Commit 256aeed9 authored by Martin Řepa's avatar Martin Řepa

Successfully migrated to pytorch. Working loss func

parent 78bdbab5
This diff is collapsed.
......@@ -14,6 +14,7 @@ def np_arrays_from_scored_csv(file_name: str, label: int,
See usage in main
"""
# TODO enable load zero size array aswell
content = pandas.read_csv(Path(dirname(__file__)) / Path('scored')/Path(file_name))
batch = []
labels = []
......
import logging
import operator
from collections import Counter
from itertools import count
from typing import List
......@@ -10,7 +9,8 @@ import pulp
from config import RootConfig
from src.data.loader import np_arrays_from_scored_csv
from src.neural_networks.network import NeuralNetwork
from src.neural_networks.network import NeuralNetwork, FormattedBenignData, \
FormattedMaliciousData
logger = logging.getLogger(__name__)
......@@ -44,35 +44,46 @@ class GameSolver:
self.utility = conf.base_conf.utility_function
train = conf.nn_train_conf
self.benign_data = np_arrays_from_scored_csv(
train.benign_data_file_name,
0, train.benign_data_count)
self.benign_data_prob = self.calculate_benign_data_prob()
def calculate_benign_data_prob(self):
# TODO maybe this rounding is not really good for real results
benign_data = list(map(lambda x: tuple(map(lambda y: round(y, 2), x)),
self.benign_data[0]))
benign_data_prob = Counter(benign_data)
for key, val in benign_data_prob.items():
benign_data_prob[key] = val / len(benign_data)
return benign_data_prob
def _get_trained_nn(self, attacker_features_x, attacker_actions) -> NeuralNetwork:
raw_benign_x, _ = np_arrays_from_scored_csv(train.benign_data_file_name,
0, train.benign_data_count)
self.benign_data = self.prepare_benign_data(raw_benign_x)
def prepare_benign_data(self, raw_x_data):
unique, counts = np.unique(raw_x_data, axis=0, return_counts=True)
probs = np.array([count/len(raw_x_data) for count in counts])
benign_y = np.zeros(len(unique))
return FormattedBenignData(unique, probs, benign_y)
# def calculate_benign_data_with_probs(self):
# # TODO maybe this rounding is not really good for real results
# benign_data = list(map(lambda x: tuple(map(lambda y: round(y, 2), x)),
# self.benign_data[0]))
# benign_data_counter = Counter(benign_data)
# benign_data_points = []
# benign_data_probs = []
# for key, val in benign_data_counter.items():
# benign_data_points.append(key)
# benign_data_probs.append(val / len(benign_data))
# return np.array(benign_data_points), np.array(benign_data_probs)
def _get_trained_nn(self, attack: FormattedMaliciousData) -> NeuralNetwork:
# Initialize the model
network = NeuralNetwork(self.conf.base_conf.features_count,
self.conf.nn_conf,
self.conf.nn_train_conf)
network.set_attacker_actions(attacker_actions)
network.train(attacker_features_x, self.benign_data)
network.set_data(self.benign_data, attack)
network.train()
# TODO use different dataset to calc false_positives
# network.calc_n0_false_positives(self.benign_data[0])
return network
def double_oracle(self, actions_p1: List) -> Result:
non_attack = FormattedMaliciousData(np.empty(0), np.empty(0), np.empty(0))
# Get initial actions as the first ones
played_actions_p1 = set(actions_p1[:1])
played_actions_p2 = {self._get_trained_nn([[]])}
played_actions_p2 = {self._get_trained_nn(non_attack)}
for i in count():
logger.debug(f'Iteration: {i}\n')
......@@ -149,21 +160,16 @@ class GameSolver:
lambda a2: self.utility(a1, a2), actions_2), p2)))
def best_response_p2(self, used_actions_p1, probs_p1):
malicious_features = []
for ai, pi in zip(used_actions_p1, probs_p1):
counter = int(self.conf.nn_train_conf.malicious_data_count * pi)
for _ in range(counter):
malicious_features.append(ai)
# Take only attacker actions which are played with non zero probability
non_zero_p = np.where(np.asarray(probs_p1) != 0)
actions_2 = np.asarray(used_actions_p1)[non_zero_p]
p2 = np.asarray(probs_p1)[non_zero_p]
attacker_actions = (actions_2, p2)
unique_attack_x = np.asarray(used_actions_p1)[non_zero_p]
attack_probs = np.asarray(probs_p1)[non_zero_p]
attack_y = np.ones(len(unique_attack_x))
attack = FormattedMaliciousData(unique_attack_x, attack_probs, attack_y)
logger.debug('Let\'s train new NN with this malicious data:')
logger.debug(f'{malicious_features}\n')
return self._get_trained_nn(malicious_features, attacker_actions)
logger.debug(f'{unique_attack_x}\n')
return self._get_trained_nn(attack)
def solve_zero_sum_game_pulp(self, actions_p1: List[List[float]],
actions_p2: List[NeuralNetwork]):
......@@ -195,7 +201,8 @@ class GameSolver:
# Calc false positive cost with benign data probability distribution
fp_cost = 0
for features, features_prob in self.benign_data_prob.items():
benign_points, benign_probs = self.benign_data_with_probs
for features, features_prob in zip(benign_points, benign_probs):
for nn, nn_prob in zip(actions_p2, probs_p_two):
l = nn.limit_predict(features)[0]
fp_cost += (l**4) * features_prob * nn_prob
......
import logging
from pathlib import Path
from typing import List, Tuple
import attr
import numpy as np
import torch
from sklearn.model_selection import train_test_split
......@@ -14,6 +14,20 @@ from src.data.loader import np_arrays_from_scored_csv
logger = logging.getLogger(__name__)
# TODO one class is enough
@attr.s
class FormattedBenignData:
unique_x: np.array = attr.ib()
probs_x: np.array = attr.ib()
y: np.array = attr.ib()
@attr.s
class FormattedMaliciousData:
features: np.array = attr.ib()
probs_features: np.array = attr.ib()
y: np.array = attr.ib()
class OrderCounter:
order = 0
......@@ -36,71 +50,100 @@ class NeuralNetwork:
nn.Linear(12, 1),
nn.Sigmoid()
)
self.loss_fn = nn.BCELoss()
self.attacker_actions = None
self.epochs = nn_conf.epochs
self.validation_split = nn_train_conf.validation_split
self.id = OrderCounter.next()
self.order = OrderCounter.next()
def set_attacker_actions(self, attacker_actions: Tuple):
self.attacker_actions = attacker_actions
def loss_function(self):
pass
def _prepare_data(self, attacker_features_x: List[List[float]],
benign_data: Tuple[np.ndarray, np.ndarray]):
x, y = benign_data
# Add attacker's malicious actions to dataset
attacker_features_x = np.array(attacker_features_x)
if len(attacker_features_x[0]):
attacker_features_y = [[1] for _ in attacker_features_x]
x = np.concatenate((x, attacker_features_x), axis=0)
y = np.concatenate((y, attacker_features_y), axis=0)
# Variables used for loss function
self.attacker_actions: FormattedMaliciousData = None
self.benign_data: FormattedBenignData = None
# Shuffle benign and malicious data
x, y = shuffle(x, y, random_state=1)
# Split data so we have train dataset and validation dataset
data = train_test_split(x, y, test_size=self.validation_split)
# Convert data to float() for pyTorch model compatibility
data = tuple(map(lambda a: torch.from_numpy(a).float(), data))
# Return final data (x_train, x_validate, y_train, y_validate)
return data
def train(self,
attacker_features_x: List[List[float]],
benign_data: Tuple[np.ndarray, np.ndarray]):
data = self._prepare_data(attacker_features_x, benign_data)
x_train, x_validate, y_train, y_validate = data
self._train(x_train, y_train, x_validate, y_validate)
# TODO Just tmp
self.loss_fn = nn.BCELoss()
def _train(self, x, y, x_validate, y_validate):
learning_rate = 1e-2
def __str__(self):
return f'Neural network with id: {self.id}'
def set_data(self, benign_data, attack):
self.attacker_actions = attack
self.benign_data = benign_data
def loss_function(self, x, limits, real_y, probs):
zero_sum_part = real_y*(1-limits)*torch.prod(x, dim=1)*probs
fp_cost = (1-real_y)*probs*torch.pow(limits, 4)
sum_loss = torch.add(torch.sum(zero_sum_part), torch.sum(fp_cost))
return torch.div(sum_loss, len(x))
# Calc false positive cost
# def_indexes = (real_y == 0)
# def_limits = limits[def_indexes]
# def_probs = real_y[def_indexes]
# fp_cost = torch.pow(torch.pow(def_limits, 4), def_probs)
#
# # Calc zero sum part
# attacker_indexes = (real_y == 1)
# att_limits = limits[attacker_indexes]
# att_x = x[attacker_indexes]
# att_probs = probs[attacker_indexes]
# att_rewards = torch.prod(att_x, dim=1)
# att_rewards = torch.pow(att_rewards, att_probs)
# zero_sum = torch.pow(att_rewards, torch.sub(1, att_limits))
#
# final_fp_cost = torch.sum(fp_cost)
# final_zero_sum_part = torch.sum(zero_sum)
# loss = torch.add(final_fp_cost, final_zero_sum_part)
# return loss
def _prepare_data(self):
defender = self.benign_data
attacker = self.attacker_actions
x = np.concatenate((defender.unique_x, attacker.features), axis=0)
y = np.concatenate((defender.y, attacker.y), axis=0)
probs = np.concatenate((defender.probs_x, attacker.probs_features), axis=0)
# Shuffle before splitting
x, y, probs = shuffle(x, y, probs, random_state=1)
# Split to train and train data given the ratio in config
data = train_test_split(x, y, probs, test_size=self.validation_split)
x_train, x_test, y_train, y_test, probs_train, probs_test = data
self.x_train = torch.from_numpy(x_train).float()
self.x_test = torch.from_numpy(x_test).float()
self.y_train = torch.from_numpy(y_train).float()
self.y_test = torch.from_numpy(y_test).float()
self.probs_train = torch.from_numpy(probs_train).float()
self.probs_test = torch.from_numpy(probs_test).float()
def train(self):
self._prepare_data()
self._train()
def _train(self):
learning_rate = 1e-4
optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_rate)
x.requires_grad = True
for e in range(self.epochs):
logger.debug(f'Running epoch number {e}/{self.epochs}')
# Forward pass: compute predicted y by passing x to the model.
y_pred = self.model(x)
train_limits = self.limit_predict(self.x_train, with_grad=True)
# for l in train_limits:
# print(l.dtype, end=' ')
# print()
# Compute and print loss.
loss = self.loss_fn(y_pred, y)
logger.debug(f'TestLoss: {loss.item()}, ValidateLoss: todo') # todo
# Compute loss.
loss = self.loss_function(self.x_train, train_limits, self.y_train,
self.probs_train)
# loss = self.loss_fn(train_limits, self.y_train)
# Compute validation loss and report some info
if e % 5 == 0:
with torch.no_grad():
y_validate_pred = self.model(x_validate)
validate_loss = self.loss_fn(y_validate_pred, y_validate)
test_limits = self.limit_predict(self.x_test)
validate_loss = self.loss_function(self.x_test, test_limits,
self.y_test, self.probs_test)
logging.debug(f'Epoch: {e}/{self.epochs},\t'
f'TrainLoss: {loss.item()},\t'
f'ValidateLoss: {validate_loss},\t')
f'TrainLoss: {loss},\t'
f'ValidateLoss: {validate_loss.item()},\t')
# Before the backward pass, use the optimizer object to zero all of
# the gradients for the variables it will update
......@@ -114,17 +157,21 @@ class NeuralNetwork:
# parameters
optimizer.step()
def raw_predict(self, x):
with torch.no_grad():
tensor = torch.tensor(x).float()
res = self.model(tensor)
return res.numpy()
def _raw_predict(self, tensor: torch.Tensor):
# TODO maybe this can help
return self.model(tensor)
def limit_predict(self, x):
raw_prediction = self.raw_predict(x)
def limit_predict(self, x: torch.Tensor, with_grad=False):
if with_grad:
raw_prediction = self._raw_predict(x)
else:
with torch.no_grad():
raw_prediction = self._raw_predict(x)
np_limit_func = np.vectorize(lambda p: 0 if p < 0.5 else (p - 0.5) * 2)
return np_limit_func(raw_prediction)
# The same as lambda p: 0 if p < 0.5 else (p - 0.5) * 2
clamped = raw_prediction.clamp(min=0.5, max=1)
limit = torch.mul(torch.add(clamped, -0.5), 2)
return limit
def setup_loger(conf):
......@@ -136,13 +183,24 @@ def setup_loger(conf):
if __name__ == '__main__':
setup_loger(RootConfig())
benign_x, benign_y = np_arrays_from_scored_csv(
benign_x, _ = np_arrays_from_scored_csv(
Path('all_benign_scored.csv'), 0, 1000)
malicious_x, malicious_y = np_arrays_from_scored_csv(
Path('scored_malicious.csv'), 1, 0)
malicious_x, _ = np_arrays_from_scored_csv(
Path('scored_malicious.csv'), 1, 500)
benign_unique_x, counts = np.unique(benign_x, axis=0, return_counts=True)
probs_benign = np.array([count / len(benign_x) for count in counts])
benign_y = np.zeros(len(benign_unique_x))
benign_data = FormattedBenignData(benign_unique_x, probs_benign, benign_y)
malicious_unique_x, counts = np.unique(malicious_x, axis=0, return_counts=True)
probs_malicious = np.array([count / len(malicious_unique_x) for count in counts])
malicious_y = np.ones(len(malicious_unique_x))
malicious_data = FormattedMaliciousData(malicious_unique_x, probs_malicious, malicious_y)
nn = NeuralNetwork()
nn.train(malicious_x, (benign_x, benign_y))
nn.set_data(benign_data, malicious_data)
nn.train()
# test_loss, test_acc = network.model.evaluate(x_test, y_test)
# print('Test loss:', test_loss)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment