-
Notifications
You must be signed in to change notification settings - Fork 0
/
load_data.py
226 lines (179 loc) · 7.61 KB
/
load_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
import os
import pickle
import sys
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset
import data_preprocessing
from data_preprocessing import resample_data
from utils.compare_dataset_features import unique_2018_attributes, get_attribute_map, attributes_2018, attributes_2017
CIC_2017 = 'cic-2017'
CIC_2018 = 'cic-2018'
BENIGN_LABEL_2018 = 'Benign'
BENIGN_LABEL_2017 = 'BENIGN'
def load_data(dset, data_path, pkl_path=None):
"""
Read in the entire 2018 dataset
:param pkl_path: Path to the pickle objects
:param is_2018: True if loading 2018 data. False for 2017 data
:param data_path: the path to the root data directory
:return: a tuple for the full numpy arrays and labels
"""
all_data = None
all_labels = []
all_dropped = 0
is_2018 = dset == CIC_2018
if pkl_path is not None:
pkl_path = os.path.join(pkl_path, 'all_data_%s.pkl' % ('2018' if is_2018 else '2017'))
if os.path.exists(pkl_path):
with open(pkl_path, 'rb') as file:
data_train, data_test, labels_train, labels_test = pickle.load(file)
else:
print('Pickle Path was invalid', file=sys.stderr)
else:
for file in os.listdir(data_path):
print('Loading file: %s ...' % file)
data, labels, num_dropped = get_data(os.path.join(data_path, file), is_2018=is_2018)
if all_data is None:
all_data = data
else:
all_data = np.concatenate((all_data, data))
all_labels += labels
all_dropped += num_dropped
print('Total Number of invalid values: %d' % all_dropped)
print('Total Data values: %d' % len(all_labels))
print('Invalid data: %.2f%%' % (all_dropped / float(all_data.size) * 100))
label_mapping = []
for label in all_labels:
if label not in label_mapping:
label_mapping.append(label)
print('Dataset labels: %s' % str(label_mapping))
# Perform test/validation split
data_train, data_test, labels_train, labels_test = train_test_split(all_data, all_labels, test_size=0.20)
# Resample Data
data_train, labels_train, classes_to_drop = resample_data(data_train, labels_train, is_2018=is_2018)
data_test, labels_test = data_preprocessing.drop_classes(data_test, labels_test, classes_to_drop)
if pkl_path:
with open(pkl_path, 'wb') as file:
pickle.dump((data_train, data_test, labels_train, labels_test), file)
return data_train, data_test, labels_train, labels_test
def get_datasets(dset, data_path, pkl_path=None):
"""
Load the data into PyTorch Dataset structures for PyTorch processing
:param dset: String for the dataset desired.
:param data_path: Path to the root directory for the dataset
:param pkl_path: Path to the pickle data objects
:return: Training and testing datasets
"""
data_train, data_test, labels_train, labels_test = load_data(dset, data_path, pkl_path)
data_train = torch.tensor(data_train)
data_test = torch.tensor(data_test)
# Convert string list to list of integers
label_mapping = {}
value = 0
for label in labels_test:
if label not in label_mapping:
label_mapping[label] = value
value += 1
labels_idx_train = []
for i in range(len(labels_train)):
label = labels_train[i]
value = label_mapping[label]
labels_idx_train.append(value)
labels_idx_test = []
for i in range(len(labels_test)):
label = labels_test[i]
value = label_mapping[label]
labels_idx_test.append(value)
labels_train = torch.tensor(labels_idx_train)
labels_test = torch.tensor(labels_idx_test)
classes = list(label_mapping.keys())
dataset_train = TensorDataset(data_train, labels_train)
dataset_test = TensorDataset(data_test, labels_test)
dataset_train.classes = classes
dataset_test.classes = classes
return dataset_train, dataset_test
def get_data(file, is_2018=True, pkl_path=None):
"""
Reads the csv file using pandas and returns the data and labels as numpy arrays
:param pkl_path: Path to the pickle file objects
:param is_2018: Flag for whether to handle as 2018 or 2017 dataset
:param file: The file to read from
:return: a tuple of numpy arrays and the labels
"""
filename = os.path.splitext(os.path.basename(file))[0] + '.pkl'
if pkl_path is not None:
pkl_path = os.path.join(pkl_path, filename)
if os.path.exists(pkl_path):
with open(pkl_path, 'rb') as file:
data_np, labels_list, num_dropped = pickle.load(file)
else:
print('Pickle Path was invalid', file=sys.stderr)
else:
if is_2018:
df = pd.read_csv(file, dtype={'Timestamp': 'string'})
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
# Remove invalid Dst Port values
types = df.dtypes
if types['Dst Port'].name != 'int64':
df['C'] = np.where(df['Dst Port'].str.isdigit(), ['Retain'], ['Delete'])
df = df[~df['C'].isin(['Delete'])]
df = df.drop('C', axis=1)
data = df.drop('Label', axis=1)
if 'Flow ID' in df:
data = data.drop('Flow ID', axis=1)
if 'Src IP' in df:
data = data.drop('Src IP', axis=1)
if 'Src Port' in df:
data = data.drop('Src Port', axis=1)
if 'Dst IP' in df:
data = data.drop('Dst IP', axis=1)
# Drop data that isn't in 2017 dataset
for attribute in unique_2018_attributes:
if attribute in df:
data = data.drop(attribute, axis=1)
else:
df = pd.read_csv(file)
df.columns = df.columns.str.strip()
# Drop attributes unique to 2017 data
for attribute in unique_2018_attributes:
if attribute in df:
df = df.drop(attribute, axis=1)
# Remap and reorder the columns to match 2018 data
attribute_map = get_attribute_map()
for i in range(len(attributes_2017)):
df.columns = df.columns.str.replace(attributes_2017[i], attribute_map[(attributes_2017[i])])
df = df[attributes_2018]
data = df.drop('Label', axis=1)
labels = df['Label']
labels_list = labels.tolist()
if not is_2018:
# Convert labels to be consistent
for i in range(len(labels)):
if labels_list[i] == BENIGN_LABEL_2017:
labels_list[i] = BENIGN_LABEL_2018
data_np = data.to_numpy(dtype=np.float32, na_value=0)
data_np, num_dropped = data_preprocessing.clean_np_data(data_np, labels_list)
is_nan = np.any(np.isnan(data_np))
is_finite = np.all(np.isfinite(data_np))
print('Data is nan: %s' % str(is_nan))
print('Data is finite: %s' % str(is_finite))
# Normalize data
data_np = normalize(data_np)
if pkl_path:
with open(pkl_path, 'wb') as file:
pickle.dump((data_np, labels_list, num_dropped), file)
return data_np, labels_list, num_dropped
def normalize(array):
"""
Will normalize each column of a numpy array between 0-1
:param array: The data array
:return: the normalized data
"""
min = np.amin(array, axis=0)
array -= min
max = np.amax(array, axis=0)
array /= (max + 1e-3)
return array