-
Notifications
You must be signed in to change notification settings - Fork 3
/
divide_data_sortdate.py
50 lines (39 loc) · 1.47 KB
/
divide_data_sortdate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import sys
import os
import pandas
import numpy
import preprocess
args = preprocess.arg_passing(sys.argv)
path = args['-path']
project = args['-project']
data_path = path + project + '.csv'
data = pandas.read_csv(data_path)
# the following line drops created column which is only present in Tawosi dataset. This column contains the creation
# date-time of each bug, which is used to sort the issues but is not needed here.
data = data.drop(['created'], axis=1, errors='ignore')
data = data.values
keys = data[:, 0]
trainingSize = 60
validationSize = 20
testSize = 20
if trainingSize + validationSize + testSize == 100:
numData = len(keys)
numTrain = (trainingSize * numData) / 100
numValidation = (validationSize * numData) / 100
numTest = (testSize * numData) / 100
print "Total data: %s" % numData
print "Training size: %s, validation size: %s, testing size: %s" % (numTrain, numValidation, numTest)
print "Total: %s" % (numTrain + numValidation + numTest)
divided_set = numpy.zeros((len(keys), 3)).astype('int64')
divided_set[0:numTrain - 1, 0] = 1
divided_set[numTrain - 1:numTrain + numValidation - 1, 1] = 1
divided_set[numTrain + numValidation - 1:numData, 2] = 1
if not os.path.exists('files/'):
os.makedirs('files/')
f = open('files/' + project + '_3sets.txt', 'w')
f.write('train\tvalid\ttest')
for s in divided_set:
f.write('\n%d\t%d\t%d' % (s[0], s[1], s[2]))
f.close()
else:
print "check size"