forked from yitepeli/LOFDetectionInProteinMutation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
evalData.py
90 lines (71 loc) · 2.37 KB
/
evalData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# -*- coding: utf-8 -*-
"""
Created on Wed Dec 5 21:42:39 2018
@author: yitepeli
"""
import csv
import numpy as np
from sklearn.svm import SVC
from sklearn import preprocessing
import itertools
import pandas as pd
def processData():
outData = np.array([])
outLabel = np.array([])
#get data from PPData.csv
with open('PPData.csv', newline='') as csvfile:
spamreader = csv.reader(csvfile, delimiter=',')
#For each row in file
for row in spamreader:
labelVal = 0
data2Val = row[2]
#If LOF then label=1, else label = 0
if row[6] == "LOF":
labelVal = 1
#If new aa is * then it is converted to X
if row[2] == "*":
data2Val = "X"
if len(outLabel) == 0:
outData = np.array([[row[1], data2Val]])
outLabel = np.array([labelVal])
else:
outData = np.append(outData, [[row[1], data2Val]], axis=0)
outLabel = np.append(outLabel, [labelVal])
return outData, outLabel
def predict():
data, labels = processData()
# Encode amino acids
df = pd.DataFrame({'A': data[:, 0].tolist(), 'B': data[:, 1].tolist()})
OneHotData = np.array(pd.get_dummies(df, prefix=['aa0', 'aa1']).values.tolist())
# print(OneHotData)
# Train and Test Data 80%-20%
cutterIndex = round(8 * len(labels) / 10)
oneHotDataTrain = OneHotData[:cutterIndex]
oneHotDataTest = OneHotData[cutterIndex:]
labelsTrain = labels[:cutterIndex]
labelsTest = labels[cutterIndex:]
sizeOfTrain = len(labelsTrain)
sizeOfTest = len(labelsTest)
# Calculate Base Accuracy
countOfLoss = 0
for row in labelsTest:
if row == 1:
countOfLoss += 1
baseAcc = max(countOfLoss, sizeOfTest - countOfLoss) / sizeOfTest
print("Base Accuracy: " + str(baseAcc))
# Create SVM Model
clf = SVC(gamma='auto')
clf.fit(oneHotDataTrain, labelsTrain)
# Predict
testPredictions = clf.predict(oneHotDataTest)
# Prediction Statistics
countOfCorrects = 0
for i in range(len(labelsTest)):
if testPredictions[i] == labelsTest[i]:
countOfCorrects += 1
predictionAcc = countOfCorrects / sizeOfTest
print("Prediction Accuracy: " + str(predictionAcc))
def main():
predict()
if __name__== "__main__":
main()