-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_iterator.py
79 lines (57 loc) · 2.71 KB
/
data_iterator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import string
from collections import Counter
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
PROTEIN_ALPHABET = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
def read_data_from_files():
with open('neg_A0201.txt', 'r') as f:
neg_text = f.read().splitlines()
neg_label = [0] * len(neg_text)
print(f"number of neg labels: {len(neg_label)}")
with open('pos_A0201.txt', 'r') as f:
pos_text = f.read().splitlines()
pos_label = [1] * len(pos_text)
print(f"number of pos labels: {len(pos_label)}")
data_text = neg_text + pos_text
data_label = neg_label + pos_label
return data_text, data_label
def get_data_for_training(data_text, data_label, batch_size):
# train_test_split
x_train, x_test, y_train, y_test = train_test_split(data_text, data_label, test_size=0.10, random_state=42)
# make Dataset
train_dataset = PeptidesDataset(x_train, y_train)
test_dataset = PeptidesDataset(x_test, y_test)
# make DataLoader
sampler_train = get_sampler(y_train)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, sampler=sampler_train)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size)
return train_loader, test_dataloader
def get_sampler(y_train):
# count pos and neg
counter = Counter(y_train)
# Oversample minority class
class_sample_count = torch.Tensor([counter[0], counter[1]])
weights = 1. / class_sample_count.float()
samples_weights = torch.tensor([weights[t] for t in y_train])
# check if replacement???
sampler = WeightedRandomSampler(weights=samples_weights, num_samples=len(samples_weights))
return sampler
class PeptidesDataset(Dataset):
def __init__(self, data_text, data_labels=None):
self.data_text = data_text
self.data_labels = data_labels
def __len__(self):
return len(self.data_text)
def __getitem__(self, index: int):
return dict(gene=self.data_text[index],
encoded_gene=self.one_hot_encoder(self.data_text[index]),
label=torch.tensor(self.data_labels[index]))
@staticmethod
def one_hot_encoder(text):
encoding = torch.Tensor([[0 if char != letter else 1 for char in PROTEIN_ALPHABET] for letter in text])
return encoding
@staticmethod
def one_hot_decoder(one_hot_tensor):
encoding = torch.Tensor([[0 if char != letter else 1 for char in PROTEIN_ALPHABET] for letter in one_hot_tensor])
return encoding