-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCRC_Dataset.py
187 lines (134 loc) · 6.04 KB
/
CRC_Dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import torch
from torchvision.transforms import Compose
from transforms import ToTensor
import numpy as np
import glob
import os
class CV_Splits(object):
'''
Generator for cross validation compatible with PyTorch Datasets. \n
Args: \n
:cv_folds: number k of k-fold cross validation \n
:shuffle: bool, shuffle indices before creating splits \n
:dataset: initialize Generator with a PyTorch Dataset
'''
def __init__(self, cv_folds, subset_size=None, shuffle=True, dataset=None):
self.folds = cv_folds
self.dataset = dataset
self.shuffle = shuffle
# determine how much of the available training data is used for each cross validation iteration
self.subset_size = subset_size
if subset_size:
self.n_train = int((cv_folds - 1)/cv_folds * subset_size)
self.n_val = int(subset_size/cv_folds)
def __call__(self, dataset):
'''
Args:
:dataset: PyTorch dataset to reassign
'''
self.dataset = dataset
def __iter__(self):
'''
Generator method, yields training and validation set as PyTorch Datasets.
'''
self.indices = np.arange(len(self.dataset))
if self.shuffle:
self.indices = np.random.permutation(self.indices)
#print(len(self.indices))
# create splits
self.splits = np.array_split(self.indices, self.folds)
#print(self.splits)
# iterate over fold combinations
for fold in range(self.folds):
train_idx = np.concatenate([split for i, split in enumerate(self.splits) if i != fold])
valid_idx = self.splits[fold]
#print(len(train_idx))
#print(len(valid_idx))
# return training and validation set as Subsets of the original dataset
if self.subset_size:
dataset_train = torch.utils.data.Subset(self.dataset, train_idx[:self.n_train])
dataset_valid = torch.utils.data.Subset(self.dataset, valid_idx[:self.n_val])
else:
dataset_train = torch.utils.data.Subset(self.dataset, train_idx)
dataset_valid = torch.utils.data.Subset(self.dataset, valid_idx)
yield dataset_train, dataset_valid
class CRC_Dataset(torch.utils.data.Dataset):
'''
CRC Tissue Segmentation Dataset. Files are assumed to lie in a root directory with subfolders 'frames' and 'masks' with the compressed .npz file format.
(May be changed once considering split between training and test set and if loading compressed .npz files becomes a bottleneck in dataloading & preprocessing).
Further the images are assumed to be in a RGB format and unprocessed.
'''
def __init__(self, root_dir, transforms=[ToTensor()]):
'''
Args:\n
:root_dir: string containing the root directory of the dataset.
:transforms: list of transformations to apply to the samples.
'''
self.root_dir = root_dir
self.transforms = transforms
self.composed_trsfm = Compose(transforms)
# save name of files in lists
self.images = glob.glob(os.path.join(root_dir, 'frames\\*.npz'))
self.labels = glob.glob(os.path.join(root_dir, 'masks\\*.npz'))
def __len__(self):
return len(self.images)
def __getitem__(self, idx):
'''
Loads an image and its segmentation mask from disc, applies transformations and returns them as pytorch tensors.
'''
# load image and label from disk
image = np.load(self.images[idx])['arr_0']
label = np.load(self.labels[idx])['arr_0']
# apply transformations
image, label = self.composed_trsfm((image, label))
return image, label
def compute_mean_and_std(dataloader, device):
'''
Compute the mean and standard deviation per channel of the whole dataset using first and second moments.
'''
# running means of first and sec. moment
mean = torch.zeros(3).to(device)
mean_sq = torch.zeros(3).to(device)
# loop over training set in batches, compute pixel means and batch means
for img_batch, _ in dataloader:
n_img, channel = img_batch.size(0), img_batch.size(1)
img_batch = img_batch.view(n_img, channel, -1).to(device)
mean += torch.mean(img_batch, (0, 2))
mean_sq += torch.mean(img_batch**2, (0, 2))
# divide by # of batches
mean /= len(dataloader)
mean_sq /= len(dataloader)
# return means and stds
return mean.cpu(), torch.sqrt(mean_sq - mean**2).cpu()
def compute_mean_class_occurences(dataloader):
cl_occurences = np.zeros(3)
for _, mask_batch in dataloader:
batch_np = mask_batch.view(mask_batch.size(0), -1).numpy()
for i in range(3):
cl_occurences[i] += np.mean(np.count_nonzero(batch_np == i, axis=1))
return cl_occurences/len(dataloader)
if __name__ == "__main__":
import time
from utils import MirrorPad
import matplotlib.pyplot as plt
# test
dataset = CRC_Dataset(root_dir = os.path.join(os.getcwd(), 'data\\train'))
#print(dataset.images[:10], dataset.labels[:10], len(dataset))
#t1 = time.perf_counter()
#image, label = dataset[np.random.randint(len(dataset))]
#t2 = time.perf_counter() - t1
#print(f"{CRC_Dataset.__getitem__} took {t2}s")
#print(image.shape, label.shape, image.dtype, label.dtype)
#plt.imshow(image.numpy().transpose((1, 2, 0)))
#plt.show()
dataloader = torch.utils.data.DataLoader(
dataset = dataset,
batch_size = 32,
pin_memory = True,
num_workers = 2
)
#occurences = compute_mean_class_occurences(dataloader)
#print(occurences) #rounded [131383, 68638, 49979]
mean, std = compute_mean_and_std(dataloader, device="cpu") #set to "cpu" if not gpu is available
print(mean, std) #prints approx. mean=(0.7979, 0.6772, 0.7768), std=(0.1997, 0.3007, 0.2039)
# new (11.033 samples): mean=(0.7942, 0.6693, 0.7722), std=(0.1998, 0.3008, 0.2037)