-
Notifications
You must be signed in to change notification settings - Fork 0
/
compress_h5.py
109 lines (81 loc) · 3.5 KB
/
compress_h5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os
import h5py
import argparse
from readimc import MCDFile
import re
import numpy as np
import exifread
from tifffile import imread
import pandas as pd
def parse():
parser = argparse.ArgumentParser()
parser.add_argument("--source_folder", type=str, required=bool, help="Path to data to compress.")
parser.add_argument("--out_file", type=str, required=bool, help="Output h5file.")
parser.add_argument("--level", type=int, required=bool, help="Compression level.")
args = parser.parse_args()
return args
def read_IMC_rois(mcd_file):
rois = {}
with MCDFile(mcd_file) as f:
for acq in f.slides[0].acquisitions:
try:
img = f.read_acquisition(acq)
except:
continue
roi_num = int(re.findall("[0-9]+", acq.description)[0])
channels = np.array(acq.channel_labels)
rois[roi_num] = {
"image": img,
"channels": channels
}
return rois
def read_IF_rois(tif_files):
rois = {}
info = []
for file in tif_files:
with open(file, 'rb') as f:
tags = exifread.process_file(f)
try:
roi_num = int(re.findall("(?<=[rR][oO][iI]_)[0-9]+", str(tags["Image Tag 0xB0B7"]))[0])
except:
roi_num = int(re.findall("(?<=[rR][oO][iI])[0-9]+", str(tags["Image Tag 0xB0B7"]))[0])
channel = re.findall("(?<=_)[A-Z]\.", file)[0][0]
info.append([roi_num, channel, file])
info = pd.DataFrame(info, columns=["roi", "channel", "filename"])
for roi in np.unique(info.roi):
tmp = info[info.roi == roi]
tmp = tmp.sort_values("channel")
image_stack = np.stack([imread(row.filename) for _, row in tmp.iterrows()], axis=-1).transpose(2,0,1)
channels = np.array([row.channel for _, row in tmp.iterrows()])
rois[roi] = {
"image": image_stack,
"channels": channels
}
return rois
def compress_rois_to_h5(modality_group, roi_list, l):
for key, value in roi_list.items():
roi = modality_group.create_group(str(key))
roi.attrs['channel_names'] = value['channels'].astype(str).tolist()
roi.create_dataset('image', data=value['image'], compression='gzip', compression_opts=l)
return modality_group
def compress_folder_to_h5(out, root, files, level):
with h5py.File(os.path.join(out, (os.path.basename(root)) + '.h5'), 'w') as h5file:
mcd_files = [os.path.join(root, file) for file in files if file.endswith('.mcd')]
tif_files = [os.path.join(root, file) for file in files if file.endswith('.TIF')]
# Create subgroup for IF and a sub-subgroup for each ROI and fill with compressed data
rois = read_IF_rois(tif_files)
IF = h5file.create_group('IF')
IF = compress_rois_to_h5(IF, rois, level)
# Create subgroup for IMC and a sub-subgroup for each ROI and fill with compressed data
rois = read_IMC_rois(mcd_files[0])
IMC = h5file.create_group('IMC')
IMC = compress_rois_to_h5(IMC, rois, level)
def compress_all_folders(source, out, lev):
for root, _, files in os.walk(source):
# Skip first iteration where root = path
if not files and root == source:
continue
compress_folder_to_h5(out, root, files, lev)
if __name__ == "__main__":
args = parse()
compress_all_folders(args.source_folder, args.out_file, args.level)