-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_hdf5_10.py
86 lines (76 loc) · 2.74 KB
/
generate_hdf5_10.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import numpy as np
import pandas as pd
import h5py
import glob
import os
#from unicode import *
def seq_to_matrix(seq,seq_matrix,seq_order):
for i in range(len(seq)):
if ((seq[i] == 'A') | (seq[i] == 'a')):
seq_matrix[seq_order, i, 0] = 1
if ((seq[i] == 'C') | (seq[i] == 'c')):
seq_matrix[seq_order, i, 1] = 1
if ((seq[i] == 'G') | (seq[i] == 'g')):
seq_matrix[seq_order, i, 2] = 1
if ((seq[i] == 'U') | (seq[i] == 'u')):
seq_matrix[seq_order, i, 3] = 1
return seq_matrix
def genarate_matrix_for_train(seq_shape,seq_series):
"""
genarate matrix for train
:param shape: (seq number, sequence_length, 4)
:param seq_series: dataframe of all sequences
:return:seq
"""
seq_matrix = np.zeros(seq_shape)
for i in range(seq_series.shape[0]):
seq_tem = seq_series[i]
seq_matrix = seq_to_matrix(seq_tem, seq_matrix, i)
#print( seq_matrix)
return seq_matrix
def mkdir(path):
isExists = os.path.exists(path)
if not isExists:
os.makedirs(path)
return (True)
else:
return False
def generate_dataset_matrix(file_path):
"""
generate matrix of the data set(the path)
:param file_path:
:return:
"""
#filenames = glob.glob(file_path + "\\*.data")
filenames = glob.glob(file_path+"\\train.fa")
for allFileFa in filenames:
AllTem = allFileFa.split("\\")[-1].split(".")[0]
#print(AllTem)
output_dir = allFileFa.split(AllTem)[0].replace("motif_discovery", "HDF5t")
#print(allFileFa+'#')
#print(AllTem+'*')
#print(output_dir+'%')
SeqLen = 81
ChipSeqlFileFa = pd.read_csv(allFileFa, sep=' ', header=None, index_col=None,engine ='python')
#print(ChipSeqlFileFa,'')
seq_series = np.asarray(ChipSeqlFileFa.loc[:, 1])
seq_name = np.asarray(ChipSeqlFileFa.loc[:, 0])
seq_matrix_out = genarate_matrix_for_train((seq_series.shape[0], SeqLen, 9), seq_series)
seq_label_out = np.asarray(ChipSeqlFileFa.loc[:, 2])
#print(seq_name)
mkdir(output_dir)
f = h5py.File(output_dir + AllTem + ".hdf5",'w')
#seq_label_out_encode = []
# for j in seq_label_out:
#seq_label_out_encode.append(j.encode())
f.create_dataset("sequences", data=seq_matrix_out)
f.create_dataset("labs", data=seq_label_out)
f.create_dataset("seq_idx", data=seq_name)
f.close()
print(f)
if __name__ == '__main__':
base = {0:"A",1:"C",2:"G",3:"U"}
# You need modify path
allFileFaList = glob.glob(r"F:\Download\lengent\venv\MAHyNet-main\demo\motif_discovery\*")
for FilePath in allFileFaList:
generate_dataset_matrix(FilePath)