-
Notifications
You must be signed in to change notification settings - Fork 0
/
scalings.py
61 lines (44 loc) · 1.45 KB
/
scalings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def get_boundaries(ins_table, resolution, sample_id):
"""
Params: ins_table, resolution, sample_id
Returns: boundaries_df
"""
column = 'is_boundary_'+str(resolution)
return ins_table[sample_id][(ins_table[sample_id].is_bad_bin == False) & (ins_table[sample_id][column] == True)]
def overlap(ipg, boundaries):
"""
Params: ipg, boundaries_df
Return: overlap_df
"""
return bioframe.overlap(dld_ipg, boundaries)
def agg(overlap_df, sample_id):
"""
Params: sample_id, overlap_df
Return: concat_df
"""
df = overlap_df.groupby(['chrom','start','end','length','state']).agg(**{
'boundaries' : ('chrom_', 'count'),
})
df['sample'] = sample_id
return df.reset_index()
def normalize(concat_df):
"""
Params: concat_df
Returns: norm_df
"""
df = concat_df.groupby(['state','sample']).agg(**{
'boundaries_sum' : ('boundaries', sum),
'length_sum' : ('length', sum)
})
df['bound_norm'] = df['boundaries_sum'] / (df['length_sum'] / 1_000_000)
return df.reset_index()
def norm_boundaries_ipg(ins_table, dld_ipg, resolution, sample_id_list):
appended_data = []
for sample_id in sample_id_list:
b = get_boundaries(ins_table, resolution, sample_id)
o = overlap(dld_ipg, b)
ag = agg(o, sample_id)
n = normalize(ag)
appended_data.append(n)
appended_data = pd.concat(appended_data)
return appended_data