-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
178 lines (131 loc) · 5.78 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import pandas as pd
from bioc import biocxml
from bs4 import BeautifulSoup
import re
from itertools import product
import requests
def translateProtein2SingleChar(proteinNotation):
print("start translateProtein2SingleChar")
print("proteinNotation",proteinNotation)
try:
r = requests.get(f'https://mutalyzer.nl/api/normalize/{proteinNotation}?only_variants=false')
j = r.json()
return j['equivalent_descriptions']['p'][0]['description'].split(":")[-1]
except Exception as e:
return None
def sortVariantAlias(variantAlias,documents):
for page in source_doc:
c_rsids = re.findall(current_variant,page.page_content)
def table_to_2d(table_tag):
rowspans = [] # track pending rowspans
rows = table_tag.find_all(['tr'])
# first scan, see how many columns we need
colcount = 0
for r, row in enumerate(rows):
cells = row.find_all(['td', 'th'], recursive=False)
'''
colcount = max(
colcount,
sum(int(c.get('colspan', 1)) or 1 for c in cells[:-1]) + len(cells[-1:]) + len(rowspans))
'''
colcount = max(colcount,sum(int(c.get('colspan', 1)) for c in cells) + len(rowspans))
# update rowspan bookkeeping; 0 is a span to the bottom.
rowspans += [int(c.get('rowspan', 1)) or 1 or len(rows) - r for c in cells]
rowspans = [s - 1 for s in rowspans if s > 1]
# it doesn't matter if there are still rowspan numbers 'active'; no extra
# rows to show in the table means the larger than 1 rowspan numbers in the
# last table row are ignored.
# build an empty matrix for all possible cells
table = [[None] * colcount for row in rows]
# fill matrix from row data
rowspans = {} # track pending rowspans, column number mapping to count
for row, row_elem in enumerate(rows):
span_offset = 0 # how many columns are skipped due to row and colspans
for col, cell in enumerate(row_elem.find_all(['td', 'th'], recursive=False)):
# adjust for preceding row and colspans
col += span_offset
while rowspans.get(col, 0):
span_offset += 1
col += 1
# fill table data
rowspan = rowspans[col] = int(cell.get('rowspan', 1)) or len(rows) - row
colspan = int(cell.get('colspan', 1)) or colcount - col
# next column is offset by the colspan
span_offset += colspan - 1
value = cell.get_text()
for drow, dcol in product(range(rowspan), range(colspan)):
try:
table[row + drow][col + dcol] = value
rowspans[col + dcol] = rowspan
except IndexError:
# rowspan or colspan outside the confines of the table
pass
# update rowspan bookkeeping
rowspans = {c: s - 1 for c, s in rowspans.items() if s > 1}
return table
def convert2DF(xml_data):
'''
input: xml_data: the xml component of a table
return: the converetd dataframe, collapse the table into 2D structure
'''
# parse XML string with BeautifulSoup
xml_data = re.sub(r'\\x..', '', xml_data)
#print(xml_data)
soup = BeautifulSoup(xml_data, 'lxml')
# find the table in the soup
table = soup.find('table')
# if thead and tbody
heads = table.find('thead')
if(not heads):
# if no tags of thead in the table, lets make the head as the first row
new_body = table_to_2d(table)
df = pd.DataFrame(new_body[1:], columns=new_body[0])
return df
new_heads = table_to_2d(heads)
# merge multiple row of heads by concatenate unique values
new_heads = [['' if value is None else value for value in row] for row in new_heads]
unique_columns = [list(set(column)) for column in zip(*new_heads)]
my_merged_header = [' '.join(column) for column in unique_columns]
bodies = table.find('tbody')
data = []
new_body_data = table_to_2d(bodies)
for index,cur_body_row in enumerate(new_body_data):
tmp_row = ['']*len(cur_body_row)
cur_body_row = ['' if value is None else value for value in cur_body_row]
for k, cur_col in enumerate(cur_body_row):
tmp_row[k] = cur_col.replace("\\n",'')
data.append(tmp_row)
df = pd.DataFrame(data, columns=my_merged_header)
return df
def extractTablesFromXML(XML_path):
'''
XML_path: the path of the XML paper file
return a list of dataframes, each df represents one table
'''
all_tables = []
with biocxml.iterparse(XML_path) as reader:
for document in reader:
for i in range(len(document.passages)):
if(document.passages[i].infons['type']!='table'):
continue
cur_table_xml = document.passages[i].infons['xml']
table_name = document.passages[i].infons['id']
df = convert2DF(cur_table_xml)
all_tables.append(df)
return all_tables
pass
def reduceIntransDuplicates(text_in_trans_list):
formated_in_trans_list = []
for cur_answers in text_in_trans_list:
if('none' in cur_answers.lower() and 'contain' not in cur_answers.lower()):
continue
else:
formated_in_trans_list.extend(cur_answers.split(','))
formated_in_trans_list = list(set(formated_in_trans_list))
return formated_in_trans_list
pass
if __name__ == "__main__":
print("test extractTablesFromXML")
test_path = "/autofs/bal36md0/smli/smli/LLM-genome-curation/literatures/subset_report_XML/36546626.xml"
table_list = extractTablesFromXML(test_path)
print(len(table_list))