-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathacknow_mod.py
205 lines (194 loc) · 8.08 KB
/
acknow_mod.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import csv
import glob
import spacy
import re
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
from pdfminer.pdfparser import PDFSyntaxError
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
nlp = spacy.load("en_core_web_sm")
from datetime import datetime
def extract_page(f):
"""
Extracts the page containing the acknowledgements/funding section
:param f: the file
:return: (page(str), page_number)
"""
page = ()
count = 0
for page_layout in extract_pages(f): # the method from pdfminer.six library
count +=1
s = ""
for element in page_layout:
if isinstance(element, LTTextContainer): # if is instance of the text
s += element.get_text()
x = re.search(r'(acknowledge?ments?)', s, re.IGNORECASE)
y = re.search(r'(Funding|FUNDING)', s)
if x or y:
page = (s,count)
return page
def extract_sentences(page):
"""
Extracts the sentences that contain funding information
:param page: tuple(page_text, page_number)
:return: list of the sentences with necessary lemmas in
"""
l = []
if page:
st = page[0].replace("\U0010fc2f","(")
st = st.replace("\U0010fc30",")")
st = st.replace("-\n", "") # remove going to the next line with hyphens
st = st.replace("\n", " ") # remove going to next line
st = re.sub(r'[Nn]o\.? ', "No.", st)
#all the parentheses in order not to disrupt the syntactic structure
st = st.replace(" "," ") # remove all the excessive spaces
st = st.replace("- ", "")
l_s = sent_tokenize(st)
l_lemma = ["funding", "financial", "finance", "fund", "support","auspex","sponsor"]
for el in l_s:
sent = nlp(el)
for token in sent: # keep the sentences where there lemmas linked to funding
if token.lemma_ in l_lemma:
s = sent.text
if s[-1] != ".":
s += "."
s = s.replace("this","that")
s = s.replace("This","That")
if "their" in s or "his" in s:
pass
else:
if s not in l:
l.append(s)
return l
def parsing(l_sent):
"""
Extracts the necessary named entities
:param
l_sent: list of necessary sentences
:return: the list of named entities for the doc
"""
li_ne = []
s = ""
l_d = []
beg = False # flag: beginning of the entity
l_beg = ["compound", "nmod", "nummod", "amod","csubj"] # beginning of the ne
l_mid = ["compound", "poss", "punct","case", "conj", "nmod", "preconj","nummod", "cc", "amod"] # middle of the ne
l_root = ["dobj", "pobj","attr"] # end of the ne
l_prep = ["of","for","on"] # the prepositions that can be inside words
if l_sent:
for el in l_sent:
el = re.sub(r'\(.*?\)', "", el, re.DOTALL) # remove everything between parantheses, including \n
le = len(el.split(" "))
doc = nlp(el) # applying spacy to sentences
for i in range(len(doc)):
if doc[i].dep_ in l_beg:
s += doc[i].text
s += " "
beg = True # flag: true
elif beg and doc[i].dep_ in l_mid:
s += doc[i].text # continue adding
s += " "
elif beg and doc[i].text in l_prep:
s += doc[i].text # continue adding
s += " "
elif doc[i].dep_ in l_root:
if s:
if doc[i+1].text in l_prep: # if the next token is a preposition that can be inside word groups
s += doc[i].text # keep adding
s += " "
else:
s += doc[i].text
s += " "
if s.rstrip() not in li_ne and s != s.lower():
li_ne.append(s.rstrip()) # else add to the list
beg = False # end adding
s = ""
else:
beg = False # we did not get the right pattern so we put the string to "".
s = ""
return li_ne
def extract_cga(l_sent):
"""
Extracts contracts, grants and abbreviations
:param page: string
:return: the list of contracts, grants and abbreviations
"""
y = ""
l_cga = []
if l_sent:
for s in l_sent:
s = s.replace("-\n", "-") # remove going to the next line with hyphens by a hyphen because it can be part of a contract
s = s.replace("\n", " ") # remove going to next line
s = re.sub(r'[Nn]o\.? ', "No.", s)
s = re.sub(r'[Nn]r. ', "No.", s)
s = re.sub(r'[Nn]umber ', "No.", s)
s = re.sub(r'[Nn]o: ', "No.", s)
pattern_c = r'[Cc]ontract (No\.)?[A-Za-z0-9-]+'
regex = re.compile(pattern_c)
for match in regex.finditer(s):
if match.group(0) not in l_cga:
l_cga.append(match.group(0))
pattern_g = r'[Gg]rant ?([Aa]greement)? (No\.)?[A-Z0-9-–\.\/\\]+'
regex = re.compile(pattern_g)
for match in regex.finditer(s):
if match.group(0).rstrip(".") not in l_cga:
l_cga.append(match.group(0).rstrip("."))
pattern_ab = r'[A-Z]{3,10}[ \.)]+'
regex = re.compile(pattern_ab)
for match in regex.finditer(s):
s = match.group(0)
s = s[:-1]
if s[-1] == ")":
s = s[:-1]
if s not in l_cga:
l_cga.append(s)
return l_cga
def to_csv(path):
"""
Goes through all the pdf documents and extracts the named entities
linked to the funding information to a csv file.
:param path of the files
adds named entities to a csv file in the format [path,page_number,ne,type_ne]
adds paths of the articles where there were no matches to a separate text file.
"""
nm = open("/data/output/no_matches.txt",'w') # open the file for articles with no matches
with open("/data/output/ne.csv", 'w') as file: # open the file for named entities
file.write('path,page_n,ne,type_ne\n')# write the names of the columns
with open("/data/output/ne.csv", 'a', newline='') as ner:
writer = csv.writer(ner)
for filename in glob.iglob(path + '**/**', recursive=True):
if filename.endswith(".pdf"):
try:
l_fn = []
start_time = datetime.now()
f = filename
if f not in l_fn:
l_fn.append(f)
page = extract_page(f)
if page:
page_n = page[1]
li = extract_sentences(page)
l_ne = parsing(li)
li_cga = extract_cga(li)
if not l_ne: # if the list is empty (no ne), write the name of the file in "no_matches"
nm.write(f'item: {f}'+"\n")
end_time = datetime.now()
print(f)
print('Duration: {}'.format(end_time - start_time))
for ne in l_ne:
writer.writerow([f,page_n,ne])# write information in csv
for cga in li_cga:
writer.writerow([f, page_n, cga]) # write information in csv
else:
pass
except PDFSyntaxError:
print(filename," IS NOT A PDF")
except Exception as e:
print("Failed to parse %s" % (filename))
print(e)
else:
print("reading: %s" % (filename))
nm.close()
c = to_csv("/data/articles")