-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
232 lines (204 loc) · 8.5 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
##Copyright (c) 2022 duncan g. smith
##
##Permission is hereby granted, free of charge, to any person obtaining a
##copy of this software and associated documentation files (the "Software"),
##to deal in the Software without restriction, including without limitation
##the rights to use, copy, modify, merge, publish, distribute, sublicense,
##and/or sell copies of the Software, and to permit persons to whom the
##Software is furnished to do so, subject to the following conditions:
##
##The above copyright notice and this permission notice shall be included
##in all copies or substantial portions of the Software.
##
##THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
##OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
##FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
##THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
##OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
##ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
##OTHER DEALINGS IN THE SOFTWARE.
# various functions / classes that don't obviously belong elsewhere
from pathlib import Path
import json
from nltk.corpus import wordnet as wn
"""Find synonyms and hyponyms via WordNet"""
def synonyms(synset, as_names=True):
# Returns the synonyms (lemmas) for synset.
# Returns strings if as_names is True.
if as_names:
# The .split('.')[0] is not necessary below, but is included
# in case the API is changed to make lemme name consistent with
# synset names
return [' '.join(lemma.name().split('.')[0].lower().split('_'))
for lemma in synset.lemmas()]
return synset.lemmas()
def hyponyms(synset, as_names=True):
# Returns the hyponyms for synset.
# Returns strings if as_names is True.
if as_names:
return [' '.join(hyponym.name().split('.')[0].lower().split('_'))
for hyponym in synset.hyponyms()]
return synset.hyponyms()
def all_words_from_rules(rules_data):
# This function is specific to the rules
# defined in "Code Sheet.doc.x" (with minor adjustments).
# A rule is a mapping of a key to a list containing a string
# (longer description) and two lists, the first containing
# terms that will often be present, and the second containing
# terms that will often follow the terms in the first list.
# The function simply extracts all the words in the two lists across
# all the rules.
words = set()
for key, data in rules_data.items():
for lis in data[1:]: # we don't want the description
words.update(lis)
return words
def create_syndict(words):
# Create mapping of words to synonyms.
# "words" is an iterable containing strings and / or
# Synset instances.
# For strings the synsets are generated and the first is
# taken as the relevant sense of the word.
# The return value is a mapping (dict) of words to a list
# containing a string (word / phrase definition" and a list
# of synonyms.
# In some cases the first in the list of synsets will not
# be the correct sense (hence the inclusion of the definition)
# and the relevant synset will need to be found and
# substituted for the word in a subsequent function call.
syndict = {}
for word in set(words):
if isinstance(word, str):
synsets = wn.synsets(word)
if synsets:
synset = synsets[0] # try first item
syndict[word] = [synset.definition(), synonyms(synset)]
else:
# word / phrase not in WordNet
syndict[word] = ['', [word]]
else:
# word is a synset
synset = word
word = ' '.join(synset.name().split('.')[0].lower().split('_'))
syndict[word] = [synset.definition(), synonyms(synset)]
return syndict
def create_hypdict(words):
# Create mapping of words to hyponyms (derived terms).
# Other aspects of this function are identical to
# those of create_syndict.
hypdict = {}
for word in set(words):
if isinstance(word, str):
synsets = wn.synsets(word)
if synsets:
synset = synsets[0] # try first item
hypdict[word] = [synset.definition(), hyponyms(synset)]
else:
# word / phrase not in WordNet
hypdict[word] = ['', [word]]
else:
# word is a synset
synset = word
word = ' '.join(synset.name().split('.')[0].lower().split('_'))
hypdict[word] = [synset.definition(), hyponyms(synset)]
return hypdict
def reverse_map(dic):
# Create a mapping (dict) of synonyms / hyponyms to words.
# "dic" is mapping of the kind returned by a call to
# either "create_syndict" or "create_hypdict".
# Can be used to harmonize text data.
reverse_map = {}
for word, (definition, values) in dic.items():
for val in values:
reverse_map[val] = word
return reverse_map
def read_json_data(filename):
# Read data from a json formatted file.
# If filename is not a valid path the
# file is assumed to be in the same directory as this module.
if filename == Path(filename).name:
# assume file is in the same directory as module
filename = Path(__file__).parent / filename
with open(filename, 'r') as f:
return json.load(f)
def write_json_data(data, filename):
# Write data to a text file in json format.
# If filename is not a valid path the
# file will be placed in the same directory as this module.
if filename == Path(filename).name:
# assume file is in the same directory as module
filename = Path(__file__).parent / filename
with open(filename, 'w') as f:
json.dump(data, f, indent=4, sort_keys=True)
"""Utility function for chaining generators"""
class Pipeline(list):
# Concatenates generators into an object
# that behaves like a single generator.
# >>> p_line = Pipeline([lambda x: (y**2 for y in x), lambda x: (y+2 for y in x)])
# >>> list(p_line([0,1,2,3]))
# [2, 3, 6, 11]
def __call__(self, arg):
for item in self:
arg = item(arg)
return arg
def __getitem__(self, i):
if isinstance(i, slice):
return self.__class__(list.__getitem__(self, i))
else:
return list.__getitem__(self, i)
"""Function for replacing words according to supplied mapping"""
def find_replace(words, mapping):
# "words" is a list of strings.
# "mapping" is dict mapping words to
# the words that will replace them
# in the returned generator.
return [mapping.get(word, word) for word in words]
"""Convenience / factory functions for constructing pipelines"""
def replacement_factory(mapping):
# returns a function that takes an iterable of word lists
# and generates word lists with words replaced according to mapping
def f(word_lists):
return (find_replace(word_list, mapping) for word_list in word_lists)
return f
def filter_factory(func):
# returns a function that takes an iterable of word lists
# and generates word lists with words filtered according
# to func (i.e. all words w s.t. func(w) returns True)
def f(word_lists):
return (filter(func, word_list) for word_list in word_lists)
return f
##import io
##
##from pdfminer.converter import TextConverter
##from pdfminer.layout import LAParams
##from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
##from pdfminer.pdfpage import PDFPage
##
##def pdfparser(data):
## rsrcmgr = PDFResourceManager()
## retstr = io.StringIO()
## codec = 'utf-8'
## laparams = LAParams()
## device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
##
## interpreter = PDFPageInterpreter(rsrcmgr, device)
## password = ""
## maxpages = 0
## caching = True
## pagenos = set()
##
## with open(data, 'rb') as fp:
## for page in PDFPage.get_pages(fp,
## pagenos,
## maxpages=maxpages,
## password=password,
## caching=caching,
## check_extractable=False):
## interpreter.process_page(page)
##
## # As pointed out in another answer, this goes outside the loop
## text = retstr.getvalue()
##
## device.close()
## retstr.close()
## return text