-
Notifications
You must be signed in to change notification settings - Fork 0
/
cntext_search_with_gui.py
102 lines (93 loc) · 4.05 KB
/
cntext_search_with_gui.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import pandas as pd
import re
import spacy
from tkinter import ttk
import tkinter
import en_core_web_md
from spacy.pipeline import EntityRuler
nlp = spacy.load('en_core_web_md')
MONEY = nlp.vocab.strings['MONEY']
class MyWindow:
def __init__(self, win):
self.lbl = Label(win, text='Results', font='ariel 12 bold', bg='#856ff8')
self.t1 = Entry(bd=3)
self.t2 = Text(window, width=10)
self.b1 = ttk.Button(win, text='Search', command=self.add)
self.btn = ttk.Button(win, text='Delete', command=lambda: self.t2.delete(1.0, END))
self.t1.place(x=200, y=50, width=250)
self.b1.place(x=460, y=50)
self.lbl.place(x=200, y=175)
self.t2.place(x=200, y=200, width=500)
self.btn.place(x=400, y=600)
def clean_noun(self, doc):
nouns = []
for chunk in doc.noun_chunks:
if chunk.root.text != "rupees":
nouns.append(chunk)
return nouns
def filter_spans(self, spans):
# Filter a sequence of spans so they don't contain overlaps
get_sort_key = lambda span: (span.end - span.start, -span.start)
sorted_spans = sorted(spans, key=get_sort_key, reverse=True)
result = []
seen_tokens = set()
for span in sorted_spans:
# Check for end - 1 here because boundaries are inclusive
if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
result.append(span)
seen_tokens.update(range(span.start, span.end))
result = sorted(result, key=lambda span: span.start)
return result
def extract_currency_relations(self, doc):
noun_chunk = self.clean_noun(doc)
spans = list(doc.ents) + noun_chunk
spans = self.filter_spans(spans)
with doc.retokenize() as retokenizer:
for span in spans:
retokenizer.merge(span)
relations = []
for money in filter(lambda w: w.ent_type_ == "MONEY", doc):
if money.children:
for child in money.children:
if money.dep_ in ("appos", "npadvmod", "nsubj", "conj"):
relations.append((money.head, child, money))
elif money.dep_ in ("pobj", "dobj"):
relations.append((money.head.head, money.head, money))
if money.dep_ in "pobj":
relations.append((money.head.head, money.head, money))
return relations
def add(self):
"""Searches for input text from table
"""
text = self.t1.get()
df = pd.read_excel('Train-Data.xlsx')
text_split = text.split()
if len(text_split) < 4:
for sen in df["Description"]:
if all(map(lambda word: word in sen.lower(), text_split)):
self.t2.insert(END, sen + '\n')
if 'entity_ruler' not in nlp.pipe_names:
ruler = EntityRuler(nlp, overwrite_ents="True")
patterns = [{"label": "MONEY", "pattern": [{'LIKE_NUM': True}, {"LOWER": "rupees"}]}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler, after="ner")
doc = nlp(text)
relations = self.extract_currency_relations(doc)
for r1, r2, r3 in relations:
des = r1.orth_.split()
mon = r3.orth_.split()
lis = next(int(val) for val in mon if val.isdigit())
less_list = ['less', 'under', 'below']
if r2.text in less_list:
df_new = df.query('Selling_Price < @lis')
small = df_new["Description"].str.findall('.*?'+'.*'.join(des)+'.*', re.I)
for line in small:
if line:
self.t2.insert(END, *line)
self.t2.insert(END, '\n')
window = Tk()
mywin = MyWindow(window)
window['background'] = '#856ff8'
window.title('Contextual Search')
window.geometry("800x800+10+10")
window.mainloop()