-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract.py
132 lines (114 loc) · 5.06 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import spacy
import textacy.extract
import textacy.ke
import requests
import json
import os.path
import pymongo
from bs4 import BeautifulSoup
from ast import literal_eval
from nltk import sent_tokenize
BASE_URLS_DIR = "./urls/"
BASE_PAGES_DIR = "./pages/"
"""
HOW:
(1) Fetch the top wikipedia links
(2) Request a top link (Mediawiki) - parse it for interesting facts
(2.1) Send a request via MediaWiki API with the link title
(2.2) Parse the keywords from the link body
(2.3) Use each keyword as entity for obtaining semi structured statements - (entity, cue, fragment) triple
- cue – verb lemma with which entity is associated
(2.4) Finding the sentence using its (entity, cue, fragment) triple
(2.5) Using the paper's techniques like - Linguistic Features - Superlative Words, Contradictory Words, Root Word of Sentence, Subject of Sentence, Readability Score (Gunning Fog Index)
(3) Store the link's content with the facts in a mongoDB database
(4) Front-end for retrival of facts (if time permits)
Future Scope: Can be made into an online learning model
"""
def load_spacy_model():
nlp = spacy.load('en_core_web_md')
print("Loaded Spacy Model .....")
return nlp
def parse_and_extract_facts(nlp, text, entity):
doc = nlp(text)
statements = textacy.extract.semistructured_statements(doc, entity)
return statements
def fetch_all_links(link):
req = requests.get(link)
soup = BeautifulSoup(req.text, 'html.parser')
# Customized for pages like - https://en.wikipedia.org/wiki/Wikipedia:2019_Top_50_Report
tables = soup.find_all("table", class_="wikitable")[0].find_all("td")
hrefs = list()
for table in tables:
if len(table.find_all("a")) != 1:
continue
else:
try:
url = table.find_all("a")[0]["href"]
if "File:" not in url:
hrefs.append(url)
except:
continue
return hrefs
def store_urls(hrefs, year):
with open(BASE_URLS_DIR + str(year)+".json", 'w', encoding='utf-8') as f:
json.dump(hrefs, f, ensure_ascii=False, indent=4)
def media_wiki_call(link):
HEAD = "https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&explaintext&redirects=1&titles="
TAIL = "&origin=*"
req = requests.get(HEAD+link.replace("/wiki/", '')+TAIL)
page = literal_eval(req.text)["query"]["pages"]
key = list(page.keys())[0]
text_body = page[key]["extract"]
title = page[key]["title"]
pageid = page[key]["pageid"]
return (text_body, title, pageid)
def store_pages(text, title, facts, keywords, pageid):
form = {"text": text, "title": title, "facts": facts, "keywords": keywords, "pageid": pageid}
with open(BASE_PAGES_DIR + title +".json", 'w', encoding='utf-8') as f:
json.dump(form, f, ensure_ascii=False, indent=4)
def find_facts(spacy_facts, offset=2000):
all_facts = list()
for fact in spacy_facts:
# check for lists with no facts first
if fact == list():
continue
for sub_fact in fact:
document = str(sub_fact[2].doc)
fragment = sub_fact[2].text
base_index = document.index(fragment)
left = base_index - offset
right = base_index + offset
probable_sents = sent_tokenize(document[left: right])
try:
full_fact = [f for f in probable_sents if fragment in f][0]
except:
continue
if full_fact not in all_facts:
all_facts.append(full_fact)
return all_facts
def mongo_store(mongo_client, mongo_db, mongo_col, text, title, all_facts, keywords, pageid):
form = {"text": text, "title": title, "facts": all_facts, "keywords": keywords, "pageid": pageid}
x = mongo_col.insert_one(form)
if __name__ == "__main__":
top_wiki_links = {2019: "https://en.wikipedia.org/wiki/Wikipedia:2019_Top_50_Report",
2018: "https://en.wikipedia.org/wiki/Wikipedia:2018_Top_50_Report"}
mongo_client = pymongo.MongoClient("mongodb://localhost:27017/")
mongo_db = mongo_client["facts_database"]
mongo_col = mongo_db["facts"]
for year in top_wiki_links:
if os.path.isfile(BASE_URLS_DIR + str(year)+".json") == False:
hrefs = fetch_all_links(top_wiki_links[year])
store_urls(hrefs, year)
print("Verified Top Wiki Links Paths .....")
nlp = load_spacy_model()
for year in top_wiki_links:
links = json.load(open(BASE_URLS_DIR + str(year)+".json"))
for link in links:
print("For Link: {} .....".format(link))
text, title, pageid = media_wiki_call(link)
keywords = list(textacy.ke.yake(nlp(text), normalize="lower", topn=10, ngrams=1))
spacy_facts = [list(parse_and_extract_facts(nlp, text, keyword[0])) for keyword in keywords]
all_facts = find_facts(spacy_facts)
print("Storing: {} .....".format(title))
# store_pages(text, title, all_facts, keywords, pageid)
mongo_store(mongo_client, mongo_db, mongo_col, text, title, all_facts, keywords, pageid)