forked from priyaradhakrishnan0/templeKB
-
Notifications
You must be signed in to change notification settings - Fork 0
/
WikiProcessing.py
104 lines (85 loc) · 3.23 KB
/
WikiProcessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#Wiki annotated corpus processing
import json
import os
from KGconfig import *
def printWikiFile(wikiDir,wikiFilePath):
data = []
with open(os.path.join(wiki_corpus_path,wikiDir,wikiFilePath)) as wf:
for line in wf:
data.append(json.loads(line))
for line in data:
print(data[0])
print(data[0]['url'])
print(data[0]['id'])
print(len(data[0]['text']))
print(data[0]['annotations'][0])
print(data[0]['annotations'][0]['surface_form'])
break
#Returns one title or empty dict
def getArticleText2(qtitle, wikiDir, wikiFilePath):
text_data = dict()
with open(os.path.join(args['wiki_corpus_path'],wikiDir,wikiFilePath)) as wf:
data = []
for line in wf:
data.append(json.loads(line))
for i,line in enumerate(data): #print(data[i]['text'])
art_title = ''.join(data[i]['url'].split('/')[-1:])
if art_title==qtitle:
text_data[art_title] = data[i]['text']
break
if len(text_data) > 0:
return text_data
#print('Returning '+str(len(text_data))+' articles from wikiFile '+wikiDir+'/'+wikiFilePath)
return text_data
def getArticleText(wikiDir,wikiFilePath):
data = []
text_data = dict()
with open(os.path.join(args['wiki_corpus_path'],wikiDir,wikiFilePath)) as wf:
for line in wf:
data.append(json.loads(line))
for i,line in enumerate(data): #print(data[i]['text'])
text_data[''.join(data[i]['url'].split('/')[-1:])] = data[i]['text']
print('Retrieved '+str(len(text_data))+' articles from wikiFile '+wikiDir+'/'+wikiFilePath)
return text_data
def getWikiFile(wikiDir):
for wikiFil in os.listdir(os.path.join(args['wiki_corpus_path'],wikiDir)):
if not wikiFil.startswith('.'):
#print(wikiDir, wikiFil)
yield wikiFil
def fetchWikiText(article_title):
for wikiDir in os.listdir(args['wiki_corpus_path']):
if not wikiDir.startswith('.') and os.path.isdir(os.path.join(args['wiki_corpus_path'],wikiDir)):
wikiFil = getWikiFile(wikiDir)
#print(wikiFil)
articleText = ''
for wikiContainerFile in wikiFil: #get value yielded
articleText = getArticleText2(article_title,wikiDir,wikiContainerFile)
if len(articleText) > 0:
print("Found "+article_title+' in '+wikiDir+'/'+wikiContainerFile)
break;
#if len(articleText) == 0:
# print(article_title+' not in '+wikiDir)
#elif len(articleText) > 0:
if len(articleText) > 0:
return articleText
##for batch mode
def fetchWikiText2(article_titles,limit=581):
articles = dict()
for wikiDir in os.listdir(args['wiki_corpus_path']):
if not wikiDir.startswith('.') and os.path.isdir(os.path.join(args['wiki_corpus_path'],wikiDir)):
wikiFil = getWikiFile(wikiDir)
#print(wikiFil)
articleText = ''
for wikiContainerFile in wikiFil: #get value yielded
for article_title in article_titles:
articleText = getArticleText2(article_title,wikiDir,wikiContainerFile)
if len(articleText) > 0:
print("Found "+article_title+' in '+wikiDir+'/'+wikiContainerFile)
articles[article_title] = articleText
articleText = ''
if len(articles) == limit:
return articles
if len(articles) < len(article_titles):
article_titles = [k for k in article_titles if k not in articles]#clean article titles already fetched.
elif len(articles) >= len(article_titles):
return articles