forked from thamizha/tawiktionary-offline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearcher.py
73 lines (62 loc) · 2.16 KB
/
searcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/python
# -*- encoding: UTF-8 -*-
'''
The text entered during the searching should be parsed into search objects for
the Whoosh library.
This file recieves the text entered and parses into searchable objects and
performs search operations.
'''
import re
import bz2
import os
import BeautifulSoup
from whoosh import index
from whoosh.fields import *
from whoosh.qparser import QueryParser
def search_for(text):
''' This function gets the search query string and returns the list of
dictionary of the hits (file_name and titles) '''
ix = index.open_dir("indexdir")
res = []
with ix.searcher() as searcher:
query = QueryParser("word", ix.schema).parse(unicode(text))
results = searcher.search(query, limit=None)
for result in results:
temp = {}
temp['meaning'] = result['meaning']
temp['word'] = result['word']
res.append(temp)
return res
def get_markup(word,meaning):
''' The get_markup function checks wether the index contains full wiki text
or the name of the file which contains wiki text and returns the wiki text
in first case. It obtains the wikitext from the file and returns it in
second case.'''
filexp = re.compile("chunk-[0-9]{1,}.xml.bz2")
if filexp.match(meaning):
#parse the file and return wiki text
#print "Contains File Name"
bzfile = bz2.BZ2File(os.path.join("chunks",meaning))
xmltext = '<page>'
writ = False
for li in bzfile:
line = unicode(li, 'utf-8')
if word in line:
writ = True
if writ:
xmltext += line
if '</page>' in line:
writ = False
soup = BeautifulSoup.BeautifulSoup(xmltext)
text = soup.find("text").text
return text
else:
return meaning
if __name__ == "__main__":
searchterm = raw_input("Enter the Search Term: ")
r = search_for(searchterm)
for rs in r:
print str(r.index(rs)),unicode(rs['word'])
choice = int(raw_input('Enter your option: '))
opt = r[choice]
print get_markup(opt['word'],opt['meaning'])