-
Notifications
You must be signed in to change notification settings - Fork 96
/
run.py
31 lines (24 loc) · 1.01 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import os.path
import requests
from download import download_wikipedia_abstracts
from load import load_documents
from search.timing import timing
from search.index import Index
@timing
def index_documents(documents, index):
for i, document in enumerate(documents):
index.index_document(document)
if i % 5000 == 0:
print(f'Indexed {i} documents', end='\r')
return index
if __name__ == '__main__':
# this will only download the xml dump if you don't have a copy already;
# just delete the file if you want a fresh copy
if not os.path.exists('data/enwiki-latest-abstract.xml.gz'):
download_wikipedia_abstracts()
index = index_documents(load_documents(), Index())
print(f'Index contains {len(index.documents)} documents')
index.search('London Beer Flood', search_type='AND')
index.search('London Beer Flood', search_type='OR')
index.search('London Beer Flood', search_type='AND', rank=True)
index.search('London Beer Flood', search_type='OR', rank=True)