forked from fastai/course-nlp
-
Notifications
You must be signed in to change notification settings - Fork 1
/
nlputils.py
51 lines (41 loc) · 1.63 KB
/
nlputils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from fastai.basics import *
import re
def get_wiki(path,lang):
name = f'{lang}wiki'
if (path/name).exists():
print(f"{path/name} already exists; not downloading")
return
xml_fn = f"{lang}wiki-latest-pages-articles.xml"
zip_fn = f"{xml_fn}.bz2"
if not (path/xml_fn).exists():
print("downloading...")
download_url(f'https://dumps.wikimedia.org/{name}/latest/{zip_fn}', path/zip_fn)
print("unzipping...")
bunzip(path/zip_fn)
with working_directory(path):
if not (path/'wikiextractor').exists(): os.system('git clone https://github.com/attardi/wikiextractor.git')
print("extracting...")
os.system("python wikiextractor/WikiExtractor.py --processes 4 --no_templates " +
f"--min_text_length 1800 --filter_disambig_pages --log_file log -b 100G -q {xml_fn}")
shutil.move(str(path/'text/AA/wiki_00'), str(path/name))
shutil.rmtree(path/'text')
def split_wiki(path,lang):
dest = path/'docs'
name = f'{lang}wiki'
if dest.exists():
print(f"{dest} already exists; not splitting")
return dest
dest.mkdir(exist_ok=True, parents=True)
title_re = re.compile(rf'<doc id="\d+" url="https://{lang}.wikipedia.org/wiki\?curid=\d+" title="([^"]+)">')
lines = (path/name).open()
f=None
for i,l in enumerate(lines):
if i%100000 == 0: print(i)
if l.startswith('<doc id="'):
title = title_re.findall(l)[0].replace('/','_')
if len(title)>150: continue
if f: f.close()
f = (dest/f'{title}.txt').open('w')
else: f.write(l)
f.close()
return dest