-
Notifications
You must be signed in to change notification settings - Fork 1
/
generate_corpus.py
32 lines (26 loc) · 1.04 KB
/
generate_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from tqdm.auto import tqdm
from datasets import load_dataset
import nltk
from nltk import tokenize
from argparse import ArgumentParser
import os
def main():
parser = ArgumentParser()
parser.add_argument("--cache_dir", type=str, default="/input/osilab-nlp")
parser.add_argument("--corpus_dir", type=str, default="/input/osilab-nlp/wikipedia")
args = parser.parse_args()
nltk.download('punkt')
os.makedirs(args.corpus_dir, exist_ok=True)
wiki = load_dataset('wikipedia', '20200501.en', split='train', cache_dir=args.cache_dir)
with open(os.path.join(args.corpus_dir, "corpus.txt"), "w", encoding="utf-8") as fp:
for idx, document in tqdm(enumerate(wiki)):
document = document["text"].replace("\n", " ")
document = tokenize.sent_tokenize(document)
for sentence in document:
fp.write(sentence)
fp.write("\n")
fp.write("\n")
# if idx == 1000:
# break
if __name__ == "__main__":
main()