-
Notifications
You must be signed in to change notification settings - Fork 2
/
word2vec.py
executable file
·102 lines (77 loc) · 3 KB
/
word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# -*- coding: utf-8 -*-
import json
import collections
import logging
import numpy as np
import gensim.models.word2vec as w2v
import os
import argparse
from pathlib import Path
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO,
)
logger = logging.getLogger(__name__)
class CorpusIterator:
def __init__(self, fname):
self.fname = fname
def __iter__(self):
with open(self.fname, encoding='utf-8', errors='ignore') as rf:
for line in rf:
line = line.strip()
if not line:
continue
jsonl = json.loads(line)
sentence_tokens = jsonl['text'].split()
yield sentence_tokens
def train_and_dump_word2vec(
medline_entities_linked_fname,
output_dir,
n_workers=4,
n_iter=3
):
# fix embed dim = 50 and max vocab size to 50k
model = w2v.Word2Vec(size=50, workers=n_workers, iter=n_iter, max_final_vocab=50000)
sentences = CorpusIterator(medline_entities_linked_fname)
logger.info(f'Building word2vec vocab on {medline_entities_linked_fname}...')
model.build_vocab(sentences)
logger.info('Training ...')
model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
os.makedirs(output_dir, exist_ok=True)
logger.info('Saving word2vec model ...')
model.save(os.path.join(output_dir, 'word2vec.pubmed2019.50d.gz'))
wv = model.wv
del model # free up memory
word2id = {"<PAD>": 0, "<UNK>": 1}
mat = np.zeros((len(wv.vocab.keys()) + 2, 50))
# initialize UNK embedding with random normal
mat[1] = np.random.randn(50)
for word in sorted(wv.vocab.keys()):
vocab_item = wv.vocab[word]
vector = wv.vectors[vocab_item.index]
mat[len(word2id)] = vector
word2id[word] = len(word2id)
mat_fname = Path(output_dir) / f'word2vec.pubmed2019.50d_mat.npy'
map_fname = Path(output_dir) / f'word2vec.pubmed2019.50d_word2id.json'
logger.info(f'Saving word2id at {map_fname} and numpy matrix at {mat_fname} ...')
np.save(mat_fname, mat)
with open(map_fname, 'w', encoding='utf-8', errors='ignore') as wf:
json.dump(word2id, wf)
if __name__=='__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
"--medline_entities_linked_fname", action="store", required=True, type=str,
help="Path to *.jsonl concepts linked file."
)
parser.add_argument(
"--output_dir", action="store", required=True, type=str,
help="Path to output directory where the word2id and numpy matrix will be saved."
)
args = parser.parse_args()
import pprint
pprint.pprint(vars(args))
train_and_dump_word2vec(
args.medline_entities_linked_fname,
args.output_dir
)