-
Notifications
You must be signed in to change notification settings - Fork 10
/
train_non_contextualized_embeddings.sh
executable file
·62 lines (47 loc) · 2.82 KB
/
train_non_contextualized_embeddings.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/env bash
# Monolingual Corpora
src_corpus=en.wikipedia.tok
tgt_corpus=fr.wikipedia.tok
concat_corpus=en-fr.wikipedia.tok
# Hyper-param
src_lang=en
tgt_lang=fr
topk=200000
thredshold=90 # {70, 80, 90, 95}
lr=10 # {1, 10, 25, 50}
epoch=10 # {10, 20}
# Tool Path
MUSE_PATH=$PWD/tools/MUSE
RCSLS_PATH=$PWD/tools/fastText/alignment
FASTTEXT=$PWD/tools/fastText/fasttext
FASTBPE=$PWD/tools/fastBPE/fast
LANG_PAIR_PATH=$PWD/word_embeddings/${src_lang}_${tgt_lang} # Path to language pair vocab and embeddings
TRAIN_DICO_PATH=$MUSE_PATH/data/crosslingual/dictionaries/${src_lang}-${tgt_lang}.0-5000.txt
TEST_DICO_PATH=$MUSE_PATH/data/crosslingual/dictionaries/${src_lang}-${tgt_lang}.5000-6500.txt
JOINT_EMBED=$LANG_PAIR_PATH/fasttext.${src_lang}-${tgt_lang}.word.joint.300
SRC_VOCAB=$LANG_PAIR_PATH/${src_lang}.word.vocab
TGT_VOCAB=$LANG_PAIR_PATH/${tgt_lang}.word.vocab
JOINT_VOCAB=$LANG_PAIR_PATH/${src_lang}.${tgt_lang}.word.vocab
SRC_ONLY_EMBED=$LANG_PAIR_PATH/${src_lang}_only_embedding.${thredshold}
TGT_ONLY_EMBED=$LANG_PAIR_PATH/${tgt_lang}_only_embedding.${thredshold}
JOINT_ONLY_EMBED=$LANG_PAIR_PATH/joint_only_embedding.${thredshold}
OUTPUT_EMBED=$LANG_PAIR_PATH/joint_align_embedding
mkdir $LANG_PAIR_PATH
# down (or up) sample a side of the corpus and concatenate them
a=($(wc -l $tgt_corpus))
python sample_corpus --corpus $src_corpus --target_size ${a[0]} --output $src_corpus.sampled
cat $src_corpus.sampled $tgt_corpus | shuf > $concat_corpus
# train joint fastText embeddings
$FASTTEXT skipgram -dim 300 -thread 24 -input $concat_corpus -output $concat_corpus.300
# get the vocab and counts for the next steps
$FASTBPE getvocab $src_corpus.sampled > $SRC_VOCAB
$FASTBPE getvocab $tgt_corpus > $TGT_VOCAB
$FASTBPE getvocab $src_corpus.sampled $tgt_corpus > $JOINT_VOCAB
# Select topk words for
python topk_embedding.py --src_vocab $SRC_VOCAB --tgt_vocab $TGT_VOCAB --input_embedding $JOINT_EMBED --output_path $JOINT_EMBED.$topk --topk $topk --src_lang ${src_lang} --tgt_lang ${tgt_lang} --dico_path $MUSE_PATH/data/crosslingual/dictionaries
# Vocabulary Reallocation
python vocab_reallocation.py --threshold ${thredshold} --src_vocab $SRC_VOCAB --tgt_vocab $TGT_VOCAB --joint_vocab $JOINT_VOCAB --input_embedding $JOINT_EMBED.$topk --src_only_output_path $SRC_ONLY_EMBED --tgt_only_output_path $TGT_ONLY_EMBED --joint_only_output_path $JOINT_ONLY_EMBED
# Alignment Refinement using RCSLS, replace this line with other alignment methods if needed
python $RCSLS_PATH/align.py --lr ${lr} --niter ${epoch} --src_emb $SRC_ONLY_EMBED --tgt_emb $TGT_ONLY_EMBED --dico_train $TRAIN_DICO_PATH --dico_test $TEST_DICO_PATH --output $SRC_ONLY_EMBED.aligned
# Merge embeddings
python merge_embeddings.py --input_embedding_1 $SRC_ONLY_EMBED.aligned --input_embedding_2 $TGT_ONLY_EMBED --input_embedding_3 $JOINT_ONLY_EMBED --output_path $OUTPUT_EMBED