forked from protonish/cipherdaug-nmt
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmulti_binarize.sh
93 lines (73 loc) · 2.12 KB
/
multi_binarize.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/bin/bash
LOC='/cs/lab-folder/' # set your root project location
ROOT="${LOC}/username/cipherdaug-nmt"
DATAROOT="${ROOT}/data" # set your data root
DATABIN="${ROOT}/data-bin"
FAIRSEQ="${ROOT}/fairseq"
FAIRSCRIPTS="${FAIRSEQ}/scripts"
dex_en_2keys() {
SRCS=(
"de"
"de1"
"de2"
)
TGTS=(
"en"
"de"
)
# Preprocess/binarize the data
TEXT="${DATABIN}/iwslt14/dex_en_2keys"
mkdir -p "${TEXT}/bin"
echo "make sure this is the config you want:"
cat "${TEXT}/data.config.txt"
echo ""
}
dex_en_5keys() {
SRCS=(
"de"
"de1"
"de2"
"de3"
"de4"
"de5"
)
TGTS=(
"en"
"de"
)
# Preprocess/binarize the data
TEXT="${DATABIN}/iwslt14/dex_en_5keys"
mkdir -p "${TEXT}/bin"
echo "make sure this is the config you want:"
cat "${TEXT}/data.config.txt"
echo ""
}
##############################
#### call the config here ####
dex_en_2keys
#dex_en_5keys
##############################
# best if left untouched
DICT=jointdict.txt
echo "Generating joined dictionary for all languages based on BPE.."
# strip the first three special tokens and append fake counts for each vocabulary
tail -n +4 "${TEXT}/bpe/spm.bpe.vocab" | cut -f1 | sed 's/$/ 100/g' > "${TEXT}/bin/${DICT}"
echo "binarizing pairwise langs .."
for SRC in ${SRCS[@]}; do
for TGT in ${TGTS[@]}; do
if [ ! ${SRC} = ${TGT} ]; then
echo "binarizing data ${SRC}-${TGT} data.."
fairseq-preprocess --source-lang ${SRC} --target-lang ${TGT} \
--destdir "${TEXT}/bin" \
--trainpref "${TEXT}/bpe/train.bpe.${SRC}-${TGT}" \
--validpref "${TEXT}/bpe/valid.bpe.${SRC}-${TGT}" \
--testpref "${TEXT}/bpe/test.bpe.${SRC}-${TGT}" \
--srcdict "${TEXT}/bin/${DICT}" --tgtdict "${TEXT}/bin/${DICT}" \
--workers 4
fi
done
done
echo ""
echo "Creating langs file based on binarised dicts .."
python "${ROOT}/cipher/utils.py" -i "${TEXT}/bin/" --getlangs > "${TEXT}/bin/langs.file"
echo "--> ${TEXT}/bin/langs.file"