-
Notifications
You must be signed in to change notification settings - Fork 3
/
build_lang_id_model.sh
69 lines (37 loc) · 2.07 KB
/
build_lang_id_model.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/env bash
if [ $# -ne 3 ]; then
echo "usage ${0} python_cmd train_corpus n";
exit -1;
fi
python_cmd=${1}
train_corpus=${2}
n=${3}
printf "python_cmd: %s\n" "${python_cmd}"
printf "n gram: %s\n" "${n}"
printf "train_corpus: %s\n" "${train_corpus}"
printf "removing %s\n" "${train_corpus}_model_${n}_grams/"
rm -rf ${train_corpus}_model_${n}_grams/
printf "\n\n%s\n" "step 1: index the corpus"
${python_cmd} langid.py-master/langid/train/index.py ${train_corpus}/ -m ${train_corpus}_model_${n}_grams/
#read -n1 -r -p "Press space to continue..." key
printf "\n\n%s\n" "step 2: tokenization"
${python_cmd} langid.py-master/langid/train/tokenize.py --max_order ${n} ${train_corpus}_model_${n}_grams/
#read -n1 -r -p "Press space to continue..." key
printf "\n\n%s\n" "step 3: choose features by document frequency"
${python_cmd} langid.py-master/langid/train/DFfeatureselect.py --max_order ${n} ${train_corpus}_model_${n}_grams/
#read -n1 -r -p "Press space to continue..." key
printf "\n\n%s\n" "step 4: compute the IG weights for domain"
${python_cmd} langid.py-master/langid/train/IGweight.py -d ${train_corpus}_model_${n}_grams/
#read -n1 -r -p "Press space to continue..." key
printf "\n\n%s\n" "step 5: compute the IG weights for language"
${python_cmd} langid.py-master/langid/train/IGweight.py -lb ${train_corpus}_model_${n}_grams/
#read -n1 -r -p "Press space to continue..." key
printf "\n\n%s\n" "step 6: LD feature selection: take the IG weights and use them to select a feature set"
${python_cmd} langid.py-master/langid/train/LDfeatureselect.py ${train_corpus}_model_${n}_grams/
#read -n1 -r -p "Press space to continue..." key
printf "\n\n%s\n" "step 7: build a scanner on the basis of a feature set"
${python_cmd} langid.py-master/langid/train/scanner.py ${train_corpus}_model_${n}_grams/
#read -n1 -r -p "Press space to continue..." key
printf "\n\n%s\n" "\nstep 8: learn NB parameters using an indexed corpus and a scanner"
${python_cmd} langid.py-master/langid/train/NBtrain.py ${train_corpus}_model_${n}_grams/
#python langid.py -m multidialect_model/model