Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tool improvements #101

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ be found in [10], together with an experimental evaluation.
* set the environment variable 'LASER' to the root of the installation, e.g.
`export LASER="${HOME}/projects/laser"`
* download encoders from Amazon s3 by e.g. `bash ./nllb/download_models.sh`
* install Cython in your Python environment
* download third party software by `bash ./install_external_tools.sh`
* download the data used in the example tasks (see description for each task)

Expand Down
86 changes: 48 additions & 38 deletions install_external_tools.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,44 +49,54 @@ MKDIR $tools_ext
###################################################################

InstallMosesTools () {
moses_git="https://raw.githubusercontent.com/moses-smt/mosesdecoder/RELEASE-4.0/scripts"
moses_files=("tokenizer/tokenizer.perl" "tokenizer/detokenizer.perl" \
"tokenizer/normalize-punctuation.perl" \
"tokenizer/remove-non-printing-char.perl" \
"tokenizer/deescape-special-chars.perl" \
"tokenizer/lowercase.perl" \
"tokenizer/basic-protected-patterns" \
)

wdir="${tools_ext}/moses-tokenizer/tokenizer"
MKDIR ${wdir}
cd ${wdir}
moses_tokenizer_root_url="https://raw.githubusercontent.com/moses-smt/mosesdecoder/RELEASE-4.0/scripts/tokenizer"

for f in ${moses_files[@]} ; do
if [ ! -f `basename ${f}` ] ; then
echo " - download ${f}"
wget -q ${moses_git}/${f}
fi
done
chmod 755 *perl
wget -P ${wdir} -nc -nd -nv \
"${moses_tokenizer_root_url}/tokenizer.perl" \
"${moses_tokenizer_root_url}/detokenizer.perl" \
"${moses_tokenizer_root_url}/normalize-punctuation.perl" \
"${moses_tokenizer_root_url}/remove-non-printing-char.perl" \
"${moses_tokenizer_root_url}/deescape-special-chars.perl" \
"${moses_tokenizer_root_url}/lowercase.perl" \
"${moses_tokenizer_root_url}/basic-protected-patterns"

chmod 755 ${wdir}/*perl

# download non-breaking prefixes per language
moses_non_breakings="share/nonbreaking_prefixes/nonbreaking_prefix"
moses_non_breaking_langs=( \
"ca" "cs" "de" "el" "en" "es" "fi" "fr" "ga" "hu" "is" \
"it" "lt" "lv" "nl" "pl" "pt" "ro" "ru" "sk" "sl" "sv" \
"ta" "yue" "zh" )

wdir="${tools_ext}/moses-tokenizer/share/nonbreaking_prefixes"
MKDIR ${wdir}
cd ${wdir}
moses_nonbreaking_prefixes_root_url="https://raw.githubusercontent.com/moses-smt/mosesdecoder/RELEASE-4.0/scripts/share/nonbreaking_prefixes/nonbreaking_prefix"

wget -P ${wdir} -nc -nd -nv \
"${moses_nonbreaking_prefixes_root_url}.ca" \
"${moses_nonbreaking_prefixes_root_url}.cs" \
"${moses_nonbreaking_prefixes_root_url}.de" \
"${moses_nonbreaking_prefixes_root_url}.el" \
"${moses_nonbreaking_prefixes_root_url}.en" \
"${moses_nonbreaking_prefixes_root_url}.es" \
"${moses_nonbreaking_prefixes_root_url}.fi" \
"${moses_nonbreaking_prefixes_root_url}.fr" \
"${moses_nonbreaking_prefixes_root_url}.ga" \
"${moses_nonbreaking_prefixes_root_url}.hu" \
"${moses_nonbreaking_prefixes_root_url}.is" \
"${moses_nonbreaking_prefixes_root_url}.it" \
"${moses_nonbreaking_prefixes_root_url}.lt" \
"${moses_nonbreaking_prefixes_root_url}.lv" \
"${moses_nonbreaking_prefixes_root_url}.nl" \
"${moses_nonbreaking_prefixes_root_url}.pl" \
"${moses_nonbreaking_prefixes_root_url}.pt" \
"${moses_nonbreaking_prefixes_root_url}.ro" \
"${moses_nonbreaking_prefixes_root_url}.ru" \
"${moses_nonbreaking_prefixes_root_url}.sk" \
"${moses_nonbreaking_prefixes_root_url}.sl" \
"${moses_nonbreaking_prefixes_root_url}.sv" \
"${moses_nonbreaking_prefixes_root_url}.ta" \
"${moses_nonbreaking_prefixes_root_url}.yue" \
"${moses_nonbreaking_prefixes_root_url}.zh"

for l in ${moses_non_breaking_langs[@]} ; do
f="${moses_non_breakings}.${l}"
if [ ! -f `basename ${f}` ] ; then
echo " - download ${f}"
wget -q ${moses_git}/${f}
fi
done
}


Expand All @@ -100,10 +110,10 @@ InstallFastBPE () {
cd ${tools_ext}
if [ ! -x fastBPE/fast ] ; then
echo " - download fastBPE software from github"
wget https://github.com/glample/fastBPE/archive/master.zip
unzip master.zip
/bin/rm master.zip
mv fastBPE-master fastBPE
wget https://github.com/glample/fastBPE/archive/1fd33189c126dae356b9e187d93d93302fa45cef.zip
unzip 1fd33189c126dae356b9e187d93d93302fa45cef.zip
/bin/rm 1fd33189c126dae356b9e187d93d93302fa45cef.zip
mv fastBPE-1fd33189c126dae356b9e187d93d93302fa45cef fastBPE
cd fastBPE
echo " - compiling"
g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast
Expand Down Expand Up @@ -150,20 +160,20 @@ InstallMecab () {
cd ${tools_ext}
if [ ! -x mecab/mecab/bin/mecab ] ; then
echo " - download mecab from github"
wget https://github.com/taku910/mecab/archive/master.zip
unzip master.zip
wget https://github.com/taku910/mecab/archive/3a07c4eefaffb4e7a0690a7f4e5e0263d3ddb8a3.zip
unzip 3a07c4eefaffb4e7a0690a7f4e5e0263d3ddb8a3.zip
#/bin/rm master.zip
if [ ! -s mecab/bin/mecab ] ; then
mkdir mecab
cd mecab-master/mecab
cd mecab-3a07c4eefaffb4e7a0690a7f4e5e0263d3ddb8a3/mecab
echo " - installing code"
./configure --prefix ${tools_ext}/mecab && make && make install
if [ $? -q 1 ] ; then
echo "ERROR: installation failed, please install manually"; exit
fi
fi
if [ ! -d mecab/lib/mecab/dic/ipadic ] ; then
cd ${tools_ext}/mecab-master/mecab-ipadic
cd ${tools_ext}/mecab-3a07c4eefaffb4e7a0690a7f4e5e0263d3ddb8a3/mecab-ipadic
echo " - installing dictionaries"
./configure --prefix ${tools_ext}/mecab --with-mecab-config=${tools_ext}/mecab/bin/mecab-config \
&& make && make install
Expand Down
38 changes: 11 additions & 27 deletions install_models.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,40 +9,24 @@
# is a toolkit to calculate multilingual sentence embeddings
# and to use them for document classification, bitext filtering
# and mining
#
#
#-------------------------------------------------------
#
# This bash script installs sentence encoders from Amazon s3
#

if [ -z ${LASER} ] ; then
if [ -z ${LASER} ] ; then
echo "Please set the environment variable 'LASER'"
exit
fi

mdir="${LASER}/models"

# available encoders
s3="https://dl.fbaipublicfiles.com/laser/models"
networks=("bilstm.eparl21.2018-11-19.pt" \
"eparl21.fcodes" "eparl21.fvocab" \
"bilstm.93langs.2018-12-26.pt" \
"93langs.fcodes" "93langs.fvocab")


echo "Downloading networks"

if [ ! -d ${mdir} ] ; then
echo " - creating directory ${mdir}"
mkdir -p ${mdir}
fi

cd ${mdir}
for f in ${networks[@]} ; do
if [ -f ${f} ] ; then
echo " - ${mdir}/${f} already downloaded"
else
echo " - ${f}"
wget -q ${s3}/${f}
fi
done
echo "Downloading networks to ${mdir}"
mkdir -p ${mdir}
wget -P ${mdir} -nc -nd \
"https://dl.fbaipublicfiles.com/laser/models/bilstm.eparl21.2018-11-19.pt" \
"https://dl.fbaipublicfiles.com/laser/models/eparl21.fcodes" \
"https://dl.fbaipublicfiles.com/laser/models/eparl21.fvocab" \
"https://dl.fbaipublicfiles.com/laser/models/bilstm.93langs.2018-12-26.pt" \
"https://dl.fbaipublicfiles.com/laser/models/93langs.fcodes" \
"https://dl.fbaipublicfiles.com/laser/models/93langs.fvocab"