From 3999b647ed5081e6bd9e0efbb2a9f34795b8cf93 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Thu, 31 Oct 2019 12:59:58 +0200 Subject: [PATCH 1/3] Downloads: Use a single wget invocation for faster downloads Wget can reuse the same connection for all files. The `-nc` flag will have it skip pre-existing files. --- install_external_tools.sh | 70 ++++++++++++++++++++++----------------- install_models.sh | 38 ++++++--------------- 2 files changed, 51 insertions(+), 57 deletions(-) diff --git a/install_external_tools.sh b/install_external_tools.sh index b250982c..d061c47b 100755 --- a/install_external_tools.sh +++ b/install_external_tools.sh @@ -49,44 +49,54 @@ MKDIR () { ################################################################### InstallMosesTools () { - moses_git="https://raw.githubusercontent.com/moses-smt/mosesdecoder/RELEASE-4.0/scripts" - moses_files=("tokenizer/tokenizer.perl" "tokenizer/detokenizer.perl" \ - "tokenizer/normalize-punctuation.perl" \ - "tokenizer/remove-non-printing-char.perl" \ - "tokenizer/deescape-special-chars.perl" \ - "tokenizer/lowercase.perl" \ - "tokenizer/basic-protected-patterns" \ - ) - wdir="${tools_ext}/moses-tokenizer/tokenizer" MKDIR ${wdir} - cd ${wdir} + moses_tokenizer_root_url="https://raw.githubusercontent.com/moses-smt/mosesdecoder/RELEASE-4.0/scripts/tokenizer" - for f in ${moses_files[@]} ; do - if [ ! -f `basename ${f}` ] ; then - echo " - download ${f}" - wget -q ${moses_git}/${f} - fi - done - chmod 755 *perl + wget -P ${wdir} -nc -nd -nv \ + "${moses_tokenizer_root_url}/tokenizer.perl" \ + "${moses_tokenizer_root_url}/detokenizer.perl" \ + "${moses_tokenizer_root_url}/normalize-punctuation.perl" \ + "${moses_tokenizer_root_url}/remove-non-printing-char.perl" \ + "${moses_tokenizer_root_url}/deescape-special-chars.perl" \ + "${moses_tokenizer_root_url}/lowercase.perl" \ + "${moses_tokenizer_root_url}/basic-protected-patterns" + + chmod 755 ${wdir}/*perl # download non-breaking prefixes per language - moses_non_breakings="share/nonbreaking_prefixes/nonbreaking_prefix" - moses_non_breaking_langs=( \ - "ca" "cs" "de" "el" "en" "es" "fi" "fr" "ga" "hu" "is" \ - "it" "lt" "lv" "nl" "pl" "pt" "ro" "ru" "sk" "sl" "sv" \ - "ta" "yue" "zh" ) + wdir="${tools_ext}/moses-tokenizer/share/nonbreaking_prefixes" MKDIR ${wdir} - cd ${wdir} + moses_nonbreaking_prefixes_root_url="https://raw.githubusercontent.com/moses-smt/mosesdecoder/RELEASE-4.0/scripts/share/nonbreaking_prefixes/nonbreaking_prefix" + + wget -P ${wdir} -nc -nd -nv \ + "${moses_nonbreaking_prefixes_root_url}.ca" \ + "${moses_nonbreaking_prefixes_root_url}.cs" \ + "${moses_nonbreaking_prefixes_root_url}.de" \ + "${moses_nonbreaking_prefixes_root_url}.el" \ + "${moses_nonbreaking_prefixes_root_url}.en" \ + "${moses_nonbreaking_prefixes_root_url}.es" \ + "${moses_nonbreaking_prefixes_root_url}.fi" \ + "${moses_nonbreaking_prefixes_root_url}.fr" \ + "${moses_nonbreaking_prefixes_root_url}.ga" \ + "${moses_nonbreaking_prefixes_root_url}.hu" \ + "${moses_nonbreaking_prefixes_root_url}.is" \ + "${moses_nonbreaking_prefixes_root_url}.it" \ + "${moses_nonbreaking_prefixes_root_url}.lt" \ + "${moses_nonbreaking_prefixes_root_url}.lv" \ + "${moses_nonbreaking_prefixes_root_url}.nl" \ + "${moses_nonbreaking_prefixes_root_url}.pl" \ + "${moses_nonbreaking_prefixes_root_url}.pt" \ + "${moses_nonbreaking_prefixes_root_url}.ro" \ + "${moses_nonbreaking_prefixes_root_url}.ru" \ + "${moses_nonbreaking_prefixes_root_url}.sk" \ + "${moses_nonbreaking_prefixes_root_url}.sl" \ + "${moses_nonbreaking_prefixes_root_url}.sv" \ + "${moses_nonbreaking_prefixes_root_url}.ta" \ + "${moses_nonbreaking_prefixes_root_url}.yue" \ + "${moses_nonbreaking_prefixes_root_url}.zh" - for l in ${moses_non_breaking_langs[@]} ; do - f="${moses_non_breakings}.${l}" - if [ ! -f `basename ${f}` ] ; then - echo " - download ${f}" - wget -q ${moses_git}/${f} - fi - done } diff --git a/install_models.sh b/install_models.sh index 32e25317..94f09190 100755 --- a/install_models.sh +++ b/install_models.sh @@ -9,40 +9,24 @@ # is a toolkit to calculate multilingual sentence embeddings # and to use them for document classification, bitext filtering # and mining -# +# #------------------------------------------------------- # # This bash script installs sentence encoders from Amazon s3 # -if [ -z ${LASER} ] ; then +if [ -z ${LASER} ] ; then echo "Please set the environment variable 'LASER'" exit fi mdir="${LASER}/models" - -# available encoders -s3="https://dl.fbaipublicfiles.com/laser/models" -networks=("bilstm.eparl21.2018-11-19.pt" \ - "eparl21.fcodes" "eparl21.fvocab" \ - "bilstm.93langs.2018-12-26.pt" \ - "93langs.fcodes" "93langs.fvocab") - - -echo "Downloading networks" - -if [ ! -d ${mdir} ] ; then - echo " - creating directory ${mdir}" - mkdir -p ${mdir} -fi - -cd ${mdir} -for f in ${networks[@]} ; do - if [ -f ${f} ] ; then - echo " - ${mdir}/${f} already downloaded" - else - echo " - ${f}" - wget -q ${s3}/${f} - fi -done +echo "Downloading networks to ${mdir}" +mkdir -p ${mdir} +wget -P ${mdir} -nc -nd \ + "https://dl.fbaipublicfiles.com/laser/models/bilstm.eparl21.2018-11-19.pt" \ + "https://dl.fbaipublicfiles.com/laser/models/eparl21.fcodes" \ + "https://dl.fbaipublicfiles.com/laser/models/eparl21.fvocab" \ + "https://dl.fbaipublicfiles.com/laser/models/bilstm.93langs.2018-12-26.pt" \ + "https://dl.fbaipublicfiles.com/laser/models/93langs.fcodes" \ + "https://dl.fbaipublicfiles.com/laser/models/93langs.fvocab" From 615c7b978bcb8d822819f599fbc6c99c9738f320 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Thu, 31 Oct 2019 13:18:23 +0200 Subject: [PATCH 2/3] Note that Cython is required (to compile fastBPE's Python module) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index aa1a8a27..b0690e8b 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,7 @@ be found in [6], together with an extensive experimental evaluation. * set the environment variable 'LASER' to the root of the installation, e.g. `export LASER="${HOME}/projects/laser"` * download encoders from Amazon s3 by `bash ./install_models.sh` +* install Cython in your Python environment * download third party software by `bash ./install_external_tools.sh` * download the data used in the example tasks (see description for each task) From 49271e66735cd5a4a7906eceef74035a5817e5cb Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Thu, 31 Oct 2019 13:18:43 +0200 Subject: [PATCH 3/3] Pin external tool versions (`master` is a moving target) --- install_external_tools.sh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/install_external_tools.sh b/install_external_tools.sh index d061c47b..734bde06 100755 --- a/install_external_tools.sh +++ b/install_external_tools.sh @@ -110,10 +110,10 @@ InstallFastBPE () { cd ${tools_ext} if [ ! -x fastBPE/fast ] ; then echo " - download fastBPE software from github" - wget https://github.com/glample/fastBPE/archive/master.zip - unzip master.zip - /bin/rm master.zip - mv fastBPE-master fastBPE + wget https://github.com/glample/fastBPE/archive/1fd33189c126dae356b9e187d93d93302fa45cef.zip + unzip 1fd33189c126dae356b9e187d93d93302fa45cef.zip + /bin/rm 1fd33189c126dae356b9e187d93d93302fa45cef.zip + mv fastBPE-1fd33189c126dae356b9e187d93d93302fa45cef fastBPE cd fastBPE echo " - compiling" g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast @@ -136,12 +136,12 @@ InstallMecab () { cd ${tools_ext} if [ ! -x mecab/mecab/bin/mecab ] ; then echo " - download mecab from github" - wget https://github.com/taku910/mecab/archive/master.zip - unzip master.zip + wget https://github.com/taku910/mecab/archive/3a07c4eefaffb4e7a0690a7f4e5e0263d3ddb8a3.zip + unzip 3a07c4eefaffb4e7a0690a7f4e5e0263d3ddb8a3.zip #/bin/rm master.zip if [ ! -s mecab/bin/mecab ] ; then mkdir mecab - cd mecab-master/mecab + cd mecab-3a07c4eefaffb4e7a0690a7f4e5e0263d3ddb8a3/mecab echo " - installing code" ./configure --prefix ${tools_ext}/mecab && make && make install if [ $? -q 1 ] ; then @@ -149,7 +149,7 @@ InstallMecab () { fi fi if [ ! -d mecab/lib/mecab/dic/ipadic ] ; then - cd ${tools_ext}/mecab-master/mecab-ipadic + cd ${tools_ext}/mecab-3a07c4eefaffb4e7a0690a7f4e5e0263d3ddb8a3/mecab-ipadic echo " - installing dictionaries" ./configure --prefix ${tools_ext}/mecab --with-mecab-config=${tools_ext}/mecab/bin/mecab-config \ && make && make install