binpash · huangworld · Nov 29, 2023 · Nov 29, 2023 · Nov 29, 2023
diff --git a/evaluation/benchmarks/bio/bio-align/genome-diff.sh b/evaluation/benchmarks/bio/bio-align/genome-diff.sh
@@ -11,7 +11,7 @@
 # bacteria), and any regions with less than 10 supporting reads.
 
 # Requires: samtools, minimap2, bcftools
-# Data: http://ndr.md/data/bio/R1.fastq.gz http://ndr.md/data/bio/R2.fastq.gz  http://ndr.md/data/bio/ref.fa
+# Data: atlas-group.cs.brown.edu/data/bio/R1.fastq.gz atlas-group.cs.brown.edu/data/bio/R2.fastq.gz  atlas-group.cs.brown.edu/data/bio/ref.fa
 
 # https://github.com/samtools/samtools/releases/latest
 # https://github.com/lh3/minimap2

diff --git a/evaluation/benchmarks/bio/bio-align/genquality.sh b/evaluation/benchmarks/bio/bio-align/genquality.sh
@@ -6,7 +6,7 @@
 # http://thegenomefactory.blogspot.com/2019/09/25-reasons-assemblies-dont-make-it-into.html
 
 # Require: csvkit
-# Data: http://ndr.md/data/bio/genbank.txt
+# Data: atlas-group.cs.brown.edu/data/bio/genbank.txt
 
 IN=./input/genbank.txt
 OUT=./output/out.txt

diff --git a/evaluation/benchmarks/bio/bio1/setup.sh b/evaluation/benchmarks/bio/bio1/setup.sh
@@ -8,7 +8,7 @@ mkdir -p input
 mkdir -p output
 cd input
 if [[ ! -f R1.fastq ]]; then
-  wget ndr.md/data/bio/{R1.fastq.gz,R2.fastq.gz,ref.fa}
+  wget atlas-group.cs.brown.edu/data/bio/{R1.fastq.gz,R2.fastq.gz,ref.fa}
 
   gunzip R1.fastq.gz
   gunzip R2.fastq.gz

diff --git a/evaluation/benchmarks/max-temp/max-temp-preprocess.sh b/evaluation/benchmarks/max-temp/max-temp-preprocess.sh
@@ -1,12 +1,12 @@
 #!/bin/bash
 
-sed 's;^;http://ndr.md/data/noaa/;' |
+sed 's;^;atlas-group.cs.brown.edu/data/noaa/;' |
     sed 's;$;/;' |
     xargs -r -n 1 curl -s |
     grep gz |
     tr -s ' \n' |
     cut -d ' ' -f9 |
     sed 's;^\(.*\)\(20[0-9][0-9]\).gz;\2/\1\2\.gz;' |
-    sed 's;^;http://ndr.md/data/noaa/;' |
+    sed 's;^;atlas-group.cs.brown.edu/data/noaa/;' |
     xargs -n1 curl -s |
     gunzip
diff --git a/evaluation/benchmarks/max-temp/max-temp.sh b/evaluation/benchmarks/max-temp/max-temp.sh
@@ -2,7 +2,7 @@
 
 FROM=${FROM:-2015}
 TO=${TO:-2015}
-IN=${IN:-'http://ndr.md/data/noaa/'}
+IN=${IN:-'atlas-group.cs.brown.edu/data/noaa/'}
 fetch=${fetch:-"curl -s"}
 
 seq $FROM $TO |

diff --git a/evaluation/benchmarks/max-temp/temp-analytics.sh b/evaluation/benchmarks/max-temp/temp-analytics.sh
@@ -2,7 +2,7 @@
 
 FROM=${FROM:-2015}
 TO=${TO:-2015}
-IN=${IN:-'http://ndr.md/data/noaa/'}
+IN=${IN:-'atlas-group.cs.brown.edu/data/noaa/'}
 fetch=${fetch:-"curl -s"}
 
 data_file=temperatures.txt

diff --git a/evaluation/benchmarks/nlp/input/setup.sh b/evaluation/benchmarks/nlp/input/setup.sh
@@ -20,7 +20,7 @@ setup_dataset() {
     cd pg
   if [[ "$1" == "--full" ]]; then
     echo 'N.b.: download/extraction will take about 10min'
-    wget ndr.md/data/pg.tar.xz
+    wget atlas-group.cs.brown.edu/data/pg.tar.xz # FIXME: moving to PG soon
     if [ $? -ne 0 ]; then
 		cat <<-'EOF' | sed 's/^ *//'
 		Downloading input dataset failed, thus need to manually rsync all books from  project gutenberg:

diff --git a/evaluation/benchmarks/oneliners/input/setup.sh b/evaluation/benchmarks/oneliners/input/setup.sh
@@ -26,7 +26,7 @@ setup_dataset() {
   fi
 
     if [ ! -f ./1M.txt ]; then
-        curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt
+        curl -sf 'atlas-group.cs.brown.edu/data/dummy/1M.txt' > 1M.txt
         if [ $? -ne 0 ]; then
             echo 'cannot find 1M.txt -- please contact the developers of pash'
             exit 1
@@ -51,7 +51,7 @@ setup_dataset() {
     fi
 
     if [ ! -f ./1G.txt ]; then
-        curl -sf 'http://ndr.md/data/dummy/1G.txt' > 1G.txt
+        curl -sf 'atlas-group.cs.brown.edu/data/dummy/1G.txt' > 1G.txt
         if [ $? -ne 0 ]; then
             echo 'cannot find 1G.txt -- please contact the developers of pash'
             exit 1
@@ -61,7 +61,7 @@ setup_dataset() {
 
   # download wamerican-insane dictionary and sort according to machine
   if [ ! -f ./dict.txt ]; then
-      curl -sf 'http://ndr.md/data/dummy/dict.txt' | sort > dict.txt
+      curl -sf 'atlas-group.cs.brown.edu/data/dummy/dict.txt' | sort > dict.txt
       if [ $? -ne 0 ]; then
           echo 'cannot find dict.txt -- please contact the developers of pash'
           exit 1
@@ -70,7 +70,7 @@ setup_dataset() {
     fi
 
     if [ ! -f ./all_cmds.txt ]; then
-        curl -sf 'http://ndr.md/data/dummy/all_cmds.txt' > all_cmds.txt
+        curl -sf 'atlas-group.cs.brown.edu/data/dummy/all_cmds.txt' > all_cmds.txt
         if [ $? -ne 0 ]; then
             # This should be OK for tests, no need for abort
             ls /usr/bin/* > all_cmds.txt

diff --git a/evaluation/benchmarks/web-index/input/setup.sh b/evaluation/benchmarks/web-index/input/setup.sh
@@ -17,8 +17,7 @@ setup_dataset() {
     wget $wiki_archive || eexit "cannot fetch wikipedia"
     7za x wikipedia-en-html.tar.7z
     tar -xvf wikipedia-en-html.tar
-    wget http://ndr.md/data/wikipedia/index.txt # || eexit "cannot fetch wiki indices"
-    # It is actually OK if we don't have this index since we download the 500/1000 below
+    wget atlas-group.cs.brown.edu/data/wikipedia/index.txt # FIXME: we download index below?
   fi
 
   if [ "$1" = "--small" ]; then

diff --git a/evaluation/distr_benchmarks/intro/check-ft-correctness.sh b/evaluation/distr_benchmarks/intro/check-ft-correctness.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Specify the folder where the .out files are located
+folder="$DISH_TOP/evaluation/distr_benchmarks/intro/outputs"
+
+# Loop through the files in the folder
+num_workers=3
+for script_distr_out in "$folder"/*distr.out; do
+    # Extract the script name without the extension
+    script_name=$(basename "$script_distr_out" .distr.out)
+    for ((i = 1; i <= num_workers; i++)); do
+        # get the corresponding .faults.$crashed_worker.out file
+        crashed_worker="worker$i"
+        script_faults_out="$folder/$script_name.faults_$crashed_worker.out"
+
+        # Perform a diff between the two files
+        echo "Comparing faults_$crashed_worker.out and distr.out for script $script_name.sh"
+        if diff -q "$script_faults_out" "$script_distr_out"; then
+            echo "Outputs are identical"
+        else
+            echo "Files are different. Differences are as follows:"
+            diff -y "$script_faults_out" "$script_distr_out"
+        fi
+        echo "-------------------------------------------"
+    done
+
+done
diff --git a/evaluation/distr_benchmarks/intro/demo-spell.sh b/evaluation/distr_benchmarks/intro/demo-spell.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+cd "$(dirname $0)"
+
+[ -z $PASH_TOP ] && { 
+  echo "PASH_TOP not set, maybe $(git rev-parse --show-toplevel)?"
+  exit
+}
+DICT="$DISH_TOP/evaluation/distr_benchmarks/intro/input/sorted_words"
+IN=${IN:-/intro/100M.txt}
+hdfs dfs -cat -ignoreCrc $IN |
+    tr A-Z a-z |
+    tr -cs A-Za-z '\n' |
+    sort |
+    uniq |
+    comm -13 $DICT -
diff --git a/evaluation/distr_benchmarks/intro/input/.gitignore b/evaluation/distr_benchmarks/intro/input/.gitignore
@@ -0,0 +1,3 @@
+100M.txt
+words
+sorted_words
diff --git a/evaluation/distr_benchmarks/intro/input/setup.sh b/evaluation/distr_benchmarks/intro/input/setup.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)}
+. "$PASH_TOP/scripts/utils.sh"
+cd $(dirname $0)
+input_files=("100M.txt")
+local_fils=("dict.txt")
+
+[ "$1" = "-c" ] && rm-files 100M.txt words sorted_words
+
+hdfs dfs -mkdir -p /intro
+
+if [ ! -f ./100M.txt ]; then
+  curl -sf --connect-timeout 10 'ndr.md/data/dummy/100M.txt' > 100M.txt
+  if [ $? -ne 0 ]; then
+    # Pipe curl through tac (twice) in order to consume all the output from curl.
+    # This way, curl can write the whole page and not emit an error code.
+    curl -fL 'http://www.gutenberg.org/files/2600/2600-0.txt' | tac | tac | head -c 1M > 1M.txt
+    [ $? -ne 0 ] && eexit 'cannot find 1M.txt'
+    touch 100M.txt
+    for (( i = 0; i < 100; i++ )); do
+      cat 1M.txt >> 100M.txt
+    done
+  fi
+  append_nl_if_not ./100M.txt
+fi
+
+if [ ! -f ./words ]; then
+  curl -sf --connect-timeout 10 'http://ndr.md/data/dummy/words' > words
+  if [ $? -ne 0 ]; then
+    curl -sf 'https://zenodo.org/record/7650885/files/words' > words
+    if [ $? -ne 0 ]; then
+      if [ $(uname) = 'Darwin' ]; then
+        cp /usr/share/dict/web2 words || eexit "cannot find dict file"
+      else
+        # apt install wamerican-insane
+        cp /usr/share/dict/words words || eexit "cannot find dict file"
+      fi
+    fi
+  fi
+  append_nl_if_not words
+fi
+
+## Re-sort words for this machine
+if [ ! -f ./sorted_words ]; then
+  sort words > sorted_words
+fi
+
+# Add files with different replication factors
+for file in "${input_files[@]}"; do
+    hdfs dfs -put $file /intro/$file
+    rm -f $file
+done
diff --git a/evaluation/distr_benchmarks/intro/run.distr.faults.sh b/evaluation/distr_benchmarks/intro/run.distr.faults.sh
@@ -0,0 +1,66 @@
+PASH_FLAGS='--width 8 --r_split'
+export TIMEFORMAT=%R
+export dict="$PASH_TOP/evaluation/distr_benchmarks/oneliners/input/dict.txt"
+curl -sf 'http://ndr.md/data/dummy/dict.txt' | sort > $dict
+
+
+intro_pash(){
+  flags=${1:-$PASH_FLAGS}
+  prefix=${2:-par}
+  prefix=$prefix
+
+  times_file="$prefix.res"
+  outputs_suffix="$prefix.out"
+  time_suffix="$prefix.time"
+  outputs_dir="outputs"
+  pash_logs_dir="pash_logs_$prefix"
+
+  mkdir -p "$outputs_dir"
+  mkdir -p "$pash_logs_dir"
+
+  touch "$times_file"
+  cat $times_file >> $times_file.d
+  echo executing one-liners with $prefix pash with data $(date) | tee "$times_file"
+  echo '' >> "$times_file"
+
+
+  script="demo-spell"
+
+
+  printf -v pad %30s
+  padded_script="${script}.sh:${pad}"
+  padded_script=${padded_script:0:30}
+
+  outputs_file="${outputs_dir}/${script}.${outputs_suffix}"
+  pash_log="${pash_logs_dir}/${script}.pash.log"
+  single_time_file="${outputs_dir}/${script}.${time_suffix}"
+
+  echo -n "${padded_script}" | tee -a "$times_file"
+  { time "$PASH_TOP/pa.sh" $flags --log_file "${pash_log}" ${script}.sh > "$outputs_file"; } 2> "${single_time_file}"
+  cat "${single_time_file}" | tee -a "$times_file"
+
+}
+
+intro_faults() {
+  # For faults, mock crash for all workers
+  num_workers=3
+  # it's important to set the timeout long enough for now to avoid the "crashed" worker coming back alive while its replacement does work
+  # until it's fully supported! 
+  timeout=100
+
+  for ((i = 1; i <= num_workers; i++)); do
+    crashed_worker="worker$i"
+    echo Mocking crash for $crashed_worker with timeout of $timeout seconds
+    echo ----------------------------------------------------------------
+    intro_pash "$PASH_FLAGS --distributed_exec --worker_timeout 100 --worker_timeout_choice worker$i" "faults_$crashed_worker"
+    # echo "Iteration $i"
+    # Your loop body here
+  done
+}
+
+outputs_dir="outputs"
+rm -rf "$outputs"
+
+intro_pash "$PASH_FLAGS --distributed_exec" "distr"
+
+intro_faults
diff --git a/evaluation/distr_benchmarks/nlp/input/setup.sh b/evaluation/distr_benchmarks/nlp/input/setup.sh
@@ -19,7 +19,7 @@ if [ ! -e ./pg ]; then
   cd pg
   if [[ "$1" == "--full" ]]; then
     echo 'N.b.: download/extraction will take about 10min'
-    wget ndr.md/data/pg.tar.xz
+    wget atlas-group.cs.brown.edu/data/pg.tar.xz # FIXME: moving to PG soon
     if [ $? -ne 0 ]; then
 		cat <<-'EOF' | sed 's/^ *//'
 		Downloading input dataset failed, thus need to manually rsync all books from  project gutenberg:
@@ -31,10 +31,21 @@ if [ ! -e ./pg ]; then
   cat pg.tar.xz | tar -xJ
 
   else
-    wget http://pac-n4.csail.mit.edu:81/pash_data/nlp.zip
-    unzip nlp.zip
-    mv data/* .
-    rm nlp.zip data -rf
+    # wget http://pac-n4.csail.mit.edu:81/pash_data/nlp.zip
+    # unzip nlp.zip
+    # mv data/* .
+    # rm nlp.zip data -rf
+
+    # Mock 1
+    for (( i = 0; i < 60; i++ )); do
+        touch "$i".txt
+        cat ../genesis >> "$i".txt
+    done
+    # Mock 2
+    for (( i = 61; i < 120; i++ )); do
+        touch "$i".txt
+        cat ../exodus >> "$i".txt
+    done
   fi
 
   for f in *.txt; do
@@ -48,4 +59,4 @@ fi
 hdfs dfs -mkdir /nlp
 hdfs dfs -put exodus /nlp/exodus
 hdfs dfs -put genesis /nlp/genesis
-hdfs dfs -put pg /nlp/pg
+hdfs dfs -put pg /nlp
diff --git a/evaluation/distr_benchmarks/oneliners/check_ft_correctness.sh b/evaluation/distr_benchmarks/oneliners/check_ft_correctness.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Specify the folder where the .out files are located
+folder="$DISH_TOP/evaluation/distr_benchmarks/oneliners/outputs"
+
+# Loop through the files in the folder
+num_workers=3
+for script_distr_out in "$folder"/*distr.out; do
+    # Extract the script name without the extension
+    script_name=$(basename "$script_distr_out" .distr.out)
+    for ((i = 1; i <= num_workers; i++)); do
+        # get the corresponding .faults.$crashed_worker.out file
+        crashed_worker="worker$i"
+        script_faults_out="$folder/$script_name.faults_$crashed_worker.out"
+
+        # Perform a diff between the two files
+        echo "Comparing faults_$crashed_worker.out and distr.out for script $script_name.sh"
+        if diff -q "$script_faults_out" "$script_distr_out"; then
+            echo "Outputs are identical"
+        else
+            echo "Files are different. Differences are as follows:"
+            diff -y "$script_faults_out" "$script_distr_out"
+        fi
+        echo "-------------------------------------------"
+    done
+
+done