Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update distr ft benchmarks #708

Open
wants to merge 3 commits into
base: ft-future
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion evaluation/benchmarks/bio/bio-align/genome-diff.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# bacteria), and any regions with less than 10 supporting reads.

# Requires: samtools, minimap2, bcftools
# Data: http://ndr.md/data/bio/R1.fastq.gz http://ndr.md/data/bio/R2.fastq.gz http://ndr.md/data/bio/ref.fa
# Data: atlas-group.cs.brown.edu/data/bio/R1.fastq.gz atlas-group.cs.brown.edu/data/bio/R2.fastq.gz atlas-group.cs.brown.edu/data/bio/ref.fa

# https://github.com/samtools/samtools/releases/latest
# https://github.com/lh3/minimap2
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/bio/bio-align/genquality.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# http://thegenomefactory.blogspot.com/2019/09/25-reasons-assemblies-dont-make-it-into.html

# Require: csvkit
# Data: http://ndr.md/data/bio/genbank.txt
# Data: atlas-group.cs.brown.edu/data/bio/genbank.txt

IN=./input/genbank.txt
OUT=./output/out.txt
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/bio/bio1/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ mkdir -p input
mkdir -p output
cd input
if [[ ! -f R1.fastq ]]; then
wget ndr.md/data/bio/{R1.fastq.gz,R2.fastq.gz,ref.fa}
wget atlas-group.cs.brown.edu/data/bio/{R1.fastq.gz,R2.fastq.gz,ref.fa}

gunzip R1.fastq.gz
gunzip R2.fastq.gz
Expand Down
4 changes: 2 additions & 2 deletions evaluation/benchmarks/max-temp/max-temp-preprocess.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
#!/bin/bash

sed 's;^;http://ndr.md/data/noaa/;' |
sed 's;^;atlas-group.cs.brown.edu/data/noaa/;' |
sed 's;$;/;' |
xargs -r -n 1 curl -s |
grep gz |
tr -s ' \n' |
cut -d ' ' -f9 |
sed 's;^\(.*\)\(20[0-9][0-9]\).gz;\2/\1\2\.gz;' |
sed 's;^;http://ndr.md/data/noaa/;' |
sed 's;^;atlas-group.cs.brown.edu/data/noaa/;' |
xargs -n1 curl -s |
gunzip
2 changes: 1 addition & 1 deletion evaluation/benchmarks/max-temp/max-temp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

FROM=${FROM:-2015}
TO=${TO:-2015}
IN=${IN:-'http://ndr.md/data/noaa/'}
IN=${IN:-'atlas-group.cs.brown.edu/data/noaa/'}
fetch=${fetch:-"curl -s"}

seq $FROM $TO |
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/max-temp/temp-analytics.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

FROM=${FROM:-2015}
TO=${TO:-2015}
IN=${IN:-'http://ndr.md/data/noaa/'}
IN=${IN:-'atlas-group.cs.brown.edu/data/noaa/'}
fetch=${fetch:-"curl -s"}

data_file=temperatures.txt
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/nlp/input/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ setup_dataset() {
cd pg
if [[ "$1" == "--full" ]]; then
echo 'N.b.: download/extraction will take about 10min'
wget ndr.md/data/pg.tar.xz
wget atlas-group.cs.brown.edu/data/pg.tar.xz # FIXME: moving to PG soon
if [ $? -ne 0 ]; then
cat <<-'EOF' | sed 's/^ *//'
Downloading input dataset failed, thus need to manually rsync all books from project gutenberg:
Expand Down
8 changes: 4 additions & 4 deletions evaluation/benchmarks/oneliners/input/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ setup_dataset() {
fi

if [ ! -f ./1M.txt ]; then
curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt
curl -sf 'atlas-group.cs.brown.edu/data/dummy/1M.txt' > 1M.txt
if [ $? -ne 0 ]; then
echo 'cannot find 1M.txt -- please contact the developers of pash'
exit 1
Expand All @@ -51,7 +51,7 @@ setup_dataset() {
fi

if [ ! -f ./1G.txt ]; then
curl -sf 'http://ndr.md/data/dummy/1G.txt' > 1G.txt
curl -sf 'atlas-group.cs.brown.edu/data/dummy/1G.txt' > 1G.txt
if [ $? -ne 0 ]; then
echo 'cannot find 1G.txt -- please contact the developers of pash'
exit 1
Expand All @@ -61,7 +61,7 @@ setup_dataset() {

# download wamerican-insane dictionary and sort according to machine
if [ ! -f ./dict.txt ]; then
curl -sf 'http://ndr.md/data/dummy/dict.txt' | sort > dict.txt
curl -sf 'atlas-group.cs.brown.edu/data/dummy/dict.txt' | sort > dict.txt
if [ $? -ne 0 ]; then
echo 'cannot find dict.txt -- please contact the developers of pash'
exit 1
Expand All @@ -70,7 +70,7 @@ setup_dataset() {
fi

if [ ! -f ./all_cmds.txt ]; then
curl -sf 'http://ndr.md/data/dummy/all_cmds.txt' > all_cmds.txt
curl -sf 'atlas-group.cs.brown.edu/data/dummy/all_cmds.txt' > all_cmds.txt
if [ $? -ne 0 ]; then
# This should be OK for tests, no need for abort
ls /usr/bin/* > all_cmds.txt
Expand Down
3 changes: 1 addition & 2 deletions evaluation/benchmarks/web-index/input/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@ setup_dataset() {
wget $wiki_archive || eexit "cannot fetch wikipedia"
7za x wikipedia-en-html.tar.7z
tar -xvf wikipedia-en-html.tar
wget http://ndr.md/data/wikipedia/index.txt # || eexit "cannot fetch wiki indices"
# It is actually OK if we don't have this index since we download the 500/1000 below
wget atlas-group.cs.brown.edu/data/wikipedia/index.txt # FIXME: we download index below?
fi

if [ "$1" = "--small" ]; then
Expand Down
27 changes: 27 additions & 0 deletions evaluation/distr_benchmarks/intro/check-ft-correctness.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash

# Specify the folder where the .out files are located
folder="$DISH_TOP/evaluation/distr_benchmarks/intro/outputs"

# Loop through the files in the folder
num_workers=3
for script_distr_out in "$folder"/*distr.out; do
# Extract the script name without the extension
script_name=$(basename "$script_distr_out" .distr.out)
for ((i = 1; i <= num_workers; i++)); do
# get the corresponding .faults.$crashed_worker.out file
crashed_worker="worker$i"
script_faults_out="$folder/$script_name.faults_$crashed_worker.out"

# Perform a diff between the two files
echo "Comparing faults_$crashed_worker.out and distr.out for script $script_name.sh"
if diff -q "$script_faults_out" "$script_distr_out"; then
echo "Outputs are identical"
else
echo "Files are different. Differences are as follows:"
diff -y "$script_faults_out" "$script_distr_out"
fi
echo "-------------------------------------------"
done

done
16 changes: 16 additions & 0 deletions evaluation/distr_benchmarks/intro/demo-spell.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/sh

cd "$(dirname $0)"

[ -z $PASH_TOP ] && {
echo "PASH_TOP not set, maybe $(git rev-parse --show-toplevel)?"
exit
}
DICT="$DISH_TOP/evaluation/distr_benchmarks/intro/input/sorted_words"
IN=${IN:-/intro/100M.txt}
hdfs dfs -cat -ignoreCrc $IN |
tr A-Z a-z |
tr -cs A-Za-z '\n' |
sort |
uniq |
comm -13 $DICT -
3 changes: 3 additions & 0 deletions evaluation/distr_benchmarks/intro/input/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
100M.txt
words
sorted_words
53 changes: 53 additions & 0 deletions evaluation/distr_benchmarks/intro/input/setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/bin/bash

PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)}
. "$PASH_TOP/scripts/utils.sh"
cd $(dirname $0)
input_files=("100M.txt")
local_fils=("dict.txt")

[ "$1" = "-c" ] && rm-files 100M.txt words sorted_words

hdfs dfs -mkdir -p /intro

if [ ! -f ./100M.txt ]; then
curl -sf --connect-timeout 10 'ndr.md/data/dummy/100M.txt' > 100M.txt
if [ $? -ne 0 ]; then
# Pipe curl through tac (twice) in order to consume all the output from curl.
# This way, curl can write the whole page and not emit an error code.
curl -fL 'http://www.gutenberg.org/files/2600/2600-0.txt' | tac | tac | head -c 1M > 1M.txt
[ $? -ne 0 ] && eexit 'cannot find 1M.txt'
touch 100M.txt
for (( i = 0; i < 100; i++ )); do
cat 1M.txt >> 100M.txt
done
fi
append_nl_if_not ./100M.txt
fi

if [ ! -f ./words ]; then
curl -sf --connect-timeout 10 'http://ndr.md/data/dummy/words' > words
if [ $? -ne 0 ]; then
curl -sf 'https://zenodo.org/record/7650885/files/words' > words
if [ $? -ne 0 ]; then
if [ $(uname) = 'Darwin' ]; then
cp /usr/share/dict/web2 words || eexit "cannot find dict file"
else
# apt install wamerican-insane
cp /usr/share/dict/words words || eexit "cannot find dict file"
fi
fi
fi
append_nl_if_not words
fi

## Re-sort words for this machine
if [ ! -f ./sorted_words ]; then
sort words > sorted_words
fi

# Add files with different replication factors
for file in "${input_files[@]}"; do
hdfs dfs -put $file /intro/$file
rm -f $file
done
66 changes: 66 additions & 0 deletions evaluation/distr_benchmarks/intro/run.distr.faults.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
PASH_FLAGS='--width 8 --r_split'
export TIMEFORMAT=%R
export dict="$PASH_TOP/evaluation/distr_benchmarks/oneliners/input/dict.txt"
curl -sf 'http://ndr.md/data/dummy/dict.txt' | sort > $dict


intro_pash(){
flags=${1:-$PASH_FLAGS}
prefix=${2:-par}
prefix=$prefix

times_file="$prefix.res"
outputs_suffix="$prefix.out"
time_suffix="$prefix.time"
outputs_dir="outputs"
pash_logs_dir="pash_logs_$prefix"

mkdir -p "$outputs_dir"
mkdir -p "$pash_logs_dir"

touch "$times_file"
cat $times_file >> $times_file.d
echo executing one-liners with $prefix pash with data $(date) | tee "$times_file"
echo '' >> "$times_file"


script="demo-spell"


printf -v pad %30s
padded_script="${script}.sh:${pad}"
padded_script=${padded_script:0:30}

outputs_file="${outputs_dir}/${script}.${outputs_suffix}"
pash_log="${pash_logs_dir}/${script}.pash.log"
single_time_file="${outputs_dir}/${script}.${time_suffix}"

echo -n "${padded_script}" | tee -a "$times_file"
{ time "$PASH_TOP/pa.sh" $flags --log_file "${pash_log}" ${script}.sh > "$outputs_file"; } 2> "${single_time_file}"
cat "${single_time_file}" | tee -a "$times_file"

}

intro_faults() {
# For faults, mock crash for all workers
num_workers=3
# it's important to set the timeout long enough for now to avoid the "crashed" worker coming back alive while its replacement does work
# until it's fully supported!
timeout=100

for ((i = 1; i <= num_workers; i++)); do
crashed_worker="worker$i"
echo Mocking crash for $crashed_worker with timeout of $timeout seconds
echo ----------------------------------------------------------------
intro_pash "$PASH_FLAGS --distributed_exec --worker_timeout 100 --worker_timeout_choice worker$i" "faults_$crashed_worker"
# echo "Iteration $i"
# Your loop body here
done
}

outputs_dir="outputs"
rm -rf "$outputs"

intro_pash "$PASH_FLAGS --distributed_exec" "distr"

intro_faults
23 changes: 17 additions & 6 deletions evaluation/distr_benchmarks/nlp/input/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ if [ ! -e ./pg ]; then
cd pg
if [[ "$1" == "--full" ]]; then
echo 'N.b.: download/extraction will take about 10min'
wget ndr.md/data/pg.tar.xz
wget atlas-group.cs.brown.edu/data/pg.tar.xz # FIXME: moving to PG soon
if [ $? -ne 0 ]; then
cat <<-'EOF' | sed 's/^ *//'
Downloading input dataset failed, thus need to manually rsync all books from project gutenberg:
Expand All @@ -31,10 +31,21 @@ if [ ! -e ./pg ]; then
cat pg.tar.xz | tar -xJ

else
wget http://pac-n4.csail.mit.edu:81/pash_data/nlp.zip
unzip nlp.zip
mv data/* .
rm nlp.zip data -rf
# wget http://pac-n4.csail.mit.edu:81/pash_data/nlp.zip
# unzip nlp.zip
# mv data/* .
# rm nlp.zip data -rf

# Mock 1
for (( i = 0; i < 60; i++ )); do
touch "$i".txt
cat ../genesis >> "$i".txt
done
# Mock 2
for (( i = 61; i < 120; i++ )); do
touch "$i".txt
cat ../exodus >> "$i".txt
done
fi

for f in *.txt; do
Expand All @@ -48,4 +59,4 @@ fi
hdfs dfs -mkdir /nlp
hdfs dfs -put exodus /nlp/exodus
hdfs dfs -put genesis /nlp/genesis
hdfs dfs -put pg /nlp/pg
hdfs dfs -put pg /nlp
27 changes: 27 additions & 0 deletions evaluation/distr_benchmarks/oneliners/check_ft_correctness.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash

# Specify the folder where the .out files are located
folder="$DISH_TOP/evaluation/distr_benchmarks/oneliners/outputs"

# Loop through the files in the folder
num_workers=3
for script_distr_out in "$folder"/*distr.out; do
# Extract the script name without the extension
script_name=$(basename "$script_distr_out" .distr.out)
for ((i = 1; i <= num_workers; i++)); do
# get the corresponding .faults.$crashed_worker.out file
crashed_worker="worker$i"
script_faults_out="$folder/$script_name.faults_$crashed_worker.out"

# Perform a diff between the two files
echo "Comparing faults_$crashed_worker.out and distr.out for script $script_name.sh"
if diff -q "$script_faults_out" "$script_distr_out"; then
echo "Outputs are identical"
else
echo "Files are different. Differences are as follows:"
diff -y "$script_faults_out" "$script_distr_out"
fi
echo "-------------------------------------------"
done

done
Loading
Loading