-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetch.sh
executable file
·148 lines (136 loc) · 5.52 KB
/
fetch.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env bash
#SBATCH --account=rrg-fiona-ad
#SBATCH --time=02:59:00
#SBATCH --job-name=microbedb-fetch
#SBATCH --mem-per-cpu=2000M
#SBATCH --export=ALL
#SBATCH --mail-type=FAIL
#SBATCH --mail-user=nolan_w@sfu.ca
#SBATCH --output=%x_%a.out
# brinkman-ws+microbedb@sfu.ca
set -e -o pipefail # Halt on error
shopt -s nullglob
# For each chunk of the query results:
# - Download genomic data from NCBI
# - Split assembly gbff into separate files per replicon
# - Generate fna, faa, ffn, and PTT for each replicon
# - Populate the assembly, datasets, and summaries tables
FTP_GENOMES_PREFIX="genomes/" # NCBI rsync server returns error if you try to target root. This variable is the minimum path to avoid that.
REPONAME="microbedb.brinkmanlab.ca"
WORKDIR="$(pwd)"
IGNOREEXIT=24
IGNOREOUT='^(file has vanished: |rsync warning: some files vanished before they could be transferred)'
RETRIES=10 # Number of times to reattempt rsync network errors before failing
# Skip if no records to process
if [[ $(jq 'length' "${SLURM_ARRAY_TASK_ID}.json") == 0 ]]; then
echo "No records have a refseq uid of $(jq '.result.uids | length' "${SLURM_ARRAY_TASK_ID}.json") records, skipping fetch ${SLURM_ARRAY_TASK_ID}."
exit 0
fi
# Download data
echo "Preparing download lists.."
if [[ -z $CLEAN && -d $REPOPATH && $(cvmfs_config showconfig "$REPONAME" | grep -P 'CVMFS_REPOSITORY_TAG=$|CVMFS_REPOSITORY_DATE=$|CVMFS_ROOT_HASH=$' | wc -l) == 3 ]]; then
# verify that node has most recent commit of CVMFS repo mounted
echo "ERROR: $REPONAME targets commit other than trunk. The diff should be against the most recent commit."
exit 1
fi
# Generate file lists per ftp host (there should only be one but if that ever changes..)
# Output one file path per line to {hostname}_${SLURM_ARRAY_TASK_ID}.files
# ${SLURM_ARRAY_TASK_ID}.paths stores the Entrez uid to path mapping
rm -f *_${SLURM_ARRAY_TASK_ID}.files *_${SLURM_ARRAY_TASK_ID}.md5 "${SLURM_ARRAY_TASK_ID}.paths"
jq -r -f <(
cat <<EOF
to_entries | .[].value | {uid: .uid} + (
if (.ftppath_refseq | length) > 0 then
.ftppath_refseq
else
.ftppath_genbank
end | capture("^ftp://(?<host>[^/]+)/${FTP_GENOMES_PREFIX}(?<path>.*)$")
) |
"\(.uid)\t\(.host)\t\(.path)"
EOF
) "${SLURM_ARRAY_TASK_ID}.json" |
while IFS=$'\t' read -r id host path; do
# For each assembly, prepare directory and add to rsync --files-from
mkdir -p "${OUTDIR}/${path}"
echo "${path}/md5checksums.txt" >>"${host}_${SLURM_ARRAY_TASK_ID}.md5" # Append to list of md5 checksum files to rsync
echo "${path}" >>"${host}_${SLURM_ARRAY_TASK_ID}.files" # Append to list of files to rsync
printf "%s\t%s\n" "${id}" "${path}" >>"${SLURM_ARRAY_TASK_ID}.paths" # Append to tsv of uid to path mappings
done
# Convert .md5 to checksums_*.csv
to_checksums () {
local TOSCHEMA='BEGIN{FS=" ";OFS=","}{file=substr($2,2); sub(/\.gz$/, "", file); print "\""$1"\"", "\"" PATH file "\"", "\"" PATH substr($2,2) "\""}'
while read -r path; do
gawk -v PATH=$(dirname $path) "$TOSCHEMA" "${OUTDIR}/${path}" >>"checksums_${SLURM_ARRAY_TASK_ID}.csv"
done
}
sync () {
local ret=30
local retries=$RETRIES
while (( (ret == 30 || ret == 35) && retries > 0 )); do
rsync -rvvm --no-g --no-p --chmod=u+rwX,go+rX --ignore-missing-args --files-from="$1" $2 --inplace "rsync://${host}/${FTP_GENOMES_PREFIX}" "${OUTDIR}" 2>&1 | (grep -vP "$IGNOREOUT" || true)
ret=$?
((--retries))
done
if (( ret != IGNOREEXIT && ret != 0 )); then
return $ret
fi
return 0
}
truncate -s0 "checksums_${SLURM_ARRAY_TASK_ID}.csv"
if [[ -n $SKIP_RSYNC ]]; then
echo "Skipping rsync."
for md5s in *_"${SLURM_ARRAY_TASK_ID}.md5"; do
to_checksums <"$md5s"
done
exit 0
fi
# Rsync datasets per ftp host
# Passes the previously generated {hostname}_${SLURM_ARRAY_TASK_ID}.files to rsyncs '--files-from' argument
for files in *_${SLURM_ARRAY_TASK_ID}.files; do # for each host
if [[ "${files}" =~ ^(.*)_[^_]+\.files$ ]]; then # extract host from file name
host="${BASH_REMATCH[1]}"
echo "Downloading genomic data from ${host}.."
set +e
if [[ -d $REPOPATH && -z $CLEAN ]]; then
# Sync comparing to existing CVMFS repo
if [[ -d "${host}_${SLURM_ARRAY_TASK_ID}.md5" && -f "${REPOPATH}/microbedb.sqlite" ]]; then
# Download .md5 file for each dataset
# Use checksums to only download files that have changed
sync "${host}_${SLURM_ARRAY_TASK_ID}.md5"
ret=$?
if (( ret != 0 )); then
echo "Detected rsync error ($ret) during md5checksums sync."
exit $ret
fi
# Compare checksums and rewrite $files content to explicitly specify individual files
to_checksums <"${host}_${SLURM_ARRAY_TASK_ID}.md5"
sqlite3 -bail -readonly "${REPOPATH}/microbedb.sqlite" <<EOF >"$files"
.read ${SRCDIR}/temp_tables.sql
.mode csv
.import checksums_${SLURM_ARRAY_TASK_ID}.csv checksums
.mode list
SELECT c.ncbi_path FROM datasets d LEFT JOIN checksums c ON d.path = c.path WHERE d.path IS NULL OR d.checksum != c.checksum;
EOF
fi
# Download remaining files
sync "${files}" --compare-dest="${REPOPATH}"
ret=$?
else
# Download everything without comparing
sync "${files}"
ret=$?
if (( ret == 0 )); then
to_checksums <"${host}_${SLURM_ARRAY_TASK_ID}.md5"
fi
fi
if (( ret != 0 )); then
echo "Detected rsync error ($ret)."
exit $ret
fi
set -e
else
echo "Unable to extract host from '$files'"
fi
done
echo "Done fetch."
echo $SLURM_ARRAY_TASK_ID >> "${WORKDIR}/completed_fetch"