Skip to content

Commit

Permalink
Attempt sitemap approach
Browse files Browse the repository at this point in the history
  • Loading branch information
wesley-dean-gsa committed Oct 18, 2024
1 parent 2fe90a7 commit 8b77c15
Showing 1 changed file with 35 additions and 4 deletions.
39 changes: 35 additions & 4 deletions bin/archive_website.bash
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,8 @@ mirror_site() {
slugified_url="$(slugify "$URL")"
now="$(date +%Y%m%d%H%M)"
tarball="site-archive-${slugified_url}-${now}.tar.gz"
logfile="site-archive-${slugified_url}-${now}.log"
logfile="site-archive-${slugified_url}-${now}.log.txt"
sitemapfile="site-archive-${slugified_url}-${now}.sitemap.txt"

## perform some cleanup

Expand All @@ -114,6 +115,11 @@ mirror_site() {
rm -rf "${tarball}"
fi

if [ -e "${sitemapfile}" ]; then
echo "Removing old sitemap '${sitemapfile}'" 1>&2
rm -rf "${sitemapfile}"
fi

if [ -e "${slugified_url}" ]; then
echo "Removing old directory '${slugified_url}'" 1>&2
rm -rf "${slugified_url}"
Expand All @@ -126,17 +132,42 @@ mirror_site() {
mkdir -p "${slugified_url}"
fi

## acquire the sitemap

touch "$logfile"

echo "Downloading sitemap" 1>&2
wget \
"${URL}sitemap.xml" \
--append-output="${logfile}" \
--output-document=- \
| sed -Ene "/<loc>/p" \
| sed -Ee "s/<[^>]*>//g" \
> "$sitemapfile"



## download the site

echo "Beginning download" 1>&2
wget \
--wait 0 \
--level=inf \
--limit-rate=5000K \
--recursive \
--user-agent=TTSSiteArchiver \
--no-host-directories \
--directory-prefix="${slugified_url}" \
--output-file="${logfile}" \
--mirror \
--no-clobber \
--no-parent \
--page-requisites \
--convert-links \
--execute "robots=off" \
--input-file="$sitemapfile" \
"$@" \
"$URL" || true
|| true \
2>&1 \
| tee -a "${logfile}"

## scan the results looking for failing HTTP responses

Expand Down

0 comments on commit 8b77c15

Please sign in to comment.