From 9c7afa3775beaf67e114815283c072eed5d658f8 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Thu, 1 Jul 2021 09:36:56 -0400 Subject: [PATCH] Established aclanthology.org as canonical site (#1328) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * changed canonical to aclanthology.org * straightened out mirror and main uploads * removed mirror target since it will just be redirects * updated host target in some support scripts * Adapt .htaccess to new canonical host - rewrite from www.aclanthology.org → aclanthology.org - http → https - make anthology-files a symlink so we don't have to modify its location in the .htaccess file * moved thumbnail script Co-authored-by: Arne Köhn --- .github/workflows/publish-aclanthology.yml | 35 ---------------- .github/workflows/publish.yml | 11 ++--- Makefile | 24 ++++------- .../build_thumbnails.sh | 0 bin/anthology/data.py | 4 +- bin/create_mirror.py | 2 +- bin/create_sitemapindex.sh | 2 +- bin/upload_files.py | 9 ++-- hugo/static/.htaccess | 42 ++++++++----------- 9 files changed, 37 insertions(+), 92 deletions(-) delete mode 100644 .github/workflows/publish-aclanthology.yml rename bin/{ => aclanthology.org}/build_thumbnails.sh (100%) diff --git a/.github/workflows/publish-aclanthology.yml b/.github/workflows/publish-aclanthology.yml deleted file mode 100644 index 0e57bac17d..0000000000 --- a/.github/workflows/publish-aclanthology.yml +++ /dev/null @@ -1,35 +0,0 @@ -# Publishes the build to our current mirror, aclanthology.org - -name: publish-aclanthology - -on: - push: - branches: - - master - -jobs: - publish-aclanthology: - runs-on: ubuntu-20.04 - steps: - - name: install hugo - run: wget https://github.com/gohugoio/hugo/releases/download/v0.58.3/hugo_extended_0.58.3_Linux-64bit.deb && sudo dpkg -i hugo_extended*.deb - - name: update - run: sudo apt-get update - - name: install other deps - run: sudo apt-get install -y jing bibutils openssh-client rsync libyaml-dev libpython3.8-dev - - name: dump secret key - env: - SSH_KEY: ${{ secrets.PUBLISH_SSH_KEY }} - run: | - mkdir -p $HOME/.ssh/ - echo "$SSH_KEY" > $HOME/.ssh/id_rsa - chmod 600 $HOME/.ssh/id_rsa - - uses: actions/checkout@v1 - - name: build - env: - ANTHOLOGY_PREFIX: https://aclanthology.org - run: | - make ANTHOLOGY_PREFIX=${ANTHOLOGY_PREFIX} check site - - name: publish - run: | - make ANTHOLOGY_PREFIX=${ANTHOLOGY_PREFIX} upload-mirror diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index d7ba574d7f..3f91021fa1 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -1,6 +1,6 @@ -# Publishes the build to our main site at aclweb.org/anthology +# Publishes the build to our current mirror, aclanthology.org -name: publish +name: publish-aclanthology on: push: @@ -8,7 +8,7 @@ on: - master jobs: - publish: + publish-aclanthology: runs-on: ubuntu-20.04 steps: - name: install hugo @@ -27,12 +27,9 @@ jobs: - uses: actions/checkout@v1 - name: build env: - ANTHOLOGY_PREFIX: https://www.aclweb.org/anthology + ANTHOLOGY_PREFIX: https://aclanthology.org run: | make ANTHOLOGY_PREFIX=${ANTHOLOGY_PREFIX} check site - name: publish - env: - PUBLISH_TARGET: 50.87.169.12:anthology-static - ANTHOLOGY_PREFIX: https://www.aclweb.org/anthology run: | make ANTHOLOGY_PREFIX=${ANTHOLOGY_PREFIX} upload diff --git a/Makefile b/Makefile index f8ba774809..1972af0f05 100644 --- a/Makefile +++ b/Makefile @@ -29,14 +29,14 @@ SHELL := /bin/bash # If you want to host the anthology on your own, set ANTHOLOGY_PREFIX # in your call to make to your prefix, e.g. # -# ANTHOLOGY_PREFIX="https://aclanthology.org" make +# ANTHOLOGY_PREFIX="https://www.aclweb.org/anthology" make # # PLEASE NOTE that the prefix cannot contain any '#' character, or a Perl regex # below will fail. # The following line ensures that it is exported as an environment variable # for all sub-processes: -export ANTHOLOGY_PREFIX ?= https://www.aclweb.org/anthology +export ANTHOLOGY_PREFIX ?= https://aclanthology.org SLASHATEND:=$(shell echo ${ANTHOLOGY_PREFIX} | grep -q '/$$'; echo $$?) @@ -58,7 +58,7 @@ endif # Root location for PDF and attachment hierarchy. # This is the directory where you have to put all the papers and attachments. # Easiest if the server can just serve them from /anthology-files. -ANTHOLOGYFILES ?= /anthology-files +ANTHOLOGYFILES ?= /var/www/anthology-files HUGO_ENV ?= production @@ -245,8 +245,8 @@ build/.hugo: build/.static build/.pages build/.bibtex build/.mods build/.endnote --cleanDestinationDir \ --minify @cd build/website/$(ANTHOLOGYDIR) \ - && perl -i -pe 's|ANTHOLOGYDIR|$(ANTHOLOGYDIR)|g' .htaccess \ - && perl -i -pe 's|ANTHOLOGYFILES|$(ANTHOLOGYFILES)|g' .htaccess + && ln -s $(ANTHOLOGYFILES) anthology-files \ + && perl -i -pe 's|ANTHOLOGYDIR|$(ANTHOLOGYDIR)|g' .htaccess @touch build/.hugo .PHONY: mirror @@ -306,20 +306,14 @@ serve: @echo "INFO Starting a server at http://localhost:8000/" @cd build/website && python3 -m http.server 8000 -# this target does not use ANTHOLOGYDIR because the official website -# only works if ANTHOLOGYDIR == anthology. +# Main site: aclanthology.org. Requires ANTHOLOGYDIR to be unset. .PHONY: upload upload: - @if [ $(ANTHOLOGYDIR) != "anthology" ]; then \ - echo "WARNING: Can't upload because ANTHOLOGYDIR was set to '${ANTHOLOGYDIR}' instead of 'anthology'"; \ + @if [[ $(ANTHOLOGYDIR) != "" ]]; then \ + echo "WARNING: Can't upload because ANTHOLOGYDIR was set to '${ANTHOLOGYDIR}' instead of being empty"; \ exit 1; \ fi - @echo "INFO Running rsync for main site..." - @rsync -aze "ssh -o StrictHostKeyChecking=accept-new" --delete build/website/anthology/ aclwebor@50.87.169.12:anthology-static - -.PHONY: upload-mirror -upload-mirror: - @echo "INFO Running rsync for aclanthology.org mirror..." + @echo "INFO Running rsync for main site (aclanthology.org)..." @rsync -aze "ssh -o StrictHostKeyChecking=accept-new" build/website/ anthologizer@aclanthology.org:/var/www/aclanthology.org # Push a preview to the mirror diff --git a/bin/build_thumbnails.sh b/bin/aclanthology.org/build_thumbnails.sh similarity index 100% rename from bin/build_thumbnails.sh rename to bin/aclanthology.org/build_thumbnails.sh diff --git a/bin/anthology/data.py b/bin/anthology/data.py index c8f63af5bf..3aff1c9715 100644 --- a/bin/anthology/data.py +++ b/bin/anthology/data.py @@ -25,7 +25,7 @@ # this is the canonical URL. In contrast to all other # URL templates, it always links to the official anthology. -CANONICAL_URL_TEMPLATE = "https://www.aclweb.org/anthology/{}" +CANONICAL_URL_TEMPLATE = "https://aclanthology.org/{}" # the prefix is used in different programs and we need to set it everywhere # We use a environment variable to set this and not have to forward the value @@ -34,7 +34,7 @@ try: ANTHOLOGY_PREFIX = os.environ["ANTHOLOGY_PREFIX"] except: - ANTHOLOGY_PREFIX = "https://www.aclweb.org/anthology" + ANTHOLOGY_PREFIX = "https://aclanthology.org" ATTACHMENT_PREFIX = ANTHOLOGY_PREFIX + "/attachments" ATTACHMENT_TEMPLATE = ATTACHMENT_PREFIX + "/{}" diff --git a/bin/create_mirror.py b/bin/create_mirror.py index 878fe30bad..1fde74bf68 100755 --- a/bin/create_mirror.py +++ b/bin/create_mirror.py @@ -22,7 +22,7 @@ fetch everything. Options: - --source=SRC where to fetch the files from [default: https://www.aclweb.org/anthology] + --source=SRC where to fetch the files from [default: https://aclanthology.org] --to=DIR Directory to write files to [default: {scriptdir}/../build/anthology-files] --only-papers Do not mirror attachments, only papers. --debug Output debug-level log messages. diff --git a/bin/create_sitemapindex.sh b/bin/create_sitemapindex.sh index 0365090a0b..e86abf546e 100755 --- a/bin/create_sitemapindex.sh +++ b/bin/create_sitemapindex.sh @@ -22,7 +22,7 @@ echo '' for f in "$@"; do echo ' ' - echo ' https://www.aclweb.org/anthology/'$f'' + echo ' https://aclanthology.org/'$f'' echo ' ' done echo '' diff --git a/bin/upload_files.py b/bin/upload_files.py index 44b87b7657..569cccef4b 100755 --- a/bin/upload_files.py +++ b/bin/upload_files.py @@ -43,14 +43,11 @@ from anthology.utils import deconstruct_anthology_id from typing import List -# Name for the SSH alias in ~/.ssh/config. -SSH_CONFIG_TARGET = "aclweb" - # The root directory for files, currently containing pdf/ and attachments/ -ACLWEB_FILE_ROOT = "/home3/aclwebor/anthology-files" +ACLWEB_FILE_ROOT = "/home/anthologizer/anthology-files" -# The ssh shortcut or full hostname -ACLWEB_HOST = "aclweb" +# The ssh shortcut (in ~/.ssh/config) or full hostname +ACLWEB_HOST = "anth" def get_dest_path(filepath: str): diff --git a/hugo/static/.htaccess b/hugo/static/.htaccess index 6276b91270..eb933bca33 100644 --- a/hugo/static/.htaccess +++ b/hugo/static/.htaccess @@ -28,30 +28,22 @@ RewriteEngine On # The redirects are only applicable to the official aclweb.org anthology; all the # 301 redirects are guarded with a RewriteCond. -# Redirect non-canonical URL bases to https://aclweb.org/anthology/ -RewriteCond %{HTTP_HOST} ^anthology\.aclweb\.org$ [OR] -RewriteCond %{HTTP_HOST} ^www\.anthology\.aclweb\.org$ -RewriteRule ^/?(.*) https://www.aclweb.org/anthology/$1 [R=301,L] - -# Now we know we are at the canonical URL. -# Now make sure that we redirect from non-HTTPS to HTTPS. -# **Note**: at this point, paths are relative to /anthology, that is, they *do not include* the /anthology/ text. -RewriteCond %{HTTP_HOST} aclweb.org RewriteCond %{HTTPS} !=on [OR] -RewriteCond %{HTTP_HOST} !^www\. [NC] -RewriteRule ^(.*)$ https://www.aclweb.org/anthology/$1 [R=301,L] +RewriteCond %{HTTP_HOST} www\.aclanthology\.org [NC] +RewriteRule ^(.*)$ https://aclanthology.org/$1 [R=301,L] + ## Nested file paths # Redirect old nested file paths (e.g., P/P17/P17-1069.pdf -> https://www.aclweb.org/anthology/P17-1069.pdf) # Note that since capture patterns can't be reused in the capture portion of the string, this is a leaky match -# that will also redirect X/Z19/P17-1069.pdf -> ANTHOLOGYFILES/pdf/P/P17/P17-1069.pdf -RewriteCond %{HTTP_HOST} ^www\.aclweb\.org$ -RewriteRule ^[A-Za-z]/[A-Za-z]\d{2}/([A-Za-z])(\d{2})\-(\d{4})(/|\.[a-z]+)?$ https://www.aclweb.org/anthology/$1$2-$3$4 [R=301,L] +# that will also redirect X/Z19/P17-1069.pdf -> anthology-files/pdf/P/P17/P17-1069.pdf +RewriteCond %{HTTP_HOST} ^aclanthology\.org$ +RewriteRule ^[A-Za-z]/[A-Za-z]\d{2}/([A-Za-z])(\d{2})\-(\d{4})(/|\.[a-z]+)?$ https://aclanthology.org/$1$2-$3$4 [R=301,L] # Redirect nested paper pages to the canonical location (e.g., papers/P/P19/P19-1001/ -> P19-1001) # This way there is only one page. Should maintain for backward compatibility for some time after August 2019. -RewriteCond %{HTTP_HOST} ^www\.aclweb\.org$ -RewriteRule ^papers/[A-Za-z]/[A-Za-z]\d{2}/([A-Za-z])(\d{2})\-(\d{4})/?$ https://www.aclweb.org/anthology/$1$2-$3 [R=301,L] +RewriteCond %{HTTP_HOST} ^aclanthology\.org$ +RewriteRule ^papers/[A-Za-z]/[A-Za-z]\d{2}/([A-Za-z])(\d{2})\-(\d{4})/?$ https://aclanthology.org/$1$2-$3 [R=301,L] # # INTERNAL RETRIEVALS @@ -66,20 +58,20 @@ RewriteRule ^([A-Za-z]\d{2}\-\d{1,2})/?$ /ANTHOLOGYDIR/volumes/$1 [L,NC] RewriteRule ^(\d{4}\.[a-zA-Z\d]+\-[a-zA-Z\d]+)/?$ /ANTHOLOGYDIR/volumes/$1 [L,NC] # Volume URLs (e.g., P17-1.pdf loads P/P17/P17-1.pdf, 2020.acl-main loads acl/2020.acl-main.pdf) -RewriteRule ^([A-Za-z])(\d{2})\-(\d{1,2})\.pdf$ ANTHOLOGYFILES/pdf/$1/$1$2/$1$2-$3.pdf [L,NC] -RewriteRule ^(\d{4})\.([a-zA-Z\d]+)\-([a-zA-Z\d]+)\.pdf$ ANTHOLOGYFILES/pdf/$2/$1.$2-$3.pdf [L,NC] +RewriteRule ^([A-Za-z])(\d{2})\-(\d{1,2})\.pdf$ anthology-files/pdf/$1/$1$2/$1$2-$3.pdf [L,NC] +RewriteRule ^(\d{4})\.([a-zA-Z\d]+)\-([a-zA-Z\d]+)\.pdf$ anthology-files/pdf/$2/$1.$2-$3.pdf [L,NC] # PDF link, revisions, and errata (P17-1069[v2].pdf loads P/P17/P17-1069[v2].pdf --- with "v2" optional) # TODO: decide on a new format for revisions and errata -RewriteRule ^([A-Za-z])(\d{2})\-(\d{4})([ve]\d+)?\.pdf$ ANTHOLOGYFILES/pdf/$1/$1$2/$1$2-$3$4.pdf [L,NC] -RewriteRule ^(\d{4})\.([a-zA-Z\d]+)\-([a-zA-Z\d]+)\.(\d+([ve]\d+)?)\.pdf$ ANTHOLOGYFILES/pdf/$2/$1.$2-$3.$4.pdf [L,NC] +RewriteRule ^([A-Za-z])(\d{2})\-(\d{4})([ve]\d+)?\.pdf$ anthology-files/pdf/$1/$1$2/$1$2-$3$4.pdf [L,NC] +RewriteRule ^(\d{4})\.([a-zA-Z\d]+)\-([a-zA-Z\d]+)\.(\d+([ve]\d+)?)\.pdf$ anthology-files/pdf/$2/$1.$2-$3.$4.pdf [L,NC] -# Attachments (e.g., P17-1069.Poster.pdf loads ANTHOLOGYFILES/attachments/P/P17/P17-1069.Poster.pdf) -RewriteRule ^attachments/([A-Za-z])(\d{2})\-(\d{4})(\..*)?$ ANTHOLOGYFILES/attachments/$1/$1$2/$1$2-$3$4 [L,NC] -RewriteRule ^attachments/(\d{4})\.([a-zA-Z\d]+)\-([a-zA-Z\d]+\.\d+)\.(.*)$ ANTHOLOGYFILES/attachments/$2/$1.$2-$3.$4 [L,NC] +# Attachments (e.g., P17-1069.Poster.pdf loads anthology-files/attachments/P/P17/P17-1069.Poster.pdf) +RewriteRule ^attachments/([A-Za-z])(\d{2})\-(\d{4})(\..*)?$ anthology-files/attachments/$1/$1$2/$1$2-$3$4 [L,NC] +RewriteRule ^attachments/(\d{4})\.([a-zA-Z\d]+)\-([a-zA-Z\d]+\.\d+)\.(.*)$ anthology-files/attachments/$2/$1.$2-$3.$4 [L,NC] -# Thumbnails (e.g., /thumb/P17-1069.jpg loads ANTHOLOGYFILES/thumb/P17-1069.jpg) -RewriteRule ^thumb/(.*)$ ANTHOLOGYFILES/thumb/$1 [L,NC] +# Thumbnails (e.g., /thumb/P17-1069.jpg loads anthology-files/thumb/P17-1069.jpg) +RewriteRule ^thumb/(.*)$ anthology-files/thumb/$1 [L,NC] # Author pages (e.g., people/arya-d-mccarthy loads people/a/arya-d-mccarthy) RewriteRule ^people/([a-z])([\-a-z0-9]*)$ people/$1/$1$2/ [L,NC]