Merge branch 'master' into python-dev

acl-org · Nov 3, 2024 · ec402f6 · ec402f6
2 parents 194473d + 0b67533
commit ec402f6
Show file tree

Hide file tree

Showing 362 changed files with 100,870 additions and 5,396 deletions.
diff --git a/.github/ISSUE_TEMPLATE/01-metadata-correction.yml b/.github/ISSUE_TEMPLATE/01-metadata-correction.yml
@@ -5,6 +5,14 @@ labels: ["correction", "metadata"]
 assignees:
   - anthology-assist
 body:
+  - type: markdown
+    attributes:
+      value: >
+        This form will report paper metadata issues to Anthology staff.
+        Each paper's metadata is stored in an [XML](https://github.com/acl-org/acl-anthology/tree/master/data/xml)
+        record, and is supposed to match what appears in the PDF.
+        If you can, submitting a __pull request__ instead of this form will expedite the process
+        (please include a link to the paper as a comment in the pull request). Thanks!
   - type: checkboxes
     id: paper_metadata_confirmation
     attributes:

diff --git a/.github/ISSUE_TEMPLATE/02-name-correction.yml b/.github/ISSUE_TEMPLATE/02-name-correction.yml
@@ -5,6 +5,14 @@ labels: ["correction", "metadata"]
 assignees:
   - anthology-assist
 body:
+  - type: markdown
+    attributes:
+      value: >
+        This form will report author metadata issues to Anthology staff.
+        For simple cases (where paper metadata in the [XML](https://github.com/acl-org/acl-anthology/tree/master/data/xml)
+        record doesn't match the PDF, or
+        [`name_variants.yaml`](https://github.com/acl-org/acl-anthology/blob/master/data/yaml/name_variants.yaml) needs modification),
+        submitting a __pull request__ instead will expedite the process. Thanks!
   - type: textarea
     id: name_pages_affected
     attributes:

diff --git a/.github/ISSUE_TEMPLATE/04-ingestion-request.yml b/.github/ISSUE_TEMPLATE/04-ingestion-request.yml
@@ -24,12 +24,19 @@ body:
       placeholder: ex. emnlp, repl4nlp
     validations:
       required: true
+  - type: input
+    id: venue_sig
+    attributes:
+      label: "ACL SIG(s) sponsoring or endorsing the whole venue"
+      description: |
+        Provide a comma-separated list of any SIGs that apply to the whole venue. If there are multiple subvenues/volumes with different SIGs, provide the mapping under Supporting Information.
+      placeholder: ex. SIGLEX, SIGSEM
   - type: input
     id: volume_title
     attributes:
       label: Volume Title
       description: |
-        What is the title of the volume that should be published?
+        What is the title of the (main) volume that should be published?
       placeholder: ex. Proceedings of the 2019 Meeting of the Conference on Empirical Methods in Natural Language Processing (EMNLP)
     validations:
       required: true
@@ -54,9 +61,16 @@ body:
       description: |
         When would you like the material to be published on the ACL Anthology? If you are submitting material that can be published immediately (e.g. for conferences that already happened in the past), you can leave this field blank.
       placeholder: ex. 2023-12-31
+  - type: input
+    id: volume_address
+    attributes:
+      label: Location
+      description: |
+        What address should be included in bibliography entries, if any? For conferences this is the location of the conference. For a fully-online event use "Online", optionally following the host team location. Ensure the address field is consistent across submitted volumes.
+      placeholder: ex. Barcelona, Spain (Online)
   - type: textarea
     id: ingestion_information
     attributes:
       label: Supporting Information
       description: |
-        If there is anything else we should know about this ingestion request, please provide the information here.  You can also use this field to **provide links or attach files** of the material, if you already have them.
+        If there is anything else we should know about this ingestion request, please provide the information here. E.g. for venues with multiple volumes, list them with the volume identifier, volume title, and any SIGs for the volume. You can also use this field to **provide links or attach files** of the material, if you already have them.
diff --git a/.github/ingestion-review-checklist.md b/.github/ingestion-review-checklist.md
@@ -0,0 +1,12 @@
+1. [ ] In the Github sidebar, add the PR to work items and the current milestone
+1. [ ] In the Github sidebar, under "Development", make sure to link to the corresponding issue
+1. [ ] Make sure the branch is merged with the latest `master` branch
+1. [ ] Ensure that there are editors listed in the `<meta>` block
+1. [ ] For workshops, add a `<venue>ws</venue>` tag to its meta block
+1. [ ] For workshops, add a backlink from the main event's `<event>` block
+1. [ ] Add events to their relevant SIGs
+1. [ ] Look at the venue listing for prior years, and ensure that the new volume titles are consistent. You can do this by clicking on the venue name from a paper page, which will take you to the vendor listing.
+1. [ ] Navigate to the event page preview (e.g., https://preview.aclanthology.org/icnlsp-ingestion/events/icnlsp-2021/), and page through, to see if there are any glaring mistakes
+1. [ ] Skim through the complete listing, looking for mis-parsed author names.
+1. [ ] Download the frontmatter and verify that the table of contents matches at least three randomly-selected papers
+1. [ ] Download 3–5 PDFs (including the first and last one) and make sure they are correct (title, authors, page numbers).
diff --git a/.github/workflows/link-to-checklist.yml b/.github/workflows/link-to-checklist.yml
@@ -0,0 +1,47 @@
+name: link-to-checklist
+
+on:
+  workflow_dispatch:
+  pull_request_target:
+    types: [opened]
+
+jobs:
+  add-review-checklist:
+    if : ${{ github.event_name == 'pull_request_target' && github.event.action == 'opened' && startsWith(github.event.pull_request.title, 'ingestion') == true}}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v3
+      - name: Print event details
+        run: |
+          echo "Event Name: ${{ github.event_name }}"
+          echo "Action: ${{ github.event.action }}"
+          echo "PR Title: ${{ github.event.pull_request.title }}"
+          echo "Starts with ingestion: ${{ startsWith(github.event.pull_request.title, 'ingestion') }}"
+      - name: Log from GitHub script
+        uses: actions/github-script@v6
+        with:
+          script: |
+            console.log('This is a test log.');
+            console.log('Event Name:', context.eventName);
+            console.log('Action:', context.payload.action);
+            console.log('PR Title:', context.payload.pull_request.title);
+            console.log('Starts with ingestion:', context.payload.pull_request.title.startsWith('ingestion'));
+      - name: Add review checklist
+        uses: actions/github-script@v6
+        with:
+          script: |
+            const fs = require('fs');
+            console.log('Reading checklist file...');
+            const review_checklist = fs.readFileSync('.github/ingestion-review-checklist.md', 'utf8');
+            const old_description = context.payload.pull_request.body || '';
+            console.log('Old description:', old_description);
+            const updated_body = !old_description.trim() ? review_checklist : old_description + "\n\n" + review_checklist;
+            github.rest.pulls.update({
+                pull_number: context.issue.number,
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                body: updated_body,
+              });
+
+              console.log('Updated body:', updated_body);
diff --git a/.github/workflows/print-info.yml b/.github/workflows/print-info.yml
@@ -0,0 +1,17 @@
+name: print-info
+
+on:
+  workflow_dispatch:
+  pull_request_target:
+    types: [opened]
+
+jobs:
+  add-review-checklist:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Print event details
+        run: |
+          echo "Event Name: ${{ github.event_name }}"
+          echo "Action: ${{ github.event.action }}"
+          echo "PR Title: ${{ github.event.pull_request.title }}"
+          echo "Original PR Body: ${{ github.event.pull_request.body }}"
diff --git a/.gitignore b/.gitignore
@@ -23,6 +23,8 @@ Icon
 
 #######################################
 **/__pycache__
+.coverage
+.idea
 
 # generated website
 /build/

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -6,7 +6,11 @@ repos:
     - id: check-ast
     - id: check-merge-conflict
     - id: check-xml
-      exclude: ^hugo/layouts/sitemap.xml$
+      exclude: |
+          (?x)^(
+              hugo/layouts/sitemap.xml|
+              hugo/layouts/_default/rss.xml
+          )$
     - id: check-yaml
       exclude: ^python/mkdocs.yml$
     - id: end-of-file-fixer

diff --git a/Makefile b/Makefile
@@ -307,6 +307,9 @@ autofix: check_staged_xml venv/bin/activate
 	 [ "$${PRE_DIFF}" = "$${POST_DIFF}" ] || EXIT_STATUS=1 ;\
 	 [ $${EXIT_STATUS} -eq 0 ]
 
+.PHONY: reformat
+reformat: autofix
+
 .PHONY: serve
 serve:
 	 @echo "INFO     Starting a server at http://localhost:8000/"

diff --git a/bin/aclpub2_format_check.py b/bin/aclpub2_format_check.py
@@ -1,10 +1,11 @@
 #!/usr/bin/env python3
+# -*- coding: utf-8 -*-
 
 """
-Last updated 2023-11-23 by Matt Post.
+Last updated 2024-08-07 by Matt Post
 
 TODO:
-- Ensure no LaTeX in titles (e.g., \emph{...})
+- Ensure no LaTeX in titles
 - If there is only a single name, it should be the last name, not the first
 - Clean TeX from abstracts, too (e.g., 2023.findings-emnlp.439)
 
@@ -110,13 +111,13 @@ def main(args):
         conference_details_path := rootdir / "inputs" / "conference_details.yml"
     ).exists():
         logger.error(f"x File '{conference_details_path}' does not exist")
-    else:
+    elif args.verbose:
         logger.info(f"✓ Found {conference_details_path}")
 
     # papers.yml
     if not (papers_path := rootdir / "inputs" / "papers.yml").exists():
         logger.fatal(f"File '{papers_path}' not found")
-    else:
+    elif args.verbose:
         logger.info(f"✓ Found {papers_path}")
 
     # Read through papers.yml. At the top level of the file is a list
@@ -125,28 +126,36 @@ def main(args):
     # sure that exists.
     papers = yaml.safe_load(papers_path.read_text())
     for paper in papers:
+        paper_id = paper["id"]
+
         # For each file, there should be a file {rootdir}/watermarked_pdfs/{id}.pdf
         if "archival" not in paper or paper['archival']:
             if not (
                 pdf_path := rootdir / "watermarked_pdfs" / f'{paper["id"]}.pdf'
             ).exists():
                 logger.error(f"Paper file '{pdf_path}' not found")
-            else:
+            elif args.verbose:
                 logger.info(f"✓ Found PDF file {pdf_path}")
 
+        for author in paper.get("authors", []):
+            if "@" in author.get("name", ""):
+                logger.error(
+                    f"Paper ID {paper_id}: Author name '{author['name']}' contains an email address"
+                )
+
         if "attachments" in paper:
             for attachment in paper["attachments"]:
                 if not (
                     attachment_path := rootdir / "attachments" / attachment["file"]
                 ).exists():
                     logger.error(f"Attachment file '{attachment_path}' not found")
-                else:
+                elif args.verbose:
                     logger.info(f"✓ Found attachment file {attachment_path}")
 
     # Check for frontmatter
     if not (frontmatter_path := rootdir / "watermarked_pdfs" / "0.pdf").exists():
         logger.error(f"Frontmatter {frontmatter_path} not found")
-    else:
+    elif args.verbose:
         logger.info(f"✓ Found frontmatter at {frontmatter_path}")
 
     # If there were any warnings or errors, exit with a non-zero status
@@ -155,7 +164,7 @@ def main(args):
         if isinstance(handler, CounterHandler):
             if handler.count > 0:
                 print(
-                    f"Script found {handler.count} warnings or errors. Please fix them before submitting."
+                    f"FAILURE: script found {handler.count} warnings or errors. Please fix them before submitting."
                 )
                 sys.exit(1)
 
@@ -168,10 +177,17 @@ def main(args):
     )
     parser.add_argument(
         "--import-dir",
+        "-i",
         type=str,
-        default="output",
+        default=".",
         help="Root directory for Anthology import",
     )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Print successes in addition to errors.",
+    )
     args = parser.parse_args()
 
     main(args)
diff --git a/bin/add_dois.py b/bin/add_dois.py
@@ -72,21 +72,25 @@ def add_doi(xml_node, collection_id, volume_id, force=False):
 
     doi_url = f"{data.DOI_URL_PREFIX}{data.DOI_PREFIX}{anth_id}"
     for tries in [1, 2, 3]:  # lots of random failures
-        result = test_url_code(doi_url)
-        if result.status_code == 200:
-            doi = make_simple_element("doi", text=new_doi_text)
-            print(f"-> Adding DOI {new_doi_text}", file=sys.stderr)
-            xml_node.append(doi)
-            return True
-        elif result.status_code == 429:  # too many requests
-            pause_for = int(result.headers["Retry-After"])
-            print(f"--> Got 429, pausing for {pause_for} seconds", file=sys.stderr)
-            sleep(pause_for + 1)
-        elif result.status_code == 404:  # not found
-            print("--> Got 404", file=sys.stderr)
-            break
-        else:
-            print(f"--> Other problem: {result}", file=sys.stderr)
+        try:
+            result = test_url_code(doi_url)
+            if result.status_code == 200:
+                doi = make_simple_element("doi", text=new_doi_text)
+                print(f"-> Adding DOI {new_doi_text}", file=sys.stderr)
+                xml_node.append(doi)
+                return True
+            elif result.status_code == 429:  # too many requests
+                pause_for = int(result.headers["Retry-After"])
+                print(f"--> Got 429, pausing for {pause_for} seconds", file=sys.stderr)
+                sleep(pause_for + 1)
+            elif result.status_code == 404:  # not found
+                print("--> Got 404", file=sys.stderr)
+                break
+            else:
+                print(f"--> Other problem: {result}", file=sys.stderr)
+
+        except Exception as e:
+            print(e)
 
     print(f"-> Couldn't add DOI for {doi_url}", file=sys.stderr)
     return False
@@ -121,7 +125,7 @@ def process_volume(anthology_volume):
             added = add_doi(paper, collection_id, volume_id, force=args.force)
             if added:
                 num_added += 1
-                sleep(1)
+                sleep(0.1)
 
         indent(tree.getroot())
 

diff --git a/bin/anthology/index.py b/bin/anthology/index.py
@@ -217,9 +217,18 @@ def create_bibkey(self, paper, vidx=None):
             raise Exception(
                 "Cannot create bibkeys when AnthologyIndex is instantiated with fast_load=True"
             )
+
+        # Regular papers use the first title word, then add title words until uniqueness is achieved
+        title = [
+            w
+            for w in slugify(paper.get_title("plain")).split("-")
+            if not self._is_stopword(w, paper)
+        ]
+
         if paper.is_volume:
-            # Proceedings volumes use venue acronym instead of authors/editors
+            # Proceedings volumes use venue acronym instead of authors/editors, e.g., lrec-tutorials-2024
             bibnames = slugify(paper.get_venue_acronym())
+            bibkey = f"{bibnames}-{paper.get('year')}-{paper.volume_id}"
         else:
             # Regular papers use author/editor names
             names = paper.get("author")
@@ -232,12 +241,9 @@ def create_bibkey(self, paper, vidx=None):
                     bibnames = "-".join(slugify(n.last) for n, _ in names)
             else:
                 bibnames = "nn"
-        title = [
-            w
-            for w in slugify(paper.get_title("plain")).split("-")
-            if not self._is_stopword(w, paper)
-        ]
-        bibkey = f"{bibnames}-{paper.get('year')}-{title.pop(0)}"
+
+            bibkey = f"{bibnames}-{paper.get('year')}-{title.pop(0)}"
+
         while bibkey in self.bibkeys:  # guarantee uniqueness
             if title:
                 bibkey += f"-{title.pop(0)}"

diff --git a/bin/anthology/latexcodec.py b/bin/anthology/latexcodec.py
@@ -8,7 +8,7 @@
 # PSF (Python Software Foundation) license found here:
 # http://www.python.org/psf/license/
 
-'''Translates unicode to bibtex-friendly encoding.
+r"""Translates unicode to bibtex-friendly encoding.
 
 bibtex-friendly features:
 
@@ -54,7 +54,7 @@
 
 D. Eppstein, October 2003.
 
-'''
+"""
 
 from __future__ import generators
 import codecs
@@ -548,7 +548,7 @@ def candidates(self, offset):
 _blacklist.add(None)  # shortcut candidate generation at end of data
 
 # Construction of inverse translation table
-_l2u = {'\ ': ord(" ")}  # unexpanding space makes no sense in non-TeX contexts
+_l2u = {'\\ ': ord(" ")}  # unexpanding space makes no sense in non-TeX contexts
 
 for _tex in latex_equivalents:
     if _tex <= 0x0020 or (_tex <= 0x007F and len(latex_equivalents[_tex]) <= 1):