Merge pull request #282 from arXiv/develop

Reasons file for arxiv-browse, test changes, header changes
arXiv · May 29, 2024 · 68492a1 · 68492a1
2 parents 0d588d0 + 0fb6c76
commit 68492a1
Show file tree

Hide file tree

Showing 5 changed files with 66 additions and 75 deletions.
diff --git a/arxiv/integration/fastly/headers.py b/arxiv/integration/fastly/headers.py
@@ -1,6 +1,7 @@
 from typing import Dict, List
+from werkzeug.datastructures import Headers
 
-def add_surrogate_key(headers: Dict[str,str], keys:List[str])-> Dict[str, str]:
+def add_surrogate_key(headers: Dict[str,str], keys:List[str])-> Headers:
     """adds surrogate key(s) to a response header, 
     will update the header dictionary with new keys while retaining the rest of the header information"""
     old_keys=f' {headers.get("Surrogate-Key","").strip()} '

diff --git a/arxiv/legacy/papers/dissemination/reasons.py b/arxiv/legacy/papers/dissemination/reasons.py
@@ -11,6 +11,8 @@
 
 from google.cloud import storage
 
+from arxiv.files import FileObj
+
 FORMATS = Literal['ps', 'pdf', 'html', 'dvi', 'postscript']
 
 DEFAULT_REASONS_GS_URL = "gs://arxiv-production-data/reasons.json"
@@ -19,33 +21,22 @@
 
 _reasons_data = None
 
-def ensure_reasons_data(location:Optional[str]=None) -> None:
-    """Checks that reasons data loads from Google storage and is not empty.
-
-    Raises an Exception if there are problems.
-    """
-    rd = get_reasons_data(location)
-    if not rd:
-        raise Exception("Reasons data was empty")
-
 
-def get_reasons_data(location:Optional[str]=None)->dict:
-    """Get the reasons data.
+def get_reasons_data(file: FileObj) -> dict:
+    """Get the reasons' data.
 
     `get_reasons_data()` will attempt to get the data from GS only
     once. If it fails it will cache a "LOAD FAILED" result that will
     cause it to fail when called further in the execution. This is to
     avoid repeted API calls to the GS bucket.
 
-    `location` should be a GS bucket in the form
-    gs://bucketname/some/key/reasons.json. If location is not
-    provided the env var REASONS_GS_URL will be used and if that
-    doesn't exist a default value for the URL will be used.
+    `location` should be `FileObj` like
+    gs://arxiv-production-data/reasons.json.
 
-    This will load from GS once and save in a package level variable.
+    This will load once and save in a package level variable.
 
     If `reasons()` is to be used in an app it makes sense to call
-    `get_reasons_data()` when starting that app to ensure the app has
+    `get_reasons_data(file)` when starting that app to ensure the app has
     access and it is configured correctly.
     """
     global _reasons_data
@@ -54,37 +45,16 @@ def get_reasons_data(location:Optional[str]=None)->dict:
     if _reasons_data == "LOAD FAILED":
         raise Exception("Previous load of reasons data failed, not trying again "
                         "until _reasons_data is cleared by setting it to None")
-
-    if location is None:
-        location = os.environ.get("REASONS_GS_URL", DEFAULT_REASONS_GS_URL)
-
-    if location is None:
-        raise ValueError("Must pass location or set env var REASONS_GS_URL")
-
-    blob = None
     try:
-        bucket_name = location.strip('gs://').split('/')[0]
-        key = '/'.join(location.strip('gs://').split('/')[1:])
-        bucket = storage.Client().bucket(bucket_name)
-        blob = bucket.get_blob(key)
-    except Exception as ex:
-        _reasons_data = "LOAD FAILED"
-        raise ex
-
-    if not blob:
-        _reasons_data = "LOAD FAILED"
-        raise Exception(f"Could not get resons file from {location}")
-
-    try:
-        with blob.open('r') as fp:
+        with file.open('r') as fp:
             _reasons_data = json.load(fp)
             return _reasons_data
     except Exception as ex:
         _reasons_data = "LOAD FAILED"
         raise ex
 
 
-def reasons(id: str, format: FORMATS)-> Optional[str] :
+def reasons(reasons_data: dict, id: str, format: FORMATS) -> Optional[str] :
     """Find any reasons for inability to process this paper.
 
     Find all the recorded reasons for inability to process this paper (if any),
@@ -100,8 +70,9 @@ def reasons(id: str, format: FORMATS)-> Optional[str] :
 
     Returns a list of strings which report reasons for different versions
     or formats fail. List is empty if no reasons are recorded.
+
+    See test_reasons.py for an example of the JSON needed for `reasons_data`.
     """
-    reasons_data = get_reasons_data()
 
     if not id:
         return None

diff --git a/arxiv/legacy/papers/dissemination/tests/test_reasons.py b/arxiv/legacy/papers/dissemination/tests/test_reasons.py
@@ -2,13 +2,10 @@
 import re
 from unittest import TestCase
 
-import arxiv.legacy.papers.dissemination.reasons as reasons_pkg
-
-reasons = reasons_pkg.reasons
+from arxiv.legacy.papers.dissemination.reasons import reasons
 
 def fake_reasons():
-    reasons_pkg._reasons_data = test_reasons_data # pylint: disable=W0212
-
+    return test_reasons_data
 
 
 class TestReasons(TestCase):
@@ -28,15 +25,14 @@ class TestReasons(TestCase):
                   '1808.02949v1', '1612.00844v1']
 
     def test_papers_flaged_versions_in_reasons(self):
-        fake_reasons()
         vrx = re.compile(r'v\d+$')
         for id in self.ids_with_v:
-            assert reasons(id, 'pdf')
-            assert reasons(id, 'ps')
+            assert reasons(fake_reasons(), id, 'pdf')
+            assert reasons(fake_reasons(), id, 'ps')
             nov = re.sub(vrx,'',id)
             assert nov != id
-            assert not reasons(nov, 'pdf')
-            assert not reasons(nov, 'ps')
+            assert not reasons(fake_reasons(), nov, 'pdf')
+            assert not reasons(fake_reasons(), nov, 'ps')
 
 
     ids_with_pdf_reasons = [ 'alg-geom/9411012.pdf',
@@ -66,13 +62,12 @@ def test_papers_flaged_versions_in_reasons(self):
 
 
     def test_papers_flaged_pdf_in_reasons(self):
-        fake_reasons()
         dotpdfrx = re.compile('.pdf$')
         for id in self.ids_with_pdf_reasons:
-            assert reasons(id, 'pdf')
+            assert reasons(fake_reasons(), id, 'pdf')
             nodotpdf = re.sub(dotpdfrx, '', id)
-            assert not reasons(nodotpdf, 'ps')
-            assert not reasons(nodotpdf, 'postscript')
+            assert not reasons(fake_reasons(), nodotpdf, 'ps')
+            assert not reasons(fake_reasons(), nodotpdf, 'postscript')
 
 
 
@@ -813,15 +808,14 @@ def test_papers_flaged_pdf_in_reasons(self):
                         '1310.4962']
 
     def test_papers_flaged_in_reason(self):
-        fake_reasons()
         for id in self.ids_with_reasons:
-            assert reasons(id, 'pdf')
-            assert reasons(id + "v1", 'pdf')
-            assert reasons(id + "v2", 'pdf')
-            assert reasons(id + "v12", 'pdf')
-            assert reasons(id + "v123", 'pdf')
-            assert reasons(id, 'ps')
-            assert reasons(id, 'postscript')
+            assert reasons(fake_reasons(), id, 'pdf')
+            assert reasons(fake_reasons(), id + "v1", 'pdf')
+            assert reasons(fake_reasons(), id + "v2", 'pdf')
+            assert reasons(fake_reasons(), id + "v12", 'pdf')
+            assert reasons(fake_reasons(), id + "v123", 'pdf')
+            assert reasons(fake_reasons(), id, 'ps')
+            assert reasons(fake_reasons(), id, 'postscript')
 
     no_reasons_data=[ 'astro-ph/9811330', 'astro-ph/9811330v1',
                  'astro-ph/9811330v12', 'astro-ph/9811330v1.pdf',
@@ -831,9 +825,8 @@ def test_papers_flaged_in_reason(self):
                 ]
 
     def no_reasons(self):
-        fake_reasons()
         for id in self.no_reasons_data:
-            assert not reasons(id, 'pdf')
+            assert not reasons(fake_reasons(), id, 'pdf')
 
 
 # Take from /users/e-prints/httpd/htdocs/Database/reasons on 2023-01-25.

diff --git a/gcp/service_auth/tests/test_get_token.py b/gcp/service_auth/tests/test_get_token.py
@@ -1,39 +1,61 @@
 import os
+import sys
 import subprocess
 import time
-from typing import Generator
+from typing import Generator, Tuple
 
 import pytest
 import logging
 import datetime
 from pathlib import Path
+import json
 
-from gcp_service_auth import GcpIdentityToken
+try:
+    from gcp_service_auth import GcpIdentityToken
+except ModuleNotFoundError:
+    sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    from gcp_service_auth import GcpIdentityToken
+    pass
+
 
-TEST_URL = os.environ.get('TEST_URL')
 
 @pytest.fixture(scope="module")
 #@pytest.mark.with_op
-def gcp_browse_cred() -> Generator[str, str, str]:
+def gcp_browse_cred() -> Generator[Tuple[str, str], None, None]:
+    """The fixture sets up
+    1. the path to the credentials file
+    2. the URL to the service.
+    """
     logging.basicConfig(level=logging.DEBUG)
     cred_file = os.path.join(Path(__file__).parent, "browse-local.json")
     cred_file = cred_file
+    # the magic numbers are assigned by the 1password.
+    # You'd find the value by running `op item list`, etc.
     if not os.path.exists(cred_file):
         subprocess.run(["op", "document", "get", "4feibaz4tzn6iwk5c7ggvb7xwi", "-o", cred_file])
     os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = cred_file
-    yield cred_file
+    op = subprocess.run(["op", "item", "get", "4feibaz4tzn6iwk5c7ggvb7xwi", "--format", "json"], stdout=subprocess.PIPE)
+    test_meta = json.loads(op.stdout)
+    test_url = ""
+    for field in test_meta['fields']:
+        # the magic number is assigned by 1p. This is a URL to test against
+        if field["id"] == "heltf7phky3h6rnlvd7zi3542u":
+            test_url = field["value"]
+            break
+    yield cred_file, test_url
     os.remove(cred_file)
-    return ""
+    return
 
 @pytest.mark.with_op
-def test_get_token(gcp_browse_cred: str) -> None:
-    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = gcp_browse_cred
+def test_get_token(gcp_browse_cred: Tuple[str, str]) -> None:
+    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = gcp_browse_cred[0]
     logger = logging.getLogger("test")
-    idt = GcpIdentityToken(TEST_URL, logger=logger,
+    idt = GcpIdentityToken(gcp_browse_cred[0][1], logger=logger,
                            expiration=datetime.timedelta(seconds=1))
     token0 = idt.token
     time.sleep(2)
     token1 = idt.token
     assert token0 is not None
     assert token1 is not None
     assert token0 != token1
+
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,4 @@
+[pytest]
+markers =
+    with_op: marks tests to run with 1password CLI
+addopts = -m "not with_op"