From fa57db18ec3bd826ff3bd76b761c5c7c8f72ee15 Mon Sep 17 00:00:00 2001 From: Naoyuki Tai Date: Mon, 20 May 2024 16:58:19 -0400 Subject: [PATCH 1/4] (1) Not select the with_op by default as you may not have 1password CLI (2) Make the test to run from the arxiv-base directory --- gcp/service_auth/tests/test_get_token.py | 29 ++++++++++++++++++------ pytest.ini | 4 ++++ 2 files changed, 26 insertions(+), 7 deletions(-) create mode 100644 pytest.ini diff --git a/gcp/service_auth/tests/test_get_token.py b/gcp/service_auth/tests/test_get_token.py index 366b0a37..0e911199 100644 --- a/gcp/service_auth/tests/test_get_token.py +++ b/gcp/service_auth/tests/test_get_token.py @@ -1,4 +1,5 @@ import os +import sys import subprocess import time from typing import Generator @@ -7,29 +8,42 @@ import logging import datetime from pathlib import Path +import json -from gcp_service_auth import GcpIdentityToken +try: + from gcp_service_auth import GcpIdentityToken +except ModuleNotFoundError: + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + from gcp_service_auth import GcpIdentityToken + pass + -TEST_URL = os.environ.get('TEST_URL') @pytest.fixture(scope="module") #@pytest.mark.with_op -def gcp_browse_cred() -> Generator[str, str, str]: +def gcp_browse_cred() -> (Generator[str, str, str], str): logging.basicConfig(level=logging.DEBUG) cred_file = os.path.join(Path(__file__).parent, "browse-local.json") cred_file = cred_file if not os.path.exists(cred_file): subprocess.run(["op", "document", "get", "4feibaz4tzn6iwk5c7ggvb7xwi", "-o", cred_file]) os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = cred_file - yield cred_file + op = subprocess.run(["op", "item", "get", "4feibaz4tzn6iwk5c7ggvb7xwi", "--format", "json"], stdout=subprocess.PIPE) + test_meta = json.loads(op.stdout) + test_url = "" + for field in test_meta['fields']: + if field["id"] == "heltf7phky3h6rnlvd7zi3542u": + test_url = field["value"] + break + yield (cred_file, test_url) os.remove(cred_file) return "" @pytest.mark.with_op -def test_get_token(gcp_browse_cred: str) -> None: - os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = gcp_browse_cred +def test_get_token(gcp_browse_cred: (str, str)) -> None: + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = gcp_browse_cred[0] logger = logging.getLogger("test") - idt = GcpIdentityToken(TEST_URL, logger=logger, + idt = GcpIdentityToken(gcp_browse_cred[0][1], logger=logger, expiration=datetime.timedelta(seconds=1)) token0 = idt.token time.sleep(2) @@ -37,3 +51,4 @@ def test_get_token(gcp_browse_cred: str) -> None: assert token0 is not None assert token1 is not None assert token0 != token1 + diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..5c872f0b --- /dev/null +++ b/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +markers = + with_op: marks tests to run with 1password CLI +addopts = -m "not with_op" From 99a6957936f96c8e0c19db71aeb46084da6ef430 Mon Sep 17 00:00:00 2001 From: Naoyuki Tai Date: Tue, 21 May 2024 12:08:27 -0400 Subject: [PATCH 2/4] Fix the bad type annotation --- gcp/service_auth/tests/test_get_token.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/gcp/service_auth/tests/test_get_token.py b/gcp/service_auth/tests/test_get_token.py index 0e911199..1c7c1443 100644 --- a/gcp/service_auth/tests/test_get_token.py +++ b/gcp/service_auth/tests/test_get_token.py @@ -2,7 +2,7 @@ import sys import subprocess import time -from typing import Generator +from typing import Generator, Tuple import pytest import logging @@ -21,10 +21,16 @@ @pytest.fixture(scope="module") #@pytest.mark.with_op -def gcp_browse_cred() -> (Generator[str, str, str], str): +def gcp_browse_cred() -> Generator[Tuple[str, str], None, None]: + """The fixture sets up + 1. the path to the credentials file + 2. the URL to the service. + """ logging.basicConfig(level=logging.DEBUG) cred_file = os.path.join(Path(__file__).parent, "browse-local.json") cred_file = cred_file + # the magic numbers are assigned by the 1password. + # You'd find the value by running `op item list`, etc. if not os.path.exists(cred_file): subprocess.run(["op", "document", "get", "4feibaz4tzn6iwk5c7ggvb7xwi", "-o", cred_file]) os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = cred_file @@ -32,15 +38,16 @@ def gcp_browse_cred() -> (Generator[str, str, str], str): test_meta = json.loads(op.stdout) test_url = "" for field in test_meta['fields']: + # the magic number is assigned by 1p. This is a URL to test against if field["id"] == "heltf7phky3h6rnlvd7zi3542u": test_url = field["value"] break - yield (cred_file, test_url) + yield cred_file, test_url os.remove(cred_file) - return "" + return @pytest.mark.with_op -def test_get_token(gcp_browse_cred: (str, str)) -> None: +def test_get_token(gcp_browse_cred: Tuple[str, str]) -> None: os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = gcp_browse_cred[0] logger = logging.getLogger("test") idt = GcpIdentityToken(gcp_browse_cred[0][1], logger=logger, From e74a3ddfa12d91b768f9686bca83246b66ca8e05 Mon Sep 17 00:00:00 2001 From: kyokukou Date: Thu, 23 May 2024 13:18:20 -0700 Subject: [PATCH 3/4] add surrogate key outputs a headers object --- arxiv/integration/fastly/headers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arxiv/integration/fastly/headers.py b/arxiv/integration/fastly/headers.py index bc5539ac..36498922 100644 --- a/arxiv/integration/fastly/headers.py +++ b/arxiv/integration/fastly/headers.py @@ -1,6 +1,7 @@ from typing import Dict, List +from werkzeug.datastructures import Headers -def add_surrogate_key(headers: Dict[str,str], keys:List[str])-> Dict[str, str]: +def add_surrogate_key(headers: Dict[str,str], keys:List[str])-> Headers: """adds surrogate key(s) to a response header, will update the header dictionary with new keys while retaining the rest of the header information""" old_keys=f' {headers.get("Surrogate-Key","").strip()} ' From db8d807e7746350b9be79816e6c3802c8535340e Mon Sep 17 00:00:00 2001 From: "Brian D. Caruso" Date: Tue, 28 May 2024 17:59:10 -0400 Subject: [PATCH 4/4] Changes how reasons file is loaded. --- arxiv/legacy/papers/dissemination/reasons.py | 53 +++++-------------- .../dissemination/tests/test_reasons.py | 41 ++++++-------- 2 files changed, 29 insertions(+), 65 deletions(-) diff --git a/arxiv/legacy/papers/dissemination/reasons.py b/arxiv/legacy/papers/dissemination/reasons.py index 80ead92a..d2fef69e 100644 --- a/arxiv/legacy/papers/dissemination/reasons.py +++ b/arxiv/legacy/papers/dissemination/reasons.py @@ -11,6 +11,8 @@ from google.cloud import storage +from arxiv.files import FileObj + FORMATS = Literal['ps', 'pdf', 'html', 'dvi', 'postscript'] DEFAULT_REASONS_GS_URL = "gs://arxiv-production-data/reasons.json" @@ -19,33 +21,22 @@ _reasons_data = None -def ensure_reasons_data(location:Optional[str]=None) -> None: - """Checks that reasons data loads from Google storage and is not empty. - - Raises an Exception if there are problems. - """ - rd = get_reasons_data(location) - if not rd: - raise Exception("Reasons data was empty") - -def get_reasons_data(location:Optional[str]=None)->dict: - """Get the reasons data. +def get_reasons_data(file: FileObj) -> dict: + """Get the reasons' data. `get_reasons_data()` will attempt to get the data from GS only once. If it fails it will cache a "LOAD FAILED" result that will cause it to fail when called further in the execution. This is to avoid repeted API calls to the GS bucket. - `location` should be a GS bucket in the form - gs://bucketname/some/key/reasons.json. If location is not - provided the env var REASONS_GS_URL will be used and if that - doesn't exist a default value for the URL will be used. + `location` should be `FileObj` like + gs://arxiv-production-data/reasons.json. - This will load from GS once and save in a package level variable. + This will load once and save in a package level variable. If `reasons()` is to be used in an app it makes sense to call - `get_reasons_data()` when starting that app to ensure the app has + `get_reasons_data(file)` when starting that app to ensure the app has access and it is configured correctly. """ global _reasons_data @@ -54,29 +45,8 @@ def get_reasons_data(location:Optional[str]=None)->dict: if _reasons_data == "LOAD FAILED": raise Exception("Previous load of reasons data failed, not trying again " "until _reasons_data is cleared by setting it to None") - - if location is None: - location = os.environ.get("REASONS_GS_URL", DEFAULT_REASONS_GS_URL) - - if location is None: - raise ValueError("Must pass location or set env var REASONS_GS_URL") - - blob = None try: - bucket_name = location.strip('gs://').split('/')[0] - key = '/'.join(location.strip('gs://').split('/')[1:]) - bucket = storage.Client().bucket(bucket_name) - blob = bucket.get_blob(key) - except Exception as ex: - _reasons_data = "LOAD FAILED" - raise ex - - if not blob: - _reasons_data = "LOAD FAILED" - raise Exception(f"Could not get resons file from {location}") - - try: - with blob.open('r') as fp: + with file.open('r') as fp: _reasons_data = json.load(fp) return _reasons_data except Exception as ex: @@ -84,7 +54,7 @@ def get_reasons_data(location:Optional[str]=None)->dict: raise ex -def reasons(id: str, format: FORMATS)-> Optional[str] : +def reasons(reasons_data: dict, id: str, format: FORMATS) -> Optional[str] : """Find any reasons for inability to process this paper. Find all the recorded reasons for inability to process this paper (if any), @@ -100,8 +70,9 @@ def reasons(id: str, format: FORMATS)-> Optional[str] : Returns a list of strings which report reasons for different versions or formats fail. List is empty if no reasons are recorded. + + See test_reasons.py for an example of the JSON needed for `reasons_data`. """ - reasons_data = get_reasons_data() if not id: return None diff --git a/arxiv/legacy/papers/dissemination/tests/test_reasons.py b/arxiv/legacy/papers/dissemination/tests/test_reasons.py index 70d9b934..20b74f06 100644 --- a/arxiv/legacy/papers/dissemination/tests/test_reasons.py +++ b/arxiv/legacy/papers/dissemination/tests/test_reasons.py @@ -2,13 +2,10 @@ import re from unittest import TestCase -import arxiv.legacy.papers.dissemination.reasons as reasons_pkg - -reasons = reasons_pkg.reasons +from arxiv.legacy.papers.dissemination.reasons import reasons def fake_reasons(): - reasons_pkg._reasons_data = test_reasons_data # pylint: disable=W0212 - + return test_reasons_data class TestReasons(TestCase): @@ -28,15 +25,14 @@ class TestReasons(TestCase): '1808.02949v1', '1612.00844v1'] def test_papers_flaged_versions_in_reasons(self): - fake_reasons() vrx = re.compile(r'v\d+$') for id in self.ids_with_v: - assert reasons(id, 'pdf') - assert reasons(id, 'ps') + assert reasons(fake_reasons(), id, 'pdf') + assert reasons(fake_reasons(), id, 'ps') nov = re.sub(vrx,'',id) assert nov != id - assert not reasons(nov, 'pdf') - assert not reasons(nov, 'ps') + assert not reasons(fake_reasons(), nov, 'pdf') + assert not reasons(fake_reasons(), nov, 'ps') ids_with_pdf_reasons = [ 'alg-geom/9411012.pdf', @@ -66,13 +62,12 @@ def test_papers_flaged_versions_in_reasons(self): def test_papers_flaged_pdf_in_reasons(self): - fake_reasons() dotpdfrx = re.compile('.pdf$') for id in self.ids_with_pdf_reasons: - assert reasons(id, 'pdf') + assert reasons(fake_reasons(), id, 'pdf') nodotpdf = re.sub(dotpdfrx, '', id) - assert not reasons(nodotpdf, 'ps') - assert not reasons(nodotpdf, 'postscript') + assert not reasons(fake_reasons(), nodotpdf, 'ps') + assert not reasons(fake_reasons(), nodotpdf, 'postscript') @@ -813,15 +808,14 @@ def test_papers_flaged_pdf_in_reasons(self): '1310.4962'] def test_papers_flaged_in_reason(self): - fake_reasons() for id in self.ids_with_reasons: - assert reasons(id, 'pdf') - assert reasons(id + "v1", 'pdf') - assert reasons(id + "v2", 'pdf') - assert reasons(id + "v12", 'pdf') - assert reasons(id + "v123", 'pdf') - assert reasons(id, 'ps') - assert reasons(id, 'postscript') + assert reasons(fake_reasons(), id, 'pdf') + assert reasons(fake_reasons(), id + "v1", 'pdf') + assert reasons(fake_reasons(), id + "v2", 'pdf') + assert reasons(fake_reasons(), id + "v12", 'pdf') + assert reasons(fake_reasons(), id + "v123", 'pdf') + assert reasons(fake_reasons(), id, 'ps') + assert reasons(fake_reasons(), id, 'postscript') no_reasons_data=[ 'astro-ph/9811330', 'astro-ph/9811330v1', 'astro-ph/9811330v12', 'astro-ph/9811330v1.pdf', @@ -831,9 +825,8 @@ def test_papers_flaged_in_reason(self): ] def no_reasons(self): - fake_reasons() for id in self.no_reasons_data: - assert not reasons(id, 'pdf') + assert not reasons(fake_reasons(), id, 'pdf') # Take from /users/e-prints/httpd/htdocs/Database/reasons on 2023-01-25.