Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reasons file for arxiv-browse, test changes, header changes #282

Merged
merged 7 commits into from
May 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion arxiv/integration/fastly/headers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Dict, List
from werkzeug.datastructures import Headers

def add_surrogate_key(headers: Dict[str,str], keys:List[str])-> Dict[str, str]:
def add_surrogate_key(headers: Dict[str,str], keys:List[str])-> Headers:
"""adds surrogate key(s) to a response header,
will update the header dictionary with new keys while retaining the rest of the header information"""
old_keys=f' {headers.get("Surrogate-Key","").strip()} '
Expand Down
53 changes: 12 additions & 41 deletions arxiv/legacy/papers/dissemination/reasons.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@

from google.cloud import storage

from arxiv.files import FileObj

FORMATS = Literal['ps', 'pdf', 'html', 'dvi', 'postscript']

DEFAULT_REASONS_GS_URL = "gs://arxiv-production-data/reasons.json"
Expand All @@ -19,33 +21,22 @@

_reasons_data = None

def ensure_reasons_data(location:Optional[str]=None) -> None:
"""Checks that reasons data loads from Google storage and is not empty.

Raises an Exception if there are problems.
"""
rd = get_reasons_data(location)
if not rd:
raise Exception("Reasons data was empty")


def get_reasons_data(location:Optional[str]=None)->dict:
"""Get the reasons data.
def get_reasons_data(file: FileObj) -> dict:
"""Get the reasons' data.

`get_reasons_data()` will attempt to get the data from GS only
once. If it fails it will cache a "LOAD FAILED" result that will
cause it to fail when called further in the execution. This is to
avoid repeted API calls to the GS bucket.

`location` should be a GS bucket in the form
gs://bucketname/some/key/reasons.json. If location is not
provided the env var REASONS_GS_URL will be used and if that
doesn't exist a default value for the URL will be used.
`location` should be `FileObj` like
gs://arxiv-production-data/reasons.json.

This will load from GS once and save in a package level variable.
This will load once and save in a package level variable.

If `reasons()` is to be used in an app it makes sense to call
`get_reasons_data()` when starting that app to ensure the app has
`get_reasons_data(file)` when starting that app to ensure the app has
access and it is configured correctly.
"""
global _reasons_data
Expand All @@ -54,37 +45,16 @@ def get_reasons_data(location:Optional[str]=None)->dict:
if _reasons_data == "LOAD FAILED":
raise Exception("Previous load of reasons data failed, not trying again "
"until _reasons_data is cleared by setting it to None")

if location is None:
location = os.environ.get("REASONS_GS_URL", DEFAULT_REASONS_GS_URL)

if location is None:
raise ValueError("Must pass location or set env var REASONS_GS_URL")

blob = None
try:
bucket_name = location.strip('gs://').split('/')[0]
key = '/'.join(location.strip('gs://').split('/')[1:])
bucket = storage.Client().bucket(bucket_name)
blob = bucket.get_blob(key)
except Exception as ex:
_reasons_data = "LOAD FAILED"
raise ex

if not blob:
_reasons_data = "LOAD FAILED"
raise Exception(f"Could not get resons file from {location}")

try:
with blob.open('r') as fp:
with file.open('r') as fp:
_reasons_data = json.load(fp)
return _reasons_data
except Exception as ex:
_reasons_data = "LOAD FAILED"
raise ex


def reasons(id: str, format: FORMATS)-> Optional[str] :
def reasons(reasons_data: dict, id: str, format: FORMATS) -> Optional[str] :
"""Find any reasons for inability to process this paper.

Find all the recorded reasons for inability to process this paper (if any),
Expand All @@ -100,8 +70,9 @@ def reasons(id: str, format: FORMATS)-> Optional[str] :

Returns a list of strings which report reasons for different versions
or formats fail. List is empty if no reasons are recorded.

See test_reasons.py for an example of the JSON needed for `reasons_data`.
"""
reasons_data = get_reasons_data()

if not id:
return None
Expand Down
41 changes: 17 additions & 24 deletions arxiv/legacy/papers/dissemination/tests/test_reasons.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,10 @@
import re
from unittest import TestCase

import arxiv.legacy.papers.dissemination.reasons as reasons_pkg

reasons = reasons_pkg.reasons
from arxiv.legacy.papers.dissemination.reasons import reasons

def fake_reasons():
reasons_pkg._reasons_data = test_reasons_data # pylint: disable=W0212

return test_reasons_data


class TestReasons(TestCase):
Expand All @@ -28,15 +25,14 @@ class TestReasons(TestCase):
'1808.02949v1', '1612.00844v1']

def test_papers_flaged_versions_in_reasons(self):
fake_reasons()
vrx = re.compile(r'v\d+$')
for id in self.ids_with_v:
assert reasons(id, 'pdf')
assert reasons(id, 'ps')
assert reasons(fake_reasons(), id, 'pdf')
assert reasons(fake_reasons(), id, 'ps')
nov = re.sub(vrx,'',id)
assert nov != id
assert not reasons(nov, 'pdf')
assert not reasons(nov, 'ps')
assert not reasons(fake_reasons(), nov, 'pdf')
assert not reasons(fake_reasons(), nov, 'ps')


ids_with_pdf_reasons = [ 'alg-geom/9411012.pdf',
Expand Down Expand Up @@ -66,13 +62,12 @@ def test_papers_flaged_versions_in_reasons(self):


def test_papers_flaged_pdf_in_reasons(self):
fake_reasons()
dotpdfrx = re.compile('.pdf$')
for id in self.ids_with_pdf_reasons:
assert reasons(id, 'pdf')
assert reasons(fake_reasons(), id, 'pdf')
nodotpdf = re.sub(dotpdfrx, '', id)
assert not reasons(nodotpdf, 'ps')
assert not reasons(nodotpdf, 'postscript')
assert not reasons(fake_reasons(), nodotpdf, 'ps')
assert not reasons(fake_reasons(), nodotpdf, 'postscript')



Expand Down Expand Up @@ -813,15 +808,14 @@ def test_papers_flaged_pdf_in_reasons(self):
'1310.4962']

def test_papers_flaged_in_reason(self):
fake_reasons()
for id in self.ids_with_reasons:
assert reasons(id, 'pdf')
assert reasons(id + "v1", 'pdf')
assert reasons(id + "v2", 'pdf')
assert reasons(id + "v12", 'pdf')
assert reasons(id + "v123", 'pdf')
assert reasons(id, 'ps')
assert reasons(id, 'postscript')
assert reasons(fake_reasons(), id, 'pdf')
assert reasons(fake_reasons(), id + "v1", 'pdf')
assert reasons(fake_reasons(), id + "v2", 'pdf')
assert reasons(fake_reasons(), id + "v12", 'pdf')
assert reasons(fake_reasons(), id + "v123", 'pdf')
assert reasons(fake_reasons(), id, 'ps')
assert reasons(fake_reasons(), id, 'postscript')

no_reasons_data=[ 'astro-ph/9811330', 'astro-ph/9811330v1',
'astro-ph/9811330v12', 'astro-ph/9811330v1.pdf',
Expand All @@ -831,9 +825,8 @@ def test_papers_flaged_in_reason(self):
]

def no_reasons(self):
fake_reasons()
for id in self.no_reasons_data:
assert not reasons(id, 'pdf')
assert not reasons(fake_reasons(), id, 'pdf')


# Take from /users/e-prints/httpd/htdocs/Database/reasons on 2023-01-25.
Expand Down
40 changes: 31 additions & 9 deletions gcp/service_auth/tests/test_get_token.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,61 @@
import os
import sys
import subprocess
import time
from typing import Generator
from typing import Generator, Tuple

import pytest
import logging
import datetime
from pathlib import Path
import json

from gcp_service_auth import GcpIdentityToken
try:
from gcp_service_auth import GcpIdentityToken
except ModuleNotFoundError:
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from gcp_service_auth import GcpIdentityToken
pass


TEST_URL = os.environ.get('TEST_URL')

@pytest.fixture(scope="module")
#@pytest.mark.with_op
def gcp_browse_cred() -> Generator[str, str, str]:
def gcp_browse_cred() -> Generator[Tuple[str, str], None, None]:
"""The fixture sets up
1. the path to the credentials file
2. the URL to the service.
"""
logging.basicConfig(level=logging.DEBUG)
cred_file = os.path.join(Path(__file__).parent, "browse-local.json")
cred_file = cred_file
# the magic numbers are assigned by the 1password.
# You'd find the value by running `op item list`, etc.
if not os.path.exists(cred_file):
subprocess.run(["op", "document", "get", "4feibaz4tzn6iwk5c7ggvb7xwi", "-o", cred_file])
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = cred_file
yield cred_file
op = subprocess.run(["op", "item", "get", "4feibaz4tzn6iwk5c7ggvb7xwi", "--format", "json"], stdout=subprocess.PIPE)
test_meta = json.loads(op.stdout)
test_url = ""
for field in test_meta['fields']:
# the magic number is assigned by 1p. This is a URL to test against
if field["id"] == "heltf7phky3h6rnlvd7zi3542u":
test_url = field["value"]
break
yield cred_file, test_url
os.remove(cred_file)
return ""
return

@pytest.mark.with_op
def test_get_token(gcp_browse_cred: str) -> None:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = gcp_browse_cred
def test_get_token(gcp_browse_cred: Tuple[str, str]) -> None:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = gcp_browse_cred[0]
logger = logging.getLogger("test")
idt = GcpIdentityToken(TEST_URL, logger=logger,
idt = GcpIdentityToken(gcp_browse_cred[0][1], logger=logger,
expiration=datetime.timedelta(seconds=1))
token0 = idt.token
time.sleep(2)
token1 = idt.token
assert token0 is not None
assert token1 is not None
assert token0 != token1

4 changes: 4 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[pytest]
markers =
with_op: marks tests to run with 1password CLI
addopts = -m "not with_op"
Loading