Skip to content

Commit

Permalink
Merge pull request #281 from arXiv/bdc34/arxivce-1783-reasons-file
Browse files Browse the repository at this point in the history
Changes how reasons file is loaded.
  • Loading branch information
bdc34 authored May 29, 2024
2 parents 5926b21 + db8d807 commit 0fb6c76
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 65 deletions.
53 changes: 12 additions & 41 deletions arxiv/legacy/papers/dissemination/reasons.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@

from google.cloud import storage

from arxiv.files import FileObj

FORMATS = Literal['ps', 'pdf', 'html', 'dvi', 'postscript']

DEFAULT_REASONS_GS_URL = "gs://arxiv-production-data/reasons.json"
Expand All @@ -19,33 +21,22 @@

_reasons_data = None

def ensure_reasons_data(location:Optional[str]=None) -> None:
"""Checks that reasons data loads from Google storage and is not empty.
Raises an Exception if there are problems.
"""
rd = get_reasons_data(location)
if not rd:
raise Exception("Reasons data was empty")


def get_reasons_data(location:Optional[str]=None)->dict:
"""Get the reasons data.
def get_reasons_data(file: FileObj) -> dict:
"""Get the reasons' data.
`get_reasons_data()` will attempt to get the data from GS only
once. If it fails it will cache a "LOAD FAILED" result that will
cause it to fail when called further in the execution. This is to
avoid repeted API calls to the GS bucket.
`location` should be a GS bucket in the form
gs://bucketname/some/key/reasons.json. If location is not
provided the env var REASONS_GS_URL will be used and if that
doesn't exist a default value for the URL will be used.
`location` should be `FileObj` like
gs://arxiv-production-data/reasons.json.
This will load from GS once and save in a package level variable.
This will load once and save in a package level variable.
If `reasons()` is to be used in an app it makes sense to call
`get_reasons_data()` when starting that app to ensure the app has
`get_reasons_data(file)` when starting that app to ensure the app has
access and it is configured correctly.
"""
global _reasons_data
Expand All @@ -54,37 +45,16 @@ def get_reasons_data(location:Optional[str]=None)->dict:
if _reasons_data == "LOAD FAILED":
raise Exception("Previous load of reasons data failed, not trying again "
"until _reasons_data is cleared by setting it to None")

if location is None:
location = os.environ.get("REASONS_GS_URL", DEFAULT_REASONS_GS_URL)

if location is None:
raise ValueError("Must pass location or set env var REASONS_GS_URL")

blob = None
try:
bucket_name = location.strip('gs://').split('/')[0]
key = '/'.join(location.strip('gs://').split('/')[1:])
bucket = storage.Client().bucket(bucket_name)
blob = bucket.get_blob(key)
except Exception as ex:
_reasons_data = "LOAD FAILED"
raise ex

if not blob:
_reasons_data = "LOAD FAILED"
raise Exception(f"Could not get resons file from {location}")

try:
with blob.open('r') as fp:
with file.open('r') as fp:
_reasons_data = json.load(fp)
return _reasons_data
except Exception as ex:
_reasons_data = "LOAD FAILED"
raise ex


def reasons(id: str, format: FORMATS)-> Optional[str] :
def reasons(reasons_data: dict, id: str, format: FORMATS) -> Optional[str] :
"""Find any reasons for inability to process this paper.
Find all the recorded reasons for inability to process this paper (if any),
Expand All @@ -100,8 +70,9 @@ def reasons(id: str, format: FORMATS)-> Optional[str] :
Returns a list of strings which report reasons for different versions
or formats fail. List is empty if no reasons are recorded.
See test_reasons.py for an example of the JSON needed for `reasons_data`.
"""
reasons_data = get_reasons_data()

if not id:
return None
Expand Down
41 changes: 17 additions & 24 deletions arxiv/legacy/papers/dissemination/tests/test_reasons.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,10 @@
import re
from unittest import TestCase

import arxiv.legacy.papers.dissemination.reasons as reasons_pkg

reasons = reasons_pkg.reasons
from arxiv.legacy.papers.dissemination.reasons import reasons

def fake_reasons():
reasons_pkg._reasons_data = test_reasons_data # pylint: disable=W0212

return test_reasons_data


class TestReasons(TestCase):
Expand All @@ -28,15 +25,14 @@ class TestReasons(TestCase):
'1808.02949v1', '1612.00844v1']

def test_papers_flaged_versions_in_reasons(self):
fake_reasons()
vrx = re.compile(r'v\d+$')
for id in self.ids_with_v:
assert reasons(id, 'pdf')
assert reasons(id, 'ps')
assert reasons(fake_reasons(), id, 'pdf')
assert reasons(fake_reasons(), id, 'ps')
nov = re.sub(vrx,'',id)
assert nov != id
assert not reasons(nov, 'pdf')
assert not reasons(nov, 'ps')
assert not reasons(fake_reasons(), nov, 'pdf')
assert not reasons(fake_reasons(), nov, 'ps')


ids_with_pdf_reasons = [ 'alg-geom/9411012.pdf',
Expand Down Expand Up @@ -66,13 +62,12 @@ def test_papers_flaged_versions_in_reasons(self):


def test_papers_flaged_pdf_in_reasons(self):
fake_reasons()
dotpdfrx = re.compile('.pdf$')
for id in self.ids_with_pdf_reasons:
assert reasons(id, 'pdf')
assert reasons(fake_reasons(), id, 'pdf')
nodotpdf = re.sub(dotpdfrx, '', id)
assert not reasons(nodotpdf, 'ps')
assert not reasons(nodotpdf, 'postscript')
assert not reasons(fake_reasons(), nodotpdf, 'ps')
assert not reasons(fake_reasons(), nodotpdf, 'postscript')



Expand Down Expand Up @@ -813,15 +808,14 @@ def test_papers_flaged_pdf_in_reasons(self):
'1310.4962']

def test_papers_flaged_in_reason(self):
fake_reasons()
for id in self.ids_with_reasons:
assert reasons(id, 'pdf')
assert reasons(id + "v1", 'pdf')
assert reasons(id + "v2", 'pdf')
assert reasons(id + "v12", 'pdf')
assert reasons(id + "v123", 'pdf')
assert reasons(id, 'ps')
assert reasons(id, 'postscript')
assert reasons(fake_reasons(), id, 'pdf')
assert reasons(fake_reasons(), id + "v1", 'pdf')
assert reasons(fake_reasons(), id + "v2", 'pdf')
assert reasons(fake_reasons(), id + "v12", 'pdf')
assert reasons(fake_reasons(), id + "v123", 'pdf')
assert reasons(fake_reasons(), id, 'ps')
assert reasons(fake_reasons(), id, 'postscript')

no_reasons_data=[ 'astro-ph/9811330', 'astro-ph/9811330v1',
'astro-ph/9811330v12', 'astro-ph/9811330v1.pdf',
Expand All @@ -831,9 +825,8 @@ def test_papers_flaged_in_reason(self):
]

def no_reasons(self):
fake_reasons()
for id in self.no_reasons_data:
assert not reasons(id, 'pdf')
assert not reasons(fake_reasons(), id, 'pdf')


# Take from /users/e-prints/httpd/htdocs/Database/reasons on 2023-01-25.
Expand Down

0 comments on commit 0fb6c76

Please sign in to comment.