diff --git a/arxiv/base/__init__.py b/arxiv/base/__init__.py index 594620c0..64da60c4 100644 --- a/arxiv/base/__init__.py +++ b/arxiv/base/__init__.py @@ -128,6 +128,5 @@ def register_blueprint(self: Flask, blueprint: Blueprint, # It is the same as the flask_sqlalchemy implementation # See: https://github.com/pallets-eco/flask-sqlalchemy/blob/42a36a3cb604fd39d81d00b54ab3988bbd0ad184/src/flask_sqlalchemy/session.py#L109 @app.teardown_appcontext - def remove_scoped_session (response_or_exc): + def remove_scoped_session (response_or_exc: BaseException | None) -> None: session.remove() - return response_or_exc \ No newline at end of file diff --git a/arxiv/config/__init__.py b/arxiv/config/__init__.py index 2baeff93..b30200fd 100644 --- a/arxiv/config/__init__.py +++ b/arxiv/config/__init__.py @@ -2,6 +2,7 @@ import importlib.metadata from typing import Optional, List, Tuple import os +from sqlalchemy.engine.interfaces import IsolationLevel from secrets import token_hex from urllib.parse import urlparse from pydantic import BaseSettings, SecretStr @@ -171,5 +172,11 @@ class Settings(BaseSettings): CLASSIC_DB_URI: str = DEFAULT_DB LATEXML_DB_URI: str = DEFAULT_LATEXML_DB ECHO_SQL: bool = False + CLASSIC_DB_TRANSACTION_ISOLATION_LEVEL: Optional[IsolationLevel] = None + LATEXML_DB_TRANSACTION_ISOLATION_LEVEL: Optional[IsolationLevel] = None + REQUEST_CONCURRENCY: int = 32 + """ How many requests do we handle at once -> How many db connections should we be able to open at once """ + POOL_PRE_PING: bool = True + """ Liveness check of sqlalchemy connections before checking out of pool """ settings = Settings() \ No newline at end of file diff --git a/arxiv/db/__init__.py b/arxiv/db/__init__.py index 06c203ca..d5126b64 100644 --- a/arxiv/db/__init__.py +++ b/arxiv/db/__init__.py @@ -32,7 +32,7 @@ with transaction() as session: session.add(...) """ -from typing import Generator +from typing import Optional import logging from contextlib import contextmanager @@ -41,7 +41,7 @@ from sqlalchemy import create_engine, MetaData, String from sqlalchemy.orm import sessionmaker, scoped_session, DeclarativeBase - +from sqlalchemy.engine.interfaces import IsolationLevel from ..config import settings @@ -56,9 +56,17 @@ class LaTeXMLBase(DeclarativeBase): logger = logging.getLogger(__name__) engine = create_engine(settings.CLASSIC_DB_URI, - echo=settings.ECHO_SQL) + echo=settings.ECHO_SQL, + isolation_level=settings.CLASSIC_DB_TRANSACTION_ISOLATION_LEVEL, + pool_recycle=600, + max_overflow=(settings.REQUEST_CONCURRENCY - 5), # max overflow is how many + base pool size, which is 5 by default + pool_pre_ping=settings.POOL_PRE_PING) latexml_engine = create_engine(settings.LATEXML_DB_URI, - echo=settings.ECHO_SQL) + echo=settings.ECHO_SQL, + isolation_level=settings.LATEXML_DB_TRANSACTION_ISOLATION_LEVEL, + pool_recycle=600, + max_overflow=(settings.REQUEST_CONCURRENCY - 5), + pool_pre_ping=settings.POOL_PRE_PING) SessionLocal = sessionmaker(autocommit=False, autoflush=False) def _app_ctx_id () -> int: @@ -77,7 +85,7 @@ def get_db (): @contextmanager def transaction (): in_flask = True if has_app_context() else False - db = session if in_flask else SessionLocal() + db = session if in_flask else SessionLocal() try: yield db diff --git a/arxiv/document/metadata.py b/arxiv/document/metadata.py index b0f97060..d0f2a24b 100644 --- a/arxiv/document/metadata.py +++ b/arxiv/document/metadata.py @@ -291,10 +291,10 @@ def raw(self) -> str: # skipping proxy to avoid harvesting of email addresses if self.report_num: rv += f"Report-no: {self.report_num}\n" - if self.msc_class: - rv += f"MSC-class: {self.msc_class}\n" if self.acm_class: rv += f"ACM-class: {self.acm_class}\n" + if self.msc_class: + rv += f"MSC-class: {self.msc_class}\n" if self.journal_ref: rv += f"Journal-ref: {self.journal_ref}\n" if self.doi: diff --git a/arxiv/document/version.py b/arxiv/document/version.py index cd379d95..8c2661ea 100644 --- a/arxiv/document/version.py +++ b/arxiv/document/version.py @@ -144,7 +144,7 @@ def formats(self) -> List[str]: elif self.source_flag.html or self.source_format == "html": formats.extend(['html']) elif self.source_flag.docx or self.source_format == "docx": - formats.extend(['pdf']) + formats.extend(['pdf', 'docx']) else: formats.extend(['pdf', 'ps', 'src']) diff --git a/arxiv/files/object_store.py b/arxiv/files/object_store.py index 5148f946..84c03e7d 100644 --- a/arxiv/files/object_store.py +++ b/arxiv/files/object_store.py @@ -7,9 +7,14 @@ from google.cloud.storage.blob import Blob from google.cloud.storage.bucket import Bucket +from google.cloud.storage.retry import DEFAULT_RETRY from . import FileObj +GCS_RETRY = DEFAULT_RETRY \ + .with_deadline(12) \ + .with_delay(0.25, 2.5) + class ObjectStore(ABC): """ABC for an object store.""" @@ -106,7 +111,10 @@ def to_obj(self, key: str) -> FileObj: Returns `FileDoesNotExist` if there is no object at the key. """ - blob = self.bucket.get_blob(key) + try: + blob = self.bucket.get_blob(key, retry=GCS_RETRY) + except: + blob = None if not blob: return FileDoesNotExist("gs://" + self.bucket.name + '/' + key) else: