From b789628a16d058bb799bbfb28051aac65c6e429c Mon Sep 17 00:00:00 2001 From: Mark Nazzaro Date: Thu, 25 Apr 2024 14:52:37 -0400 Subject: [PATCH 01/14] Add docx as format option --- arxiv/document/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arxiv/document/version.py b/arxiv/document/version.py index cd379d95..8c2661ea 100644 --- a/arxiv/document/version.py +++ b/arxiv/document/version.py @@ -144,7 +144,7 @@ def formats(self) -> List[str]: elif self.source_flag.html or self.source_format == "html": formats.extend(['html']) elif self.source_flag.docx or self.source_format == "docx": - formats.extend(['pdf']) + formats.extend(['pdf', 'docx']) else: formats.extend(['pdf', 'ps', 'src']) From 019032a007855f4c53a8ee4597261f1a56f1d32f Mon Sep 17 00:00:00 2001 From: Charles Frankston Date: Tue, 30 Apr 2024 12:02:13 -0400 Subject: [PATCH 02/14] Swap order of acm-class and msc-class to conform to schema. --- arxiv/document/metadata.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arxiv/document/metadata.py b/arxiv/document/metadata.py index b0f97060..d0f2a24b 100644 --- a/arxiv/document/metadata.py +++ b/arxiv/document/metadata.py @@ -291,10 +291,10 @@ def raw(self) -> str: # skipping proxy to avoid harvesting of email addresses if self.report_num: rv += f"Report-no: {self.report_num}\n" - if self.msc_class: - rv += f"MSC-class: {self.msc_class}\n" if self.acm_class: rv += f"ACM-class: {self.acm_class}\n" + if self.msc_class: + rv += f"MSC-class: {self.msc_class}\n" if self.journal_ref: rv += f"Journal-ref: {self.journal_ref}\n" if self.doi: From cc1f7d305c96d185f27a9037c5d340abeb829a9d Mon Sep 17 00:00:00 2001 From: Mark Nazzaro Date: Wed, 1 May 2024 10:21:09 -0400 Subject: [PATCH 03/14] Add env vars for setting transaction isolation level for both dbs --- arxiv/config/__init__.py | 3 +++ arxiv/db/__init__.py | 6 ++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/arxiv/config/__init__.py b/arxiv/config/__init__.py index 2baeff93..53ccb54f 100644 --- a/arxiv/config/__init__.py +++ b/arxiv/config/__init__.py @@ -2,6 +2,7 @@ import importlib.metadata from typing import Optional, List, Tuple import os +from sqlalchemy.engine.interfaces import IsolationLevel from secrets import token_hex from urllib.parse import urlparse from pydantic import BaseSettings, SecretStr @@ -171,5 +172,7 @@ class Settings(BaseSettings): CLASSIC_DB_URI: str = DEFAULT_DB LATEXML_DB_URI: str = DEFAULT_LATEXML_DB ECHO_SQL: bool = False + CLASSIC_DB_TRANSACTION_ISOLATION_LEVEL: IsolationLevel = 'REPEATABLE READ' + LATEXML_DB_TRANSACTION_ISOLATION_LEVEL: IsolationLevel = 'READ COMMITTED' settings = Settings() \ No newline at end of file diff --git a/arxiv/db/__init__.py b/arxiv/db/__init__.py index 06c203ca..cb1f2735 100644 --- a/arxiv/db/__init__.py +++ b/arxiv/db/__init__.py @@ -56,9 +56,11 @@ class LaTeXMLBase(DeclarativeBase): logger = logging.getLogger(__name__) engine = create_engine(settings.CLASSIC_DB_URI, - echo=settings.ECHO_SQL) + echo=settings.ECHO_SQL, + isolation_level=settings.CLASSIC_DB_TRANSACTION_ISOLATION_LEVEL) latexml_engine = create_engine(settings.LATEXML_DB_URI, - echo=settings.ECHO_SQL) + echo=settings.ECHO_SQL, + isolation_level=settings.LATEXML_DB_TRANSACTION_ISOLATION_LEVEL) SessionLocal = sessionmaker(autocommit=False, autoflush=False) def _app_ctx_id () -> int: From c10d97fe1b90bde6084d29073e4b4c1256373eb2 Mon Sep 17 00:00:00 2001 From: Mark Nazzaro Date: Wed, 1 May 2024 10:30:40 -0400 Subject: [PATCH 04/14] per transaction iso levels --- arxiv/db/__init__.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arxiv/db/__init__.py b/arxiv/db/__init__.py index cb1f2735..3838b8d2 100644 --- a/arxiv/db/__init__.py +++ b/arxiv/db/__init__.py @@ -32,7 +32,7 @@ with transaction() as session: session.add(...) """ -from typing import Generator +from typing import Optional import logging from contextlib import contextmanager @@ -77,9 +77,13 @@ def get_db (): db.close() @contextmanager -def transaction (): +def transaction (transaction_isolation_level: Optional[str] = None): in_flask = True if has_app_context() else False - db = session if in_flask else SessionLocal() + db = session if in_flask else SessionLocal() + if transaction_isolation_level: + db.connection(execution_options={ + 'isolation_level': transaction_isolation_level + }) try: yield db From 24ae67b186a5da56d6a23f47ed08684795fd317c Mon Sep 17 00:00:00 2001 From: Mark Nazzaro Date: Wed, 1 May 2024 10:37:19 -0400 Subject: [PATCH 05/14] Fix type --- arxiv/db/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arxiv/db/__init__.py b/arxiv/db/__init__.py index 3838b8d2..2b20a9df 100644 --- a/arxiv/db/__init__.py +++ b/arxiv/db/__init__.py @@ -41,7 +41,7 @@ from sqlalchemy import create_engine, MetaData, String from sqlalchemy.orm import sessionmaker, scoped_session, DeclarativeBase - +from sqlalchemy.engine.interfaces import IsolationLevel from ..config import settings @@ -77,7 +77,7 @@ def get_db (): db.close() @contextmanager -def transaction (transaction_isolation_level: Optional[str] = None): +def transaction (transaction_isolation_level: Optional[IsolationLevel] = None): in_flask = True if has_app_context() else False db = session if in_flask else SessionLocal() if transaction_isolation_level: From e3c45bd83de3efe417721ac61319532131a8f17e Mon Sep 17 00:00:00 2001 From: Mark Nazzaro Date: Thu, 2 May 2024 10:52:06 -0400 Subject: [PATCH 06/14] Change default iso levels because they don't work with sqlite --- arxiv/config/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arxiv/config/__init__.py b/arxiv/config/__init__.py index 53ccb54f..d373d76c 100644 --- a/arxiv/config/__init__.py +++ b/arxiv/config/__init__.py @@ -172,7 +172,7 @@ class Settings(BaseSettings): CLASSIC_DB_URI: str = DEFAULT_DB LATEXML_DB_URI: str = DEFAULT_LATEXML_DB ECHO_SQL: bool = False - CLASSIC_DB_TRANSACTION_ISOLATION_LEVEL: IsolationLevel = 'REPEATABLE READ' - LATEXML_DB_TRANSACTION_ISOLATION_LEVEL: IsolationLevel = 'READ COMMITTED' + CLASSIC_DB_TRANSACTION_ISOLATION_LEVEL: Optional[IsolationLevel] = None + LATEXML_DB_TRANSACTION_ISOLATION_LEVEL: Optional[IsolationLevel] = None settings = Settings() \ No newline at end of file From 97c884a183f3f8e187d70f89ce1598074c42d9c2 Mon Sep 17 00:00:00 2001 From: Mark Nazzaro Date: Mon, 6 May 2024 10:00:44 -0400 Subject: [PATCH 07/14] Remove broken code for transaction level specification of isolation levels --- arxiv/db/__init__.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arxiv/db/__init__.py b/arxiv/db/__init__.py index 2b20a9df..11fb42cc 100644 --- a/arxiv/db/__init__.py +++ b/arxiv/db/__init__.py @@ -77,13 +77,9 @@ def get_db (): db.close() @contextmanager -def transaction (transaction_isolation_level: Optional[IsolationLevel] = None): +def transaction (): in_flask = True if has_app_context() else False db = session if in_flask else SessionLocal() - if transaction_isolation_level: - db.connection(execution_options={ - 'isolation_level': transaction_isolation_level - }) try: yield db From 146460ce4847e2626e9e17cdccbbe49303a1df4f Mon Sep 17 00:00:00 2001 From: Mark Nazzaro Date: Thu, 9 May 2024 11:07:44 -0400 Subject: [PATCH 08/14] exponential backoff for retry, and catch exception for get_blob --- arxiv/files/object_store.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arxiv/files/object_store.py b/arxiv/files/object_store.py index 5148f946..84c03e7d 100644 --- a/arxiv/files/object_store.py +++ b/arxiv/files/object_store.py @@ -7,9 +7,14 @@ from google.cloud.storage.blob import Blob from google.cloud.storage.bucket import Bucket +from google.cloud.storage.retry import DEFAULT_RETRY from . import FileObj +GCS_RETRY = DEFAULT_RETRY \ + .with_deadline(12) \ + .with_delay(0.25, 2.5) + class ObjectStore(ABC): """ABC for an object store.""" @@ -106,7 +111,10 @@ def to_obj(self, key: str) -> FileObj: Returns `FileDoesNotExist` if there is no object at the key. """ - blob = self.bucket.get_blob(key) + try: + blob = self.bucket.get_blob(key, retry=GCS_RETRY) + except: + blob = None if not blob: return FileDoesNotExist("gs://" + self.bucket.name + '/' + key) else: From ed2bd42fc811acde3890f0172b06cca858125849 Mon Sep 17 00:00:00 2001 From: Mark Nazzaro Date: Thu, 9 May 2024 11:38:22 -0400 Subject: [PATCH 09/14] Reset connections more often so we don't try to use idle connections that have been killed on the db side --- arxiv/db/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arxiv/db/__init__.py b/arxiv/db/__init__.py index 11fb42cc..6f60d779 100644 --- a/arxiv/db/__init__.py +++ b/arxiv/db/__init__.py @@ -57,10 +57,12 @@ class LaTeXMLBase(DeclarativeBase): engine = create_engine(settings.CLASSIC_DB_URI, echo=settings.ECHO_SQL, - isolation_level=settings.CLASSIC_DB_TRANSACTION_ISOLATION_LEVEL) + isolation_level=settings.CLASSIC_DB_TRANSACTION_ISOLATION_LEVEL, + pool_recycle=600) latexml_engine = create_engine(settings.LATEXML_DB_URI, echo=settings.ECHO_SQL, - isolation_level=settings.LATEXML_DB_TRANSACTION_ISOLATION_LEVEL) + isolation_level=settings.LATEXML_DB_TRANSACTION_ISOLATION_LEVEL, + pool_recycle=600) SessionLocal = sessionmaker(autocommit=False, autoflush=False) def _app_ctx_id () -> int: From 4f10aa71cf9f928bb3200604fb81afcc096efe14 Mon Sep 17 00:00:00 2001 From: Mark Nazzaro Date: Thu, 9 May 2024 11:43:07 -0400 Subject: [PATCH 10/14] Don't return error from remove_scoped_session --- arxiv/base/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/arxiv/base/__init__.py b/arxiv/base/__init__.py index 594620c0..75f9abdf 100644 --- a/arxiv/base/__init__.py +++ b/arxiv/base/__init__.py @@ -130,4 +130,3 @@ def register_blueprint(self: Flask, blueprint: Blueprint, @app.teardown_appcontext def remove_scoped_session (response_or_exc): session.remove() - return response_or_exc \ No newline at end of file From 446076160c039b30aacca9c7cc1fc037231d8462 Mon Sep 17 00:00:00 2001 From: Mark Nazzaro Date: Thu, 9 May 2024 11:43:55 -0400 Subject: [PATCH 11/14] Add types --- arxiv/base/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arxiv/base/__init__.py b/arxiv/base/__init__.py index 75f9abdf..64da60c4 100644 --- a/arxiv/base/__init__.py +++ b/arxiv/base/__init__.py @@ -128,5 +128,5 @@ def register_blueprint(self: Flask, blueprint: Blueprint, # It is the same as the flask_sqlalchemy implementation # See: https://github.com/pallets-eco/flask-sqlalchemy/blob/42a36a3cb604fd39d81d00b54ab3988bbd0ad184/src/flask_sqlalchemy/session.py#L109 @app.teardown_appcontext - def remove_scoped_session (response_or_exc): + def remove_scoped_session (response_or_exc: BaseException | None) -> None: session.remove() From 1b1a28d065eda234446548bc6cd76e60b8b67ecf Mon Sep 17 00:00:00 2001 From: Mark Nazzaro Date: Thu, 9 May 2024 11:56:23 -0400 Subject: [PATCH 12/14] Allow more connections --- arxiv/db/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arxiv/db/__init__.py b/arxiv/db/__init__.py index 6f60d779..ec8dfe97 100644 --- a/arxiv/db/__init__.py +++ b/arxiv/db/__init__.py @@ -58,11 +58,13 @@ class LaTeXMLBase(DeclarativeBase): engine = create_engine(settings.CLASSIC_DB_URI, echo=settings.ECHO_SQL, isolation_level=settings.CLASSIC_DB_TRANSACTION_ISOLATION_LEVEL, - pool_recycle=600) + pool_recycle=600, + max_overflow=27) latexml_engine = create_engine(settings.LATEXML_DB_URI, echo=settings.ECHO_SQL, isolation_level=settings.LATEXML_DB_TRANSACTION_ISOLATION_LEVEL, - pool_recycle=600) + pool_recycle=600, + max_overflow=27) SessionLocal = sessionmaker(autocommit=False, autoflush=False) def _app_ctx_id () -> int: From 5723bd2769a433dfac4e2ee7a15037e8fc04e4c7 Mon Sep 17 00:00:00 2001 From: Mark Nazzaro Date: Fri, 10 May 2024 09:47:59 -0400 Subject: [PATCH 13/14] Add request concurrency config --- arxiv/config/__init__.py | 3 +++ arxiv/db/__init__.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arxiv/config/__init__.py b/arxiv/config/__init__.py index d373d76c..7d9a602c 100644 --- a/arxiv/config/__init__.py +++ b/arxiv/config/__init__.py @@ -175,4 +175,7 @@ class Settings(BaseSettings): CLASSIC_DB_TRANSACTION_ISOLATION_LEVEL: Optional[IsolationLevel] = None LATEXML_DB_TRANSACTION_ISOLATION_LEVEL: Optional[IsolationLevel] = None + # How many requests do we handle at once -> How many db connections should we be able to open at once + REQUEST_CONCURRENCY: int = 32 + settings = Settings() \ No newline at end of file diff --git a/arxiv/db/__init__.py b/arxiv/db/__init__.py index ec8dfe97..227e025f 100644 --- a/arxiv/db/__init__.py +++ b/arxiv/db/__init__.py @@ -59,12 +59,12 @@ class LaTeXMLBase(DeclarativeBase): echo=settings.ECHO_SQL, isolation_level=settings.CLASSIC_DB_TRANSACTION_ISOLATION_LEVEL, pool_recycle=600, - max_overflow=27) + max_overflow=(settings.REQUEST_CONCURRENCY - 5)) # max overflow is how many + base pool size, which is 5 by default latexml_engine = create_engine(settings.LATEXML_DB_URI, echo=settings.ECHO_SQL, isolation_level=settings.LATEXML_DB_TRANSACTION_ISOLATION_LEVEL, pool_recycle=600, - max_overflow=27) + max_overflow=(settings.REQUEST_CONCURRENCY - 5)) SessionLocal = sessionmaker(autocommit=False, autoflush=False) def _app_ctx_id () -> int: From 805ed440dc770c6bcea199a9cec3683d89311b8d Mon Sep 17 00:00:00 2001 From: Mark Nazzaro Date: Fri, 10 May 2024 10:44:06 -0400 Subject: [PATCH 14/14] Found the config Jonathan is skeptical of --- arxiv/config/__init__.py | 5 +++-- arxiv/db/__init__.py | 6 ++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arxiv/config/__init__.py b/arxiv/config/__init__.py index 7d9a602c..b30200fd 100644 --- a/arxiv/config/__init__.py +++ b/arxiv/config/__init__.py @@ -175,7 +175,8 @@ class Settings(BaseSettings): CLASSIC_DB_TRANSACTION_ISOLATION_LEVEL: Optional[IsolationLevel] = None LATEXML_DB_TRANSACTION_ISOLATION_LEVEL: Optional[IsolationLevel] = None - # How many requests do we handle at once -> How many db connections should we be able to open at once REQUEST_CONCURRENCY: int = 32 - + """ How many requests do we handle at once -> How many db connections should we be able to open at once """ + POOL_PRE_PING: bool = True + """ Liveness check of sqlalchemy connections before checking out of pool """ settings = Settings() \ No newline at end of file diff --git a/arxiv/db/__init__.py b/arxiv/db/__init__.py index 227e025f..d5126b64 100644 --- a/arxiv/db/__init__.py +++ b/arxiv/db/__init__.py @@ -59,12 +59,14 @@ class LaTeXMLBase(DeclarativeBase): echo=settings.ECHO_SQL, isolation_level=settings.CLASSIC_DB_TRANSACTION_ISOLATION_LEVEL, pool_recycle=600, - max_overflow=(settings.REQUEST_CONCURRENCY - 5)) # max overflow is how many + base pool size, which is 5 by default + max_overflow=(settings.REQUEST_CONCURRENCY - 5), # max overflow is how many + base pool size, which is 5 by default + pool_pre_ping=settings.POOL_PRE_PING) latexml_engine = create_engine(settings.LATEXML_DB_URI, echo=settings.ECHO_SQL, isolation_level=settings.LATEXML_DB_TRANSACTION_ISOLATION_LEVEL, pool_recycle=600, - max_overflow=(settings.REQUEST_CONCURRENCY - 5)) + max_overflow=(settings.REQUEST_CONCURRENCY - 5), + pool_pre_ping=settings.POOL_PRE_PING) SessionLocal = sessionmaker(autocommit=False, autoflush=False) def _app_ctx_id () -> int: