Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Docx format option changes + db isolation level environment variable #267

Merged
merged 20 commits into from
May 10, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
b789628
Add docx as format option
mnazzaro Apr 25, 2024
a2a1256
Merge pull request #262 from arXiv/ARXIVCE-1513-docx
mnazzaro Apr 25, 2024
019032a
Swap order of acm-class and msc-class to conform to schema.
cbf66 Apr 30, 2024
f38eed8
Merge pull request #263 from arXiv/charles/oai-schema
arxiv-admin May 1, 2024
cc1f7d3
Add env vars for setting transaction isolation level for both dbs
mnazzaro May 1, 2024
c10d97f
per transaction iso levels
mnazzaro May 1, 2024
24ae67b
Fix type
mnazzaro May 1, 2024
e3c45bd
Change default iso levels because they don't work with sqlite
mnazzaro May 2, 2024
09fc4b6
Merge pull request #264 from arXiv/ARXIVCE-1596-db-iso-levels
mnazzaro May 2, 2024
97c884a
Remove broken code for transaction level specification of isolation l…
mnazzaro May 6, 2024
aeec4a8
Merge pull request #266 from arXiv/transaction-iso-levels
mnazzaro May 6, 2024
146460c
exponential backoff for retry, and catch exception for get_blob
mnazzaro May 9, 2024
ed2bd42
Reset connections more often so we don't try to use idle connections …
mnazzaro May 9, 2024
4f10aa7
Don't return error from remove_scoped_session
mnazzaro May 9, 2024
4460761
Add types
mnazzaro May 9, 2024
1b1a28d
Allow more connections
mnazzaro May 9, 2024
5723bd2
Add request concurrency config
mnazzaro May 10, 2024
5ac92e6
Merge pull request #270 from arXiv/ARXIVCE-1639-browse-traces
mnazzaro May 10, 2024
805ed44
Found the config Jonathan is skeptical of
mnazzaro May 10, 2024
f707487
Merge pull request #271 from arXiv/ARXIVCE-1639-browse-traces
mnazzaro May 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions arxiv/base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,5 @@ def register_blueprint(self: Flask, blueprint: Blueprint,
# It is the same as the flask_sqlalchemy implementation
# See: https://github.com/pallets-eco/flask-sqlalchemy/blob/42a36a3cb604fd39d81d00b54ab3988bbd0ad184/src/flask_sqlalchemy/session.py#L109
@app.teardown_appcontext
def remove_scoped_session (response_or_exc):
def remove_scoped_session (response_or_exc: BaseException | None) -> None:
session.remove()
return response_or_exc
6 changes: 6 additions & 0 deletions arxiv/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import importlib.metadata
from typing import Optional, List, Tuple
import os
from sqlalchemy.engine.interfaces import IsolationLevel
from secrets import token_hex
from urllib.parse import urlparse
from pydantic import BaseSettings, SecretStr
Expand Down Expand Up @@ -171,5 +172,10 @@ class Settings(BaseSettings):
CLASSIC_DB_URI: str = DEFAULT_DB
LATEXML_DB_URI: str = DEFAULT_LATEXML_DB
ECHO_SQL: bool = False
CLASSIC_DB_TRANSACTION_ISOLATION_LEVEL: Optional[IsolationLevel] = None
LATEXML_DB_TRANSACTION_ISOLATION_LEVEL: Optional[IsolationLevel] = None

# How many requests do we handle at once -> How many db connections should we be able to open at once
REQUEST_CONCURRENCY: int = 32

settings = Settings()
16 changes: 11 additions & 5 deletions arxiv/db/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
with transaction() as session:
session.add(...)
"""
from typing import Generator
from typing import Optional
import logging
from contextlib import contextmanager

Expand All @@ -41,7 +41,7 @@

from sqlalchemy import create_engine, MetaData, String
from sqlalchemy.orm import sessionmaker, scoped_session, DeclarativeBase

from sqlalchemy.engine.interfaces import IsolationLevel

from ..config import settings

Expand All @@ -56,9 +56,15 @@ class LaTeXMLBase(DeclarativeBase):
logger = logging.getLogger(__name__)

engine = create_engine(settings.CLASSIC_DB_URI,
echo=settings.ECHO_SQL)
echo=settings.ECHO_SQL,
isolation_level=settings.CLASSIC_DB_TRANSACTION_ISOLATION_LEVEL,
pool_recycle=600,
max_overflow=(settings.REQUEST_CONCURRENCY - 5)) # max overflow is how many + base pool size, which is 5 by default
latexml_engine = create_engine(settings.LATEXML_DB_URI,
echo=settings.ECHO_SQL)
echo=settings.ECHO_SQL,
isolation_level=settings.LATEXML_DB_TRANSACTION_ISOLATION_LEVEL,
pool_recycle=600,
max_overflow=(settings.REQUEST_CONCURRENCY - 5))
SessionLocal = sessionmaker(autocommit=False, autoflush=False)

def _app_ctx_id () -> int:
Expand All @@ -77,7 +83,7 @@ def get_db ():
@contextmanager
def transaction ():
in_flask = True if has_app_context() else False
db = session if in_flask else SessionLocal()
db = session if in_flask else SessionLocal()
try:
yield db

Expand Down
4 changes: 2 additions & 2 deletions arxiv/document/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,10 +291,10 @@ def raw(self) -> str:
# skipping proxy to avoid harvesting of email addresses
if self.report_num:
rv += f"Report-no: {self.report_num}\n"
if self.msc_class:
rv += f"MSC-class: {self.msc_class}\n"
if self.acm_class:
rv += f"ACM-class: {self.acm_class}\n"
if self.msc_class:
rv += f"MSC-class: {self.msc_class}\n"
if self.journal_ref:
rv += f"Journal-ref: {self.journal_ref}\n"
if self.doi:
Expand Down
2 changes: 1 addition & 1 deletion arxiv/document/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def formats(self) -> List[str]:
elif self.source_flag.html or self.source_format == "html":
formats.extend(['html'])
elif self.source_flag.docx or self.source_format == "docx":
formats.extend(['pdf'])
formats.extend(['pdf', 'docx'])
else:
formats.extend(['pdf', 'ps', 'src'])

Expand Down
10 changes: 9 additions & 1 deletion arxiv/files/object_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,14 @@

from google.cloud.storage.blob import Blob
from google.cloud.storage.bucket import Bucket
from google.cloud.storage.retry import DEFAULT_RETRY

from . import FileObj

GCS_RETRY = DEFAULT_RETRY \
.with_deadline(12) \
.with_delay(0.25, 2.5)

class ObjectStore(ABC):
"""ABC for an object store."""

Expand Down Expand Up @@ -106,7 +111,10 @@ def to_obj(self, key: str) -> FileObj:

Returns `FileDoesNotExist` if there is no object at the key.
"""
blob = self.bucket.get_blob(key)
try:
blob = self.bucket.get_blob(key, retry=GCS_RETRY)
except:
blob = None
if not blob:
return FileDoesNotExist("gs://" + self.bucket.name + '/' + key)
else:
Expand Down
Loading