Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Skip errors during harvest and report to Sentry #551

Merged
merged 5 commits into from
Nov 20, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -107,5 +107,6 @@ tmp/*
out.xml
.DS_Store
.vscode
.idea

output/
3 changes: 2 additions & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ click = "*"
sentry-sdk = "*"
sickle = "*"
smart-open = {extras = ["s3"], version = "*"}
urllib3 = "==1.26.18"

[dev-packages]
bandit = "*"
Expand All @@ -17,7 +18,7 @@ flake8 = "*"
isort = "*"
mypy = "*"
pytest = "*"
vcrpy = "*"
vcrpy = "==4.2.1"

[requires]
python_version = "3.11"
Expand Down
1,130 changes: 572 additions & 558 deletions Pipfile.lock

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion harvester/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@

DEFAULT_RETRY_AFTER = 30
MAX_RETRIES = 10
RETRY_STATUS_CODES = (429, 500, 503)
RETRY_STATUS_CODES = [429, 500, 503]
MAX_ALLOWED_ERRORS = 10


def configure_logger(logger: logging.Logger, verbose: bool) -> str:
Expand Down
7 changes: 7 additions & 0 deletions harvester/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""exceptions.py module."""


class MaxAllowedErrorsReached(Exception):
"""Thrown when maximum numbers of errors reached during GetRecords harvest."""

pass
49 changes: 42 additions & 7 deletions harvester/oai.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,24 @@
"""oai.py module."""

import json
import logging
import os
from typing import Iterator, Literal, Optional
from typing import Any, Iterator, Literal, Optional

from requests import HTTPError
import smart_open
from sickle import Sickle
from sickle.models import Record
from sickle.oaiexceptions import IdDoesNotExist
from sickle.oaiexceptions import IdDoesNotExist, OAIError

from harvester.config import DEFAULT_RETRY_AFTER, MAX_RETRIES, RETRY_STATUS_CODES
from harvester.config import (
DEFAULT_RETRY_AFTER,
MAX_RETRIES,
RETRY_STATUS_CODES,
MAX_ALLOWED_ERRORS,
)
from harvester.exceptions import MaxAllowedErrorsReached
from harvester.utils import send_sentry_message

logger = logging.getLogger(__name__)

Expand All @@ -22,13 +31,15 @@ def __init__(
from_date: Optional[str] = None,
until_date: Optional[str] = None,
set_spec: Optional[str] = None,
max_retries: Optional[int] = MAX_RETRIES,
retry_status_codes: list[int] = RETRY_STATUS_CODES,
) -> None:
self.source_url = source_url
self.client = Sickle(
self.source_url,
default_retry_after=DEFAULT_RETRY_AFTER,
max_retries=MAX_RETRIES,
retry_status_codes=RETRY_STATUS_CODES,
max_retries=max_retries,
retry_status_codes=retry_status_codes,
)
self.metadata_format = metadata_format
self._set_params(metadata_format, from_date, until_date, set_spec)
Expand Down Expand Up @@ -59,9 +70,23 @@ def get_identifiers(self, exclude_deleted: bool) -> Iterator[str]:
yield record.identifier

def get_records(
self, identifiers: Iterator[str], skip_list: Optional[tuple[str]] = None
self,
identifiers: Iterator[str],
skip_list: Optional[tuple[str]] = None,
max_allowed_errors: int = MAX_ALLOWED_ERRORS,
) -> Iterator[Record]:
failed_records: list[tuple[str, Any | str | None]] = []
for identifier in identifiers:
if len(failed_records) == max_allowed_errors:
message = (
f"OAI harvest ABORTED, max errors reached: {max_allowed_errors}."
)
send_sentry_message(
message,
{"failed_records": failed_records},
)
raise MaxAllowedErrorsReached(message)

if skip_list and identifier in skip_list:
logger.warning(
"Skipped retrieving record with identifier %s because it is in the "
Expand All @@ -74,6 +99,9 @@ def get_records(
identifier=identifier, metadataPrefix=self.metadata_format
)
logger.debug("Record retrieved: %s", identifier)
except (HTTPError, OAIError) as e:
failed_records.append((identifier, getattr(e.request, "url", None)))
continue
except IdDoesNotExist:
logger.warning(
"Identifier %s retrieved in identifiers list returned 'id does not "
Expand All @@ -83,6 +111,13 @@ def get_records(
continue
yield record

if len(failed_records) > 0:
send_sentry_message(
f"OAI harvest COMPLETED, but with errors: {len(failed_records)} "
f"records skipped.",
{"failed_records": failed_records},
)

def get_sets(self):
responses = self.client.ListSets()
sets = [{"Set name": set.setName, "Set spec": set.setSpec} for set in responses]
Expand All @@ -104,7 +139,7 @@ def retrieve_records(
return self.list_records(exclude_deleted)
else:
raise ValueError(
'Method must be either "get" or "list", method provided was "{method}"'
f'Method must be either "get" or "list", method provided was "{method}"'
)


Expand Down
21 changes: 21 additions & 0 deletions harvester/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
"""utils.py module."""

import sentry_sdk


def send_sentry_message(
message: str,
scopes: dict | None = None,
level: str = "warning",
):
"""Send message directly to Sentry.

This allows both reporting information without raising an Exception, and optionally
including additional information in the sent Sentry message via "scopes".
ghukill marked this conversation as resolved.
Show resolved Hide resolved
"""
with sentry_sdk.push_scope() as scope:
if scopes:
for scope_key, scope_value in scopes.items():
scope.set_extra(scope_key, scope_value)
send_receipt = sentry_sdk.capture_message(message, level=level)
return send_receipt
3 changes: 3 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,8 @@ ignore_missing_imports = True
[mypy-smart_open.*]
ignore_missing_imports = True

[mypy-requests.*]
ignore_missing_imports = True

[tool:pytest]
log_level = DEBUG
8 changes: 8 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import pytest
from click.testing import CliRunner
from unittest.mock import patch


@pytest.fixture(autouse=True)
Expand All @@ -14,3 +15,10 @@ def test_env():
def cli_runner():
runner = CliRunner()
return runner


@pytest.fixture
def mock_sentry_capture_message():
with patch("harvester.utils.sentry_sdk.capture_message") as mock_capture:
mock_capture.return_value = "message-sent-ok"
yield mock_capture
Loading
Loading