From 37ecbb813500018692c9c787cff15e5084dd11b2 Mon Sep 17 00:00:00 2001 From: davidschober Date: Mon, 1 Apr 2024 12:14:12 -0500 Subject: [PATCH 1/2] added retries --- nuldc/dump.py | 7 +++---- nuldc/helpers.py | 29 +++++++++++++++++++++-------- pyproject.toml | 2 +- 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/nuldc/dump.py b/nuldc/dump.py index bfafafe..ba18083 100644 --- a/nuldc/dump.py +++ b/nuldc/dump.py @@ -22,7 +22,6 @@ from nuldc import helpers import json import re -import concurrent.futures import datetime import os import sys @@ -67,10 +66,10 @@ def dump_collection(col_id): "query": f"collection.id:{col_id}", "size": "100", "sort": "id:asc"} - data = helpers.get_search_results(API, - "works", - params, all_results=True, page_limit=5000) try: + data = helpers.get_search_results(API, + "works", + params, all_results=True, page_limit=5000) col_title = data['data'][0]['collection']['title'] filename = f"{slugify(col_title)}-{col_id}" save_files(filename, data) diff --git a/nuldc/helpers.py b/nuldc/helpers.py index c576c65..5d4bb7c 100644 --- a/nuldc/helpers.py +++ b/nuldc/helpers.py @@ -1,10 +1,23 @@ import requests +from requests.adapters import HTTPAdapter +import urllib3 import unicodecsv as csv import tqdm import dicttoxml + api_base_url = "https://api.dc.library.northwestern.edu/api/v2" +# set retries for req +retries = urllib3.Retry(total=5, + backoff_factor=1, + status_forcelist=[429, 500, 502, 503, 504], + method_whitelist=['GET', 'POST']) + +session = requests.Session() +adapter = HTTPAdapter(max_retries=retries) +session.mount('https://', adapter) + def get_all_iiif(start_manifest, total_pages, page_limit): """ takes items from a IIIF manifest and returns the next_page @@ -26,7 +39,7 @@ def get_all_iiif(start_manifest, total_pages, page_limit): pbar = tqdm.tqdm(total=total_pages, initial=1) while next: - next_results = requests.get(next).json() + next_results = session.get(next).json() if next_results.get('items')[-1].get('type') == 'Collection': next = next_results['items'].pop().get('id') else: @@ -56,7 +69,7 @@ def get_all_search_results(start_results, page_limit): # loop through the results while next: - next_results = requests.get(next).json() + next_results = session.get(next).json() results['data'] = results['data'] + next_results.get('data') next = next_results.get('pagination').get('next_url') pbar.update(1) @@ -72,7 +85,7 @@ def get_collection_by_id(api_base_url, identifier, """returns a collection as IIIF or json""" url = f"{api_base_url}/collections/{identifier}" - results = requests.get(url, params=parameters).json() + results = session.get(url, params=parameters).json() if all_results and parameters.get('as') == 'iiif': # fire off a search for total pagecount this powers the progressbar @@ -80,7 +93,7 @@ def get_collection_by_id(api_base_url, identifier, count_params['as'] = 'opensearch' count_params['query'] = f'collection.id: {identifier}' url = f"{api_base_url}/search" - total_pages = requests.get(url, count_params).json()[ + total_pages = session.get(url, count_params).json()[ 'pagination']['total_pages'] results = get_all_iiif(results, total_pages, page_limit) @@ -113,13 +126,13 @@ def get_search_results(api_base_url, model, parameters, to 200""" url = f"{api_base_url}/search/{model}" - search_results = requests.get(url, params=parameters).json() + search_results = session.get(url, params=parameters).json() # Get all results as IIIF if all_results and parameters.get('as') == 'iiif': count_params = parameters count_params['as'] = 'opensearch' - total_pages = requests.get(url, count_params).json()[ + total_pages = session.get(url, count_params).json()[ 'pagination']['total_pages'] search_results = get_all_iiif(search_results, total_pages, page_limit) elif all_results: @@ -132,7 +145,7 @@ def get_work_by_id(api_base_url, identifier, parameters): """returns a work as IIIF or json""" url = f"{api_base_url}/works/{identifier}" - return requests.get(url, params=parameters).json() + return session.get(url, params=parameters).json() def normalize_format(field): @@ -216,4 +229,4 @@ def aggregate_by(search_url, query_string, agg, size): } } - return requests.post(search_url, json=query) + return session.post(search_url, json=query) diff --git a/pyproject.toml b/pyproject.toml index 19f49a8..82d80ce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "nuldc" -version = "0.9.1" +version = "0.10.0" description = "" authors = ["davidschober "] license = "MIT" From ef42a90951b5c182100764f735347bcbb8b384f6 Mon Sep 17 00:00:00 2001 From: davidschober Date: Mon, 1 Apr 2024 12:18:10 -0500 Subject: [PATCH 2/2] lowered "size" --- nuldc/dump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nuldc/dump.py b/nuldc/dump.py index ba18083..97c6ffc 100644 --- a/nuldc/dump.py +++ b/nuldc/dump.py @@ -64,7 +64,7 @@ def dump_collection(col_id): params = { "query": f"collection.id:{col_id}", - "size": "100", + "size": "50", "sort": "id:asc"} try: data = helpers.get_search_results(API,