From 6511dd440cf17d37eac7fe2103097102dc95f9ae Mon Sep 17 00:00:00 2001 From: shri Date: Mon, 4 Nov 2024 18:57:33 +0100 Subject: [PATCH] Add scraper timeout as API input parameter --- src/app/domain/jobs/controllers/job_posts.py | 2 +- src/app/domain/jobs/schemas.py | 1 + src/app/lib/scraperapi.py | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/app/domain/jobs/controllers/job_posts.py b/src/app/domain/jobs/controllers/job_posts.py index 00810142..1616f7ad 100644 --- a/src/app/domain/jobs/controllers/job_posts.py +++ b/src/app/domain/jobs/controllers/job_posts.py @@ -122,7 +122,7 @@ async def create_job_post_from_url( if job_link_domain.endswith("workable.com") or job_link_domain.endswith("linkedin.com"): render = True - html_content = await extract_url_content(data.url, render=render) + html_content = await extract_url_content(data.url, render=render, timeout=data.timeout) job_details = await extract_job_details_from_html(html_content) company_url = job_details.get("company", {}).get("url") diff --git a/src/app/domain/jobs/schemas.py b/src/app/domain/jobs/schemas.py index a74f24fc..4f83bb24 100644 --- a/src/app/domain/jobs/schemas.py +++ b/src/app/domain/jobs/schemas.py @@ -51,6 +51,7 @@ class JobPostCreateFromURL(CamelizedBaseStruct): """A job post create from URL schema.""" url: str + timeout: float = 30.0 class JobPostUpdate(CamelizedBaseStruct, omit_defaults=True): diff --git a/src/app/lib/scraperapi.py b/src/app/lib/scraperapi.py index 4457aa00..39360c5f 100644 --- a/src/app/lib/scraperapi.py +++ b/src/app/lib/scraperapi.py @@ -5,7 +5,7 @@ scraper_api_key = os.environ["SCRAPERAPI_API_KEY"] -async def extract_url_content(url: str, render: bool = False) -> str: +async def extract_url_content(url: str, render: bool = False, timeout=30.0) -> str: """Extracts URL content using a 3rd party service""" params = { "api_key": scraper_api_key, @@ -17,6 +17,6 @@ async def extract_url_content(url: str, render: bool = False) -> str: } async with httpx.AsyncClient() as client: - response = await client.get("https://api.scraperapi.com", params=params, timeout=30.0) + response = await client.get("https://api.scraperapi.com", params=params, timeout=timeout) data = response.text return data