From 5d1557da98fb544e7e2656996e3e9a9d98b897d4 Mon Sep 17 00:00:00 2001 From: David Pomerenke <46022183+davidpomerenke@users.noreply.github.com> Date: Thu, 21 Nov 2024 22:40:24 +0100 Subject: [PATCH] refactor(backend): remove Zenrows proxying functions --- .env.example | 1 - .github/workflows/deploy.yml | 1 - .../media_impact_monitor/util/cache.py | 43 ----------- .../media_impact_monitor/util/cache_test.py | 18 +---- .../media_impact_monitor/util/env.py | 2 - backend-python/poetry.lock | 76 ++++++++++++++----- backend-python/pyproject.toml | 1 - 7 files changed, 57 insertions(+), 85 deletions(-) diff --git a/.env.example b/.env.example index 367ede44..ea512764 100644 --- a/.env.example +++ b/.env.example @@ -3,7 +3,6 @@ ACLED_EMAIL= ACLED_KEY= # media data MEDIACLOUD_API_TOKEN= -ZENROWS_API_KEY= # google trends data DATAFORSEO_EMAIL= DATAFORSEO_PASSWORD= diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 971a2174..9efb7f11 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -58,7 +58,6 @@ jobs: MEDIACLOUD_API_TOKEN: ${{ secrets.MEDIACLOUD_API_TOKEN }} ACLED_EMAIL: ${{ secrets.ACLED_EMAIL }} ACLED_KEY: ${{ secrets.ACLED_KEY }} - ZENROWS_API_KEY: ${{ secrets.ZENROWS_API_KEY }} AZURE_API_BASE: ${{ secrets.AZURE_API_BASE }} AZURE_API_VERSION: ${{ secrets.AZURE_API_VERSION }} AZURE_API_KEY: ${{ secrets.AZURE_API_KEY }} diff --git a/backend-python/media_impact_monitor/util/cache.py b/backend-python/media_impact_monitor/util/cache.py index f711a130..76322d2f 100644 --- a/backend-python/media_impact_monitor/util/cache.py +++ b/backend-python/media_impact_monitor/util/cache.py @@ -4,9 +4,6 @@ from joblib import Memory from requests import get as _get, post as _post, Response -from zenrows import ZenRowsClient - -from media_impact_monitor.util.env import ZENROWS_API_KEY memory = Memory("cache", verbose=0) cache = memory.cache @@ -52,43 +49,3 @@ def post(url, sleep=None, **kwargs) -> Response | None: if sleep is not None: _sleep(sleep) return response - - -@cache -def get_proxied(url, **kwargs): - if "timeout" not in kwargs: - kwargs["timeout"] = 10 - try: - response = get(url, **kwargs) - return response - except Exception: - pass - client = ZenRowsClient(ZENROWS_API_KEY, retries=2, concurrency=10) - response = client.get(url, **kwargs) - if response.text.startswith('{"code":'): - zenrows_errors = [ - "REQS001", - "REQS004", - "REQS006", - "RESP004", - "AUTH001", - "AUTH002", - "AUTH003", - "AUTH004", - "AUTH005", - "AUTH009", - "BLK0001", - "AUTH007", - "AUTH006", - "AUTH008", - "CTX0001", - "ERR0001", - "ERR0000", - "RESP003", - ] - if any(error in response.text for error in zenrows_errors): - # problem with zenrows -> inform the developer - raise Exception(response.text) - # otherwise, problem with the site itself -> just don't use this site - return None - return response diff --git a/backend-python/media_impact_monitor/util/cache_test.py b/backend-python/media_impact_monitor/util/cache_test.py index d1dd391b..fd3708af 100644 --- a/backend-python/media_impact_monitor/util/cache_test.py +++ b/backend-python/media_impact_monitor/util/cache_test.py @@ -1,6 +1,4 @@ -import pytest - -from media_impact_monitor.util.cache import get, get_proxied, post +from media_impact_monitor.util.cache import get, post # URLs for the stable testing endpoints (preferably dedicated for testing purposes) GET_URL = "https://httpbin.org/get" @@ -30,17 +28,3 @@ def test_post_retrieval(): assert ( response.json().get("json") == POST_DATA ), "The response body should contain the JSON data we sent" - - -@pytest.mark.skip( - reason="Our API key has expired, we will get a new one once we really need it." -) -def test_get_proxied(): - """ - Test if the `get_proxied` function can successfully retrieve content. - """ - response = get_proxied(GET_URL) - assert response.status_code == 200 - assert ( - "args" in response.json() - ), "The response should contain 'args' to confirm it's from httpbin.org" diff --git a/backend-python/media_impact_monitor/util/env.py b/backend-python/media_impact_monitor/util/env.py index 588c2163..11171d2b 100644 --- a/backend-python/media_impact_monitor/util/env.py +++ b/backend-python/media_impact_monitor/util/env.py @@ -10,7 +10,6 @@ ACLED_EMAIL = environ["ACLED_EMAIL"] ACLED_KEY = environ["ACLED_KEY"] MEDIACLOUD_API_TOKEN = environ["MEDIACLOUD_API_TOKEN"] -ZENROWS_API_KEY = environ["ZENROWS_API_KEY"] AZURE_API_BASE = environ["AZURE_API_BASE"] AZURE_API_VERSION = environ["AZURE_API_VERSION"] AZURE_API_KEY = environ["AZURE_API_KEY"] @@ -24,7 +23,6 @@ assert ACLED_EMAIL assert ACLED_KEY assert MEDIACLOUD_API_TOKEN -assert ZENROWS_API_KEY assert AZURE_API_BASE assert AZURE_API_VERSION assert AZURE_API_KEY diff --git a/backend-python/poetry.lock b/backend-python/poetry.lock index 9dd1f7b9..e46fdf19 100644 --- a/backend-python/poetry.lock +++ b/backend-python/poetry.lock @@ -1595,6 +1595,47 @@ all = ["jieba", "nltk"] arabic = ["nltk"] chinese = ["jieba"] +[[package]] +name = "gql" +version = "3.5.0" +description = "GraphQL client for Python" +optional = false +python-versions = "*" +files = [ + {file = "gql-3.5.0-py2.py3-none-any.whl", hash = "sha256:70dda5694a5b194a8441f077aa5fb70cc94e4ec08016117523f013680901ecb7"}, + {file = "gql-3.5.0.tar.gz", hash = "sha256:ccb9c5db543682b28f577069950488218ed65d4ac70bb03b6929aaadaf636de9"}, +] + +[package.dependencies] +anyio = ">=3.0,<5" +backoff = ">=1.11.1,<3.0" +graphql-core = ">=3.2,<3.3" +requests = {version = ">=2.26,<3", optional = true, markers = "extra == \"requests\""} +requests-toolbelt = {version = ">=1.0.0,<2", optional = true, markers = "extra == \"requests\""} +yarl = ">=1.6,<2.0" + +[package.extras] +aiohttp = ["aiohttp (>=3.8.0,<4)", "aiohttp (>=3.9.0b0,<4)"] +all = ["aiohttp (>=3.8.0,<4)", "aiohttp (>=3.9.0b0,<4)", "botocore (>=1.21,<2)", "httpx (>=0.23.1,<1)", "requests (>=2.26,<3)", "requests-toolbelt (>=1.0.0,<2)", "websockets (>=10,<12)"] +botocore = ["botocore (>=1.21,<2)"] +dev = ["aiofiles", "aiohttp (>=3.8.0,<4)", "aiohttp (>=3.9.0b0,<4)", "black (==22.3.0)", "botocore (>=1.21,<2)", "check-manifest (>=0.42,<1)", "flake8 (==3.8.1)", "httpx (>=0.23.1,<1)", "isort (==4.3.21)", "mock (==4.0.2)", "mypy (==0.910)", "parse (==1.15.0)", "pytest (==7.4.2)", "pytest-asyncio (==0.21.1)", "pytest-console-scripts (==1.3.1)", "pytest-cov (==3.0.0)", "requests (>=2.26,<3)", "requests-toolbelt (>=1.0.0,<2)", "sphinx (>=5.3.0,<6)", "sphinx-argparse (==0.2.5)", "sphinx-rtd-theme (>=0.4,<1)", "types-aiofiles", "types-mock", "types-requests", "vcrpy (==4.4.0)", "websockets (>=10,<12)"] +httpx = ["httpx (>=0.23.1,<1)"] +requests = ["requests (>=2.26,<3)", "requests-toolbelt (>=1.0.0,<2)"] +test = ["aiofiles", "aiohttp (>=3.8.0,<4)", "aiohttp (>=3.9.0b0,<4)", "botocore (>=1.21,<2)", "httpx (>=0.23.1,<1)", "mock (==4.0.2)", "parse (==1.15.0)", "pytest (==7.4.2)", "pytest-asyncio (==0.21.1)", "pytest-console-scripts (==1.3.1)", "pytest-cov (==3.0.0)", "requests (>=2.26,<3)", "requests-toolbelt (>=1.0.0,<2)", "vcrpy (==4.4.0)", "websockets (>=10,<12)"] +test-no-transport = ["aiofiles", "mock (==4.0.2)", "parse (==1.15.0)", "pytest (==7.4.2)", "pytest-asyncio (==0.21.1)", "pytest-console-scripts (==1.3.1)", "pytest-cov (==3.0.0)", "vcrpy (==4.4.0)"] +websockets = ["websockets (>=10,<12)"] + +[[package]] +name = "graphql-core" +version = "3.2.5" +description = "GraphQL implementation for Python, a port of GraphQL.js, the JavaScript reference implementation for GraphQL." +optional = false +python-versions = "<4,>=3.6" +files = [ + {file = "graphql_core-3.2.5-py3-none-any.whl", hash = "sha256:2f150d5096448aa4f8ab26268567bbfeef823769893b39c1a2e1409590939c8a"}, + {file = "graphql_core-3.2.5.tar.gz", hash = "sha256:e671b90ed653c808715645e3998b7ab67d382d55467b7e2978549111bbabf8d5"}, +] + [[package]] name = "graphviz" version = "0.20.3" @@ -2795,29 +2836,24 @@ files = [ {file = "matplotlib-3.9.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd2a59ff4b83d33bca3b5ec58203cc65985367812cb8c257f3e101632be86d92"}, {file = "matplotlib-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fc001516ffcf1a221beb51198b194d9230199d6842c540108e4ce109ac05cc0"}, {file = "matplotlib-3.9.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:83c6a792f1465d174c86d06f3ae85a8fe36e6f5964633ae8106312ec0921fdf5"}, - {file = "matplotlib-3.9.1-cp310-cp310-win_amd64.whl", hash = "sha256:421851f4f57350bcf0811edd754a708d2275533e84f52f6760b740766c6747a7"}, {file = "matplotlib-3.9.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:b3fce58971b465e01b5c538f9d44915640c20ec5ff31346e963c9e1cd66fa812"}, {file = "matplotlib-3.9.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a973c53ad0668c53e0ed76b27d2eeeae8799836fd0d0caaa4ecc66bf4e6676c0"}, {file = "matplotlib-3.9.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:82cd5acf8f3ef43f7532c2f230249720f5dc5dd40ecafaf1c60ac8200d46d7eb"}, {file = "matplotlib-3.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ab38a4f3772523179b2f772103d8030215b318fef6360cb40558f585bf3d017f"}, {file = "matplotlib-3.9.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:2315837485ca6188a4b632c5199900e28d33b481eb083663f6a44cfc8987ded3"}, - {file = "matplotlib-3.9.1-cp311-cp311-win_amd64.whl", hash = "sha256:a0c977c5c382f6696caf0bd277ef4f936da7e2aa202ff66cad5f0ac1428ee15b"}, {file = "matplotlib-3.9.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:565d572efea2b94f264dd86ef27919515aa6d629252a169b42ce5f570db7f37b"}, {file = "matplotlib-3.9.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6d397fd8ccc64af2ec0af1f0efc3bacd745ebfb9d507f3f552e8adb689ed730a"}, {file = "matplotlib-3.9.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:26040c8f5121cd1ad712abffcd4b5222a8aec3a0fe40bc8542c94331deb8780d"}, {file = "matplotlib-3.9.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d12cb1837cffaac087ad6b44399d5e22b78c729de3cdae4629e252067b705e2b"}, {file = "matplotlib-3.9.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:0e835c6988edc3d2d08794f73c323cc62483e13df0194719ecb0723b564e0b5c"}, - {file = "matplotlib-3.9.1-cp312-cp312-win_amd64.whl", hash = "sha256:44a21d922f78ce40435cb35b43dd7d573cf2a30138d5c4b709d19f00e3907fd7"}, {file = "matplotlib-3.9.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:0c584210c755ae921283d21d01f03a49ef46d1afa184134dd0f95b0202ee6f03"}, {file = "matplotlib-3.9.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:11fed08f34fa682c2b792942f8902e7aefeed400da71f9e5816bea40a7ce28fe"}, {file = "matplotlib-3.9.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0000354e32efcfd86bda75729716b92f5c2edd5b947200be9881f0a671565c33"}, {file = "matplotlib-3.9.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4db17fea0ae3aceb8e9ac69c7e3051bae0b3d083bfec932240f9bf5d0197a049"}, {file = "matplotlib-3.9.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:208cbce658b72bf6a8e675058fbbf59f67814057ae78165d8a2f87c45b48d0ff"}, - {file = "matplotlib-3.9.1-cp39-cp39-win_amd64.whl", hash = "sha256:dc23f48ab630474264276be156d0d7710ac6c5a09648ccdf49fef9200d8cbe80"}, {file = "matplotlib-3.9.1-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:3fda72d4d472e2ccd1be0e9ccb6bf0d2eaf635e7f8f51d737ed7e465ac020cb3"}, {file = "matplotlib-3.9.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:84b3ba8429935a444f1fdc80ed930babbe06725bcf09fbeb5c8757a2cd74af04"}, {file = "matplotlib-3.9.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b918770bf3e07845408716e5bbda17eadfc3fcbd9307dc67f37d6cf834bb3d98"}, - {file = "matplotlib-3.9.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f1f2e5d29e9435c97ad4c36fb6668e89aee13d48c75893e25cef064675038ac9"}, {file = "matplotlib-3.9.1.tar.gz", hash = "sha256:de06b19b8db95dd33d0dc17c926c7c9ebed9f572074b6fac4f65068a6814d010"}, ] @@ -4623,6 +4659,20 @@ files = [ [package.dependencies] requests = ">=1.0.0" +[[package]] +name = "requests-toolbelt" +version = "1.0.0" +description = "A utility belt for advanced users of python-requests" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"}, + {file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"}, +] + +[package.dependencies] +requests = ">=2.0.1,<3.0.0" + [[package]] name = "rfc3339-validator" version = "0.1.4" @@ -6029,20 +6079,6 @@ files = [ idna = ">=2.0" multidict = ">=4.0" -[[package]] -name = "zenrows" -version = "1.3.2" -description = "Python client for ZenRows API" -optional = false -python-versions = ">=3.6" -files = [ - {file = "zenrows-1.3.2-py3-none-any.whl", hash = "sha256:41f3c7403872bf69f6d46926496a018b8c0dd34fd811d250a269f10917598d88"}, - {file = "zenrows-1.3.2.tar.gz", hash = "sha256:1465bba1c53b42a0ca7dd05239a1f27d4cca5379a5543cf8a483ec89906e5f8c"}, -] - -[package.dependencies] -requests = "*" - [[package]] name = "zipp" version = "3.19.2" @@ -6061,4 +6097,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "c810ccfd91b6af30ef56af38a2a5f74f9740f808835c17452511cf488089f5be" +content-hash = "1a5e7188e8c22601c5a45c85897bde3e483643cdb05e2c4f83e136cb077ce5bd" diff --git a/backend-python/pyproject.toml b/backend-python/pyproject.toml index ec79d33a..5a6f1099 100644 --- a/backend-python/pyproject.toml +++ b/backend-python/pyproject.toml @@ -20,7 +20,6 @@ matplotlib = "^3.8.2" mediacloud = "^4.1.3" backoff = "^2.2.1" openai = "^1.12.0" -zenrows = "^1.3.2" html2text = "^2020.1.16" openpyxl = "^3.1.2" fastapi = "^0.110.0"