Skip to content

Commit

Permalink
refactor(backend): remove Zenrows proxying functions
Browse files Browse the repository at this point in the history
  • Loading branch information
davidpomerenke committed Nov 21, 2024
1 parent 7baf7b6 commit 5d1557d
Show file tree
Hide file tree
Showing 7 changed files with 57 additions and 85 deletions.
1 change: 0 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ ACLED_EMAIL=
ACLED_KEY=
# media data
MEDIACLOUD_API_TOKEN=
ZENROWS_API_KEY=
# google trends data
DATAFORSEO_EMAIL=
DATAFORSEO_PASSWORD=
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ jobs:
MEDIACLOUD_API_TOKEN: ${{ secrets.MEDIACLOUD_API_TOKEN }}
ACLED_EMAIL: ${{ secrets.ACLED_EMAIL }}
ACLED_KEY: ${{ secrets.ACLED_KEY }}
ZENROWS_API_KEY: ${{ secrets.ZENROWS_API_KEY }}
AZURE_API_BASE: ${{ secrets.AZURE_API_BASE }}
AZURE_API_VERSION: ${{ secrets.AZURE_API_VERSION }}
AZURE_API_KEY: ${{ secrets.AZURE_API_KEY }}
Expand Down
43 changes: 0 additions & 43 deletions backend-python/media_impact_monitor/util/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@

from joblib import Memory
from requests import get as _get, post as _post, Response
from zenrows import ZenRowsClient

from media_impact_monitor.util.env import ZENROWS_API_KEY

memory = Memory("cache", verbose=0)
cache = memory.cache
Expand Down Expand Up @@ -52,43 +49,3 @@ def post(url, sleep=None, **kwargs) -> Response | None:
if sleep is not None:
_sleep(sleep)
return response


@cache
def get_proxied(url, **kwargs):
if "timeout" not in kwargs:
kwargs["timeout"] = 10
try:
response = get(url, **kwargs)
return response
except Exception:
pass
client = ZenRowsClient(ZENROWS_API_KEY, retries=2, concurrency=10)
response = client.get(url, **kwargs)
if response.text.startswith('{"code":'):
zenrows_errors = [
"REQS001",
"REQS004",
"REQS006",
"RESP004",
"AUTH001",
"AUTH002",
"AUTH003",
"AUTH004",
"AUTH005",
"AUTH009",
"BLK0001",
"AUTH007",
"AUTH006",
"AUTH008",
"CTX0001",
"ERR0001",
"ERR0000",
"RESP003",
]
if any(error in response.text for error in zenrows_errors):
# problem with zenrows -> inform the developer
raise Exception(response.text)
# otherwise, problem with the site itself -> just don't use this site
return None
return response
18 changes: 1 addition & 17 deletions backend-python/media_impact_monitor/util/cache_test.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import pytest

from media_impact_monitor.util.cache import get, get_proxied, post
from media_impact_monitor.util.cache import get, post

# URLs for the stable testing endpoints (preferably dedicated for testing purposes)
GET_URL = "https://httpbin.org/get"
Expand Down Expand Up @@ -30,17 +28,3 @@ def test_post_retrieval():
assert (
response.json().get("json") == POST_DATA
), "The response body should contain the JSON data we sent"


@pytest.mark.skip(
reason="Our API key has expired, we will get a new one once we really need it."
)
def test_get_proxied():
"""
Test if the `get_proxied` function can successfully retrieve content.
"""
response = get_proxied(GET_URL)
assert response.status_code == 200
assert (
"args" in response.json()
), "The response should contain 'args' to confirm it's from httpbin.org"
2 changes: 0 additions & 2 deletions backend-python/media_impact_monitor/util/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
ACLED_EMAIL = environ["ACLED_EMAIL"]
ACLED_KEY = environ["ACLED_KEY"]
MEDIACLOUD_API_TOKEN = environ["MEDIACLOUD_API_TOKEN"]
ZENROWS_API_KEY = environ["ZENROWS_API_KEY"]
AZURE_API_BASE = environ["AZURE_API_BASE"]
AZURE_API_VERSION = environ["AZURE_API_VERSION"]
AZURE_API_KEY = environ["AZURE_API_KEY"]
Expand All @@ -24,7 +23,6 @@
assert ACLED_EMAIL
assert ACLED_KEY
assert MEDIACLOUD_API_TOKEN
assert ZENROWS_API_KEY
assert AZURE_API_BASE
assert AZURE_API_VERSION
assert AZURE_API_KEY
Expand Down
76 changes: 56 additions & 20 deletions backend-python/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion backend-python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ matplotlib = "^3.8.2"
mediacloud = "^4.1.3"
backoff = "^2.2.1"
openai = "^1.12.0"
zenrows = "^1.3.2"
html2text = "^2020.1.16"
openpyxl = "^3.1.2"
fastapi = "^0.110.0"
Expand Down

0 comments on commit 5d1557d

Please sign in to comment.