Skip to content

Commit

Permalink
refactor(google_trends.py): use dataforseo backend rather than pytrends
Browse files Browse the repository at this point in the history
pytrends is not very stable, not even with zenrows proxying, dataforseo
is better
  • Loading branch information
davidpomerenke committed May 16, 2024
1 parent bba1b5c commit aa3ecf7
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 213 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,27 +7,42 @@
For the last 90 days, data is also available with daily resolution; otherwise only weekly.
"""

import base64
from datetime import date
from time import sleep

import pandas as pd
from pytrends.request import TrendReq

from media_impact_monitor.util.cache import cache
from media_impact_monitor.util.cache import cache, post

end_date = date.today()


@cache
def get_google_trends_counts(query: str, end_date: date = date.today()) -> pd.Series:
PyTrends = TrendReq(hl="de-DE", tz=60)
PyTrends.build_payload([query], timeframe="today 5-y", geo="DE")
df = PyTrends.interest_over_time()
df = (
df[~df["isPartial"]]
.drop(columns=["isPartial"])
.rename(columns={query: "count"})
)
df.index = pd.to_datetime(df.index).date
df.index.name = "date"
# when rate limit is reached, this should be 60 seconds according to https://github.com/GeneralMills/pytrends
sleep(1)
return df["count"]
def get_google_trends_counts(query: str, end_date: date = end_date) -> pd.Series:
url = "https://api.dataforseo.com/v3/keywords_data/google_trends/explore/live"
location_codes = {"Germany": 2276}
payload = [
{
"time_range": "past_5_years",
"type": "web",
"keywords": [query],
"location_code": location_codes["Germany"],
"language_code": "de",
}
]
credentials = "davidpomerenke@mailbox.org:99e67272d04117b1"
credentials_encoded = base64.b64encode(credentials.encode()).decode()
headers = {
"Authorization": f"Basic {credentials_encoded}",
"Content-Type": "application/json",
}
response = post(url, headers=headers, json=payload)
data = response.json()["tasks"][0]["result"][0]["items"][0]["data"]
df = pd.DataFrame(data)
df["value"] = df["values"].str[0]
# df = df[~df["missing_data"]]
df = df.rename(columns={"date_from": "date", "value": "count"})
df["date"] = pd.to_datetime(df["date"]).dt.date
df = df.set_index("date")["count"]
print(df)
return df
9 changes: 2 additions & 7 deletions backend-python/media_impact_monitor/util/cache.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,13 @@
"""Cache functions."""

from os import environ
from time import sleep as _sleep

from dotenv import load_dotenv
from joblib import Memory
from requests import get as _get
from requests import post as _post
from zenrows import ZenRowsClient

load_dotenv()

from media_impact_monitor.util.env import ZENROWS_API_KEY

memory = Memory("cache", verbose=0)
cache = memory.cache
Expand Down Expand Up @@ -60,9 +57,7 @@ def post(url, sleep=None, **kwargs):

@cache
def get_proxied(url, *args, **kwargs):
client = ZenRowsClient(
environ["ZENROWS_API_KEY"], retries=2, concurrency=concurrency
)
client = ZenRowsClient(ZENROWS_API_KEY, retries=2, concurrency=concurrency)
response = client.get(url, *args, **kwargs)
if '{"code":' in response.text:
raise ValueError(response.text)
Expand Down
Loading

0 comments on commit aa3ecf7

Please sign in to comment.