SocialChangeLab · davidpomerenke · May 20, 2024 · May 9, 2024 · May 9, 2024 · May 9, 2024
diff --git a/.env.example b/.env.example
@@ -2,4 +2,9 @@ ACLED_EMAIL=
 ACLED_KEY=
 MEDIACLOUD_API_TOKEN=
 ZENROWS_API_KEY=
-OPENAI_API_KEY=
+AZURE_API_BASE=
+AZURE_API_VERSION=
+AZURE_API_KEY=
+DATAFORSEO_EMAIL=
+DATAFORSEO_PASSWORD=
+PORT=
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -59,3 +59,8 @@ jobs:
         ACLED_EMAIL: ${{ secrets.ACLED_EMAIL }}
         ACLED_KEY: ${{ secrets.ACLED_KEY }}
         ZENROWS_API_KEY: ${{ secrets.ZENROWS_API_KEY }}
+        AZURE_API_BASE: ${{ secrets.AZURE_API_BASE }}
+        AZURE_API_VERSION: ${{ secrets.AZURE_API_VERSION }}
+        AZURE_API_KEY: ${{ secrets.AZURE_API_KEY }}
+        DATAFORSEO_EMAIL: ${{ secrets.DATAFORSEO_EMAIL }}
+        DATAFORSEO_PASSWORD: ${{ secrets.DATAFORSEO_PASSWORD }}
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,20 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python Debugger: FastAPI",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "uvicorn",
+            "args": [
+                "media_impact_monitor.api:app",
+                "--host=0.0.0.0",
+                "--port=8000",
+            ],
+            "jinja": true
+        }
+    ]
+}
diff --git a/backend-python/README.md b/backend-python/README.md
@@ -10,11 +10,26 @@
 - `poetry run uvicorn media_impact_monitor.api:app --host 0.0.0.0 --port 8000` to serve the API
 - `poetry run py-spy record -o profile.svg -- python xyz.py` to do profiling
 
-## Setting up a server to serve the Docker container
+## Deployment
 
-- Set up a Ubuntu 22.04 server.
-- Create a `.env` file based on `.env.example` and put it on the server.
-- Run `setup_server.sh` on the server.
+Continuous deployment is currently set up with railway.app.
+
+Important: Configure a `PORT` environment variable there, otherwise the app will not be accessible and will be `Shutting down` for no apparent reason.
+
+## 3rd party services
+
+These are configured via a `.env` file, see [`.env.example`](../.env.example).
+
+### Azure OpenAI
+
+We have defined the following endpoints on Azure OpenAI:
+
+- `gpt-35-turbo`: `gpt-3.5-turbo-1106` (16k tokens context)
+- `gpt-4`: `gpt-4-turbo-2024-04-09` (128k tokens context)
+
+Azure OpenAI uses content filters, also for the input texts, that cannot be switched off, but their thresholds can be set to high.
+
+Similar models are also available via OpenAI directly.
 
 ## Conventions
 

diff --git a/backend-python/media_impact_monitor/api.py b/backend-python/media_impact_monitor/api.py
@@ -107,7 +107,7 @@ def _get_fulltexts(q: FulltextSearch) -> Response[FulltextSearch, list[Event]]:
 
 
 @app.post("/impact")
-def _get_impact(q: ImpactSearch) -> Response[ImpactSearch, Impact]:
+def _get_impact(q: ImpactSearch):  # -> Response[ImpactSearch, Impact]:
     """Compute the impact of an event on a media trend."""
     impact = get_impact(q)
     return Response(query=q, data=impact)

diff --git a/backend-python/media_impact_monitor/data_loaders/news_online/mediacloud_.py b/backend-python/media_impact_monitor/data_loaders/news_online/mediacloud_.py
@@ -1,45 +1,33 @@
-import os
 from datetime import date
 from typing import Literal
 
 import mediacloud.api
 import pandas as pd
-from dotenv import load_dotenv
+from mcmetadata import extract
 
-from media_impact_monitor.util.cache import cache
+from media_impact_monitor.util.cache import cache, get_proxied
 from media_impact_monitor.util.env import MEDIACLOUD_API_TOKEN
-
-load_dotenv()
-
+from media_impact_monitor.util.parallel import parallel_tqdm
 
 search = mediacloud.api.SearchApi(MEDIACLOUD_API_TOKEN)
 directory = mediacloud.api.DirectoryApi(MEDIACLOUD_API_TOKEN)
 
 
 Platform = Literal["onlinenews-mediacloud", "onlinenews-waybackmachine"]
 
+end_date = date.today()
+
 
 @cache
 def get_mediacloud_counts(
     query: str,
-    start_date: date = date(2023, 1, 1),
-    end_date: date = date.today(),
+    start_date: date = date(2022, 1, 1),
+    end_date: date = end_date,
     countries: list | None = None,
     platform: Platform = "onlinenews-mediacloud",
 ) -> pd.Series:
-    assert start_date.year >= 2023, "MediaCloud currently only goes back to 2023"
-    collection_ids: list[int] = []
-    if countries:
-        collection_ids = []
-        for country in countries:
-            # get national newspapers (regional newspapers are also available)
-            results = directory.collection_list(name=f"{country} - national")["results"]
-            # ignore research collections
-            results = [r for r in results if "(Research Only)" not in r["name"]]
-            assert (
-                len(results) == 1
-            ), f"Expected 1 result, got {len(results)} for {country}"
-            collection_ids.append(results[0]["id"])
+    assert start_date.year >= 2022, "MediaCloud currently only goes back to 2022"
+    collection_ids = resolve_countries(countries)
     data = search.story_count_over_time(
         query=query,
         start_date=start_date,
@@ -52,3 +40,56 @@ def get_mediacloud_counts(
     df["date"] = pd.to_datetime(df["date"]).dt.date
     df = df.set_index("date")
     return df["count"]
+
+
+def get_mediacloud_fulltexts(
+    query: str,
+    start_date: date = date(2022, 1, 1),
+    end_date: date = end_date,
+    countries: list | None = None,
+    platform: Platform = "onlinenews-mediacloud",
+) -> pd.DataFrame:
+    assert start_date.year >= 2022, "MediaCloud currently only goes back to 2022"
+    collection_ids = resolve_countries(countries)
+    data = search.story_list(
+        query=query,
+        start_date=start_date,
+        end_date=end_date,
+        collection_ids=collection_ids,
+        platform=platform,
+    )
+    pagination_token = data[1]
+    if pagination_token:
+        raise NotImplementedError("Pagination not implemented")
+    df = pd.DataFrame(data[0])
+    df["publish_date"] = pd.to_datetime(df["publish_date"])
+    df["text"] = parallel_tqdm(
+        retrieve_text, df["url"], n_jobs=4, desc="Retrieving fulltexts"
+    )
+    df = df.dropna(subset=["text"])
+    return df
+
+
+def retrieve_text(url: str) -> str | None:
+    try:
+        html = get_proxied(url, timeout=15).text
+    except ValueError as e:
+        if "RESP002" in str(e):  # zenrows error code for http 404
+            return None
+        raise e
+    data = extract(url=url, html_text=html)
+    # this also contains additional metadata (title, language, extraction method, ...) that could be used
+    return data["text_content"]
+
+
+def resolve_countries(countries: list | None) -> list | None:
+    collection_ids: list[int] = []
+    collection_ids = []
+    for country in countries or []:
+        # get national newspapers (regional newspapers are also available)
+        results = directory.collection_list(name=f"{country} - national")["results"]
+        # ignore research collections
+        results = [r for r in results if "(Research Only)" not in r["name"]]
+        assert len(results) == 1, f"Expected 1 result, got {len(results)} for {country}"
+        collection_ids.append(results[0]["id"])
+    return collection_ids
diff --git a/backend-python/media_impact_monitor/data_loaders/news_print/genios.py b/backend-python/media_impact_monitor/data_loaders/news_print/genios.py
@@ -4,12 +4,14 @@
 
 from media_impact_monitor.util.cache import cache, get
 
+end_date = date.today()
+
 
 @cache
 def get_genios_counts(
     query: str,
     start_date: date = date(2010, 1, 1),
-    end_date: date = date.today(),
+    end_date: date = end_date,
 ) -> pd.Series:
     response = get(
         "https://www.genios.de/api/searchResult/Alle/Presse",
@@ -32,5 +34,5 @@ def get_genios_counts(
     df["date"] = pd.to_datetime(df["date"]).dt.date
     df = df.set_index("date")
     # there is a bug that sets the count at day -1 to 0
-    df = df[df.index >= pd.Timestamp(start_date)]
+    df = df[df.index >= start_date]
     return df["count"]
diff --git a/backend-python/media_impact_monitor/data_loaders/news_print/genios_test.py b/backend-python/media_impact_monitor/data_loaders/news_print/genios_test.py
@@ -1,35 +1,39 @@
+from datetime import date
+
 import pandas as pd
 import pytest
+
 from media_impact_monitor.data_loaders.news_print.genios import get_genios_counts
 
 
-@pytest.mark.skip(reason="The Genios API is currently down.")
+# @pytest.mark.skip(reason="The Genios API is currently down.")
 def test_get_counts_genios():
     df = get_genios_counts(
         "Fridays for Future",
-        pd.Timestamp("2023-06-01"),
-        pd.Timestamp("2023-12-01"),
+        date(2023, 6, 1),
+        date(2023, 12, 1),
     )
+    df.index = pd.to_datetime(df.index)
     assert not df.empty, "The dataframe returned is unexpectedly empty."
     assert (
         df.index >= "2023-06-01"
     ).all(), "The returned dataframe contains dates before the start date."
     assert (
         df.index <= "2023-12-01"
     ).all(), "The returned dataframe contains dates after the end date."
-    assert (df["count"] >= 0).all(), "The returned dataframe contains negative counts."
-    print(df["count"].diff().abs())
+    assert (df >= 0).all(), "The returned dataframe contains negative counts."
+    print(df.diff().abs())
     assert (
-        df["count"].diff().abs().fillna(0) <= 1000
+        df.diff().abs().fillna(0) <= 1000
     ).all(), "The returned dataframe contains a count change of more than 1000."
     # assume more counts in September (due to global climate strike) than in August
     assert (
-        df.resample("ME").sum()["count"]["2023-09"].item()
-        > df.resample("ME").sum()["count"]["2023-08"].item()
+        df.resample("ME").sum()["2023-09"].item()
+        > df.resample("ME").sum()["2023-08"].item()
     ), "The count in September is not higher than in August."
     assert (
-        df.resample("ME").sum()["count"] < 10_000
+        df.resample("ME").sum() < 10_000
     ).all(), "The count per month is higher than 10,000 for some months."
     assert (
-        df.resample("ME").sum()["count"] > 10
+        df.resample("ME").sum() > 10
     ).all(), "The count per month is lower than 30 for some months."
diff --git a/backend-python/media_impact_monitor/data_loaders/web/google_trends.py b/backend-python/media_impact_monitor/data_loaders/web/google_trends.py
@@ -13,6 +13,7 @@
 import pandas as pd
 
 from media_impact_monitor.util.cache import cache, post
+from media_impact_monitor.util.env import DATAFORSEO_EMAIL, DATAFORSEO_PASSWORD
 
 end_date = date.today()
 
@@ -30,7 +31,7 @@ def get_google_trends_counts(query: str, end_date: date = end_date) -> pd.Series
             "language_code": "de",
         }
     ]
-    credentials = "davidpomerenke@mailbox.org:99e67272d04117b1"
+    credentials = f"{DATAFORSEO_EMAIL}:{DATAFORSEO_PASSWORD}"
     credentials_encoded = base64.b64encode(credentials.encode()).decode()
     headers = {
         "Authorization": f"Basic {credentials_encoded}",
@@ -40,9 +41,8 @@ def get_google_trends_counts(query: str, end_date: date = end_date) -> pd.Series
     data = response.json()["tasks"][0]["result"][0]["items"][0]["data"]
     df = pd.DataFrame(data)
     df["value"] = df["values"].str[0]
-    # df = df[~df["missing_data"]]
+    # df = df[~df["missing_data"]] # this ignores data from the current day/week/month, which is not yet complete
     df = df.rename(columns={"date_from": "date", "value": "count"})
     df["date"] = pd.to_datetime(df["date"]).dt.date
     df = df.set_index("date")["count"]
-    print(df)
     return df
diff --git a/backend-python/media_impact_monitor/events.py b/backend-python/media_impact_monitor/events.py
@@ -11,8 +11,10 @@
     climate_orgs_aliases,
 )
 from media_impact_monitor.types_ import EventSearch
+from media_impact_monitor.util.cache import cache
 
 
+@cache
 def get_events(q: EventSearch) -> pd.DataFrame:
     assert q.source == "acled", "Only ACLED is supported."
     df = get_acled_events(countries=["Germany"])

diff --git a/backend-python/media_impact_monitor/impact.py b/backend-python/media_impact_monitor/impact.py
@@ -2,35 +2,66 @@
 
 from media_impact_monitor.events import get_events_by_id
 from media_impact_monitor.impact_estimators.interrupted_time_series import (
-    estimate_impacts,
     estimate_mean_impact,
 )
 from media_impact_monitor.trend import get_trend
-from media_impact_monitor.types_ import Impact, ImpactSearch, TrendSearch
+from media_impact_monitor.types_ import Impact, ImpactSearch, Method, TrendSearch
 from media_impact_monitor.util.cache import cache
 
 
-@cache
+# @cache
 def get_impact(q: ImpactSearch) -> Impact:
     events = get_events_by_id(q.cause)
-    trend = get_trend(TrendSearch(**dict(q.effect)))
-    hidden_days_before_protest = 4
+    trends = get_trend(TrendSearch(**dict(q.effect)))
+    applicabilities = []
+    limitations = []
+    dfs = dict()
+    for topic in trends.columns:
+        trend = trends[topic]
+        trend.name = "count"
+        appl, warning, impact = get_impact_for_single_trend(
+            events=events,
+            trend=trend,
+            method=q.method,
+            aggregation=q.effect.aggregation,
+        )
+        dfs[topic] = impact.reset_index().to_dict(orient="records")
+        applicabilities.append(appl)
+        limitations.append(warning)
+    assert len(set(applicabilities)) == 1, "All topics should have same applicability."
+    assert len(set(limitations)) == 1, "All topics should have same limitations."
+    impact = dict(
+        method_applicability=applicabilities[0],
+        method_applicability_reason=limitations[0],
+        time_series=dfs,
+    )
+    return impact
+
+
+def get_impact_for_single_trend(
+    events: pd.DataFrame,
+    trend: pd.DataFrame,
+    method: Method,
+    aggregation="daily",
+) -> tuple[str, str, pd.DataFrame]:
+    hidden_days_before_protest = 7
     horizon = 28
-    match q.method:
+    match method:
         case "interrupted_time_series":
-            mean_impact = estimate_mean_impact(
+            mean_impact, warnings = estimate_mean_impact(
                 events=events,
                 article_counts=trend,
                 horizon=horizon,
                 hidden_days_before_protest=hidden_days_before_protest,
+                aggregation=aggregation,
             )
         case "synthetic_control":
             raise NotImplementedError("Synthetic control is not yet implemented.")
         case _:
-            raise ValueError(f"Unsupported method: {q.method}")
-    return Impact(
-        method_applicability="maybe",
-        method_applicability_reason="We're not checking this yet 🤡",
-        time_series=mean_impact.to_dict(orient="index"),
-    )
+            raise ValueError(f"Unsupported method: {method}")
+    warning = "We are not yet systematically checking the applicability of the impact estimation method.\n\n"
+    if warnings:
+        warning += "However, we have determined the following limitations:\n\n"
+        warning += "\n".join([f"- {w}" for w in warnings])
+    return "maybe", warning, mean_impact
     # TODO: divide impact by number of events on that day (by the same org)