Merge pull request #210 from SocialChangeLab/feature-sentiment-analysis

Sentiment analysis
SocialChangeLab · Aug 7, 2024 · 1213478 · 1213478
2 parents d0803bc + 1d9bf09
commit 1213478
Show file tree

Hide file tree

Showing 35 changed files with 785 additions and 329 deletions.
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -20,6 +20,7 @@
                 "--host=0.0.0.0",
                 "--port=8000",
                 "--reload",
+                "--reload-dir=backend-python/media_impact_monitor"
             ],
             "jinja": true
         }

diff --git a/backend-python/README.md b/backend-python/README.md
@@ -26,8 +26,7 @@ You will need to add the required API keys (`BUNDESTAG_API_KEY`, `ACLED_KEY`, et
 
 We have defined the following endpoints on Azure OpenAI:
 
-- `gpt-35-turbo`: `gpt-3.5-turbo-1106` (16k tokens context)
-- `gpt-4`: `gpt-4-turbo-2024-04-09` (128k tokens context)
+- `gpt-4o-mini`: `gpt-4o-mini-2024-07-18` (128k tokens context)
 
 Azure OpenAI uses content filters, also for the input texts, that cannot be switched off, but their thresholds can be set to high.
 

diff --git a/backend-python/media_impact_monitor/cron.py b/backend-python/media_impact_monitor/cron.py
@@ -7,11 +7,11 @@
 
 from apscheduler.schedulers.background import BackgroundScheduler
 from apscheduler.triggers.cron import CronTrigger
-from media_impact_monitor.fulltexts import get_fulltexts
 from sentry_sdk.crons import monitor
 from tqdm import tqdm
 
 from media_impact_monitor.events import get_events
+from media_impact_monitor.fulltexts import get_fulltexts
 from media_impact_monitor.impact import get_impact
 from media_impact_monitor.trend import get_trend
 from media_impact_monitor.types_ import (

diff --git a/backend-python/media_impact_monitor/data_loaders/news_online/mediacloud_.py b/backend-python/media_impact_monitor/data_loaders/news_online/mediacloud_.py
@@ -1,19 +1,22 @@
 import base64
-from datetime import date
+import random
+from datetime import date, datetime, timedelta
 from typing import Literal
 
 import mediacloud.api
 import pandas as pd
+from dateutil.relativedelta import relativedelta
 from mcmetadata import extract
 from mcmetadata.exceptions import BadContentError
 
-from media_impact_monitor.util.cache import cache, get_proxied_many
+from media_impact_monitor.util.cache import cache, get
 from media_impact_monitor.util.date import verify_dates
 from media_impact_monitor.util.env import MEDIACLOUD_API_TOKEN
+from media_impact_monitor.util.parallel import parallel_tqdm
 
 search = mediacloud.api.SearchApi(MEDIACLOUD_API_TOKEN)
 directory = mediacloud.api.DirectoryApi(MEDIACLOUD_API_TOKEN)
-search.TIMEOUT_SECS = 10
+search.TIMEOUT_SECS = 60
 
 Platform = Literal["onlinenews-mediacloud", "onlinenews-waybackmachine"]
 
@@ -31,19 +34,6 @@ def get_mediacloud_counts(
     countries: list | None = None,
     platform: Platform = "onlinenews-waybackmachine",
 ) -> pd.Series:
-    """
-    Retrieves the MediaCloud counts for a given query and parameters.
-
-    Args:
-        query (str): The query string to search for.
-        start_date (date, optional): The start date of the time range. Defaults to January 1, 2022.
-        end_date (date, optional): The end date of the time range. Defaults to the current date.
-        countries (list, optional): A list of country names or ISO codes to filter the results by. Defaults to None.
-        platform (Platform, optional): The platform to search on. Defaults to "onlinenews-mediacloud".
-
-    Returns:
-        pd.Series: A pandas Series containing the MediaCloud counts for each date in the time range.
-    """
     assert start_date.year >= 2022, "MediaCloud currently only goes back to 2022"
     assert verify_dates(start_date, end_date)
 
@@ -64,39 +54,23 @@ def get_mediacloud_counts(
 
 
 @cache
-def get_mediacloud_fulltexts(
+def _story_list(**kwargs):
+    return search.story_list(**kwargs)
+
+
+def _story_list_all_pages(
     query: str,
+    start_date: date,
     end_date: date,
-    start_date: date = date(2024, 5, 1),
-    countries: list | None = None,
+    collection_ids: list[int] | None = None,
     platform: Platform = "onlinenews-mediacloud",
-) -> pd.DataFrame | None:
-    """
-    Retrieves fulltexts of news articles from MediaCloud based on the given query and params.
-
-    Args:
-        query (str): The search query to retrieve news articles.
-        start_date (date, optional): The start date to filter news articles. Defaults to January 1, 2022.
-        end_date (date, optional): The end date to filter news articles. Defaults to the current date.
-        countries (list, optional): A list of country names to filter news articles. Defaults to None.
-        platform (Platform, optional): The platform to search for news articles. Defaults to "onlinenews-mediacloud".
-
-    Returns:
-        pd.DataFrame: A DataFrame containing the retrieved news articles with full texts.
-
-    Raises:
-        AssertionError: If the start_date is before 2022.
-        NotImplementedError: If pagination is needed.
-    """
-    assert start_date.year >= 2022, "MediaCloud currently only goes back to 2022"
-    assert verify_dates(start_date, end_date)
-    assert isinstance(countries, list) or countries is None
-    collection_ids = [_resolve_country(c) for c in countries] if countries else None
+    sample_frac: float = 1,
+):
     all_stories = []
     more_stories = True
     pagination_token = None
     while more_stories:
-        page, pagination_token = search.story_list(
+        page, pagination_token = _story_list(
             query=query,
             start_date=start_date,
             end_date=end_date,
@@ -110,18 +84,99 @@ def get_mediacloud_fulltexts(
             decoded_token = base64.urlsafe_b64decode(pagination_token + "==").decode(
                 "utf-8"
             )
-            print(f"{len(all_stories)=} {pagination_token=} {decoded_token=}")
+            # decode strings like 20240527T135136Z
+            dt = datetime.strptime(decoded_token, "%Y%m%dT%H%M%SZ").strftime(
+                "%Y-%m-%d %H:%M:%S"
+            )
+        else:
+            dt = end_date
+        print(
+            f"retrieved metadata for {len(all_stories)} stories for month {start_date.year}-{start_date.month}, currently at {dt}"
+        )
         # https://github.com/mediacloud/api-tutorial-notebooks/blob/main/MC02%20-%20attention.ipynb:
         # > As you may have noted, this can take a while for long time periods. If you look closely you'll notice that it can't be easily parallelized, because it requires content in the results to make the next call. A workaround is to divide you query up by time and query in parallel for something like each day. This can speed up the response. Also just contact us directly if you are trying to do larger data dumps, or hit up against your API quota.
-    if len(all_stories) == 0:
+    # take a 1% sample of stories
+    sample_size = int(sample_frac * len(all_stories))
+    random.seed(0)
+    all_stories = random.sample(all_stories, sample_size)
+    return all_stories
+
+
+def _slice_date_range(start: date, end: date) -> list[tuple[date, date]]:
+    result = []
+    current = start.replace(day=1)
+    while current <= min(end, date.today()):
+        next_month = current + relativedelta(months=1)
+        last_day = min(next_month - timedelta(days=1), date.today())
+        result.append((current, last_day))
+        current = next_month
+    return result
+
+
+def _story_list_split_monthly(
+    query: str,
+    start_date: date,
+    end_date: date,
+    collection_ids: list[int] | None = None,
+    platform: Platform = "onlinenews-mediacloud",
+    sample_frac: float = 1,
+):
+    def func(start_and_end):
+        start, end = start_and_end
+        return _story_list_all_pages(
+            query=query,
+            start_date=start,
+            end_date=end,
+            collection_ids=collection_ids,
+            platform=platform,
+            sample_frac=sample_frac,
+        )
+
+    label = "Downloading metadata by month"
+    stories_lists = parallel_tqdm(
+        func,
+        _slice_date_range(start_date, end_date),
+        desc=f"{label:<{40}}",
+        n_jobs=8,
+    )
+    stories = [s for sl in stories_lists for s in sl]
+    if len(stories) == 0:
         return None
-    df = pd.DataFrame(all_stories)
+    df = pd.DataFrame(stories)
     df["publish_date"] = pd.to_datetime(df["publish_date"]).dt.date
-    responses = get_proxied_many(df["url"], desc="Retrieving fulltexts")
-    df["text"] = [
-        _extract(url, response.text) if response else None
-        for url, response in zip(df["url"], responses)
-    ]
+    return df
+
+
+@cache
+def get_mediacloud_fulltexts(
+    query: str,
+    end_date: date,
+    start_date: date | None = None,
+    countries: list | None = None,
+    platform: Platform = "onlinenews-mediacloud",
+    sample_frac: float = 1,
+) -> pd.DataFrame | None:
+    start_date = start_date or date(2022, 1, 1)
+    assert start_date.year >= 2022, "MediaCloud currently only goes back to 2022"
+    assert verify_dates(start_date, end_date)
+    assert isinstance(countries, list) or countries is None
+    collection_ids = [_resolve_country(c) for c in countries] if countries else None
+    df = _story_list_split_monthly(
+        query=query,
+        start_date=start_date,
+        end_date=end_date,
+        collection_ids=collection_ids,
+        platform=platform,
+        sample_frac=sample_frac,
+    )
+    if df is None:
+        return None
+    df = df[~df["url"].str.contains("news.de")]
+    label = "Downloading fulltexts"
+    responses = parallel_tqdm(get, df["url"].tolist(), desc=f"{label:<{40}}", n_jobs=8)
+    urls_and_responses = list(zip(df["url"], responses))
+    label = "Extracting fulltexts"
+    df["text"] = parallel_tqdm(_extract, urls_and_responses, desc=f"{label:<{40}}")
     df = df.dropna(subset=["text"]).rename(columns={"publish_date": "date"})
     df = df[
         [
@@ -139,10 +194,13 @@ def get_mediacloud_fulltexts(
     return df
 
 
-def _extract(url, html):
+def _extract(url_and_response):
+    url, response = url_and_response
+    if response.status_code != 200:
+        return None
     try:
         # this also contains additional metadata (title, language, extraction method, ...) that could be used
-        return extract(url, html)["text_content"]
+        return cache(extract)(url, response.text)["text_content"]
     except BadContentError:
         return None
 

diff --git a/backend-python/media_impact_monitor/data_loaders/news_online/mediacloud_test.py b/backend-python/media_impact_monitor/data_loaders/news_online/mediacloud_test.py
@@ -2,12 +2,44 @@
 
 import pandas as pd
 import pytest
+from freezegun import freeze_time
 
 from media_impact_monitor.data_loaders.news_online.mediacloud_ import (
+    _slice_date_range,
     get_mediacloud_counts,
 )
 
 
+@freeze_time("2023-06-15")
+def test_slicing_normal_case():
+    start = date(2023, 4, 15)
+    end = date(2023, 6, 20)
+    expected = [
+        (date(2023, 4, 1), date(2023, 4, 30)),
+        (date(2023, 5, 1), date(2023, 5, 31)),
+        (date(2023, 6, 1), date(2023, 6, 15)),  # Note: last day is today
+    ]
+    assert _slice_date_range(start, end) == expected
+
+
+@freeze_time("2023-06-15")
+def test_slicing_future_end_date():
+    start = date(2023, 5, 1)
+    end = date(2023, 7, 15)
+    expected = [
+        (date(2023, 5, 1), date(2023, 5, 31)),
+        (date(2023, 6, 1), date(2023, 6, 15)),  # Note: last day is today
+    ]
+    assert _slice_date_range(start, end) == expected
+
+
+def test_slicing_same_month():
+    start = date(2023, 3, 10)
+    end = date(2023, 3, 20)
+    expected = [(date(2023, 3, 1), date(2023, 3, 31))]
+    assert _slice_date_range(start, end) == expected
+
+
 @pytest.mark.skip("Currently unavailable")
 def test_get_counts_mediacloud():
     df = get_mediacloud_counts(

diff --git a/backend-python/media_impact_monitor/data_loaders/news_print/genios_test.py b/backend-python/media_impact_monitor/data_loaders/news_print/genios_test.py
@@ -1,7 +1,6 @@
 from datetime import date
 
 import pandas as pd
-import pytest
 
 from media_impact_monitor.data_loaders.news_print.genios import get_genios_counts
 

diff --git a/backend-python/media_impact_monitor/data_loaders/policy/bundestag/bundestag_api.py b/backend-python/media_impact_monitor/data_loaders/policy/bundestag/bundestag_api.py
@@ -7,7 +7,6 @@
 """
 
 import logging
-import os
 import warnings
 from datetime import date
 from typing import Dict

diff --git a/backend-python/media_impact_monitor/data_loaders/protest/acled.py b/backend-python/media_impact_monitor/data_loaders/protest/acled.py
@@ -1,5 +1,5 @@
-from datetime import date
 import re
+from datetime import date
 
 import pandas as pd
 

diff --git a/backend-python/media_impact_monitor/data_loaders/protest/acled_size_test.py b/backend-python/media_impact_monitor/data_loaders/protest/acled_size_test.py
@@ -1,4 +1,5 @@
 import pytest
+
 from media_impact_monitor.data_loaders.protest.acled_size import get_size_number
 
 

diff --git a/backend-python/media_impact_monitor/data_loaders/protest/acled_test.py b/backend-python/media_impact_monitor/data_loaders/protest/acled_test.py
@@ -1,4 +1,3 @@
-import os
 from datetime import date
 
 import pytest

diff --git a/...ython/media_impact_monitor/data_loaders/protest/press_releases/last_generation/extract.py b/...ython/media_impact_monitor/data_loaders/protest/press_releases/last_generation/extract.py
@@ -1,5 +1,4 @@
 from datetime import date
-from time import sleep
 
 import pandas as pd
 from bs4 import BeautifulSoup

diff --git a/backend-python/media_impact_monitor/events.py b/backend-python/media_impact_monitor/events.py
@@ -1,4 +1,3 @@
-import math
 from datetime import date
 
 import pandas as pd
Original file line number	Diff line number	Diff line change
		@@ -1,4 +1,5 @@
		import pytest

		from media_impact_monitor.data_loaders.protest.acled_size import get_size_number


Expand Down