Skip to content

Commit

Permalink
Merge pull request #210 from SocialChangeLab/feature-sentiment-analysis
Browse files Browse the repository at this point in the history
Sentiment analysis
  • Loading branch information
davidpomerenke authored Aug 7, 2024
2 parents d0803bc + 1d9bf09 commit 1213478
Show file tree
Hide file tree
Showing 35 changed files with 785 additions and 329 deletions.
1 change: 1 addition & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
"--host=0.0.0.0",
"--port=8000",
"--reload",
"--reload-dir=backend-python/media_impact_monitor"
],
"jinja": true
}
Expand Down
3 changes: 1 addition & 2 deletions backend-python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@ You will need to add the required API keys (`BUNDESTAG_API_KEY`, `ACLED_KEY`, et

We have defined the following endpoints on Azure OpenAI:

- `gpt-35-turbo`: `gpt-3.5-turbo-1106` (16k tokens context)
- `gpt-4`: `gpt-4-turbo-2024-04-09` (128k tokens context)
- `gpt-4o-mini`: `gpt-4o-mini-2024-07-18` (128k tokens context)

Azure OpenAI uses content filters, also for the input texts, that cannot be switched off, but their thresholds can be set to high.

Expand Down
2 changes: 1 addition & 1 deletion backend-python/media_impact_monitor/cron.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@

from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.cron import CronTrigger
from media_impact_monitor.fulltexts import get_fulltexts
from sentry_sdk.crons import monitor
from tqdm import tqdm

from media_impact_monitor.events import get_events
from media_impact_monitor.fulltexts import get_fulltexts
from media_impact_monitor.impact import get_impact
from media_impact_monitor.trend import get_trend
from media_impact_monitor.types_ import (
Expand Down
Original file line number Diff line number Diff line change
@@ -1,19 +1,22 @@
import base64
from datetime import date
import random
from datetime import date, datetime, timedelta
from typing import Literal

import mediacloud.api
import pandas as pd
from dateutil.relativedelta import relativedelta
from mcmetadata import extract
from mcmetadata.exceptions import BadContentError

from media_impact_monitor.util.cache import cache, get_proxied_many
from media_impact_monitor.util.cache import cache, get
from media_impact_monitor.util.date import verify_dates
from media_impact_monitor.util.env import MEDIACLOUD_API_TOKEN
from media_impact_monitor.util.parallel import parallel_tqdm

search = mediacloud.api.SearchApi(MEDIACLOUD_API_TOKEN)
directory = mediacloud.api.DirectoryApi(MEDIACLOUD_API_TOKEN)
search.TIMEOUT_SECS = 10
search.TIMEOUT_SECS = 60

Platform = Literal["onlinenews-mediacloud", "onlinenews-waybackmachine"]

Expand All @@ -31,19 +34,6 @@ def get_mediacloud_counts(
countries: list | None = None,
platform: Platform = "onlinenews-waybackmachine",
) -> pd.Series:
"""
Retrieves the MediaCloud counts for a given query and parameters.
Args:
query (str): The query string to search for.
start_date (date, optional): The start date of the time range. Defaults to January 1, 2022.
end_date (date, optional): The end date of the time range. Defaults to the current date.
countries (list, optional): A list of country names or ISO codes to filter the results by. Defaults to None.
platform (Platform, optional): The platform to search on. Defaults to "onlinenews-mediacloud".
Returns:
pd.Series: A pandas Series containing the MediaCloud counts for each date in the time range.
"""
assert start_date.year >= 2022, "MediaCloud currently only goes back to 2022"
assert verify_dates(start_date, end_date)

Expand All @@ -64,39 +54,23 @@ def get_mediacloud_counts(


@cache
def get_mediacloud_fulltexts(
def _story_list(**kwargs):
return search.story_list(**kwargs)


def _story_list_all_pages(
query: str,
start_date: date,
end_date: date,
start_date: date = date(2024, 5, 1),
countries: list | None = None,
collection_ids: list[int] | None = None,
platform: Platform = "onlinenews-mediacloud",
) -> pd.DataFrame | None:
"""
Retrieves fulltexts of news articles from MediaCloud based on the given query and params.
Args:
query (str): The search query to retrieve news articles.
start_date (date, optional): The start date to filter news articles. Defaults to January 1, 2022.
end_date (date, optional): The end date to filter news articles. Defaults to the current date.
countries (list, optional): A list of country names to filter news articles. Defaults to None.
platform (Platform, optional): The platform to search for news articles. Defaults to "onlinenews-mediacloud".
Returns:
pd.DataFrame: A DataFrame containing the retrieved news articles with full texts.
Raises:
AssertionError: If the start_date is before 2022.
NotImplementedError: If pagination is needed.
"""
assert start_date.year >= 2022, "MediaCloud currently only goes back to 2022"
assert verify_dates(start_date, end_date)
assert isinstance(countries, list) or countries is None
collection_ids = [_resolve_country(c) for c in countries] if countries else None
sample_frac: float = 1,
):
all_stories = []
more_stories = True
pagination_token = None
while more_stories:
page, pagination_token = search.story_list(
page, pagination_token = _story_list(
query=query,
start_date=start_date,
end_date=end_date,
Expand All @@ -110,18 +84,99 @@ def get_mediacloud_fulltexts(
decoded_token = base64.urlsafe_b64decode(pagination_token + "==").decode(
"utf-8"
)
print(f"{len(all_stories)=} {pagination_token=} {decoded_token=}")
# decode strings like 20240527T135136Z
dt = datetime.strptime(decoded_token, "%Y%m%dT%H%M%SZ").strftime(
"%Y-%m-%d %H:%M:%S"
)
else:
dt = end_date
print(
f"retrieved metadata for {len(all_stories)} stories for month {start_date.year}-{start_date.month}, currently at {dt}"
)
# https://github.com/mediacloud/api-tutorial-notebooks/blob/main/MC02%20-%20attention.ipynb:
# > As you may have noted, this can take a while for long time periods. If you look closely you'll notice that it can't be easily parallelized, because it requires content in the results to make the next call. A workaround is to divide you query up by time and query in parallel for something like each day. This can speed up the response. Also just contact us directly if you are trying to do larger data dumps, or hit up against your API quota.
if len(all_stories) == 0:
# take a 1% sample of stories
sample_size = int(sample_frac * len(all_stories))
random.seed(0)
all_stories = random.sample(all_stories, sample_size)
return all_stories


def _slice_date_range(start: date, end: date) -> list[tuple[date, date]]:
result = []
current = start.replace(day=1)
while current <= min(end, date.today()):
next_month = current + relativedelta(months=1)
last_day = min(next_month - timedelta(days=1), date.today())
result.append((current, last_day))
current = next_month
return result


def _story_list_split_monthly(
query: str,
start_date: date,
end_date: date,
collection_ids: list[int] | None = None,
platform: Platform = "onlinenews-mediacloud",
sample_frac: float = 1,
):
def func(start_and_end):
start, end = start_and_end
return _story_list_all_pages(
query=query,
start_date=start,
end_date=end,
collection_ids=collection_ids,
platform=platform,
sample_frac=sample_frac,
)

label = "Downloading metadata by month"
stories_lists = parallel_tqdm(
func,
_slice_date_range(start_date, end_date),
desc=f"{label:<{40}}",
n_jobs=8,
)
stories = [s for sl in stories_lists for s in sl]
if len(stories) == 0:
return None
df = pd.DataFrame(all_stories)
df = pd.DataFrame(stories)
df["publish_date"] = pd.to_datetime(df["publish_date"]).dt.date
responses = get_proxied_many(df["url"], desc="Retrieving fulltexts")
df["text"] = [
_extract(url, response.text) if response else None
for url, response in zip(df["url"], responses)
]
return df


@cache
def get_mediacloud_fulltexts(
query: str,
end_date: date,
start_date: date | None = None,
countries: list | None = None,
platform: Platform = "onlinenews-mediacloud",
sample_frac: float = 1,
) -> pd.DataFrame | None:
start_date = start_date or date(2022, 1, 1)
assert start_date.year >= 2022, "MediaCloud currently only goes back to 2022"
assert verify_dates(start_date, end_date)
assert isinstance(countries, list) or countries is None
collection_ids = [_resolve_country(c) for c in countries] if countries else None
df = _story_list_split_monthly(
query=query,
start_date=start_date,
end_date=end_date,
collection_ids=collection_ids,
platform=platform,
sample_frac=sample_frac,
)
if df is None:
return None
df = df[~df["url"].str.contains("news.de")]
label = "Downloading fulltexts"
responses = parallel_tqdm(get, df["url"].tolist(), desc=f"{label:<{40}}", n_jobs=8)
urls_and_responses = list(zip(df["url"], responses))
label = "Extracting fulltexts"
df["text"] = parallel_tqdm(_extract, urls_and_responses, desc=f"{label:<{40}}")
df = df.dropna(subset=["text"]).rename(columns={"publish_date": "date"})
df = df[
[
Expand All @@ -139,10 +194,13 @@ def get_mediacloud_fulltexts(
return df


def _extract(url, html):
def _extract(url_and_response):
url, response = url_and_response
if response.status_code != 200:
return None
try:
# this also contains additional metadata (title, language, extraction method, ...) that could be used
return extract(url, html)["text_content"]
return cache(extract)(url, response.text)["text_content"]
except BadContentError:
return None

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,44 @@

import pandas as pd
import pytest
from freezegun import freeze_time

from media_impact_monitor.data_loaders.news_online.mediacloud_ import (
_slice_date_range,
get_mediacloud_counts,
)


@freeze_time("2023-06-15")
def test_slicing_normal_case():
start = date(2023, 4, 15)
end = date(2023, 6, 20)
expected = [
(date(2023, 4, 1), date(2023, 4, 30)),
(date(2023, 5, 1), date(2023, 5, 31)),
(date(2023, 6, 1), date(2023, 6, 15)), # Note: last day is today
]
assert _slice_date_range(start, end) == expected


@freeze_time("2023-06-15")
def test_slicing_future_end_date():
start = date(2023, 5, 1)
end = date(2023, 7, 15)
expected = [
(date(2023, 5, 1), date(2023, 5, 31)),
(date(2023, 6, 1), date(2023, 6, 15)), # Note: last day is today
]
assert _slice_date_range(start, end) == expected


def test_slicing_same_month():
start = date(2023, 3, 10)
end = date(2023, 3, 20)
expected = [(date(2023, 3, 1), date(2023, 3, 31))]
assert _slice_date_range(start, end) == expected


@pytest.mark.skip("Currently unavailable")
def test_get_counts_mediacloud():
df = get_mediacloud_counts(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from datetime import date

import pandas as pd
import pytest

from media_impact_monitor.data_loaders.news_print.genios import get_genios_counts

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
"""

import logging
import os
import warnings
from datetime import date
from typing import Dict
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from datetime import date
import re
from datetime import date

import pandas as pd

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pytest

from media_impact_monitor.data_loaders.protest.acled_size import get_size_number


Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import os
from datetime import date

import pytest
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from datetime import date
from time import sleep

import pandas as pd
from bs4 import BeautifulSoup
Expand Down
1 change: 0 additions & 1 deletion backend-python/media_impact_monitor/events.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import math
from datetime import date

import pandas as pd
Expand Down
Loading

0 comments on commit 1213478

Please sign in to comment.