From 772754f15fec2f8e8bd6a61a65ca87d9055f00e2 Mon Sep 17 00:00:00 2001 From: David Pomerenke <46022183+davidpomerenke@users.noreply.github.com> Date: Wed, 14 Aug 2024 19:32:59 +0200 Subject: [PATCH] reactor(mediacloud): return limitation rather than error about data pre 2022 --- .../data_loaders/news_online/mediacloud_.py | 13 +++++++------ .../data_loaders/news_print/genios.py | 2 +- backend-python/media_impact_monitor/trend.py | 16 ++++++++-------- .../media_impact_monitor/trends/keyword_trend.py | 8 +++++--- .../trends/sentiment_trend.py | 9 ++++++--- .../trends/sentiment_trend_test.py | 2 +- .../media_impact_monitor/trends/topic_trend.py | 9 ++++++--- 7 files changed, 34 insertions(+), 25 deletions(-) diff --git a/backend-python/media_impact_monitor/data_loaders/news_online/mediacloud_.py b/backend-python/media_impact_monitor/data_loaders/news_online/mediacloud_.py index 9ffbb4ab..ac42bf39 100644 --- a/backend-python/media_impact_monitor/data_loaders/news_online/mediacloud_.py +++ b/backend-python/media_impact_monitor/data_loaders/news_online/mediacloud_.py @@ -32,8 +32,10 @@ def get_mediacloud_counts( start_date: date = date(2022, 1, 1), countries: list | None = None, platform: Platform = "onlinenews-waybackmachine", -) -> pd.Series: - assert start_date.year >= 2022, "MediaCloud currently only goes back to 2022" +) -> tuple[pd.Series | None, list[str]]: + limitations = [] + if start_date < date(2022, 1, 1): + limitations.append("Start date must be on or after 2022-01-01.") assert verify_dates(start_date, end_date) collection_ids = [_resolve_country(c) for c in countries] if countries else [] @@ -48,8 +50,8 @@ def get_mediacloud_counts( df = df[["date", "count"]] # ignore total_count and ratio df["date"] = pd.to_datetime(df["date"]).dt.date df = df.set_index("date") - df = df[(df.index >= start_date) & (df.index <= end_date)] - return df["count"] + df = df.reindex(pd.date_range(start_date, end_date), fill_value=0) + return df["count"], limitations @cache @@ -155,8 +157,7 @@ def get_mediacloud_fulltexts( platform: Platform = "onlinenews-mediacloud", sample_frac: float = 1, ) -> pd.DataFrame | None: - start_date = start_date or date(2022, 1, 1) - assert start_date.year >= 2022, "MediaCloud currently only goes back to 2022" + start_date = max(start_date or date(2022, 1, 1), date(2022, 1, 1)) assert verify_dates(start_date, end_date) assert isinstance(countries, list) or countries is None collection_ids = [_resolve_country(c) for c in countries] if countries else None diff --git a/backend-python/media_impact_monitor/data_loaders/news_print/genios.py b/backend-python/media_impact_monitor/data_loaders/news_print/genios.py index 37968007..4ee9bcae 100644 --- a/backend-python/media_impact_monitor/data_loaders/news_print/genios.py +++ b/backend-python/media_impact_monitor/data_loaders/news_print/genios.py @@ -11,7 +11,7 @@ def get_genios_counts( query: str, end_date: date, start_date: date = date(2010, 1, 1), -) -> pd.Series: +) -> tuple[pd.Series, list[str]]: assert verify_dates(start_date, end_date) response = get( "https://www.genios.de/api/searchResult/Alle/Presse", diff --git a/backend-python/media_impact_monitor/trend.py b/backend-python/media_impact_monitor/trend.py index 13634c58..a7fe7e2f 100644 --- a/backend-python/media_impact_monitor/trend.py +++ b/backend-python/media_impact_monitor/trend.py @@ -6,14 +6,14 @@ from media_impact_monitor.types_ import Trend, TrendSearch -def get_trend(q: TrendSearch, as_json=True) -> Trend: +def get_trend(q: TrendSearch, as_json=True) -> Trend | pd.DataFrame | None: match q.trend_type: case "keywords": - df = get_keyword_trend(q) + df, lims = get_keyword_trend(q) case "sentiment": - df = get_sentiment_trend(q) + df, lims = get_sentiment_trend(q) case "topic": - df = get_topic_trend(q) + df, lims = get_topic_trend(q) case _: raise ValueError(f"Unsupported trend type: {q.trend_type}") match df: @@ -35,10 +35,10 @@ def get_trend(q: TrendSearch, as_json=True) -> Trend: ) return Trend( applicability=True, - limitations=[], + limitations=lims, trends=long_df.to_dict(orient="records"), ) - case str(): + case None: if not as_json: - return df - return Trend(applicability=False, limitations=[df], trends=None) + return None + return Trend(applicability=False, limitations=lims, trends=None) diff --git a/backend-python/media_impact_monitor/trends/keyword_trend.py b/backend-python/media_impact_monitor/trends/keyword_trend.py index 6ae8ca9e..6bcd4439 100644 --- a/backend-python/media_impact_monitor/trends/keyword_trend.py +++ b/backend-python/media_impact_monitor/trends/keyword_trend.py @@ -10,16 +10,18 @@ from media_impact_monitor.util.paths import src -def get_keyword_trend(q: TrendSearch) -> pd.DataFrame: +def get_keyword_trend(q: TrendSearch) -> tuple[pd.DataFrame | None, list[str]]: assert q.trend_type == "keywords" assert q.topic == "climate_change", "Only climate_change is supported." dss = {} + limitations = set() for topic, query in topic_queries(q.media_source).items(): match q.media_source: case "news_online": - ds = get_mediacloud_counts( + ds, lims = get_mediacloud_counts( query=query, countries=["Germany"], end_date=q.end_date ) + limitations.update(lims) case "news_print": ds = get_genios_counts(query=query, end_date=q.end_date) case "web_google": @@ -31,7 +33,7 @@ def get_keyword_trend(q: TrendSearch) -> pd.DataFrame: ds.index.name = "date" dss[topic] = ds df = pd.DataFrame(dss) - return df + return df, list(limitations) def add_quotes(xs: list[str]) -> list[str]: diff --git a/backend-python/media_impact_monitor/trends/sentiment_trend.py b/backend-python/media_impact_monitor/trends/sentiment_trend.py index d4f5de93..d9a7adae 100644 --- a/backend-python/media_impact_monitor/trends/sentiment_trend.py +++ b/backend-python/media_impact_monitor/trends/sentiment_trend.py @@ -6,7 +6,7 @@ @cache -def get_sentiment_trend(q: TrendSearch) -> pd.DataFrame | str: +def get_sentiment_trend(q: TrendSearch) -> tuple[pd.DataFrame | None, list[str]]: """ Retrieves the sentiment trend for a given query and start date. @@ -18,7 +18,10 @@ def get_sentiment_trend(q: TrendSearch) -> pd.DataFrame | str: pd.DataFrame | str: A DataFrame containing the sentiment trend with columns for negative, neutral, and positive sentiments, indexed by date: or a string of limitations """ if q.media_source != "news_online": - return f"Sentiment trend requires fulltext analysis, which is only available for news_online, not {q.media_source}." + return None, [f"Sentiment trend requires fulltext analysis, which is only available for news_online, not {q.media_source}."] + limitations = [] + if q.start_date.year < 2022: + limitations.append("MediaCloud only goes back until 2022.") assert q.sentiment_target in ["activism", "policy"] field = f"{q.sentiment_target}_sentiment" params = dict(q) @@ -34,4 +37,4 @@ def get_sentiment_trend(q: TrendSearch) -> pd.DataFrame | str: ) df.index = pd.to_datetime(df.index).date df.index.name = "date" - return df + return df, limitations diff --git a/backend-python/media_impact_monitor/trends/sentiment_trend_test.py b/backend-python/media_impact_monitor/trends/sentiment_trend_test.py index a486cc95..de19337e 100644 --- a/backend-python/media_impact_monitor/trends/sentiment_trend_test.py +++ b/backend-python/media_impact_monitor/trends/sentiment_trend_test.py @@ -8,7 +8,7 @@ def test_get_sentiment_trend_valid_input(): - df = get_sentiment_trend( + df, lims = get_sentiment_trend( TrendSearch( trend_type="sentiment", topic="climate_change", diff --git a/backend-python/media_impact_monitor/trends/topic_trend.py b/backend-python/media_impact_monitor/trends/topic_trend.py index 92340462..362b901c 100644 --- a/backend-python/media_impact_monitor/trends/topic_trend.py +++ b/backend-python/media_impact_monitor/trends/topic_trend.py @@ -7,9 +7,12 @@ @cache -def get_topic_trend(q: TrendSearch) -> pd.DataFrame | str: +def get_topic_trend(q: TrendSearch) -> tuple[pd.DataFrame | None, list[str]]: if q.media_source != "news_online": - return f"Topic trend requires fulltext analysis, which is only available for news_online, not {q.media_source}." + return None, f"Topic trend requires fulltext analysis, which is only available for news_online, not {q.media_source}." + limitations = [] + if q.start_date.year < 2022: + limitations.append("MediaCloud only goes back until 2022.") q.start_date = q.start_date or date(2022, 1, 1) params = dict(q) del params["trend_type"] @@ -20,4 +23,4 @@ def get_topic_trend(q: TrendSearch) -> pd.DataFrame | str: df = df.groupby("date").sum() # add 0 for missing dates between q.start_date and q.end_date df = df.reindex(pd.date_range(q.start_date, q.end_date, freq="D"), fill_value=0) - return df + return df, limitations