Skip to content

Commit

Permalink
reactor(mediacloud): return limitation rather than error about data p…
Browse files Browse the repository at this point in the history
…re 2022
  • Loading branch information
davidpomerenke committed Aug 14, 2024
1 parent a192dc1 commit 772754f
Show file tree
Hide file tree
Showing 7 changed files with 34 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,10 @@ def get_mediacloud_counts(
start_date: date = date(2022, 1, 1),
countries: list | None = None,
platform: Platform = "onlinenews-waybackmachine",
) -> pd.Series:
assert start_date.year >= 2022, "MediaCloud currently only goes back to 2022"
) -> tuple[pd.Series | None, list[str]]:
limitations = []
if start_date < date(2022, 1, 1):
limitations.append("Start date must be on or after 2022-01-01.")
assert verify_dates(start_date, end_date)

collection_ids = [_resolve_country(c) for c in countries] if countries else []
Expand All @@ -48,8 +50,8 @@ def get_mediacloud_counts(
df = df[["date", "count"]] # ignore total_count and ratio
df["date"] = pd.to_datetime(df["date"]).dt.date
df = df.set_index("date")
df = df[(df.index >= start_date) & (df.index <= end_date)]
return df["count"]
df = df.reindex(pd.date_range(start_date, end_date), fill_value=0)
return df["count"], limitations


@cache
Expand Down Expand Up @@ -155,8 +157,7 @@ def get_mediacloud_fulltexts(
platform: Platform = "onlinenews-mediacloud",
sample_frac: float = 1,
) -> pd.DataFrame | None:
start_date = start_date or date(2022, 1, 1)
assert start_date.year >= 2022, "MediaCloud currently only goes back to 2022"
start_date = max(start_date or date(2022, 1, 1), date(2022, 1, 1))
assert verify_dates(start_date, end_date)
assert isinstance(countries, list) or countries is None
collection_ids = [_resolve_country(c) for c in countries] if countries else None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def get_genios_counts(
query: str,
end_date: date,
start_date: date = date(2010, 1, 1),
) -> pd.Series:
) -> tuple[pd.Series, list[str]]:
assert verify_dates(start_date, end_date)
response = get(
"https://www.genios.de/api/searchResult/Alle/Presse",
Expand Down
16 changes: 8 additions & 8 deletions backend-python/media_impact_monitor/trend.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
from media_impact_monitor.types_ import Trend, TrendSearch


def get_trend(q: TrendSearch, as_json=True) -> Trend:
def get_trend(q: TrendSearch, as_json=True) -> Trend | pd.DataFrame | None:
match q.trend_type:
case "keywords":
df = get_keyword_trend(q)
df, lims = get_keyword_trend(q)
case "sentiment":
df = get_sentiment_trend(q)
df, lims = get_sentiment_trend(q)
case "topic":
df = get_topic_trend(q)
df, lims = get_topic_trend(q)
case _:
raise ValueError(f"Unsupported trend type: {q.trend_type}")
match df:
Expand All @@ -35,10 +35,10 @@ def get_trend(q: TrendSearch, as_json=True) -> Trend:
)
return Trend(
applicability=True,
limitations=[],
limitations=lims,
trends=long_df.to_dict(orient="records"),
)
case str():
case None:
if not as_json:
return df
return Trend(applicability=False, limitations=[df], trends=None)
return None
return Trend(applicability=False, limitations=lims, trends=None)
8 changes: 5 additions & 3 deletions backend-python/media_impact_monitor/trends/keyword_trend.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,18 @@
from media_impact_monitor.util.paths import src


def get_keyword_trend(q: TrendSearch) -> pd.DataFrame:
def get_keyword_trend(q: TrendSearch) -> tuple[pd.DataFrame | None, list[str]]:
assert q.trend_type == "keywords"
assert q.topic == "climate_change", "Only climate_change is supported."
dss = {}
limitations = set()
for topic, query in topic_queries(q.media_source).items():
match q.media_source:
case "news_online":
ds = get_mediacloud_counts(
ds, lims = get_mediacloud_counts(
query=query, countries=["Germany"], end_date=q.end_date
)
limitations.update(lims)
case "news_print":
ds = get_genios_counts(query=query, end_date=q.end_date)
case "web_google":
Expand All @@ -31,7 +33,7 @@ def get_keyword_trend(q: TrendSearch) -> pd.DataFrame:
ds.index.name = "date"
dss[topic] = ds
df = pd.DataFrame(dss)
return df
return df, list(limitations)


def add_quotes(xs: list[str]) -> list[str]:
Expand Down
9 changes: 6 additions & 3 deletions backend-python/media_impact_monitor/trends/sentiment_trend.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@


@cache
def get_sentiment_trend(q: TrendSearch) -> pd.DataFrame | str:
def get_sentiment_trend(q: TrendSearch) -> tuple[pd.DataFrame | None, list[str]]:
"""
Retrieves the sentiment trend for a given query and start date.
Expand All @@ -18,7 +18,10 @@ def get_sentiment_trend(q: TrendSearch) -> pd.DataFrame | str:
pd.DataFrame | str: A DataFrame containing the sentiment trend with columns for negative, neutral, and positive sentiments, indexed by date: or a string of limitations
"""
if q.media_source != "news_online":
return f"Sentiment trend requires fulltext analysis, which is only available for news_online, not {q.media_source}."
return None, [f"Sentiment trend requires fulltext analysis, which is only available for news_online, not {q.media_source}."]
limitations = []
if q.start_date.year < 2022:
limitations.append("MediaCloud only goes back until 2022.")
assert q.sentiment_target in ["activism", "policy"]
field = f"{q.sentiment_target}_sentiment"
params = dict(q)
Expand All @@ -34,4 +37,4 @@ def get_sentiment_trend(q: TrendSearch) -> pd.DataFrame | str:
)
df.index = pd.to_datetime(df.index).date
df.index.name = "date"
return df
return df, limitations
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


def test_get_sentiment_trend_valid_input():
df = get_sentiment_trend(
df, lims = get_sentiment_trend(
TrendSearch(
trend_type="sentiment",
topic="climate_change",
Expand Down
9 changes: 6 additions & 3 deletions backend-python/media_impact_monitor/trends/topic_trend.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,12 @@


@cache
def get_topic_trend(q: TrendSearch) -> pd.DataFrame | str:
def get_topic_trend(q: TrendSearch) -> tuple[pd.DataFrame | None, list[str]]:
if q.media_source != "news_online":
return f"Topic trend requires fulltext analysis, which is only available for news_online, not {q.media_source}."
return None, f"Topic trend requires fulltext analysis, which is only available for news_online, not {q.media_source}."
limitations = []
if q.start_date.year < 2022:
limitations.append("MediaCloud only goes back until 2022.")
q.start_date = q.start_date or date(2022, 1, 1)
params = dict(q)
del params["trend_type"]
Expand All @@ -20,4 +23,4 @@ def get_topic_trend(q: TrendSearch) -> pd.DataFrame | str:
df = df.groupby("date").sum()
# add 0 for missing dates between q.start_date and q.end_date
df = df.reindex(pd.date_range(q.start_date, q.end_date, freq="D"), fill_value=0)
return df
return df, limitations

0 comments on commit 772754f

Please sign in to comment.