diff --git a/backend-python/media_impact_monitor/data_loaders/news_online/mediacloud_.py b/backend-python/media_impact_monitor/data_loaders/news_online/mediacloud_.py index 7f85ebe..b506090 100644 --- a/backend-python/media_impact_monitor/data_loaders/news_online/mediacloud_.py +++ b/backend-python/media_impact_monitor/data_loaders/news_online/mediacloud_.py @@ -190,6 +190,7 @@ def get_mediacloud_fulltexts( label = "Extracting fulltexts" df["text"] = parallel_tqdm(_extract, urls_and_responses, desc=f"{label:<{40}}") df = df.dropna(subset=["text"]).rename(columns={"publish_date": "date"}) + df = df[(df["date"] >= start_date) & (df["date"] <= end_date)] df = df[ [ # "id", diff --git a/backend-python/media_impact_monitor/fulltexts_test.py b/backend-python/media_impact_monitor/fulltexts_test.py index 6fb8818..1b4a5e3 100644 --- a/backend-python/media_impact_monitor/fulltexts_test.py +++ b/backend-python/media_impact_monitor/fulltexts_test.py @@ -1,4 +1,4 @@ -from datetime import date, timedelta +from datetime import date import pandas as pd import pytest @@ -41,7 +41,7 @@ def test_get_fulltexts_for_event(): media_source="news_online", event_id=event_id, ), - sample_frac=0.1, + sample_frac=1, ) assert texts is not None assert len(texts) > 0 @@ -124,7 +124,7 @@ def test_get_fulltexts_date_range(default_start_date, default_end_date): start_date=default_start_date, end_date=default_end_date, ) - result = get_fulltexts(q, sample_frac=0.001) + result = get_fulltexts(q, sample_frac=0.01) assert isinstance(result, pd.DataFrame) assert not result.empty assert all( @@ -132,5 +132,5 @@ def test_get_fulltexts_date_range(default_start_date, default_end_date): ) assert "activism_sentiment" in result.columns assert "policy_sentiment" in result.columns - assert all(result["activism_sentiment"].isin([-1, 0, 1])) - assert all(result["policy_sentiment"].isin([-1, 0, 1])) + assert all(result["activism_sentiment"].isin([-1, 0, 1, None])) + assert all(result["policy_sentiment"].isin([-1, 0, 1, None]))