refactor(backend): type annotations, minor caching thing

SocialChangeLab · Aug 25, 2024 · a632845 · a632845
1 parent 0f96879
commit a632845
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 5 deletions.
diff --git a/backend-python/media_impact_monitor/data_loaders/news_online/mediacloud_.py b/backend-python/media_impact_monitor/data_loaders/news_online/mediacloud_.py
@@ -70,6 +70,7 @@ def _story_list_all_pages(
     collection_ids: list[int] | None = None,
     platform: Platform = "onlinenews-mediacloud",
     sample_frac: float = 1,
+    verbose: bool = False,
 ):
     all_stories = []
     more_stories = True
@@ -95,9 +96,10 @@ def _story_list_all_pages(
             )
         else:
             dt = end_date
-        print(
-            f"retrieved metadata for {len(all_stories)} stories for month {start_date.year}-{start_date.month}, currently at {dt}"
-        )
+        if verbose:
+            print(
+                f"retrieved metadata for {len(all_stories)} stories for month {start_date.year}-{start_date.month}, currently at {dt}"
+            )
         # https://github.com/mediacloud/api-tutorial-notebooks/blob/main/MC02%20-%20attention.ipynb:
         # > As you may have noted, this can take a while for long time periods. If you look closely you'll notice that it can't be easily parallelized, because it requires content in the results to make the next call. A workaround is to divide you query up by time and query in parallel for something like each day. This can speed up the response. Also just contact us directly if you are trying to do larger data dumps, or hit up against your API quota.
     # take a 1% sample of stories
@@ -118,6 +120,7 @@ def _slice_date_range(start: date, end: date) -> list[tuple[date, date]]:
     return result
 
 
+@cache
 def _story_list_split_monthly(
     query: str,
     start_date: date,

diff --git a/backend-python/media_impact_monitor/trends/sentiment_trend.py b/backend-python/media_impact_monitor/trends/sentiment_trend.py
@@ -5,7 +5,6 @@
 from media_impact_monitor.util.cache import cache
 
 
-@cache
 def get_sentiment_trend(q: TrendSearch) -> tuple[pd.DataFrame | None, list[str]]:
     """
     Retrieves the sentiment trend for a given query and start date.

diff --git a/backend-python/media_impact_monitor/util/parallel.py b/backend-python/media_impact_monitor/util/parallel.py
@@ -9,7 +9,7 @@ def parallel_tqdm(
     n_jobs: int = 8,
     backend: str = "loky",
     **kwargs,
-):
+) -> list:
     """Parallelize a function with a tqdm progress bar."""
     total = total or len(iter)
     results = Parallel(n_jobs=n_jobs, return_as="generator", backend=backend)(
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,7 +5,6 @@ @@
     from media_impact_monitor.util.cache import cache
-    @cache
     def get_sentiment_trend(q: TrendSearch) -> tuple[pd.DataFrame | None, list[str]]:
         """
         Retrieves the sentiment trend for a given query and start date.
@@ Expand Down @@