Skip to content

Commit

Permalink
feat(api): implement topics via LLMs
Browse files Browse the repository at this point in the history
  • Loading branch information
davidpomerenke committed Aug 14, 2024
1 parent d4b54f2 commit aa34011
Show file tree
Hide file tree
Showing 7 changed files with 288 additions and 71 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

search = mediacloud.api.SearchApi(MEDIACLOUD_API_TOKEN)
directory = mediacloud.api.DirectoryApi(MEDIACLOUD_API_TOKEN)
search.TIMEOUT_SECS = 60
search.TIMEOUT_SECS = 10

Platform = Literal["onlinenews-mediacloud", "onlinenews-waybackmachine"]

Expand All @@ -26,7 +26,6 @@ def _story_count_over_time(**kwargs):
return search.story_count_over_time(**kwargs)


@cache
def get_mediacloud_counts(
query: str,
end_date: date,
Expand Down
101 changes: 54 additions & 47 deletions backend-python/media_impact_monitor/fulltext_coding.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
import asyncio
import json

import backoff
import json_repair
from aiolimiter import AsyncLimiter
from litellm import BadRequestError
from litellm.exceptions import RateLimitError as RateLimitError1
from litellm import BadRequestError as BadRequestError1
from openai import BadRequestError as BadRequestError2
from tqdm.asyncio import tqdm_asyncio
from openai import RateLimitError as RateLimitError2
from media_impact_monitor.util.cache import cache

from media_impact_monitor.util.llm import acompletion, completion
Expand All @@ -29,22 +26,57 @@
"type": "string",
"description": "The reasoning for the choice of topics (1-3 sentences)",
},
# # the original free-text formulation for the topics:
# "topics": {
# "type": "array",
# "items": {
# "type": "string",
# "description": "A very concise free-text topic descriptor of 1-3 words, e.g. 'international relations', 'energy policy', 'olaf scholz', 'biodiversity', 'ukraine war', ...",
# },
# "description": "A list of the 10 most dominant topics in the text",
# },
"topics": {
"type": "array",
"items": {
"type": "string",
"description": "A very concise free-text topic descriptor of 1-3 words, e.g. 'international relations', 'energy policy', 'olaf scholz', 'biodiversity', 'ukraine war', ...",
"type": "object",
"description": "To what extent is the text about the following topics? 0: not at all, 1: a little, 2: somewhat, 3: mostly, 4: entirely",
"properties": {
"protests and activism": {
"type": "number",
"enum": [0, 1, 2, 3, 4],
},
"extreme weather and disasters": {
"type": "number",
"enum": [0, 1, 2, 3, 4],
},
"climate conferences and agreements": {
"type": "number",
"enum": [0, 1, 2, 3, 4],
},
"climate policy proposals": {
"type": "number",
"enum": [0, 1, 2, 3, 4],
},
"scientific research": {
"type": "number",
"enum": [0, 1, 2, 3, 4],
},
"urgency of climate action": {
"type": "number",
"enum": [0, 1, 2, 3, 4],
},
"social and international justice": {
"type": "number",
"enum": [0, 1, 2, 3, 4],
},
},
"description": "A list of the 10 most dominant topics in the text",
},
"activism_reasoning": {
"type": "string",
"description": "The reasoning for the activism extent (1 sentence)",
},
"activism": {
"type": "number",
"enum": [0, 1, 2, 3, 4],
"description": "To what extent is the text about activism? 0: not at all, 1: a little, 2: somewhat, 3: mostly, 4: entirely",
"required": [
"protests and activism",
"extreme weather and disasters",
"climate conferences and agreements",
"climate policy proposals",
"scientific research",
"urgency of climate action",
"social and international justice",
],
},
"activism_sentiment_reasoning": {
"type": ["string", "null"],
Expand All @@ -55,15 +87,6 @@
"enum": [-1, 0, 1],
"description": "What sentiment does the text have towards the activists/protester? -1: negative, 0: neutral, 1: positive. If the text is not about activism, this field should be null.",
},
"policy_reasoning": {
"type": "string",
"description": "The reasoning for the policy extent (1 sentence)",
},
"policy": {
"type": "number",
"enum": [0, 1, 2, 3, 4],
"description": "To what extent is the text about policy? 0: not at all, 1: a little, 2: somewhat, 3: mostly, 4: entirely",
},
"policy_sentiment_reasoning": {
"type": ["string", "null"],
"description": "The reasoning for the policy sentiment (1-5 sentences). If the text is not about policy, this field should be null.",
Expand All @@ -73,29 +96,14 @@
"enum": [-1, 0, 1],
"description": "Does the text point out the insufficiency of existing policies and support progressive policy changes? -1: it supports the status quo or suggests regressive policy changes, 0: neutral, 1: it points out the insufficiency of existing policies or supports progressive policy changes. If the text is not about policy, this field should be null.",
},
"science_reasoning": {
"type": "string",
"description": "The reasoning for the science extent (1 sentence)",
},
"science": {
"type": "number",
"enum": [0, 1, 2, 3, 4],
"description": "To what extent is the text about natural phenomena or scientific research? 0: not at all, 1: a little, 2: somewhat, 3: mostly, 4: entirely",
},
},
"required": [
"topics_reasoning",
"topics",
"activism_reasoning",
"activism",
"activism_sentiment_reasoning",
"activism_sentiment",
"policy_reasoning",
"policy",
"policy_sentiment_reasoning",
"policy_sentiment",
"science_reasoning",
"science",
],
},
},
Expand All @@ -106,8 +114,6 @@
rate_limit = AsyncLimiter(max_rate=1000, time_period=60)


# @cache
# @backoff.on_exception(backoff.expo, [RateLimitError1, RateLimitError2], max_time=120)
async def code_fulltext(text: str) -> dict | None:
if len(text) < 20:
return None
Expand All @@ -124,7 +130,7 @@ async def code_fulltext(text: str) -> dict | None:
temperature=0.0,
max_tokens=4000,
)
except BadRequestError as e:
except (BadRequestError1, BadRequestError2) as e:
print("Error while coding the text with AI:", e)
return
try:
Expand All @@ -138,6 +144,7 @@ async def code_fulltext(text: str) -> dict | None:
data[sent] = (
int(data[sent]) if sent in data and data[sent] is not None else None
)
data["topics"] = data["topics"]
return data
except (json.JSONDecodeError, AssertionError):
print(
Expand Down Expand Up @@ -181,7 +188,7 @@ def get_aspect_sentiment(text: str, aspect: str) -> float:
tool_choice={"type": "function", "function": {"name": "score_sentiment"}},
temperature=0.0,
)
except BadRequestError as e:
except (BadRequestError1, BadRequestError2) as e:
print(e)
print(text)
print(response)
Expand Down
6 changes: 4 additions & 2 deletions backend-python/media_impact_monitor/fulltexts.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
climate_orgs,
)
from media_impact_monitor.events import get_events_by_id
from media_impact_monitor.fulltext_coding import code_fulltext, code_many_fulltexts
from media_impact_monitor.fulltext_coding import (
code_many_fulltexts,
)
from media_impact_monitor.trends.keyword_trend import (
add_quotes,
load_keywords,
Expand All @@ -19,7 +21,6 @@
)
from media_impact_monitor.types_ import FulltextSearch
from media_impact_monitor.util.cache import cache
from media_impact_monitor.util.parallel import parallel_tqdm


@cache
Expand Down Expand Up @@ -95,5 +96,6 @@ def get_fulltexts(q: FulltextSearch, sample_frac: float = 0.1) -> pd.DataFrame |
for field in ["activism_sentiment", "policy_sentiment"]:
df[field] = [r[field] if r and field in r else None for r in coded]
df[field] = df[field].fillna(0).astype(int)
df["topics"] = [r["topics"] if r else None for r in coded]

return df
3 changes: 3 additions & 0 deletions backend-python/media_impact_monitor/trend.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from media_impact_monitor.trends.topic_trend import get_topic_trend
import pandas as pd

from media_impact_monitor.trends.keyword_trend import get_keyword_trend
Expand All @@ -11,6 +12,8 @@ def get_trend(q: TrendSearch, as_json=True) -> Trend:
df = get_keyword_trend(q)
case "sentiment":
df = get_sentiment_trend(q)
case "topic":
df = get_topic_trend(q)
case _:
raise ValueError(f"Unsupported trend type: {q.trend_type}")
match df:
Expand Down
23 changes: 23 additions & 0 deletions backend-python/media_impact_monitor/trends/topic_trend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from datetime import date
import pandas as pd

from media_impact_monitor.fulltexts import get_fulltexts
from media_impact_monitor.types_ import FulltextSearch, TrendSearch
from media_impact_monitor.util.cache import cache


@cache
def get_topic_trend(q: TrendSearch) -> pd.DataFrame | str:
if q.media_source != "news_online":
return f"Topic trend requires fulltext analysis, which is only available for news_online, not {q.media_source}."
q.start_date = q.start_date or date(2022, 1, 1)
params = dict(q)
del params["trend_type"]
del params["aggregation"]
df = get_fulltexts(FulltextSearch(**params), sample_frac=0.01)
df = pd.concat([df["date"], df["topics"].apply(pd.Series)], axis=1)
# TODO: normalize!!
df = df.groupby("date").sum()
# add 0 for missing dates between q.start_date and q.end_date
df = df.reindex(pd.date_range(q.start_date, q.end_date, freq="D"), fill_value=0)
return df
Loading

0 comments on commit aa34011

Please sign in to comment.