Skip to content

Commit

Permalink
Merge pull request #225 from SocialChangeLab/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
davidpomerenke authored Oct 20, 2024
2 parents 7ce2068 + 53f2000 commit 61f9cdc
Show file tree
Hide file tree
Showing 12 changed files with 351 additions and 15 deletions.
20 changes: 15 additions & 5 deletions .env.example
Original file line number Diff line number Diff line change
@@ -1,13 +1,23 @@
# protest data
ACLED_EMAIL=
ACLED_KEY=
# media data
MEDIACLOUD_API_TOKEN=
ZENROWS_API_KEY=
# google trends data
DATAFORSEO_EMAIL=
DATAFORSEO_PASSWORD=
# tiktok data
RAPIDAPI_KEY=
# bundestag data
BUNDESTAG_API_KEY=
# ai
AZURE_API_BASE=
AZURE_API_VERSION=
AZURE_API_KEY=
BUNDESTAG_API_KEY=
DATAFORSEO_EMAIL=
DATAFORSEO_PASSWORD=
PORT=
# logging
SENTRY_DSN=
AI_TREND_RESOLUTION=0.01 # fraction of all articles that should be downloaded and ai-coded for the sentiment and topic trends
# port where the server should run
PORT=
# fraction of all articles that should be downloaded and ai-coded for the sentiment and topic trends
AI_TREND_RESOLUTION=0.01
2 changes: 2 additions & 0 deletions .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,5 @@ jobs:
DATAFORSEO_PASSWORD: ${{ secrets.DATAFORSEO_PASSWORD }}
BUNDESTAG_API_KEY: ${{ secrets.BUNDESTAG_API_KEY }}
SENTRY_DSN: ${{ secrets.SENTRY_DSN }}
RAPIDAPI_KEY: ${{ secrets.RAPIDAPI_KEY }}

2 changes: 1 addition & 1 deletion backend-python/media_impact_monitor/cron.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def fill_cache():
)
except Exception as e:
errors.append(f"events {data_source}: {e}")
for media_source in ["news_online", "news_print", "web_google"]:
for media_source in ["news_online", "news_print", "social_tiktok", "web_google"]:
for trend_type in ["keywords", "sentiment"]:
for aggregation in ["daily", "weekly"]:
if aggregation == "daily" and media_source == "web_google":
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
import re
from collections import Counter
from datetime import datetime
from typing import Any

import pandas as pd
from tqdm.auto import tqdm

from media_impact_monitor.util.cache import get
from media_impact_monitor.util.env import RAPIDAPI_KEY

headers = {
"x-rapidapi-key": RAPIDAPI_KEY,
"x-rapidapi-host": "tiktok-scraper7.p.rapidapi.com",
}


def get_videos_for_keywords(
keywords: str, n: int, cursor: int = 0
) -> list[dict[str, Any]]:
"""
Get videos for a given set of keywords.
Problem: This returns max ~150 videos, even for very popular keywords.
Use hashtag query to get more videos.
"""
url = "https://tiktok-scraper7.p.rapidapi.com/feed/search"
query = {
"keywords": keywords,
"region": "us", # location of the proxy server
"count": 30, # max: 30
"cursor": cursor,
"publish_time": "0", # 0 - ALL 1 - Past 24 hours 7 - This week 30 - This month 90 - Last 3 months 180 - Last 6 months
"sort_type": "0", # 0 - Relevance 1 - Like count 3 - Date posted
}
response = get(url, headers=headers, params=query)
# print(response.json())
data = response.json()["data"]
videos, cursor, has_more = data["videos"], data["cursor"], data["hasMore"]
if has_more and cursor < n:
videos.extend(get_videos_for_keywords(keywords=keywords, n=n, cursor=cursor))
return videos


def get_hashtag_suggestions(keywords: str) -> Counter:
videos = get_videos_for_keywords(keywords, n=100)
titles = [video["title"] for video in videos]
hashtags = [re.findall(r"#(\w+)", title) for title in titles]
hashtags = [item for sublist in hashtags for item in sublist]
hashtag_counts = Counter(hashtags)
return hashtag_counts


def get_hashtag_id(hashtag: str) -> str:
url = "https://tiktok-scraper7.p.rapidapi.com/challenge/info"
querystring = {
"challenge_name": hashtag,
}
response = get(url, headers=headers, params=querystring)
return response.json()["data"]["id"]


def get_videos_for_hashtag_id(
hashtag_id: str, n: int, cursor: int = 0, verbose: bool = True
) -> list[dict[str, Any]]:
url = "https://tiktok-scraper7.p.rapidapi.com/challenge/posts"
query = {
"challenge_id": hashtag_id,
"count": 20, # max: 20
"cursor": cursor,
}
response = get(url, headers=headers, params=query)
data = response.json()["data"]
videos, cursor, has_more = data["videos"], data["cursor"], data["hasMore"]
if has_more and cursor < n:
if verbose:
print(cursor)
videos.extend(
get_videos_for_hashtag_id(
hashtag_id=hashtag_id, n=n, cursor=cursor, verbose=verbose
)
)
return videos


def get_videos_for_hashtag(
hashtag: str, n: int, cursor: int = 0, verbose: bool = True
) -> list[dict[str, Any]]:
hashtag_id = get_hashtag_id(hashtag)
return get_videos_for_hashtag_id(hashtag_id, n=n, cursor=cursor, verbose=verbose)


def get_video_history_for_hashtag(
hashtag: str, n: int, verbose: bool = True
) -> pd.DataFrame:
"""
Get video history for a hashtag.
Returns a time series of views and posts.
Views are computed by summing the views of all videos that were posted in a given day -- that is, the views do not correspond to the dates when the videos were actually viewed. It is recommended to just use posts, or comments (see `get_comment_history_for_hashtag`).
"""
videos = get_videos_for_hashtag(hashtag, n=n, verbose=verbose)
df = pd.DataFrame(
{
"date": [datetime.fromtimestamp(video["create_time"]) for video in videos],
"id": [video["video_id"] for video in videos],
"title": [video["title"] for video in videos],
"views": [video["play_count"] for video in videos],
}
)
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values("date")
ts = (
df.resample("1D", on="date")
.agg(
{
"views": "sum",
"id": "count",
}
)
.rename(columns={"id": "posts"})
)
ts = ts.reindex(pd.date_range(start=ts.index.min(), end=ts.index.max())).fillna(0)
return ts


def get_comments_for_video(
video_id: str, n: int, cursor: int = 0
) -> list[dict[str, Any]]:
url = "https://tiktok-scraper7.p.rapidapi.com/comment/list"
query = {
"url": video_id,
"count": 50, # max: 50 (?)
"cursor": cursor,
}
response = get(url, headers=headers, params=query)
data = response.json()["data"]
comments, cursor, has_more = data["comments"], data["cursor"], data["hasMore"]
if has_more and cursor < n:
comments.extend(get_comments_for_video(video_id, n=n, cursor=cursor))
return comments


def get_comment_history_for_hashtag(
hashtag: str, n_posts: int, n_comments: int, verbose: bool = True
) -> pd.DataFrame:
videos = get_videos_for_hashtag(hashtag, n=n_posts, verbose=verbose)
comments = [
get_comments_for_video(video["video_id"], n=n_comments)
for video in tqdm(videos)
if video["comment_count"] > 0
]
comments = [comment for video_comments in comments for comment in video_comments]
comments_df = pd.DataFrame(
{
"date": [
datetime.fromtimestamp(comment["create_time"]) for comment in comments
],
"text": [comment["text"] for comment in comments],
"video_id": [comment["video_id"] for comment in comments],
}
)
ts = (
comments_df.resample("1W", on="date")
.agg(
{
"text": "count",
}
)
.rename(columns={"text": "comments"})
)
ts = ts.reindex(pd.date_range(start=ts.index.min(), end=ts.index.max())).fillna(0)
return ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import pytest
import pandas as pd
from datetime import datetime, timedelta
from collections import Counter
from media_impact_monitor.data_loaders.social_media.tiktok import (
get_videos_for_keywords,
get_hashtag_suggestions,
get_hashtag_id,
get_videos_for_hashtag_id,
get_videos_for_hashtag,
get_video_history_for_hashtag,
get_comments_for_video,
get_comment_history_for_hashtag,
)


@pytest.mark.slow
def test_get_videos_for_keywords():
videos = get_videos_for_keywords("climate change", n=50)
assert len(videos) > 0
assert isinstance(videos[0], dict)
assert "title" in videos[0]
assert "video_id" in videos[0]


@pytest.mark.slow
def test_get_hashtag_suggestions():
suggestions = get_hashtag_suggestions("climate change")
assert len(suggestions) > 0
assert isinstance(suggestions, Counter)


@pytest.mark.slow
def test_get_hashtag_id():
hashtag_id = get_hashtag_id("climatechange")
assert isinstance(hashtag_id, str)
assert len(hashtag_id) > 0


@pytest.mark.slow
def test_get_videos_for_hashtag_id():
hashtag_id = get_hashtag_id("climatechange")
videos = get_videos_for_hashtag_id(hashtag_id, n=50)
assert len(videos) > 0
assert isinstance(videos[0], dict)
assert "title" in videos[0]
assert "video_id" in videos[0]


@pytest.mark.slow
def test_get_videos_for_hashtag():
videos = get_videos_for_hashtag("climatechange", n=50)
assert len(videos) > 0
assert isinstance(videos[0], dict)
assert "title" in videos[0]
assert "video_id" in videos[0]


@pytest.mark.slow
def test_get_video_history_for_hashtag():
history = get_video_history_for_hashtag("climatechange", n=100)
assert isinstance(history, pd.DataFrame)
assert len(history) > 0
assert "views" in history.columns
assert "posts" in history.columns


@pytest.mark.slow
def test_get_comments_for_video():
videos = get_videos_for_hashtag("climatechange", n=1)
video_id = videos[0]["video_id"]
comments = get_comments_for_video(video_id, n=50)
assert len(comments) > 0
assert isinstance(comments[0], dict)
assert "text" in comments[0]


@pytest.mark.slow
def test_get_comment_history_for_hashtag():
history = get_comment_history_for_hashtag(
"climatechange", n_posts=10, n_comments=10
)
assert isinstance(history, pd.DataFrame)
assert len(history) > 0
assert "comments" in history.columns


@pytest.mark.slow
def test_data_freshness():
videos = get_videos_for_hashtag("climatechange", n=50)
latest_video_date = max(
datetime.fromtimestamp(video["create_time"]) for video in videos
)
assert latest_video_date >= datetime.now() - timedelta(
days=7
), "No recent videos found"


@pytest.mark.slow
def test_video_content():
videos = get_videos_for_keywords("climate change", n=50)
climate_related_words = [
"climate",
"environment",
"global warming",
"sustainability",
]
assert any(
any(word in video["title"].lower() for word in climate_related_words)
for video in videos
), "No climate-related content found in video titles"
14 changes: 13 additions & 1 deletion backend-python/media_impact_monitor/trends/keyword_trend.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
get_mediacloud_counts,
)
from media_impact_monitor.data_loaders.news_print.genios import get_genios_counts
from media_impact_monitor.data_loaders.social_media.tiktok import (
get_video_history_for_hashtag,
)
from media_impact_monitor.data_loaders.web.google_trends import get_google_trends_counts
from media_impact_monitor.types_ import TrendSearch
from media_impact_monitor.util.paths import src
Expand All @@ -31,6 +34,8 @@ def get_keyword_trend(q: TrendSearch) -> tuple[pd.DataFrame | None, list[str]]:
)
case "web_google":
ds = get_google_trends_counts(query=query, end_date=q.end_date)
case "social_tiktok":
ds = get_video_history_for_hashtag(query, n=1000, verbose=True)["posts"]
case _:
raise ValueError(f"Unsupported media source: {q.media_source}")
dss[topic] = ds
Expand Down Expand Up @@ -65,7 +70,14 @@ def topic_queries(media_source: str) -> dict[str, str]:
# media_source,
# ),
}
if media_source != "web_google":
if media_source == "social_tiktok":
keyword_queries = {
"climate activism": "climateprotest", # TODO: improve
"climate policy": "climateaction", # TODO: improve
"climate science": "climatechange", # TODO: improve
"climate crisis framing": "climatecrisis", # TODO: improve
}
elif media_source != "web_google":
keyword_queries["climate activism"] = xs_with_ys(
keywords["climate_science"]
+ keywords["climate_policy"]
Expand Down
2 changes: 1 addition & 1 deletion backend-python/media_impact_monitor/types_.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
# FIXME: consider renaming "Topic" to "Issue" to avoid confusion with topics within the issue (like science or policy)
Topic = Literal["climate_change"]
Query = str # for now, just a single keyword
MediaSource = Literal["news_online", "news_print", "web_google"]
MediaSource = Literal["news_online", "news_print", "web_google", "social_tiktok"]

StartDateField = Field(
default=date(2020, 1, 1),
Expand Down
2 changes: 2 additions & 0 deletions backend-python/media_impact_monitor/util/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
DATAFORSEO_PASSWORD = environ["DATAFORSEO_PASSWORD"]
BUNDESTAG_API_KEY = environ["BUNDESTAG_API_KEY"]
SENTRY_DSN = environ["SENTRY_DSN"]
RAPIDAPI_KEY = environ["RAPIDAPI_KEY"]
AI_TREND_RESOLUTION = float(environ.get("AI_TREND_RESOLUTION", 0.01))

assert ACLED_EMAIL
Expand All @@ -31,3 +32,4 @@
assert DATAFORSEO_PASSWORD
assert BUNDESTAG_API_KEY
assert SENTRY_DSN
assert RAPIDAPI_KEY
Loading

0 comments on commit 61f9cdc

Please sign in to comment.