-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #225 from SocialChangeLab/dev
Dev
- Loading branch information
Showing
12 changed files
with
351 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,23 @@ | ||
# protest data | ||
ACLED_EMAIL= | ||
ACLED_KEY= | ||
# media data | ||
MEDIACLOUD_API_TOKEN= | ||
ZENROWS_API_KEY= | ||
# google trends data | ||
DATAFORSEO_EMAIL= | ||
DATAFORSEO_PASSWORD= | ||
# tiktok data | ||
RAPIDAPI_KEY= | ||
# bundestag data | ||
BUNDESTAG_API_KEY= | ||
# ai | ||
AZURE_API_BASE= | ||
AZURE_API_VERSION= | ||
AZURE_API_KEY= | ||
BUNDESTAG_API_KEY= | ||
DATAFORSEO_EMAIL= | ||
DATAFORSEO_PASSWORD= | ||
PORT= | ||
# logging | ||
SENTRY_DSN= | ||
AI_TREND_RESOLUTION=0.01 # fraction of all articles that should be downloaded and ai-coded for the sentiment and topic trends | ||
# port where the server should run | ||
PORT= | ||
# fraction of all articles that should be downloaded and ai-coded for the sentiment and topic trends | ||
AI_TREND_RESOLUTION=0.01 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
171 changes: 171 additions & 0 deletions
171
backend-python/media_impact_monitor/data_loaders/social_media/tiktok.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,171 @@ | ||
import re | ||
from collections import Counter | ||
from datetime import datetime | ||
from typing import Any | ||
|
||
import pandas as pd | ||
from tqdm.auto import tqdm | ||
|
||
from media_impact_monitor.util.cache import get | ||
from media_impact_monitor.util.env import RAPIDAPI_KEY | ||
|
||
headers = { | ||
"x-rapidapi-key": RAPIDAPI_KEY, | ||
"x-rapidapi-host": "tiktok-scraper7.p.rapidapi.com", | ||
} | ||
|
||
|
||
def get_videos_for_keywords( | ||
keywords: str, n: int, cursor: int = 0 | ||
) -> list[dict[str, Any]]: | ||
""" | ||
Get videos for a given set of keywords. | ||
Problem: This returns max ~150 videos, even for very popular keywords. | ||
Use hashtag query to get more videos. | ||
""" | ||
url = "https://tiktok-scraper7.p.rapidapi.com/feed/search" | ||
query = { | ||
"keywords": keywords, | ||
"region": "us", # location of the proxy server | ||
"count": 30, # max: 30 | ||
"cursor": cursor, | ||
"publish_time": "0", # 0 - ALL 1 - Past 24 hours 7 - This week 30 - This month 90 - Last 3 months 180 - Last 6 months | ||
"sort_type": "0", # 0 - Relevance 1 - Like count 3 - Date posted | ||
} | ||
response = get(url, headers=headers, params=query) | ||
# print(response.json()) | ||
data = response.json()["data"] | ||
videos, cursor, has_more = data["videos"], data["cursor"], data["hasMore"] | ||
if has_more and cursor < n: | ||
videos.extend(get_videos_for_keywords(keywords=keywords, n=n, cursor=cursor)) | ||
return videos | ||
|
||
|
||
def get_hashtag_suggestions(keywords: str) -> Counter: | ||
videos = get_videos_for_keywords(keywords, n=100) | ||
titles = [video["title"] for video in videos] | ||
hashtags = [re.findall(r"#(\w+)", title) for title in titles] | ||
hashtags = [item for sublist in hashtags for item in sublist] | ||
hashtag_counts = Counter(hashtags) | ||
return hashtag_counts | ||
|
||
|
||
def get_hashtag_id(hashtag: str) -> str: | ||
url = "https://tiktok-scraper7.p.rapidapi.com/challenge/info" | ||
querystring = { | ||
"challenge_name": hashtag, | ||
} | ||
response = get(url, headers=headers, params=querystring) | ||
return response.json()["data"]["id"] | ||
|
||
|
||
def get_videos_for_hashtag_id( | ||
hashtag_id: str, n: int, cursor: int = 0, verbose: bool = True | ||
) -> list[dict[str, Any]]: | ||
url = "https://tiktok-scraper7.p.rapidapi.com/challenge/posts" | ||
query = { | ||
"challenge_id": hashtag_id, | ||
"count": 20, # max: 20 | ||
"cursor": cursor, | ||
} | ||
response = get(url, headers=headers, params=query) | ||
data = response.json()["data"] | ||
videos, cursor, has_more = data["videos"], data["cursor"], data["hasMore"] | ||
if has_more and cursor < n: | ||
if verbose: | ||
print(cursor) | ||
videos.extend( | ||
get_videos_for_hashtag_id( | ||
hashtag_id=hashtag_id, n=n, cursor=cursor, verbose=verbose | ||
) | ||
) | ||
return videos | ||
|
||
|
||
def get_videos_for_hashtag( | ||
hashtag: str, n: int, cursor: int = 0, verbose: bool = True | ||
) -> list[dict[str, Any]]: | ||
hashtag_id = get_hashtag_id(hashtag) | ||
return get_videos_for_hashtag_id(hashtag_id, n=n, cursor=cursor, verbose=verbose) | ||
|
||
|
||
def get_video_history_for_hashtag( | ||
hashtag: str, n: int, verbose: bool = True | ||
) -> pd.DataFrame: | ||
""" | ||
Get video history for a hashtag. | ||
Returns a time series of views and posts. | ||
Views are computed by summing the views of all videos that were posted in a given day -- that is, the views do not correspond to the dates when the videos were actually viewed. It is recommended to just use posts, or comments (see `get_comment_history_for_hashtag`). | ||
""" | ||
videos = get_videos_for_hashtag(hashtag, n=n, verbose=verbose) | ||
df = pd.DataFrame( | ||
{ | ||
"date": [datetime.fromtimestamp(video["create_time"]) for video in videos], | ||
"id": [video["video_id"] for video in videos], | ||
"title": [video["title"] for video in videos], | ||
"views": [video["play_count"] for video in videos], | ||
} | ||
) | ||
df["date"] = pd.to_datetime(df["date"]) | ||
df = df.sort_values("date") | ||
ts = ( | ||
df.resample("1D", on="date") | ||
.agg( | ||
{ | ||
"views": "sum", | ||
"id": "count", | ||
} | ||
) | ||
.rename(columns={"id": "posts"}) | ||
) | ||
ts = ts.reindex(pd.date_range(start=ts.index.min(), end=ts.index.max())).fillna(0) | ||
return ts | ||
|
||
|
||
def get_comments_for_video( | ||
video_id: str, n: int, cursor: int = 0 | ||
) -> list[dict[str, Any]]: | ||
url = "https://tiktok-scraper7.p.rapidapi.com/comment/list" | ||
query = { | ||
"url": video_id, | ||
"count": 50, # max: 50 (?) | ||
"cursor": cursor, | ||
} | ||
response = get(url, headers=headers, params=query) | ||
data = response.json()["data"] | ||
comments, cursor, has_more = data["comments"], data["cursor"], data["hasMore"] | ||
if has_more and cursor < n: | ||
comments.extend(get_comments_for_video(video_id, n=n, cursor=cursor)) | ||
return comments | ||
|
||
|
||
def get_comment_history_for_hashtag( | ||
hashtag: str, n_posts: int, n_comments: int, verbose: bool = True | ||
) -> pd.DataFrame: | ||
videos = get_videos_for_hashtag(hashtag, n=n_posts, verbose=verbose) | ||
comments = [ | ||
get_comments_for_video(video["video_id"], n=n_comments) | ||
for video in tqdm(videos) | ||
if video["comment_count"] > 0 | ||
] | ||
comments = [comment for video_comments in comments for comment in video_comments] | ||
comments_df = pd.DataFrame( | ||
{ | ||
"date": [ | ||
datetime.fromtimestamp(comment["create_time"]) for comment in comments | ||
], | ||
"text": [comment["text"] for comment in comments], | ||
"video_id": [comment["video_id"] for comment in comments], | ||
} | ||
) | ||
ts = ( | ||
comments_df.resample("1W", on="date") | ||
.agg( | ||
{ | ||
"text": "count", | ||
} | ||
) | ||
.rename(columns={"text": "comments"}) | ||
) | ||
ts = ts.reindex(pd.date_range(start=ts.index.min(), end=ts.index.max())).fillna(0) | ||
return ts |
111 changes: 111 additions & 0 deletions
111
backend-python/media_impact_monitor/data_loaders/social_media/tiktok_test.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
import pytest | ||
import pandas as pd | ||
from datetime import datetime, timedelta | ||
from collections import Counter | ||
from media_impact_monitor.data_loaders.social_media.tiktok import ( | ||
get_videos_for_keywords, | ||
get_hashtag_suggestions, | ||
get_hashtag_id, | ||
get_videos_for_hashtag_id, | ||
get_videos_for_hashtag, | ||
get_video_history_for_hashtag, | ||
get_comments_for_video, | ||
get_comment_history_for_hashtag, | ||
) | ||
|
||
|
||
@pytest.mark.slow | ||
def test_get_videos_for_keywords(): | ||
videos = get_videos_for_keywords("climate change", n=50) | ||
assert len(videos) > 0 | ||
assert isinstance(videos[0], dict) | ||
assert "title" in videos[0] | ||
assert "video_id" in videos[0] | ||
|
||
|
||
@pytest.mark.slow | ||
def test_get_hashtag_suggestions(): | ||
suggestions = get_hashtag_suggestions("climate change") | ||
assert len(suggestions) > 0 | ||
assert isinstance(suggestions, Counter) | ||
|
||
|
||
@pytest.mark.slow | ||
def test_get_hashtag_id(): | ||
hashtag_id = get_hashtag_id("climatechange") | ||
assert isinstance(hashtag_id, str) | ||
assert len(hashtag_id) > 0 | ||
|
||
|
||
@pytest.mark.slow | ||
def test_get_videos_for_hashtag_id(): | ||
hashtag_id = get_hashtag_id("climatechange") | ||
videos = get_videos_for_hashtag_id(hashtag_id, n=50) | ||
assert len(videos) > 0 | ||
assert isinstance(videos[0], dict) | ||
assert "title" in videos[0] | ||
assert "video_id" in videos[0] | ||
|
||
|
||
@pytest.mark.slow | ||
def test_get_videos_for_hashtag(): | ||
videos = get_videos_for_hashtag("climatechange", n=50) | ||
assert len(videos) > 0 | ||
assert isinstance(videos[0], dict) | ||
assert "title" in videos[0] | ||
assert "video_id" in videos[0] | ||
|
||
|
||
@pytest.mark.slow | ||
def test_get_video_history_for_hashtag(): | ||
history = get_video_history_for_hashtag("climatechange", n=100) | ||
assert isinstance(history, pd.DataFrame) | ||
assert len(history) > 0 | ||
assert "views" in history.columns | ||
assert "posts" in history.columns | ||
|
||
|
||
@pytest.mark.slow | ||
def test_get_comments_for_video(): | ||
videos = get_videos_for_hashtag("climatechange", n=1) | ||
video_id = videos[0]["video_id"] | ||
comments = get_comments_for_video(video_id, n=50) | ||
assert len(comments) > 0 | ||
assert isinstance(comments[0], dict) | ||
assert "text" in comments[0] | ||
|
||
|
||
@pytest.mark.slow | ||
def test_get_comment_history_for_hashtag(): | ||
history = get_comment_history_for_hashtag( | ||
"climatechange", n_posts=10, n_comments=10 | ||
) | ||
assert isinstance(history, pd.DataFrame) | ||
assert len(history) > 0 | ||
assert "comments" in history.columns | ||
|
||
|
||
@pytest.mark.slow | ||
def test_data_freshness(): | ||
videos = get_videos_for_hashtag("climatechange", n=50) | ||
latest_video_date = max( | ||
datetime.fromtimestamp(video["create_time"]) for video in videos | ||
) | ||
assert latest_video_date >= datetime.now() - timedelta( | ||
days=7 | ||
), "No recent videos found" | ||
|
||
|
||
@pytest.mark.slow | ||
def test_video_content(): | ||
videos = get_videos_for_keywords("climate change", n=50) | ||
climate_related_words = [ | ||
"climate", | ||
"environment", | ||
"global warming", | ||
"sustainability", | ||
] | ||
assert any( | ||
any(word in video["title"].lower() for word in climate_related_words) | ||
for video in videos | ||
), "No climate-related content found in video titles" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.