From 1659964cf2d452f24d9fa011a756f30745b08778 Mon Sep 17 00:00:00 2001 From: Matt Mollison Date: Fri, 26 Mar 2021 15:51:34 -0400 Subject: [PATCH 1/2] pull out twitter and instagram media urls --- thisishappening/utils/tweet_utils.py | 38 ++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/thisishappening/utils/tweet_utils.py b/thisishappening/utils/tweet_utils.py index 470365d..d7bba08 100644 --- a/thisishappening/utils/tweet_utils.py +++ b/thisishappening/utils/tweet_utils.py @@ -45,6 +45,7 @@ "\ufe0f", # Variation Selector-16 for emoji https://codepoints.net/U+FE0F ] +EXTERNAL_MEDIA_SOURCES = ["instagram"] nlp = en_core_web_sm.load(exclude=["parser", "ner"]) @@ -70,6 +71,7 @@ "place_country", "place_country_code", "place_type", + "media_urls", ], ) @@ -106,6 +108,37 @@ def get_tweet_body(status): return tweet_body +def get_media_urls(status): + media_urls = {} + + # Twitter native media are stored in extended_entities + key = "extended_entities" if "extended_entities" in status else "entities" + urls = [] + try: + for medium in status[key]["media"]: + urls.append(medium["media_url_https"]) + except KeyError: + logger.debug(f"No media in {key}") + if len(urls) > 0: + media_urls["twitter"] = urls + + # Other URLs are stored in entities + key = "entities" + for source in EXTERNAL_MEDIA_SOURCES: + urls = [] + try: + for url in status[key]["urls"]: + media_url = url.get("expanded_url") + if (media_url is not None) and (source in media_url): + urls.append(media_url) + except KeyError: + logger.debug(f"No urls in {key}") + if len(urls) > 0: + media_urls[source] = urls + + return media_urls + + def get_lon_lat(status): has_coords = False if status["coordinates"]: @@ -154,6 +187,10 @@ def get_tweet_info(status: Dict) -> Dict: place_country_code = status["place"].get("country_code") # Possible place_type values: country, admin, city, neighborhood, poi place_type = status["place"].get("place_type") + try: + media_urls = get_media_urls(status["extended_tweet"]) + except KeyError: + media_urls = get_media_urls(status) tweet_info = TweetInfo( status_id_str=status_id_str, @@ -175,6 +212,7 @@ def get_tweet_info(status: Dict) -> Dict: place_country=place_country, place_country_code=place_country_code, place_type=place_type, + media_urls=media_urls, ) return tweet_info From 840ed3e9a40e675d1112b62c93df1daf12b7f66b Mon Sep 17 00:00:00 2001 From: Matt Mollison Date: Fri, 26 Mar 2021 16:28:37 -0400 Subject: [PATCH 2/2] don't try/except --- thisishappening/utils/tweet_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/thisishappening/utils/tweet_utils.py b/thisishappening/utils/tweet_utils.py index d7bba08..3ff31f5 100644 --- a/thisishappening/utils/tweet_utils.py +++ b/thisishappening/utils/tweet_utils.py @@ -187,9 +187,9 @@ def get_tweet_info(status: Dict) -> Dict: place_country_code = status["place"].get("country_code") # Possible place_type values: country, admin, city, neighborhood, poi place_type = status["place"].get("place_type") - try: + if "extended_tweet" in status: media_urls = get_media_urls(status["extended_tweet"]) - except KeyError: + else: media_urls = get_media_urls(status) tweet_info = TweetInfo(