From cc027974817264005ab74da6cdaac1f182164b67 Mon Sep 17 00:00:00 2001 From: Felix Date: Sat, 24 Jul 2021 22:00:37 +0200 Subject: [PATCH 1/3] Allow downloading tweets by hashtag or cashtag --- nitter_scraper/nitter.py | 5 +++-- nitter_scraper/tweets.py | 21 +++++++++++++++------ 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/nitter_scraper/nitter.py b/nitter_scraper/nitter.py index de62f14..fcf1d6a 100644 --- a/nitter_scraper/nitter.py +++ b/nitter_scraper/nitter.py @@ -113,7 +113,7 @@ def get_profile(self, username: str, not_found_ok: bool = False): """ return get_profile(username=username, not_found_ok=not_found_ok, address=self.address) - def get_tweets(self, username: str, pages: int = 25, break_on_tweet_id: Optional[int] = None): + def get_tweets(self, query_string: str, query_type: str, pages: int = 25, break_on_tweet_id: Optional[int] = None): """Gets the target users tweets This is a modified version of nitter_scraper.tweets.get_tweets(). @@ -133,7 +133,8 @@ def get_tweets(self, username: str, pages: int = 25, break_on_tweet_id: Optional """ return get_tweets( - username=username, + query_string=query_string, + query_type=query_type, pages=pages, break_on_tweet_id=break_on_tweet_id, address=self.address, diff --git a/nitter_scraper/tweets.py b/nitter_scraper/tweets.py index 1e02335..213565b 100644 --- a/nitter_scraper/tweets.py +++ b/nitter_scraper/tweets.py @@ -123,13 +123,14 @@ def timeline_parser(html): return html.find(".timeline", first=True) -def pagination_parser(timeline, address, username) -> str: +def pagination_parser(timeline, url) -> str: next_page = list(timeline.find(".show-more")[-1].links)[0] - return f"{address}/{username}{next_page}" + return f"{url}{next_page}" def get_tweets( - username: str, + query_string: str, + query_type: str = 'user', pages: int = 25, break_on_tweet_id: Optional[int] = None, address="https://nitter.net", @@ -137,7 +138,8 @@ def get_tweets( """Gets the target users tweets Args: - username: Targeted users username. + query_string: Targeted username, hashtag or cashtag. + query_type: Type of former paremeter. Either one of 'user', 'hashtag' or 'cashtag'. pages: Max number of pages to lookback starting from the latest tweet. break_on_tweet_id: Gives the ability to break out of a loop if a tweets id is found. address: The address to scrape from. The default is https://nitter.net which should @@ -147,7 +149,14 @@ def get_tweets( Tweet Objects """ - url = f"{address}/{username}" + if query_type == 'user': + url = f"{address}/{query_string}" + elif query_type == 'hashtag': + url = f"{address}/search?q=%23{query_string}" + elif query_type == 'cashtag': + url = f"{address}/search?q=${query_string}" + else: + raise ValueError(f"Unknown query_type '{query_type}'") session = HTMLSession() def gen_tweets(pages): @@ -157,7 +166,7 @@ def gen_tweets(pages): if response.status_code == 200: timeline = timeline_parser(response.html) - next_url = pagination_parser(timeline, address, username) + next_url = pagination_parser(timeline, url) timeline_items = timeline.find(".timeline-item") From e1811f0a62062f6af338ff171a61ffbc592e5116 Mon Sep 17 00:00:00 2001 From: Felix Date: Sun, 25 Jul 2021 19:15:53 +0200 Subject: [PATCH 2/3] Make query type implicit --- nitter_scraper/nitter.py | 5 ++--- nitter_scraper/tweets.py | 17 +++++++---------- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/nitter_scraper/nitter.py b/nitter_scraper/nitter.py index fcf1d6a..54cc19a 100644 --- a/nitter_scraper/nitter.py +++ b/nitter_scraper/nitter.py @@ -113,7 +113,7 @@ def get_profile(self, username: str, not_found_ok: bool = False): """ return get_profile(username=username, not_found_ok=not_found_ok, address=self.address) - def get_tweets(self, query_string: str, query_type: str, pages: int = 25, break_on_tweet_id: Optional[int] = None): + def get_tweets(self, query_string: str, pages: int = 25, break_on_tweet_id: Optional[int] = None): """Gets the target users tweets This is a modified version of nitter_scraper.tweets.get_tweets(). @@ -121,7 +121,7 @@ def get_tweets(self, query_string: str, query_type: str, pages: int = 25, break_ address to scrape profile data. Args: - username: Targeted users username. + query_string: Hashtag, if starts with #, cashtag if starts with $, username otherwise pages: Max number of pages to lookback starting from the latest tweet. break_on_tweet_id: Gives the ability to break out of a loop if a tweets id is found. address: The address to scrape from. The default is https://nitter.net which should @@ -134,7 +134,6 @@ def get_tweets(self, query_string: str, query_type: str, pages: int = 25, break_ return get_tweets( query_string=query_string, - query_type=query_type, pages=pages, break_on_tweet_id=break_on_tweet_id, address=self.address, diff --git a/nitter_scraper/tweets.py b/nitter_scraper/tweets.py index 213565b..23efe3c 100644 --- a/nitter_scraper/tweets.py +++ b/nitter_scraper/tweets.py @@ -130,7 +130,6 @@ def pagination_parser(timeline, url) -> str: def get_tweets( query_string: str, - query_type: str = 'user', pages: int = 25, break_on_tweet_id: Optional[int] = None, address="https://nitter.net", @@ -138,8 +137,7 @@ def get_tweets( """Gets the target users tweets Args: - query_string: Targeted username, hashtag or cashtag. - query_type: Type of former paremeter. Either one of 'user', 'hashtag' or 'cashtag'. + query_string: Hashtag, if starts with #, cashtag if starts with $, username otherwise pages: Max number of pages to lookback starting from the latest tweet. break_on_tweet_id: Gives the ability to break out of a loop if a tweets id is found. address: The address to scrape from. The default is https://nitter.net which should @@ -149,16 +147,15 @@ def get_tweets( Tweet Objects """ - if query_type == 'user': - url = f"{address}/{query_string}" - elif query_type == 'hashtag': - url = f"{address}/search?q=%23{query_string}" - elif query_type == 'cashtag': - url = f"{address}/search?q=${query_string}" + if query_string.startswith('#'): + url = f"{address}/search?q=%23{query_string[1:]}" + elif query_string.startswith('$'): + url = f"{address}/search?q={query_string}" else: - raise ValueError(f"Unknown query_type '{query_type}'") + url = f"{address}/{query_string}" session = HTMLSession() + def gen_tweets(pages): response = session.get(url) From 73ca7b2a8551ed7e422fa80c1863235eb6797ff5 Mon Sep 17 00:00:00 2001 From: Felix Date: Sun, 25 Jul 2021 19:32:17 +0200 Subject: [PATCH 3/3] Add docs --- docs/content/examples.md | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/docs/content/examples.md b/docs/content/examples.md index 046c786..c894920 100644 --- a/docs/content/examples.md +++ b/docs/content/examples.md @@ -59,6 +59,34 @@ for user in users: ``` +### How to scrape tweets related to hashtag or cashtag. +```python +from pprint import pprint + +import nitter_scraper +from nitter_scraper import NitterScraper + +queries = ["#ToTheMoon", "$USDT"] + +print("Scraping with local nitter docker instance.") + +with NitterScraper(host="0.0.0.0", port=8008) as nitter: + for query in queries: + for tweet in nitter.get_tweets(query, pages=2): + print() + pprint(tweet.dict()) + print(tweet.json(indent=4)) + +print("Scraping from https://www.nitter.net.") + +for query in queries: + for tweet in nitter.get_tweets(query, pages=2): + print() + pprint(tweet.dict()) + print(tweet.json(indent=4)) + +``` + ### How to poll a users profile for the latest tweet. ```python import time