Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow downloading tweets by hashtag or cashtag #3

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions docs/content/examples.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,34 @@ for user in users:

```

### How to scrape tweets related to hashtag or cashtag.
```python
from pprint import pprint

import nitter_scraper
from nitter_scraper import NitterScraper

queries = ["#ToTheMoon", "$USDT"]

print("Scraping with local nitter docker instance.")

with NitterScraper(host="0.0.0.0", port=8008) as nitter:
for query in queries:
for tweet in nitter.get_tweets(query, pages=2):
print()
pprint(tweet.dict())
print(tweet.json(indent=4))

print("Scraping from https://www.nitter.net.")

for query in queries:
for tweet in nitter.get_tweets(query, pages=2):
print()
pprint(tweet.dict())
print(tweet.json(indent=4))

```

### How to poll a users profile for the latest tweet.
```python
import time
Expand Down
6 changes: 3 additions & 3 deletions nitter_scraper/nitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,15 +113,15 @@ def get_profile(self, username: str, not_found_ok: bool = False):
"""
return get_profile(username=username, not_found_ok=not_found_ok, address=self.address)

def get_tweets(self, username: str, pages: int = 25, break_on_tweet_id: Optional[int] = None):
def get_tweets(self, query_string: str, pages: int = 25, break_on_tweet_id: Optional[int] = None):
"""Gets the target users tweets

This is a modified version of nitter_scraper.tweets.get_tweets().
This version automatically uses the address of the docker container as the primary
address to scrape profile data.

Args:
username: Targeted users username.
query_string: Hashtag, if starts with #, cashtag if starts with $, username otherwise
pages: Max number of pages to lookback starting from the latest tweet.
break_on_tweet_id: Gives the ability to break out of a loop if a tweets id is found.
address: The address to scrape from. The default is https://nitter.net which should
Expand All @@ -133,7 +133,7 @@ def get_tweets(self, username: str, pages: int = 25, break_on_tweet_id: Optional
"""

return get_tweets(
username=username,
query_string=query_string,
pages=pages,
break_on_tweet_id=break_on_tweet_id,
address=self.address,
Expand Down
18 changes: 12 additions & 6 deletions nitter_scraper/tweets.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,21 +123,21 @@ def timeline_parser(html):
return html.find(".timeline", first=True)


def pagination_parser(timeline, address, username) -> str:
def pagination_parser(timeline, url) -> str:
next_page = list(timeline.find(".show-more")[-1].links)[0]
return f"{address}/{username}{next_page}"
return f"{url}{next_page}"


def get_tweets(
username: str,
query_string: str,
pages: int = 25,
break_on_tweet_id: Optional[int] = None,
address="https://nitter.net",
) -> Tweet:
"""Gets the target users tweets

Args:
username: Targeted users username.
query_string: Hashtag, if starts with #, cashtag if starts with $, username otherwise
pages: Max number of pages to lookback starting from the latest tweet.
break_on_tweet_id: Gives the ability to break out of a loop if a tweets id is found.
address: The address to scrape from. The default is https://nitter.net which should
Expand All @@ -147,17 +147,23 @@ def get_tweets(
Tweet Objects

"""
url = f"{address}/{username}"
if query_string.startswith('#'):
url = f"{address}/search?q=%23{query_string[1:]}"
elif query_string.startswith('$'):
url = f"{address}/search?q={query_string}"
else:
url = f"{address}/{query_string}"
session = HTMLSession()


def gen_tweets(pages):
response = session.get(url)

while pages > 0:
if response.status_code == 200:
timeline = timeline_parser(response.html)

next_url = pagination_parser(timeline, address, username)
next_url = pagination_parser(timeline, url)

timeline_items = timeline.find(".timeline-item")

Expand Down