Skip to content

Commit

Permalink
Merge pull request #5 from humandecoded/async
Browse files Browse the repository at this point in the history
Further Improvements to Async

Of course, it's the awesome @humandecoded who got this done, and Twayback is now a Super Saiyan!
  • Loading branch information
Mennaruuk authored Feb 18, 2022
2 parents e2edc53 + 315dffc commit b460229
Showing 1 changed file with 22 additions and 25 deletions.
47 changes: 22 additions & 25 deletions twayback B/twayback.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
from pathlib import Path
import simplejson as json
from tqdm import tqdm as tqdm
# this import needs to be named different since we've used up tqdm above
# used for progress bar on our async operations
from tqdm.asyncio import tqdm as asyncProgress
import colorama
from colorama import Fore, Back, Style
colorama.init(autoreset=True)
Expand All @@ -15,23 +18,30 @@
from requests import Session
session = Session()
# for async
from aiohttp import ClientSession
from aiohttp import ClientSession, TCPConnector
import asyncio

# checks the status of a given url
async def checkStatus(url, session: ClientSession):

req = await session.request(method="GET", url=url)
return req.real_url, req.status
async def checkStatus(url, session: ClientSession, sem: asyncio.Semaphore):

async with sem:
async with session.get(url) as response:
return response.real_url, response.status


# controls our async event loop
async def asyncStarter(url_list):
# this will wrap our event loop and feed the the various urls to their async request function.
status_list = []
headers = {'user-agent':'Mozilla/5.0 (compatible; DuckDuckBot-Https/1.1; https://duckduckgo.com/duckduckbot)'}
session = ClientSession(headers=headers)
status_list = await asyncio.gather(*(checkStatus(u, session) for u in url_list))
await session.close()

# using a with statement seems to be working out better
async with ClientSession(headers=headers) as session:
# limit to 50 concurrent jobs
sem = asyncio.Semaphore(50)
# launch all the url checks concurrently as coroutines
status_list = await asyncProgress.gather(*(checkStatus(u, session, sem) for u in url_list))
# return a list of the results
return status_list
if platform.system() == 'Windows':
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
Expand Down Expand Up @@ -121,24 +131,11 @@ async def asyncStarter(url_list):
headers = {'user-agent':'Mozilla/5.0 (compatible; DuckDuckBot-Https/1.1; https://duckduckgo.com/duckduckbot)'}

###############################################################################
# this block will take 15 entries of our url list
# check them asyncronously and add the results to a list
# this is broken in to chunks so we don't get blocked by twitter for too many requests
temp_list = []
for url in tqdm(twitter_url):
temp_list.append(url)
# after our list has 15 elements we go for it
if len(temp_list) == 15:
# get a list containing our ten urls and their statuses
statuses = asyncio.run(asyncStarter(temp_list))
time.sleep(.05)
# add to our master list
results = results + statuses
# reset our temp list
temp_list = []
# catch the last few elements of our list
statuses = asyncio.run(asyncStarter(temp_list))
results = results + statuses


results = asyncio.run(asyncStarter(twitter_url))

#####################################################################################################

for url, status_code in results:
Expand Down

0 comments on commit b460229

Please sign in to comment.