Merge pull request #5 from humandecoded/async

Further Improvements to Async Of course, it's the awesome @humandecoded who got this done, and Twayback is now a Super Saiyan!
humandecoded · Feb 18, 2022 · b460229 · b460229
2 parents e2edc53 + 315dffc
commit b460229
Showing 1 changed file with 22 additions and 25 deletions.
diff --git a/twayback B/twayback.py b/twayback B/twayback.py
@@ -5,6 +5,9 @@
 from pathlib import Path
 import simplejson as json
 from tqdm import tqdm as tqdm
+# this import needs to be named different since we've used up tqdm above
+# used for progress bar on our async operations
+from tqdm.asyncio import tqdm as asyncProgress
 import colorama
 from colorama import  Fore, Back, Style
 colorama.init(autoreset=True)
@@ -15,23 +18,30 @@
 from requests import Session
 session = Session()
 # for async
-from aiohttp import ClientSession
+from aiohttp import ClientSession, TCPConnector
 import asyncio
 
 # checks the status of a given url
-async def checkStatus(url, session: ClientSession):
-
-    req = await session.request(method="GET", url=url)
-    return req.real_url, req.status
+async def checkStatus(url, session: ClientSession, sem: asyncio.Semaphore):
+
+    async with sem:
+        async with session.get(url) as response:
+            return response.real_url, response.status
+
 
 # controls our async event loop
 async def asyncStarter(url_list):
     # this will wrap our event loop and feed the the various urls to their async request function.
     status_list = []
     headers = {'user-agent':'Mozilla/5.0 (compatible; DuckDuckBot-Https/1.1; https://duckduckgo.com/duckduckbot)'}
-    session = ClientSession(headers=headers)
-    status_list = await asyncio.gather(*(checkStatus(u, session) for u in url_list))
-    await session.close()
+
+    # using a with statement seems to be working out better
+    async with ClientSession(headers=headers) as session:
+        # limit to 50 concurrent jobs
+        sem = asyncio.Semaphore(50)
+        # launch all the url checks concurrently as coroutines 
+        status_list = await asyncProgress.gather(*(checkStatus(u, session, sem) for u in url_list))
+    # return a list of the results    
     return status_list
 if platform.system() == 'Windows':
     asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
@@ -121,24 +131,11 @@ async def asyncStarter(url_list):
 headers = {'user-agent':'Mozilla/5.0 (compatible; DuckDuckBot-Https/1.1; https://duckduckgo.com/duckduckbot)'}
 
 ###############################################################################
-# this block will take 15 entries of our url list
 # check them asyncronously and add the results to a list
-# this is broken in to chunks so we don't get blocked by twitter for too many requests
-temp_list =  []
-for url in tqdm(twitter_url):
-    temp_list.append(url)
-    # after our list has 15 elements we go for it
-    if len(temp_list) == 15:
-        # get a list containing our ten urls and their statuses
-         statuses = asyncio.run(asyncStarter(temp_list))
-         time.sleep(.05)
-         # add to our master list
-         results = results + statuses
-         # reset our temp list
-         temp_list = []
-# catch the last few elements of our list
-statuses = asyncio.run(asyncStarter(temp_list))
-results = results + statuses
+
+
+results = asyncio.run(asyncStarter(twitter_url))
+
 #####################################################################################################
 
 for url, status_code in results: