Merge pull request #3 from Kinuseka/experimental

Adding a CF bypass.
Kinuseka · Jul 14, 2022 · be9a4ab · be9a4ab
2 parents a3928dc + c875933
commit be9a4ab
Show file tree

Hide file tree

Showing 8 changed files with 562 additions and 14 deletions.
diff --git a/Lib/NHentai_cf.py b/Lib/NHentai_cf.py
@@ -0,0 +1,194 @@
+from logging import exception
+from bs4 import BeautifulSoup
+import re
+import json
+import yaml
+import requests
+import pickle
+
+#I recommend reading into the source code of the nhentai website to get a better understanding of what my code really does
+
+site_domain = "net"
+headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.103 Safari/537.36"}
+#Optional
+def CheckLink(data, digit=False):
+  '''For MODDERS:
+  This part is where you modify your OWN link checker to your own target site to scrape.
+  Most of the time there wont be any major edits other than the website you want to check.
+  '''
+  if digit:
+    return(f"https://nhentai.{site_domain}/g/%s" % data)
+  if re.search(f"https?://nhentai.{site_domain}/g/(\d+|/)", data.lower()):
+    return(0, data)
+  else:
+    return(2, "Link is not nHentai")
+
+#Main API
+class Api:
+  # Use INIT to initialize the needed data, for increased and faster loading times to other functions
+  def __init__(self,data):
+    '''
+    argument 'data' should be a valid link the target booklet
+    '''
+
+    self.name = "NHentai" #Directory label
+    #NHENTAI SITE FORTUNATELY HAS A DEDICATED JSON EMBEDDED INTO A SCRIPT FILE THAT YOU CAN USE TO GAIN INFORMATION FROM THE SITE. 
+    #DIFFERENT SITES MIGHT NOT HAVE A JSON FILE SO YOU WILL HAVE TO DO THE PROCESS MANUALLY
+
+    #Initialize path
+    self.cookie_path = HSite.cookie_path
+    self.session_path = HSite.session_path
+    self._data_level = data
+    #Check token
+    self.__reload_cf_token()
+
+    #Load Cookies
+    s = requests.Session()
+    self.__set_cookies(s)
+
+
+    caught_exception = None
+    for t in range(0,2):
+      try:
+        content = s.get(data)
+        content.raise_for_status()
+        break
+      except requests.exceptions.HTTPError as e:
+        http_code = e.response.status_code
+        caught_exception = e
+        if http_code == 404:
+          raise cferror.NotFound()
+        elif http_code == 503:
+          #CF blocked us, update the token
+          #Recheck token
+          self.__reload_cf_token(reset=True)
+          self.__set_cookies(s)
+      except requests.exceptions.ConnectionError as e:
+        caught_exception = e
+        caught_message = "There has been issues with trying to connect"
+        raise cferror.NetworkError()
+
+    else:
+      caught_code = caught_exception.response.status_code
+      caught_content = caught_exception.response.content
+      caught_message = "There has been issues communicating with the server"
+      if caught_code == 503:
+        raise cferror.CloudflareBlocked(caught_code,caught_content)
+      raise cferror.HTTPError(caught_code,caught_content,caught_message)
+
+
+
+    page = content.content
+    self.soup = BeautifulSoup(page, "html.parser")
+    script = (self.soup.find_all("script")[2].contents[0]).strip().replace("window._gallery = JSON.parse(", "").replace(");","")
+    #IF THERE IS NO ERROR THEN PROCEED
+    self.json = json.loads(json.loads(script))
+
+
+    self.__preloader_pages()
+
+  def Pages(self):
+    "Total available pages count" 
+    Page = len(self.json["images"]["pages"])
+    return Page
+
+  def Tags(self):
+    """For MODDERS:
+
+    For better readability for humans or other programs, I recommend you use Json to serialize your data.
+    """
+    Tag = self.json["tags"]
+    return Tag
+
+  def Title(self):
+    title = self.json["title"]["english"]
+    return title
+
+
+
+  def Direct_link(self,value): 
+    """For MODDERS:
+    This function is only used to RETURN a valid direct link to the targeted image.
+    The variable 'value' is the episode/page of the certain image to return. 
+    """
+    data = self.preloaded_data[value-1]
+    file = data["t"]
+    if file == "j":
+      extension = "jpg"
+    elif file == "p":
+      extension = "png"
+    elif file == "g":
+      extension = "gif"
+    else:
+      print("WARNING AT PAGE: %s\nUNIDENTIFIED FORMAT DETECTED REPORT THIS BUG\nautoset: jpg" % value)
+      extension = "jpg"
+    media_id = self.json["media_id"]
+    url = "https://i.nhentai.net/galleries/%s/%s.%s" % (media_id, value, extension)
+    #url = "https://t.dogehls.xyz/galleries/%s/%s.%s" % (media_id, value, extension)
+    return url
+
+  def __preloader_pages(self):
+    dict_data = self.json["images"]["pages"]
+    data = []
+    try:
+      for v in range(self.Pages()):
+        data.append(dict_data[f"{v+2}"])
+    except TypeError as e:
+      data = dict_data
+    self.preloaded_data = data
+
+  def __reload_cf_token(self,reset=False):
+    cookieStatus =  HSite.SiteCFBypass.cookie_available()
+    if not cookieStatus[0] or reset:
+      NHS = HSite.SiteCFBypass(self._data_level)
+      if reset:
+        HSite.SiteCFBypass.delete_cookies()
+      NHS.start()
+      NHS.join()
+
+  def __set_cookies(self, session):
+    cookies = pickle.load(open(self.cookie_path,"rb"))
+    selenium_headers = pickle.load(open(self.session_path,"rb"))
+    session.headers.update({"user-agent": selenium_headers})
+    for cookie in cookies:
+      session.cookies.set(cookie['name'], cookie['value'], domain=cookie['domain'])
+
+class Iterdata:
+  """File Iterator used to automatically detect links inside a text file
+  """
+  def __init__(self,data):
+    self.available = True #Used to indicate that the feature is available. False if none
+    self.data = data
+    self._index = -1
+    self.temptxt = []
+  def __iter__(self):
+    return self
+  def __enter__(self):
+    self.txt_line = open(self.data,"r")
+    for rawline in self.txt_line:
+      for tline in rawline.replace(","," ").split():
+        if not tline.isdigit():
+          continue
+        if len(tline) > 6:
+          long_line = re.findall('.{1,6}', tline)
+          for fixline in long_line:
+            self.temptxt.append(fixline)
+        self.temptxt.append(tline)
+    return self
+  def __next__(self):
+    self._index += 1 
+    if self._index >= len(self.temptxt):
+      raise StopIteration
+    return self.temptxt[self._index] 
+  def __reversed__(self):
+    return self.temptxt[::-1]
+  def __exit__(self,tp,v,tb):
+    self.txt_line.close()
+
+
+if __name__ == "__main__":
+  from ..essentials import HSite
+  from ..essentials.Errors import exception as cferror
+else:
+  from essentials import HSite
+  from essentials.Errors import exception as cferror
diff --git a/Process.py b/Process.py
@@ -296,4 +296,4 @@ def progress(self):
 VolatileData = VData()
 
 if __name__ == "__main__":
-  Download(input(),1,"Downloads")
+  pass
diff --git a/README.md b/README.md
@@ -16,13 +16,35 @@ A python script that collects data from NHentai.net.
 >Anyio (Trio Backend)
 >Trio
 >pyyaml
+>undetected-chromedriver 
 ```
+**Repositories:**
+
+
+>[Undetected Chromedriver](https://github.com/ultrafunkamsterdam/undetected-chromedriver)
+
 
 **Supported Sites at the moment**
 ```
-• NHentai [Mirror sites: .to])
+• NHentai.net[Mirror sites: Nhentai.to])
 ```
 
-**Note:**
-Mirror download is enabled by default incase the official site is not available, if you prefer
-to disable this. Please set "mirror_available" to `false` in config.json
+## Run:
+`python Start_download.py [args -n/--nuke-code]`
+**ex:**
+`python Start_download.py -n 401084`
+
+
+
+
+## Note:
+> Bypassing cloudflare requires a modified selenium which does not support headless mode. If you are not on a 
+desktop environment, you are better off setting `cf_bypass` to `false` in config.json.
+
+> Mirror server is enabled by default incase the official site is not available, if you prefer
+to disable this. Please set "mirror_available" to `false` in config.json
+
+> The Mirror server is slightly outdated compared to the official site. Some titles might not be available especially to newer release like 40000+.
+though they are being continually updated as time goes.
+
+
diff --git a/Start_download.py b/Start_download.py
@@ -1,5 +1,4 @@
 #Standard Library
-from cmath import inf
 import sys
 import os
 import time
@@ -24,11 +23,17 @@
 #Custom library
 import Process
 from essentials import updater as Updater
+from essentials.Errors import exception as cferror
 
 #EDIT THE MAXIMUM AMOUNT OF DOWNLOAD PROCESS HAPPENING AT THE SAME TIME
 #LOWER VALUE: SLOWER, MORE STABLE (BEST IN SLOW NETWORK CONDITIONS)
 #HIGHER VALUE: FASTER, LESS STABLE (BEST IN FAST NETWORK CONDITIONS
 
+methods = [
+  "cfbypass",
+  "mirror"
+]
+
 class SortData:
     AcquiredTags = None
     AcquiredPage = None
@@ -56,7 +61,7 @@ def main(args):
   Dir_name = Api.name
 
   logger.info("Found: %s pages" % AcquiredPage)
-  logger.info('Title name: "%s"' % TitleName)
+  logger.info('Title name: %s' % TitleName)
   logger.info("Acquiring direct links")
   AcquiredLinks = Api.Link_Page(AcquiredPage)
   logger.info("Total of: %s links is loaded" % len(AcquiredLinks))
@@ -229,8 +234,8 @@ def getSystemInfo(logtype):
   # Create handlers
   File = FileName()
   c_handler = logging.StreamHandler(sys.stdout)
-  o_handler = logging.FileHandler(File)
-  f_handler = logging.FileHandler(File)
+  o_handler = logging.FileHandler(File,mode="a",encoding='utf-8')
+  f_handler = logging.FileHandler(File,mode="a",encoding='utf-8')
   c_handler.setLevel(logging.INFO)
   o_handler.setLevel(logging.INFO)
   f_handler.setLevel(logging.INFO)
@@ -250,6 +255,7 @@ def getSystemInfo(logtype):
   max_process_open = sconfig(1)
   API_DATA_CONFIG = sconfig(2)
   API_MIRROR_ACCOMPLISHED = False
+  API_CF_ACCOMPLISHED = False
   EMERGENCY = 255
   verbose = False
   info = '''
@@ -295,16 +301,18 @@ def callers():
           logger.error("This method is not available for the current module")
       else:
         main(args.nukecode)
-    except urllib.error.HTTPError as e:
+    except (urllib.error.HTTPError, cferror.NotFound, cferror.NetworkError) as e:
       #ONLY OCCURS WHEN THERE IS NO RESULTS
       if e.code == 404:
         logger.error("The content you are looking for is not found")
       else:
         logger.error("HTTP Error Code: %s" % e.code)
-        if API_DATA_CONFIG["mirror_available"] and not API_MIRROR_ACCOMPLISHED:
+        if API_DATA_CONFIG["cf_bypass"] and not API_CF_ACCOMPLISHED:
+          return 102
+        elif API_DATA_CONFIG["mirror_available"] and not API_MIRROR_ACCOMPLISHED:
           return 101
       sys.exit(1)
-    except urllib.error.URLError as error:
+    except (urllib.error.URLError, cferror.NetworkError) as error:
       logger.error("A connection error has occured")
       loggon.exception("Exception catched: %s" % sys.exc_info()[0])
       sys.exit(1)
@@ -334,7 +342,11 @@ def callers():
       sys.exit()
   while True:
     exit_code = callers()
-    if exit_code == 101:
+    if exit_code == 102 and not API_CF_ACCOMPLISHED:
+      logger.info("Trying to bypass CF")
+      API_DATA_CONFIG["module_name"] = f'{API_DATA_CONFIG["module_name"]}_cf'
+      API_CF_ACCOMPLISHED = True
+    elif exit_code == 101 and not API_MIRROR_ACCOMPLISHED:
       logger.info("Mirror server enabled, trying mirror server.")
       API_DATA_CONFIG["module_name"] = f'{API_DATA_CONFIG["module_name"]}_mirror'
       API_MIRROR_ACCOMPLISHED = True

diff --git a/config.json b/config.json
@@ -3,6 +3,7 @@
     "semaphore":10,
     "Api": {
       "module_name": "NHentai",
+      "cf_bypass": true,
       "mirror_available": true 
     }
   }

diff --git a/essentials/Errors/exception.py b/essentials/Errors/exception.py
@@ -0,0 +1,55 @@
+
+class NotFound(Exception):
+     def __init__(self, *args, **kwargs):
+        default_message = 'Not Found'
+        self.code = 404
+
+        # if any arguments are passed...
+        # If you inherit from the exception that takes message as a keyword
+        # maybe you will need to check kwargs here
+        if args:
+            # ... pass them to the super constructor
+            super().__init__(*args, **kwargs)
+        else: # else, the exception was raised without arguments ...
+                 # ... pass the default message to the super constructor
+            super().__init__(default_message, **kwargs)
+
+class NetworkError(Exception):
+    def __init__(self,code,content,*args,**kwargs):
+        default_message = 'There has been a connection error'
+        self.code = code
+        self.content = content
+
+        if args:
+            # ... pass them to the super constructor
+            super().__init__(*args, **kwargs)
+        else: # else, the exception was raised without arguments ...
+                 # ... pass the default message to the super constructor
+            super().__init__(default_message, **kwargs)
+
+class HTTPError(Exception):
+    def __init__(self,code,content,*args,**kwargs):
+        default_message = 'There has been an HTTP error'
+        self.code = code
+        self.content = content
+
+        if args:
+            # ... pass them to the super constructor
+            super().__init__(*args, **kwargs)
+        else: # else, the exception was raised without arguments ...
+                 # ... pass the default message to the super constructor
+            super().__init__(default_message, **kwargs)
+
+class CloudflareBlocked(Exception):
+    def __init__(self,code,content,*args,**kwargs):
+        default_message = 'CF has persistently blocked us, please report this issue.'
+        self.code = code
+        self.content = content
+
+        if args:
+            # ... pass them to the super constructor
+            super().__init__(*args, **kwargs)
+        else: # else, the exception was raised without arguments ...
+                 # ... pass the default message to the super constructor
+            super().__init__(default_message, **kwargs)
+