Merge pull request #40 from openzim/first_fixes

First fixes for proper operation and "bare minimal" quality
openzim · Oct 25, 2024 · 5e3b2e5 · 5e3b2e5
2 parents fe46316 + fb31a56
commit 5e3b2e5
Show file tree

Hide file tree

Showing 5 changed files with 49 additions and 12 deletions.
diff --git a/.gitignore b/.gitignore
@@ -102,3 +102,9 @@ venv.bak/
 
 # mypy
 .mypy_cache/
+
+# Local files
+chefdata
+restore
+storage
+.ricecookerfilecache
diff --git a/README.md b/README.md
@@ -43,3 +43,5 @@ into a format that can be imported into Kolibri Studio.
      
 ## MathJax
 MathJax files must be in a upper level folder i.e ../ or will raise an error. 
+
+Version 2.7.5 should be used as-of Sept. 2024, and hence placed in ../MathJax-2.7.5
diff --git a/requirements.txt b/requirements.txt
@@ -4,3 +4,4 @@ markdown2
 GitPython
 numpy
 xxhash
+yt-dlp
diff --git a/sushichef.py b/sushichef.py
@@ -14,7 +14,7 @@
 from collections import OrderedDict
 
 import xxhash
-import youtube_dl
+import yt_dlp
 import requests
 from bs4 import BeautifulSoup
 from le_utils.constants import licenses, content_kinds, file_formats
@@ -30,7 +30,7 @@
 from ricecooker.utils.jsontrees import write_tree_to_json_tree, SUBTITLES_FILE
 from ricecooker.utils.zip import create_predictable_zip
 
-from utils import get_name_from_url, build_path
+from utils import remove_src_set, get_name_from_url, build_path
 from utils import file_exists, remove_links
 from utils import remove_iframes
 from utils import link_to_text, remove_scripts
@@ -180,6 +180,14 @@ def run(self):
         section = soup.find("section", class_="mt-content-container")
         section_div = section.find("div", class_="noindex")
         for tag_a in section_div.find_all("a"):
+            if not (tag_a.text or tag_a.attrs.get("title")):
+                # Ignore links without text / title, they correspond to the
+                # image and have already been found with the title
+                continue
+            if "mt-listing-detailed-subpage-title" in tag_a.get("class", []):
+                # Ignore subpages found on college, these subpages will be
+                # fetched later when exploring the course
+                continue
             yield tag_a
 
 
@@ -429,6 +437,10 @@ def save_thumbnail(url, title):
 
 class CourseIndex(object):
     def __init__(self, title, url, visited_urls=None):
+        if not title:
+            raise Exception("CourseIndex title cannot be None or empty")
+        if not url:
+            raise Exception("CourseIndex url cannot be None or empty")
         self.source_id = url
         self.title = title
         self.lang = "en"
@@ -438,8 +450,8 @@ def __init__(self, title, url, visited_urls=None):
         self.author()
         self._thumbnail = None
         self.visited_urls = visited_urls if visited_urls is not None else set([])
-        LOGGER.info("----- Course Index title: " + self.title)
-        LOGGER.info("-----    url: " + self.source_id)
+        LOGGER.info(f"----- Course Index title: {self.title}")
+        LOGGER.info(f"-----    url: {self.source_id}")
 
     def to_soup(self, loadjs=False):
         document = download(self.source_id, loadjs=loadjs)
@@ -695,6 +707,7 @@ def clean(self, content):
         remove_links(content)
         remove_iframes(content)
         remove_scripts(content)
+        remove_src_set(content)
         return content
 
     def to_soup(self):
@@ -774,7 +787,7 @@ def mathjax(self):
             return "".join([str(s) for s in scripts])
 
     def mathjax_dependences(self, filepath):
-        mathajax_path = "../MathJax/"
+        mathajax_path = "../MathJax-2.7.5/"
         dependences = [
             "config/TeX-AMS_HTML.js",
             "jax/input/TeX/config.js",
@@ -1162,17 +1175,17 @@ def get_video_info(self, download_to=None, subtitles=True):
             "noplaylist": True,
         }
 
-        with youtube_dl.YoutubeDL(ydl_options) as ydl:
+        with yt_dlp.YoutubeDL(ydl_options) as ydl:
             try:
                 ydl.add_default_info_extractors()
                 info = ydl.extract_info(
                     self.source_id, download=(download_to is not None)
                 )
                 return info
             except (
-                youtube_dl.utils.DownloadError,
-                youtube_dl.utils.ContentTooShortError,
-                youtube_dl.utils.ExtractorError,
+                yt_dlp.utils.DownloadError,
+                yt_dlp.utils.ContentTooShortError,
+                yt_dlp.utils.ExtractorError,
             ) as e:
                 LOGGER.info("An error occured " + str(e))
                 LOGGER.info(self.source_id)
@@ -1225,9 +1238,9 @@ def download(self, download=True, base_path=None):
                 LOGGER.info("Download retry")
                 time.sleep(0.8)
             except (
-                youtube_dl.utils.DownloadError,
-                youtube_dl.utils.ContentTooShortError,
-                youtube_dl.utils.ExtractorError,
+                yt_dlp.utils.DownloadError,
+                yt_dlp.utils.ContentTooShortError,
+                yt_dlp.utils.ExtractorError,
                 OSError,
             ) as e:
                 LOGGER.info(

diff --git a/utils.py b/utils.py
@@ -3,7 +3,9 @@
 import os
 from pathlib import Path
 from bs4 import Tag
+import re
 
+REFERENCE_REGEX = re.compile(".*#\d+$")
 
 def dir_exists(filepath):
     file_ = Path(filepath)
@@ -143,6 +145,19 @@ def link_to_text(content):
                 url = tag["href"]
                 if url.endswith(".pdf"):
                     pass
+                elif REFERENCE_REGEX.match(url):
+                    # we just remove links for references which are already in
+                    # document, even if the reference is in another course, see
+                    # https://github.com/openzim/librechef/issues/36
+                    pass
                 elif url.startswith("http") or url.startswith("/"):
                     tag.wrap(span)
                     span.insert(1, " (" + url + ")")
+
+
+def remove_src_set(content):
+    if content is None:
+        return
+    for img_tag in content.find_all("img"):
+        if 'srcset' in img_tag.attrs:
+            del img_tag['srcset']