Skip to content
This repository has been archived by the owner on Oct 25, 2024. It is now read-only.

Commit

Permalink
Merge pull request #40 from openzim/first_fixes
Browse files Browse the repository at this point in the history
First fixes for proper operation and "bare minimal" quality
  • Loading branch information
benoit74 authored Oct 25, 2024
2 parents fe46316 + fb31a56 commit 5e3b2e5
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 12 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,9 @@ venv.bak/

# mypy
.mypy_cache/

# Local files
chefdata
restore
storage
.ricecookerfilecache
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,5 @@ into a format that can be imported into Kolibri Studio.
## MathJax
MathJax files must be in a upper level folder i.e ../ or will raise an error.

Version 2.7.5 should be used as-of Sept. 2024, and hence placed in ../MathJax-2.7.5
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ markdown2
GitPython
numpy
xxhash
yt-dlp
37 changes: 25 additions & 12 deletions sushichef.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from collections import OrderedDict

import xxhash
import youtube_dl
import yt_dlp
import requests
from bs4 import BeautifulSoup
from le_utils.constants import licenses, content_kinds, file_formats
Expand All @@ -30,7 +30,7 @@
from ricecooker.utils.jsontrees import write_tree_to_json_tree, SUBTITLES_FILE
from ricecooker.utils.zip import create_predictable_zip

from utils import get_name_from_url, build_path
from utils import remove_src_set, get_name_from_url, build_path
from utils import file_exists, remove_links
from utils import remove_iframes
from utils import link_to_text, remove_scripts
Expand Down Expand Up @@ -180,6 +180,14 @@ def run(self):
section = soup.find("section", class_="mt-content-container")
section_div = section.find("div", class_="noindex")
for tag_a in section_div.find_all("a"):
if not (tag_a.text or tag_a.attrs.get("title")):
# Ignore links without text / title, they correspond to the
# image and have already been found with the title
continue
if "mt-listing-detailed-subpage-title" in tag_a.get("class", []):
# Ignore subpages found on college, these subpages will be
# fetched later when exploring the course
continue
yield tag_a


Expand Down Expand Up @@ -429,6 +437,10 @@ def save_thumbnail(url, title):

class CourseIndex(object):
def __init__(self, title, url, visited_urls=None):
if not title:
raise Exception("CourseIndex title cannot be None or empty")
if not url:
raise Exception("CourseIndex url cannot be None or empty")
self.source_id = url
self.title = title
self.lang = "en"
Expand All @@ -438,8 +450,8 @@ def __init__(self, title, url, visited_urls=None):
self.author()
self._thumbnail = None
self.visited_urls = visited_urls if visited_urls is not None else set([])
LOGGER.info("----- Course Index title: " + self.title)
LOGGER.info("----- url: " + self.source_id)
LOGGER.info(f"----- Course Index title: {self.title}")
LOGGER.info(f"----- url: {self.source_id}")

def to_soup(self, loadjs=False):
document = download(self.source_id, loadjs=loadjs)
Expand Down Expand Up @@ -695,6 +707,7 @@ def clean(self, content):
remove_links(content)
remove_iframes(content)
remove_scripts(content)
remove_src_set(content)
return content

def to_soup(self):
Expand Down Expand Up @@ -774,7 +787,7 @@ def mathjax(self):
return "".join([str(s) for s in scripts])

def mathjax_dependences(self, filepath):
mathajax_path = "../MathJax/"
mathajax_path = "../MathJax-2.7.5/"
dependences = [
"config/TeX-AMS_HTML.js",
"jax/input/TeX/config.js",
Expand Down Expand Up @@ -1162,17 +1175,17 @@ def get_video_info(self, download_to=None, subtitles=True):
"noplaylist": True,
}

with youtube_dl.YoutubeDL(ydl_options) as ydl:
with yt_dlp.YoutubeDL(ydl_options) as ydl:
try:
ydl.add_default_info_extractors()
info = ydl.extract_info(
self.source_id, download=(download_to is not None)
)
return info
except (
youtube_dl.utils.DownloadError,
youtube_dl.utils.ContentTooShortError,
youtube_dl.utils.ExtractorError,
yt_dlp.utils.DownloadError,
yt_dlp.utils.ContentTooShortError,
yt_dlp.utils.ExtractorError,
) as e:
LOGGER.info("An error occured " + str(e))
LOGGER.info(self.source_id)
Expand Down Expand Up @@ -1225,9 +1238,9 @@ def download(self, download=True, base_path=None):
LOGGER.info("Download retry")
time.sleep(0.8)
except (
youtube_dl.utils.DownloadError,
youtube_dl.utils.ContentTooShortError,
youtube_dl.utils.ExtractorError,
yt_dlp.utils.DownloadError,
yt_dlp.utils.ContentTooShortError,
yt_dlp.utils.ExtractorError,
OSError,
) as e:
LOGGER.info(
Expand Down
15 changes: 15 additions & 0 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
import os
from pathlib import Path
from bs4 import Tag
import re

REFERENCE_REGEX = re.compile(".*#\d+$")

def dir_exists(filepath):
file_ = Path(filepath)
Expand Down Expand Up @@ -143,6 +145,19 @@ def link_to_text(content):
url = tag["href"]
if url.endswith(".pdf"):
pass
elif REFERENCE_REGEX.match(url):
# we just remove links for references which are already in
# document, even if the reference is in another course, see
# https://github.com/openzim/librechef/issues/36
pass
elif url.startswith("http") or url.startswith("/"):
tag.wrap(span)
span.insert(1, " (" + url + ")")


def remove_src_set(content):
if content is None:
return
for img_tag in content.find_all("img"):
if 'srcset' in img_tag.attrs:
del img_tag['srcset']

0 comments on commit 5e3b2e5

Please sign in to comment.