Ignore subpages found in college pages to not scrape / include them t…

…wice
openzim · Sep 24, 2024 · 0de6805 · 0de6805
1 parent d581446
commit 0de6805
Showing 1 changed file with 6 additions and 2 deletions.
diff --git a/sushichef.py b/sushichef.py
@@ -181,8 +181,12 @@ def run(self):
         section_div = section.find("div", class_="noindex")
         for tag_a in section_div.find_all("a"):
             if not (tag_a.text or tag_a.attrs.get("title")):
-                # Ignore links without text / title, they correspond to the image
-                # and have already been found with the title
+                # Ignore links without text / title, they correspond to the
+                # image and have already been found with the title
+                continue
+            if "mt-listing-detailed-subpage-title" in tag_a.get("class", []):
+                # Ignore subpages found on college, these subpages will be
+                # fetched later when exploring the course
                 continue
             yield tag_a