Skip to content
This repository has been archived by the owner on Sep 12, 2024. It is now read-only.

Commit

Permalink
add sitemap reader (#160)
Browse files Browse the repository at this point in the history
* update read website as documents function

* update webdocsreader to include sitemap reading

* update namings

* resolve issues

* update document reading function
  • Loading branch information
SeeknnDestroy committed Nov 29, 2023
1 parent 7477af0 commit 161b783
Show file tree
Hide file tree
Showing 4 changed files with 147 additions and 74 deletions.
39 changes: 31 additions & 8 deletions autollm/utils/document_reading.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
from autollm.utils.logging import logger
from autollm.utils.markdown_reader import MarkdownReader
from autollm.utils.pdf_reader import LangchainPDFReader
from autollm.utils.web_docs_reader import WebDocsReader
from autollm.utils.web_page_reader import WebPageReader
from autollm.utils.webpage_reader import WebPageReader
from autollm.utils.website_reader import WebSiteReader


def read_files_as_documents(
Expand Down Expand Up @@ -118,18 +118,41 @@ def read_github_repo_as_documents(
return documents


def read_website_as_documents(url: str) -> List[Document]:
def read_website_as_documents(
parent_url: Optional[str] = None,
sitemap_url: Optional[str] = None,
include_filter_str: Optional[str] = None,
exclude_filter_str: Optional[str] = None) -> List[Document]:
"""
Read documents from a website with all its child pages using the WebDocsReader.
Read documents from a website or a sitemap.
Parameters:
url (str): The starting URL from which to scrape documents.
parent_url (str, optional): The starting URL from which to scrape documents.
sitemap_url (str, optional): The URL of the sitemap to process.
include_filter_str (str, optional): Filter string to include certain URLs.
exclude_filter_str (str, optional): Filter string to exclude certain URLs.
Returns:
List[Document]: A list of Document objects containing content and metadata from the web pages.
List[Document]: A list of Document objects containing content and metadata.
Raises:
ValueError: If neither parent_url nor sitemap_url is provided, or if both are provided.
"""
reader = WebDocsReader()
documents = reader.load_data(url)
if (parent_url is None and sitemap_url is None) or (parent_url is not None and sitemap_url is not None):
raise ValueError("Please provide either parent_url or sitemap_url, not both or none.")

reader = WebSiteReader()
if parent_url:
documents = reader.load_data(
parent_url=parent_url,
include_filter_str=include_filter_str,
exclude_filter_str=exclude_filter_str)
else:
documents = reader.load_data(
sitemap_url=sitemap_url,
include_filter_str=include_filter_str,
exclude_filter_str=exclude_filter_str)

return documents


Expand Down
66 changes: 0 additions & 66 deletions autollm/utils/web_docs_reader.py

This file was deleted.

File renamed without changes.
116 changes: 116 additions & 0 deletions autollm/utils/website_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import xml.etree.ElementTree as ET
from typing import List, Optional
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup
from llama_index.schema import Document
from tqdm import tqdm

from autollm.utils.constants import WEBPAGE_READER_TIMEOUT
from autollm.utils.logging import logger
from autollm.utils.webpage_reader import WebPageReader


class WebSiteReader:

def __init__(self):
self.visited_links = set()

def _fetch_and_parse_sitemap(
self,
sitemap_url: str,
include_filter_str: Optional[str] = None,
exclude_filter_str: Optional[str] = None) -> List[str]:
"""Fetches and parses the sitemap, returning URLs."""
try:
response = requests.get(sitemap_url)
response.raise_for_status()
sitemap_content = response.text
except requests.RequestException as e:
logger.error(f"Error fetching sitemap: {e}")
return []

return self._extract_urls_from_sitemap(sitemap_content, include_filter_str, exclude_filter_str)

def _extract_urls_from_sitemap(
self, sitemap_content: str, include_filter_str: Optional[str],
exclude_filter_str: Optional[str]) -> List[str]:
"""Extracts URLs from sitemap content."""
sitemap = ET.fromstring(sitemap_content)
urls = []

for url in sitemap.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}url"):
location = url.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text
if (include_filter_str and
include_filter_str not in location) or (exclude_filter_str and
exclude_filter_str in location):
continue
urls.append(location)

return urls

def _get_child_links_recursive(self, url):
parsed_url = urlparse(url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
current_path = parsed_url.path

response = requests.get(url, timeout=WEBPAGE_READER_TIMEOUT) # timeout in seconds
if response.status_code != 200:
logger.warning(f"Failed to fetch the website: {response.status_code}")
return

soup = BeautifulSoup(response.text, "html.parser")
all_links = [link.get("href") for link in soup.find_all("a")]

# Normalize links and filter out external links and anchors
child_links = set()
for link in all_links:
# Skip any None or empty hrefs, and anchors
if not link or link.startswith('#') or link == current_path:
continue
# Convert relative links to absolute
full_link = urljoin(base_url, link)
# Add to set if the link is internal
if urlparse(full_link).netloc == parsed_url.netloc:
child_links.add(full_link)

# Process each child link
for link in child_links:
if link not in self.visited_links:
self.visited_links.add(link)
self._get_child_links_recursive(link)

def _get_all_urls(self, url):
self.visited_links = set()
self._get_child_links_recursive(url)
urls = [link for link in self.visited_links if urlparse(link).netloc == urlparse(url).netloc]
return urls

def load_data(
self,
parent_url: str = None,
sitemap_url: str = None,
include_filter_str: str = None,
exclude_filter_str: str = None) -> List[Document]:
"""Loads data from either a standard URL or a sitemap URL."""
if sitemap_url:
logger.info(f"Fetching and parsing sitemap {sitemap_url}..")
all_urls = self._fetch_and_parse_sitemap(sitemap_url, include_filter_str, exclude_filter_str)
elif parent_url:
logger.info(f"Parsing child pages of {parent_url}..")
all_urls = self._get_child_links_recursive(parent_url)
else:
raise ValueError("Either sitemap_url or parent_url must be provided.")

web_reader = WebPageReader()
documents = []

logger.info(f"Total URLs to process: {len(all_urls)}")

for u in tqdm(all_urls, desc="Processing URLs"):
if u not in self.visited_links:
self.visited_links.add(u)
documents.extend(web_reader.load_data(u))

return documents

0 comments on commit 161b783

Please sign in to comment.