Skip to content
This repository has been archived by the owner on Sep 12, 2024. It is now read-only.

Commit

Permalink
resolve issues
Browse files Browse the repository at this point in the history
  • Loading branch information
SeeknnDestroy committed Nov 28, 2023
1 parent a894442 commit 4ae43cb
Showing 1 changed file with 29 additions and 13 deletions.
42 changes: 29 additions & 13 deletions autollm/utils/website_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,29 +14,38 @@

class WebSiteReader:

def __init__(self, sitemap_url: Optional[str] = None):
def __init__(self):
self.visited_links = set()
self.sitemap_url = sitemap_url

def _fetch_and_parse_sitemap(self) -> List[str]:
def _fetch_and_parse_sitemap(
self,
sitemap_url: str,
include_filter_str: Optional[str] = None,
exclude_filter_str: Optional[str] = None) -> List[str]:
"""Fetches and parses the sitemap, returning URLs."""
try:
response = requests.get(self.sitemap_url)
response = requests.get(sitemap_url)
response.raise_for_status()
sitemap_content = response.text
except requests.RequestException as e:
logger.error(f"Error fetching sitemap: {e}")
return []

return self._extract_urls_from_sitemap(sitemap_content)
return self._extract_urls_from_sitemap(sitemap_content, include_filter_str, exclude_filter_str)

def _extract_urls_from_sitemap(self, sitemap_content: str) -> List[str]:
def _extract_urls_from_sitemap(
self, sitemap_content: str, include_filter_str: Optional[str],
exclude_filter_str: Optional[str]) -> List[str]:
"""Extracts URLs from sitemap content."""
sitemap = ET.fromstring(sitemap_content)
urls = []

for url in sitemap.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}url"):
location = url.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text
if (include_filter_str and
include_filter_str not in location) or (exclude_filter_str and
exclude_filter_str in location):
continue
urls.append(location)

return urls
Expand Down Expand Up @@ -78,21 +87,28 @@ def _get_all_urls(self, url):
urls = [link for link in self.visited_links if urlparse(link).netloc == urlparse(url).netloc]
return urls

def load_data(self, url: str) -> List[Document]:
def load_data(
self,
parent_url: str,
sitemap_url: str,
include_filter_str: str = None,
exclude_filter_str: str = None) -> List[Document]:
"""Loads data from either a standard URL or a sitemap URL."""
if self.sitemap_url:
if sitemap_url:
logger.info(f"Fetching and parsing sitemap {self.sitemap_url}..")
urls_to_process = self._fetch_and_parse_sitemap()
all_urls = self._fetch_and_parse_sitemap()
elif parent_url:
logger.info(f"Parsing child pages of {parent_url}..")
all_urls = self._get_child_links_recursive(parent_url)
else:
logger.info(f"Listing child pages of {url}..")
urls_to_process = self._get_child_links_recursive(url)
raise ValueError("Either sitemap_url or parent_url must be provided.")

web_reader = WebPageReader()
documents = []

logger.info(f"Total URLs to process: {len(urls_to_process)}")
logger.info(f"Total URLs to process: {len(all_urls)}")

for u in tqdm(urls_to_process, desc="Processing URLs"):
for u in tqdm(all_urls, desc="Processing URLs"):
if u not in self.visited_links:
self.visited_links.add(u)
documents.extend(web_reader.load_data(u))
Expand Down

0 comments on commit 4ae43cb

Please sign in to comment.