Skip to content

Commit

Permalink
fix: handling sitemap entries better
Browse files Browse the repository at this point in the history
  • Loading branch information
chartotu19 committed Jan 6, 2025
1 parent 68038f1 commit dab3f3e
Show file tree
Hide file tree
Showing 3 changed files with 335 additions and 67 deletions.
7 changes: 3 additions & 4 deletions llms_txt_action/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@

from .utils import (
concatenate_markdown_files,
convert_html_to_markdown,
generate_docs_structure,
html_folder_to_markdown,
)

logging.basicConfig(level=logging.INFO)
Expand Down Expand Up @@ -54,7 +54,7 @@ def generate_documentation( # noqa: PLR0913
logger.info("Starting Generation at folder - %s", docs_dir)

logger.info("Generating MD files for all HTML files at folder - %s", docs_dir)
markdown_files = convert_html_to_markdown(docs_dir)
markdown_files = html_folder_to_markdown(docs_dir)

# Set defaults if None
skip_md_files = False if skip_md_files is None else skip_md_files
Expand All @@ -77,8 +77,7 @@ def generate_documentation( # noqa: PLR0913
)
except FileNotFoundError:
logger.exception(
"Could not find sitemap file at %s",
f"{docs_dir}/{sitemap_path}",
"Failed to generate llms.txt file",
)
raise

Expand Down
143 changes: 93 additions & 50 deletions llms_txt_action/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import os
import re
from pathlib import Path
from urllib.parse import urlparse

from defusedxml import ElementTree as ET # noqa: N817
from docling.datamodel.base_models import ConversionStatus
Expand All @@ -16,7 +15,6 @@
logger = logging.getLogger(__name__)


# %%
def html_to_markdown(input_file: Path) -> str:
"""Converts HTML content to Markdown.
Expand All @@ -42,7 +40,7 @@ def html_to_markdown(input_file: Path) -> str:
raise RuntimeError(msg)


def convert_html_to_markdown(input_path: str) -> list:
def html_folder_to_markdown(input_path: str) -> list:
"""Recursively converts all HTML files in the given directory.
to Markdown files and collects the paths of the generated Markdown files.
Expand Down Expand Up @@ -106,10 +104,7 @@ def convert_html_to_markdown(input_path: str) -> list:
return markdown_files


# %%


def summarize_page(content: str, model_name: str) -> str:
def generate_summary(content: str, model_name: str) -> str:
"""Summarize the page content using the model.
This would analyze the page content and generate a summary.
Expand Down Expand Up @@ -140,10 +135,10 @@ def summarize_page(content: str, model_name: str) -> str:
return response.choices[0].message.content
# Extract largest heading from markdown content if present
logger.info("No model API key found, using heading as summary")
return extract_heading(content)
return _extract_heading(content)


def extract_heading(content: str) -> str:
def _extract_heading(content: str) -> str:
"""Extract the largest heading upto h3 from the given content."""
heading_match = re.search(r"^#{1,3}\s+(.+)$", content, re.MULTILINE)
logger.info("Heading match: %s", heading_match)
Expand All @@ -152,13 +147,91 @@ def extract_heading(content: str) -> str:
return ""


def _extract_site_url(root: ET) -> str:
"""Extract the site URL from the sitemap.xml file by finding common prefix."""
ns = {"ns": root.tag.split("}")[0].strip("{")}
urls = [url.find("ns:loc", ns).text for url in root.findall(".//ns:url", ns)]
if not urls:
msg = "No URLs found in sitemap"
raise ValueError(msg)

# Find common prefix among all URLs
shortest = min(urls, key=len)
for i, char in enumerate(shortest):
if any(url[i] != char for url in urls):
return shortest[:i]

return shortest


def _convert_url_to_file_path(
url: str,
site_url: str,
docs_dir: str,
locale_length: int = 2,
) -> str:
"""Convert the URL to a file path.
Strips site_url prefix and checks various path patterns to find existing file.
Returns empty string if no file exists.
Args:
----
url (str): Full URL to convert
site_url (str): Base site URL to strip
docs_dir (str): Path to the directory containing the documentation
locale_length (int): Length of the locale directory
Returns:
-------
Relative file path if found, empty string otherwise
"""
# Strip site URL prefix
if not url.startswith(site_url):
return ""
if url.endswith("/"):
url = url + "index.html"
path = url[len(site_url.rstrip("/")) :].strip("/")
# Handle different URL patterns
if path in {"", "index.html"}:
file_path = "index.md"
elif path.endswith(".html"):
file_path = f"{path[:-5]}.md"
else:
file_path = f"{path}/index.md"
# Try original path
if Path(f"{docs_dir}/{file_path}").exists():
return file_path

# Try without "latest/" suffix
if "latest/" in file_path:
no_latest = file_path.replace("latest/", "")
if Path(f"{docs_dir}/{no_latest}").exists():
return no_latest

# Try without 2-letter locale directory
parts = file_path.split("/")
if len(parts) > 1 and len(parts[0]) == locale_length:
no_locale = "/".join(parts[1:])
if Path(f"{docs_dir}/{no_locale}").exists():
return no_locale

return ""


def generate_docs_structure(
docs_dir: str,
sitemap_path: str,
model_name: str,
) -> str:
"""Generate a documentation structure from a sitemap.xml file.
first, extract site url.
then for each url, convert to file path.
then for each file path, read the file and summarize it.
then create a markdown link entry.
Args:
----
docs_dir (str): Path to the directory containing the documentation
Expand All @@ -174,6 +247,7 @@ def generate_docs_structure(
if not Path(f"{docs_dir}/{sitemap_path}").exists():
msg = f"The sitemap file {docs_dir}/{sitemap_path} does not exist."
raise FileNotFoundError(msg)

tree = ET.parse(f"{docs_dir}/{sitemap_path}")
root = tree.getroot()

Expand All @@ -183,59 +257,28 @@ def generate_docs_structure(
# Start building the markdown content
content = ["# Docs\n"]

site_url = _extract_site_url(root)
# Process each URL in the sitemap
for url in root.findall(".//ns:url", ns):
loc = url.find("ns:loc", ns).text
"""
This doesnt call all cases. let me give more examples that needs to be handled.
https://test.com/ -> index.md
https://test.com/index.html -> index.md
https://test.com/configuration/ -> configuration/index.md
https://test.com/configuration/azure/ -> configuration/azure/index.md
https://test.comen/configuration/auzre.html -> configuration/azure.md
"""
# Convert URL to file path
parsed_url = urlparse(loc)
path = parsed_url.path.strip("/")

# Handle different URL patterns
if path in {"", "index.html"}:
file_path = "index.md"
elif path.endswith(".html"):
# Remove .html and convert to .md
file_path = f"{path[:-5]}.md"
else:
# For paths ending in / or no extension, append index.md
file_path = f"{path}/index.md"
# Generate a summary for the page
try:
logger.info("Processing %s", loc)
file_path = _convert_url_to_file_path(loc, site_url, docs_dir)
logger.info("found file path: %s for %s", file_path, loc)
with Path(f"{docs_dir}/{file_path}").open() as f:
markdown_content = f.read()

summary = generate_summary(markdown_content, model_name)
page_title = loc.rstrip("/").split("/")[-1].replace("-", " ").title()
content.append(f"- [{page_title}]({loc}): {summary}")
except FileNotFoundError:
# Try without locale path by removing first directory if it's 2 characters
file_path_parts = file_path.split("/")

file_path_no_locale = (
"/".join(file_path_parts[1:])
if len(file_path_parts) > 1 and len(file_path_parts[0]) == 2 # noqa: PLR2004
else file_path
)
with Path(f"{docs_dir}/{file_path_no_locale}").open() as f:
markdown_content = f.read()
summary = summarize_page(markdown_content, model_name)

# Create the markdown link entry
page_title = loc.rstrip("/").split("/")[-1].replace("-", " ").title()
content.append(f"- [{page_title}]({loc}): {summary}")

logger.info("File not found: %s", file_path)
continue
# Join all lines with newlines
return "\n".join(content)


# %%


def concatenate_markdown_files(markdown_files: list, output_file: str):
"""Concatenates multiple markdown files into a single file.
Expand Down
Loading

0 comments on commit dab3f3e

Please sign in to comment.