diff --git a/llms_txt_action/entrypoint.py b/llms_txt_action/entrypoint.py
index 6ac2a8d..3b1a0c2 100644
--- a/llms_txt_action/entrypoint.py
+++ b/llms_txt_action/entrypoint.py
@@ -9,8 +9,8 @@
from .utils import (
concatenate_markdown_files,
- convert_html_to_markdown,
generate_docs_structure,
+ html_folder_to_markdown,
)
logging.basicConfig(level=logging.INFO)
@@ -54,7 +54,7 @@ def generate_documentation( # noqa: PLR0913
logger.info("Starting Generation at folder - %s", docs_dir)
logger.info("Generating MD files for all HTML files at folder - %s", docs_dir)
- markdown_files = convert_html_to_markdown(docs_dir)
+ markdown_files = html_folder_to_markdown(docs_dir)
# Set defaults if None
skip_md_files = False if skip_md_files is None else skip_md_files
@@ -77,10 +77,8 @@ def generate_documentation( # noqa: PLR0913
)
except FileNotFoundError:
logger.exception(
- "Could not find sitemap file at %s",
- f"{docs_dir}/{sitemap_path}",
+ "Failed to generate llms.txt file",
)
- raise
if not skip_llms_full_txt:
concatenate_markdown_files(
diff --git a/llms_txt_action/utils.py b/llms_txt_action/utils.py
index 4c5228c..d57da7a 100644
--- a/llms_txt_action/utils.py
+++ b/llms_txt_action/utils.py
@@ -5,7 +5,6 @@
import os
import re
from pathlib import Path
-from urllib.parse import urlparse
from defusedxml import ElementTree as ET # noqa: N817
from docling.datamodel.base_models import ConversionStatus
@@ -16,7 +15,6 @@
logger = logging.getLogger(__name__)
-# %%
def html_to_markdown(input_file: Path) -> str:
"""Converts HTML content to Markdown.
@@ -42,7 +40,7 @@ def html_to_markdown(input_file: Path) -> str:
raise RuntimeError(msg)
-def convert_html_to_markdown(input_path: str) -> list:
+def html_folder_to_markdown(input_path: str) -> list:
"""Recursively converts all HTML files in the given directory.
to Markdown files and collects the paths of the generated Markdown files.
@@ -106,10 +104,7 @@ def convert_html_to_markdown(input_path: str) -> list:
return markdown_files
-# %%
-
-
-def summarize_page(content: str, model_name: str) -> str:
+def generate_summary(content: str, model_name: str) -> str:
"""Summarize the page content using the model.
This would analyze the page content and generate a summary.
@@ -140,10 +135,10 @@ def summarize_page(content: str, model_name: str) -> str:
return response.choices[0].message.content
# Extract largest heading from markdown content if present
logger.info("No model API key found, using heading as summary")
- return extract_heading(content)
+ return _extract_heading(content)
-def extract_heading(content: str) -> str:
+def _extract_heading(content: str) -> str:
"""Extract the largest heading upto h3 from the given content."""
heading_match = re.search(r"^#{1,3}\s+(.+)$", content, re.MULTILINE)
logger.info("Heading match: %s", heading_match)
@@ -152,6 +147,79 @@ def extract_heading(content: str) -> str:
return ""
+def _extract_site_url(root: ET) -> str:
+ """Extract the site URL from the sitemap.xml file by finding common prefix."""
+ ns = {"ns": root.tag.split("}")[0].strip("{")}
+ urls = [url.find("ns:loc", ns).text for url in root.findall(".//ns:url", ns)]
+ if not urls:
+ msg = "No URLs found in sitemap"
+ raise ValueError(msg)
+
+ # Find common prefix among all URLs
+ shortest = min(urls, key=len)
+ for i, char in enumerate(shortest):
+ if any(url[i] != char for url in urls):
+ return shortest[:i]
+
+ return shortest
+
+
+def _convert_url_to_file_path(
+ url: str,
+ site_url: str,
+ docs_dir: str,
+ locale_length: int = 2,
+) -> str:
+ """Convert the URL to a file path.
+
+ Strips site_url prefix and checks various path patterns to find existing file.
+ Returns empty string if no file exists.
+
+ Args:
+ ----
+ url (str): Full URL to convert
+ site_url (str): Base site URL to strip
+ docs_dir (str): Path to the directory containing the documentation
+ locale_length (int): Length of the locale directory
+
+ Returns:
+ -------
+ Relative file path if found, empty string otherwise
+
+ """
+ # Strip site URL prefix
+ if not url.startswith(site_url):
+ return ""
+ if url.endswith("/"):
+ url = url + "index.html"
+ path = url[len(site_url.rstrip("/")) :].strip("/")
+ # Handle different URL patterns
+ if path in {"", "index.html"}:
+ file_path = "index.md"
+ elif path.endswith(".html"):
+ file_path = f"{path[:-5]}.md"
+ else:
+ file_path = f"{path}/index.md"
+ # Try original path
+ if Path(f"{docs_dir}/{file_path}").exists():
+ return file_path
+
+ # Try without "latest/" suffix
+ if "latest/" in file_path:
+ no_latest = file_path.replace("latest/", "")
+ if Path(f"{docs_dir}/{no_latest}").exists():
+ return no_latest
+
+ # Try without 2-letter locale directory
+ parts = file_path.split("/")
+ if len(parts) > 1 and len(parts[0]) == locale_length:
+ no_locale = "/".join(parts[1:])
+ if Path(f"{docs_dir}/{no_locale}").exists():
+ return no_locale
+
+ return ""
+
+
def generate_docs_structure(
docs_dir: str,
sitemap_path: str,
@@ -159,6 +227,11 @@ def generate_docs_structure(
) -> str:
"""Generate a documentation structure from a sitemap.xml file.
+ first, extract site url.
+ then for each url, convert to file path.
+ then for each file path, read the file and summarize it.
+ then create a markdown link entry.
+
Args:
----
docs_dir (str): Path to the directory containing the documentation
@@ -174,6 +247,7 @@ def generate_docs_structure(
if not Path(f"{docs_dir}/{sitemap_path}").exists():
msg = f"The sitemap file {docs_dir}/{sitemap_path} does not exist."
raise FileNotFoundError(msg)
+
tree = ET.parse(f"{docs_dir}/{sitemap_path}")
root = tree.getroot()
@@ -183,59 +257,28 @@ def generate_docs_structure(
# Start building the markdown content
content = ["# Docs\n"]
+ site_url = _extract_site_url(root)
# Process each URL in the sitemap
for url in root.findall(".//ns:url", ns):
loc = url.find("ns:loc", ns).text
- """
- This doesnt call all cases. let me give more examples that needs to be handled.
-
- https://test.com/ -> index.md
- https://test.com/index.html -> index.md
- https://test.com/configuration/ -> configuration/index.md
- https://test.com/configuration/azure/ -> configuration/azure/index.md
- https://test.comen/configuration/auzre.html -> configuration/azure.md
- """
- # Convert URL to file path
- parsed_url = urlparse(loc)
- path = parsed_url.path.strip("/")
-
- # Handle different URL patterns
- if path in {"", "index.html"}:
- file_path = "index.md"
- elif path.endswith(".html"):
- # Remove .html and convert to .md
- file_path = f"{path[:-5]}.md"
- else:
- # For paths ending in / or no extension, append index.md
- file_path = f"{path}/index.md"
- # Generate a summary for the page
try:
+ logger.info("Processing %s", loc)
+ file_path = _convert_url_to_file_path(loc, site_url, docs_dir)
+ logger.info("found file path: %s for %s", file_path, loc)
with Path(f"{docs_dir}/{file_path}").open() as f:
markdown_content = f.read()
+
+ summary = generate_summary(markdown_content, model_name)
+ page_title = loc.rstrip("/").split("/")[-1].replace("-", " ").title()
+ content.append(f"- [{page_title}]({loc}): {summary}")
except FileNotFoundError:
# Try without locale path by removing first directory if it's 2 characters
- file_path_parts = file_path.split("/")
-
- file_path_no_locale = (
- "/".join(file_path_parts[1:])
- if len(file_path_parts) > 1 and len(file_path_parts[0]) == 2 # noqa: PLR2004
- else file_path
- )
- with Path(f"{docs_dir}/{file_path_no_locale}").open() as f:
- markdown_content = f.read()
- summary = summarize_page(markdown_content, model_name)
-
- # Create the markdown link entry
- page_title = loc.rstrip("/").split("/")[-1].replace("-", " ").title()
- content.append(f"- [{page_title}]({loc}): {summary}")
-
+ logger.info("File not found: %s", file_path)
+ continue
# Join all lines with newlines
return "\n".join(content)
-# %%
-
-
def concatenate_markdown_files(markdown_files: list, output_file: str):
"""Concatenates multiple markdown files into a single file.
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 0b0079d..3f167c7 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,18 +1,21 @@
"""Unit tests for the llms_txt_action.utils module."""
-# ruff: noqa: S101
+# ruff: noqa: S101, S314, E501
+import xml.etree.ElementTree as ET
from unittest.mock import Mock, patch
import pytest
from docling.datamodel.base_models import ConversionStatus
from llms_txt_action.utils import (
+ _convert_url_to_file_path,
+ _extract_heading,
+ _extract_site_url,
concatenate_markdown_files,
- convert_html_to_markdown,
- extract_heading,
generate_docs_structure,
+ generate_summary,
+ html_folder_to_markdown,
html_to_markdown,
- summarize_page,
)
@@ -91,7 +94,7 @@ def test_convert_html_to_markdown_success(tmp_path, sample_html_file):
input_dir.mkdir()
sample_html_file.rename(input_dir / "test.html")
- result = convert_html_to_markdown(str(input_dir))
+ result = html_folder_to_markdown(str(input_dir))
assert len(result) == 1
assert result[0].suffix == ".md"
@@ -102,7 +105,7 @@ def test_convert_html_to_markdown_invalid_path():
ValueError,
match="The input path nonexistent_path is not a directory.",
):
- convert_html_to_markdown("nonexistent_path")
+ html_folder_to_markdown("nonexistent_path")
def test_convert_html_to_markdown_handles_conversion_failure(
@@ -117,7 +120,7 @@ def test_convert_html_to_markdown_handles_conversion_failure(
input_dir.mkdir()
sample_html_file.rename(input_dir / "test.html")
- result = convert_html_to_markdown(str(input_dir))
+ result = html_folder_to_markdown(str(input_dir))
assert len(result) == 0
@@ -132,7 +135,7 @@ def test_summarize_page_with_model():
mock_response.choices = [Mock(message=Mock(content="Test summary"))]
mock_completion.return_value = mock_response
- result = summarize_page("Test content", "gpt-3.5-turbo")
+ result = generate_summary("Test content", "gpt-3.5-turbo")
assert result == "Test summary"
@@ -140,10 +143,10 @@ def test_summarize_page_without_model():
"""Test summarize page without model API key."""
with (
patch.dict("os.environ", clear=True),
- patch("llms_txt_action.utils.extract_heading") as mock_extract,
+ patch("llms_txt_action.utils._extract_heading") as mock_extract,
):
mock_extract.return_value = "Test Heading"
- result = summarize_page("# Test Heading\nContent", "gpt-3.5-turbo")
+ result = generate_summary("# Test Heading\nContent", "gpt-3.5-turbo")
assert result == "Test Heading"
@@ -151,19 +154,19 @@ def test_summarize_page_without_model():
def test_extract_heading_with_h1():
"""Test extract heading with h1."""
content = "# Main Heading\nContent"
- assert extract_heading(content) == "Main Heading"
+ assert _extract_heading(content) == "Main Heading"
def test_extract_heading_with_h2():
"""Test extract heading with h2."""
content = "## Secondary Heading\nContent"
- assert extract_heading(content) == "Secondary Heading"
+ assert _extract_heading(content) == "Secondary Heading"
def test_extract_heading_no_heading():
"""Test extract heading with no heading."""
content = "Just content"
- assert extract_heading(content) == ""
+ assert _extract_heading(content) == ""
# Tests for generate_docs_structure
@@ -233,3 +236,214 @@ def test_concatenate_markdown_files_empty_list(tmp_path):
concatenate_markdown_files([], output_file)
assert output_file.exists()
assert output_file.read_text() == ""
+
+
+def test_extract_site_url_common_prefix():
+ """Test extracting site URL with common prefix."""
+ xml_content = """
+
+ https://example.com/page1
+ https://example.com/page2
+ https://example.com/subdir/page3
+
+ """
+ root = ET.fromstring(xml_content)
+ assert _extract_site_url(root) == "https://example.com/"
+
+
+def test_extract_site_url_common_prefix_with_subdir():
+ """Test extracting site URL with common prefix."""
+ xml_content = """
+
+ https://example.com/page1/
+ https://example.com/page1/subdir/
+ https://example.com/page1/subdir2/
+
+ """
+ root = ET.fromstring(xml_content)
+ assert _extract_site_url(root) == "https://example.com/page1/"
+
+
+def test_extract_site_url_empty_sitemap():
+ """Test extracting site URL from empty sitemap."""
+ xml_content = """
+
+
+ """
+ root = ET.fromstring(xml_content)
+ with pytest.raises(ValueError, match="No URLs found in sitemap"):
+ _extract_site_url(root)
+
+
+def test_extract_site_url_single_url():
+ """Test extracting site URL with single URL."""
+ xml_content = """
+
+ https://example.com/
+
+ """
+ root = ET.fromstring(xml_content)
+ assert _extract_site_url(root) == "https://example.com/"
+
+
+@pytest.fixture
+def setup_mock_files(tmp_path):
+ """Create mock files for testing."""
+ # Create regular markdown files
+ (tmp_path / "index.md").touch()
+ (tmp_path / "docs.md").touch()
+
+ # Create nested structure
+ nested_dir = tmp_path / "nested"
+ nested_dir.mkdir()
+ (nested_dir / "index.md").touch()
+
+ # Create structure with latest directory
+ latest_dir = tmp_path / "latest"
+ latest_dir.mkdir()
+ (latest_dir / "guide.md").touch()
+ (tmp_path / "guide.md").touch() # Same file without latest/
+
+ # Create structure with locale
+ en_dir = tmp_path / "en"
+ en_dir.mkdir()
+ (en_dir / "about.md").touch()
+ (tmp_path / "about.md").touch() # Same file without locale
+
+ (tmp_path / "non_locale.md").touch()
+ (nested_dir / "non_locale.md").touch()
+
+ return tmp_path
+
+
+def test_basic_conversion(setup_mock_files, monkeypatch):
+ """Test basic URL to file path conversion."""
+ monkeypatch.chdir(setup_mock_files)
+
+ site_url = "https://example.com/"
+
+ # Test index page
+ assert (
+ _convert_url_to_file_path("https://example.com/", site_url, setup_mock_files)
+ == "index.md"
+ )
+ assert (
+ _convert_url_to_file_path(
+ "https://example.com/index.html",
+ site_url,
+ setup_mock_files,
+ )
+ == "index.md"
+ )
+
+ # Test HTML extension
+ assert (
+ _convert_url_to_file_path(
+ "https://example.com/docs.html",
+ site_url,
+ setup_mock_files,
+ )
+ == "docs.md"
+ )
+
+
+def test_nested_paths(setup_mock_files, monkeypatch):
+ """Test nested directory paths."""
+ monkeypatch.chdir(setup_mock_files)
+
+ site_url = "https://example.com/"
+
+ # Test nested directory
+ assert (
+ _convert_url_to_file_path(
+ "https://example.com/nested/",
+ site_url,
+ setup_mock_files,
+ )
+ == "nested/index.md"
+ )
+
+
+def test_latest_directory(setup_mock_files, monkeypatch):
+ """Test paths with 'latest' directory."""
+ monkeypatch.chdir(setup_mock_files)
+
+ site_url = "https://example.com/"
+
+ # Test with and without latest/
+ assert (
+ _convert_url_to_file_path(
+ "https://example.com/latest/guide.html",
+ site_url,
+ setup_mock_files,
+ )
+ == "latest/guide.md"
+ )
+ assert (
+ _convert_url_to_file_path(
+ "https://example.com/latest/index.html",
+ site_url,
+ setup_mock_files,
+ )
+ == "index.md"
+ )
+
+
+def test_locale_directory(setup_mock_files, monkeypatch):
+ """Test paths with locale directory."""
+ monkeypatch.chdir(setup_mock_files)
+
+ site_url = "https://example.com/"
+
+ # Test with and without locale
+ assert (
+ _convert_url_to_file_path(
+ "https://example.com/en/about.html",
+ site_url,
+ setup_mock_files,
+ )
+ == "en/about.md"
+ )
+ assert (
+ _convert_url_to_file_path(
+ "https://example.com/en/non_locale.html",
+ site_url,
+ setup_mock_files,
+ )
+ == "non_locale.md"
+ )
+ assert (
+ _convert_url_to_file_path(
+ "https://example.com/en/nested/non_locale.html",
+ site_url,
+ setup_mock_files,
+ )
+ == "nested/non_locale.md"
+ )
+
+
+def test_invalid_urls(setup_mock_files, monkeypatch):
+ """Test invalid URLs and non-existent files."""
+ monkeypatch.chdir(setup_mock_files)
+
+ site_url = "https://example.com/"
+
+ # Test URL that doesn't match site_url
+ assert (
+ _convert_url_to_file_path(
+ "https://different.com/page.html",
+ site_url,
+ setup_mock_files,
+ )
+ == ""
+ )
+
+ # Test non-existent file
+ assert (
+ _convert_url_to_file_path(
+ "https://example.com/nonexistent.html",
+ site_url,
+ setup_mock_files,
+ )
+ == ""
+ )