From 9cd48cdb977b3cffd8a37b9ceb21f09364c6b84d Mon Sep 17 00:00:00 2001 From: Selvam Palanimalai Date: Mon, 6 Jan 2025 13:51:24 -0500 Subject: [PATCH 1/2] fix: handling sitemap entries better --- llms_txt_action/entrypoint.py | 8 +- llms_txt_action/utils.py | 143 ++++++++++++------- tests/test_utils.py | 252 ++++++++++++++++++++++++++++++++-- 3 files changed, 335 insertions(+), 68 deletions(-) diff --git a/llms_txt_action/entrypoint.py b/llms_txt_action/entrypoint.py index 6ac2a8d..3b1a0c2 100644 --- a/llms_txt_action/entrypoint.py +++ b/llms_txt_action/entrypoint.py @@ -9,8 +9,8 @@ from .utils import ( concatenate_markdown_files, - convert_html_to_markdown, generate_docs_structure, + html_folder_to_markdown, ) logging.basicConfig(level=logging.INFO) @@ -54,7 +54,7 @@ def generate_documentation( # noqa: PLR0913 logger.info("Starting Generation at folder - %s", docs_dir) logger.info("Generating MD files for all HTML files at folder - %s", docs_dir) - markdown_files = convert_html_to_markdown(docs_dir) + markdown_files = html_folder_to_markdown(docs_dir) # Set defaults if None skip_md_files = False if skip_md_files is None else skip_md_files @@ -77,10 +77,8 @@ def generate_documentation( # noqa: PLR0913 ) except FileNotFoundError: logger.exception( - "Could not find sitemap file at %s", - f"{docs_dir}/{sitemap_path}", + "Failed to generate llms.txt file", ) - raise if not skip_llms_full_txt: concatenate_markdown_files( diff --git a/llms_txt_action/utils.py b/llms_txt_action/utils.py index 4c5228c..d57da7a 100644 --- a/llms_txt_action/utils.py +++ b/llms_txt_action/utils.py @@ -5,7 +5,6 @@ import os import re from pathlib import Path -from urllib.parse import urlparse from defusedxml import ElementTree as ET # noqa: N817 from docling.datamodel.base_models import ConversionStatus @@ -16,7 +15,6 @@ logger = logging.getLogger(__name__) -# %% def html_to_markdown(input_file: Path) -> str: """Converts HTML content to Markdown. @@ -42,7 +40,7 @@ def html_to_markdown(input_file: Path) -> str: raise RuntimeError(msg) -def convert_html_to_markdown(input_path: str) -> list: +def html_folder_to_markdown(input_path: str) -> list: """Recursively converts all HTML files in the given directory. to Markdown files and collects the paths of the generated Markdown files. @@ -106,10 +104,7 @@ def convert_html_to_markdown(input_path: str) -> list: return markdown_files -# %% - - -def summarize_page(content: str, model_name: str) -> str: +def generate_summary(content: str, model_name: str) -> str: """Summarize the page content using the model. This would analyze the page content and generate a summary. @@ -140,10 +135,10 @@ def summarize_page(content: str, model_name: str) -> str: return response.choices[0].message.content # Extract largest heading from markdown content if present logger.info("No model API key found, using heading as summary") - return extract_heading(content) + return _extract_heading(content) -def extract_heading(content: str) -> str: +def _extract_heading(content: str) -> str: """Extract the largest heading upto h3 from the given content.""" heading_match = re.search(r"^#{1,3}\s+(.+)$", content, re.MULTILINE) logger.info("Heading match: %s", heading_match) @@ -152,6 +147,79 @@ def extract_heading(content: str) -> str: return "" +def _extract_site_url(root: ET) -> str: + """Extract the site URL from the sitemap.xml file by finding common prefix.""" + ns = {"ns": root.tag.split("}")[0].strip("{")} + urls = [url.find("ns:loc", ns).text for url in root.findall(".//ns:url", ns)] + if not urls: + msg = "No URLs found in sitemap" + raise ValueError(msg) + + # Find common prefix among all URLs + shortest = min(urls, key=len) + for i, char in enumerate(shortest): + if any(url[i] != char for url in urls): + return shortest[:i] + + return shortest + + +def _convert_url_to_file_path( + url: str, + site_url: str, + docs_dir: str, + locale_length: int = 2, +) -> str: + """Convert the URL to a file path. + + Strips site_url prefix and checks various path patterns to find existing file. + Returns empty string if no file exists. + + Args: + ---- + url (str): Full URL to convert + site_url (str): Base site URL to strip + docs_dir (str): Path to the directory containing the documentation + locale_length (int): Length of the locale directory + + Returns: + ------- + Relative file path if found, empty string otherwise + + """ + # Strip site URL prefix + if not url.startswith(site_url): + return "" + if url.endswith("/"): + url = url + "index.html" + path = url[len(site_url.rstrip("/")) :].strip("/") + # Handle different URL patterns + if path in {"", "index.html"}: + file_path = "index.md" + elif path.endswith(".html"): + file_path = f"{path[:-5]}.md" + else: + file_path = f"{path}/index.md" + # Try original path + if Path(f"{docs_dir}/{file_path}").exists(): + return file_path + + # Try without "latest/" suffix + if "latest/" in file_path: + no_latest = file_path.replace("latest/", "") + if Path(f"{docs_dir}/{no_latest}").exists(): + return no_latest + + # Try without 2-letter locale directory + parts = file_path.split("/") + if len(parts) > 1 and len(parts[0]) == locale_length: + no_locale = "/".join(parts[1:]) + if Path(f"{docs_dir}/{no_locale}").exists(): + return no_locale + + return "" + + def generate_docs_structure( docs_dir: str, sitemap_path: str, @@ -159,6 +227,11 @@ def generate_docs_structure( ) -> str: """Generate a documentation structure from a sitemap.xml file. + first, extract site url. + then for each url, convert to file path. + then for each file path, read the file and summarize it. + then create a markdown link entry. + Args: ---- docs_dir (str): Path to the directory containing the documentation @@ -174,6 +247,7 @@ def generate_docs_structure( if not Path(f"{docs_dir}/{sitemap_path}").exists(): msg = f"The sitemap file {docs_dir}/{sitemap_path} does not exist." raise FileNotFoundError(msg) + tree = ET.parse(f"{docs_dir}/{sitemap_path}") root = tree.getroot() @@ -183,59 +257,28 @@ def generate_docs_structure( # Start building the markdown content content = ["# Docs\n"] + site_url = _extract_site_url(root) # Process each URL in the sitemap for url in root.findall(".//ns:url", ns): loc = url.find("ns:loc", ns).text - """ - This doesnt call all cases. let me give more examples that needs to be handled. - - https://test.com/ -> index.md - https://test.com/index.html -> index.md - https://test.com/configuration/ -> configuration/index.md - https://test.com/configuration/azure/ -> configuration/azure/index.md - https://test.comen/configuration/auzre.html -> configuration/azure.md - """ - # Convert URL to file path - parsed_url = urlparse(loc) - path = parsed_url.path.strip("/") - - # Handle different URL patterns - if path in {"", "index.html"}: - file_path = "index.md" - elif path.endswith(".html"): - # Remove .html and convert to .md - file_path = f"{path[:-5]}.md" - else: - # For paths ending in / or no extension, append index.md - file_path = f"{path}/index.md" - # Generate a summary for the page try: + logger.info("Processing %s", loc) + file_path = _convert_url_to_file_path(loc, site_url, docs_dir) + logger.info("found file path: %s for %s", file_path, loc) with Path(f"{docs_dir}/{file_path}").open() as f: markdown_content = f.read() + + summary = generate_summary(markdown_content, model_name) + page_title = loc.rstrip("/").split("/")[-1].replace("-", " ").title() + content.append(f"- [{page_title}]({loc}): {summary}") except FileNotFoundError: # Try without locale path by removing first directory if it's 2 characters - file_path_parts = file_path.split("/") - - file_path_no_locale = ( - "/".join(file_path_parts[1:]) - if len(file_path_parts) > 1 and len(file_path_parts[0]) == 2 # noqa: PLR2004 - else file_path - ) - with Path(f"{docs_dir}/{file_path_no_locale}").open() as f: - markdown_content = f.read() - summary = summarize_page(markdown_content, model_name) - - # Create the markdown link entry - page_title = loc.rstrip("/").split("/")[-1].replace("-", " ").title() - content.append(f"- [{page_title}]({loc}): {summary}") - + logger.info("File not found: %s", file_path) + continue # Join all lines with newlines return "\n".join(content) -# %% - - def concatenate_markdown_files(markdown_files: list, output_file: str): """Concatenates multiple markdown files into a single file. diff --git a/tests/test_utils.py b/tests/test_utils.py index 0b0079d..7b31c6a 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,18 +1,21 @@ """Unit tests for the llms_txt_action.utils module.""" -# ruff: noqa: S101 +# ruff: noqa: S101, S314, E501 +import xml.etree.ElementTree as ET from unittest.mock import Mock, patch import pytest from docling.datamodel.base_models import ConversionStatus from llms_txt_action.utils import ( + _convert_url_to_file_path, + _extract_heading, + _extract_site_url, concatenate_markdown_files, - convert_html_to_markdown, - extract_heading, generate_docs_structure, + generate_summary, + html_folder_to_markdown, html_to_markdown, - summarize_page, ) @@ -91,7 +94,7 @@ def test_convert_html_to_markdown_success(tmp_path, sample_html_file): input_dir.mkdir() sample_html_file.rename(input_dir / "test.html") - result = convert_html_to_markdown(str(input_dir)) + result = html_folder_to_markdown(str(input_dir)) assert len(result) == 1 assert result[0].suffix == ".md" @@ -102,7 +105,7 @@ def test_convert_html_to_markdown_invalid_path(): ValueError, match="The input path nonexistent_path is not a directory.", ): - convert_html_to_markdown("nonexistent_path") + html_folder_to_markdown("nonexistent_path") def test_convert_html_to_markdown_handles_conversion_failure( @@ -117,7 +120,7 @@ def test_convert_html_to_markdown_handles_conversion_failure( input_dir.mkdir() sample_html_file.rename(input_dir / "test.html") - result = convert_html_to_markdown(str(input_dir)) + result = html_folder_to_markdown(str(input_dir)) assert len(result) == 0 @@ -132,7 +135,7 @@ def test_summarize_page_with_model(): mock_response.choices = [Mock(message=Mock(content="Test summary"))] mock_completion.return_value = mock_response - result = summarize_page("Test content", "gpt-3.5-turbo") + result = generate_summary("Test content", "gpt-3.5-turbo") assert result == "Test summary" @@ -140,10 +143,10 @@ def test_summarize_page_without_model(): """Test summarize page without model API key.""" with ( patch.dict("os.environ", clear=True), - patch("llms_txt_action.utils.extract_heading") as mock_extract, + patch("llms_txt_action.utils._extract_heading") as mock_extract, ): mock_extract.return_value = "Test Heading" - result = summarize_page("# Test Heading\nContent", "gpt-3.5-turbo") + result = generate_summary("# Test Heading\nContent", "gpt-3.5-turbo") assert result == "Test Heading" @@ -151,19 +154,19 @@ def test_summarize_page_without_model(): def test_extract_heading_with_h1(): """Test extract heading with h1.""" content = "# Main Heading\nContent" - assert extract_heading(content) == "Main Heading" + assert _extract_heading(content) == "Main Heading" def test_extract_heading_with_h2(): """Test extract heading with h2.""" content = "## Secondary Heading\nContent" - assert extract_heading(content) == "Secondary Heading" + assert _extract_heading(content) == "Secondary Heading" def test_extract_heading_no_heading(): """Test extract heading with no heading.""" content = "Just content" - assert extract_heading(content) == "" + assert _extract_heading(content) == "" # Tests for generate_docs_structure @@ -233,3 +236,226 @@ def test_concatenate_markdown_files_empty_list(tmp_path): concatenate_markdown_files([], output_file) assert output_file.exists() assert output_file.read_text() == "" + + +def test_extract_site_url_common_prefix(): + """Test extracting site URL with common prefix.""" + xml_content = """ + + https://example.com/page1 + https://example.com/page2 + https://example.com/subdir/page3 + + """ + root = ET.fromstring(xml_content) + assert _extract_site_url(root) == "https://example.com/" + + +def test_extract_site_url_common_prefix_with_subdir(): + """Test extracting site URL with common prefix.""" + xml_content = """ + + https://example.com/page1/ + https://example.com/page1/subdir/ + https://example.com/page1/subdir2/ + + """ + root = ET.fromstring(xml_content) + assert _extract_site_url(root) == "https://example.com/page1/" + + +def test_extract_site_url_empty_sitemap(): + """Test extracting site URL from empty sitemap.""" + xml_content = """ + + + """ + root = ET.fromstring(xml_content) + with pytest.raises(ValueError, match="No URLs found in sitemap"): + _extract_site_url(root) + + +def test_extract_site_url_single_url(): + """Test extracting site URL with single URL.""" + xml_content = """ + + https://example.com/ + + """ + root = ET.fromstring(xml_content) + assert _extract_site_url(root) == "https://example.com/" + + +def test_extract_site_url_different_domains(): + """Test extracting site URL with different domains.""" + xml_content = """ + + https://example.com/page1 + https://different.com/page2 + + """ + root = ET.fromstring(xml_content) + assert _extract_site_url(root) == "https://" + + +@pytest.fixture +def setup_mock_files(tmp_path): + """Create mock files for testing.""" + # Create regular markdown files + (tmp_path / "index.md").touch() + (tmp_path / "docs.md").touch() + + # Create nested structure + nested_dir = tmp_path / "nested" + nested_dir.mkdir() + (nested_dir / "index.md").touch() + + # Create structure with latest directory + latest_dir = tmp_path / "latest" + latest_dir.mkdir() + (latest_dir / "guide.md").touch() + (tmp_path / "guide.md").touch() # Same file without latest/ + + # Create structure with locale + en_dir = tmp_path / "en" + en_dir.mkdir() + (en_dir / "about.md").touch() + (tmp_path / "about.md").touch() # Same file without locale + + (tmp_path / "non_locale.md").touch() + (nested_dir / "non_locale.md").touch() + + return tmp_path + + +def test_basic_conversion(setup_mock_files, monkeypatch): + """Test basic URL to file path conversion.""" + monkeypatch.chdir(setup_mock_files) + + site_url = "https://example.com/" + + # Test index page + assert ( + _convert_url_to_file_path("https://example.com/", site_url, setup_mock_files) + == "index.md" + ) + assert ( + _convert_url_to_file_path( + "https://example.com/index.html", + site_url, + setup_mock_files, + ) + == "index.md" + ) + + # Test HTML extension + assert ( + _convert_url_to_file_path( + "https://example.com/docs.html", + site_url, + setup_mock_files, + ) + == "docs.md" + ) + + +def test_nested_paths(setup_mock_files, monkeypatch): + """Test nested directory paths.""" + monkeypatch.chdir(setup_mock_files) + + site_url = "https://example.com/" + + # Test nested directory + assert ( + _convert_url_to_file_path( + "https://example.com/nested/", + site_url, + setup_mock_files, + ) + == "nested/index.md" + ) + + +def test_latest_directory(setup_mock_files, monkeypatch): + """Test paths with 'latest' directory.""" + monkeypatch.chdir(setup_mock_files) + + site_url = "https://example.com/" + + # Test with and without latest/ + assert ( + _convert_url_to_file_path( + "https://example.com/latest/guide.html", + site_url, + setup_mock_files, + ) + == "latest/guide.md" + ) + assert ( + _convert_url_to_file_path( + "https://example.com/latest/index.html", + site_url, + setup_mock_files, + ) + == "index.md" + ) + + +def test_locale_directory(setup_mock_files, monkeypatch): + """Test paths with locale directory.""" + monkeypatch.chdir(setup_mock_files) + + site_url = "https://example.com/" + + # Test with and without locale + assert ( + _convert_url_to_file_path( + "https://example.com/en/about.html", + site_url, + setup_mock_files, + ) + == "en/about.md" + ) + assert ( + _convert_url_to_file_path( + "https://example.com/en/non_locale.html", + site_url, + setup_mock_files, + ) + == "non_locale.md" + ) + assert ( + _convert_url_to_file_path( + "https://example.com/en/nested/non_locale.html", + site_url, + setup_mock_files, + ) + == "nested/non_locale.md" + ) + + +def test_invalid_urls(setup_mock_files, monkeypatch): + """Test invalid URLs and non-existent files.""" + monkeypatch.chdir(setup_mock_files) + + site_url = "https://example.com/" + + # Test URL that doesn't match site_url + assert ( + _convert_url_to_file_path( + "https://different.com/page.html", + site_url, + setup_mock_files, + ) + == "" + ) + + # Test non-existent file + assert ( + _convert_url_to_file_path( + "https://example.com/nonexistent.html", + site_url, + setup_mock_files, + ) + == "" + ) From 7efba405fd2feb61757c777018d3673a4c0cde11 Mon Sep 17 00:00:00 2001 From: Selvam Palanimalai Date: Mon, 6 Jan 2025 21:56:42 -0500 Subject: [PATCH 2/2] fix: removing useless test --- tests/test_utils.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 7b31c6a..3f167c7 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -286,18 +286,6 @@ def test_extract_site_url_single_url(): assert _extract_site_url(root) == "https://example.com/" -def test_extract_site_url_different_domains(): - """Test extracting site URL with different domains.""" - xml_content = """ - - https://example.com/page1 - https://different.com/page2 - - """ - root = ET.fromstring(xml_content) - assert _extract_site_url(root) == "https://" - - @pytest.fixture def setup_mock_files(tmp_path): """Create mock files for testing."""