diff --git a/llms_txt_action/entrypoint.py b/llms_txt_action/entrypoint.py
index 6ac2a8d..3b1a0c2 100644
--- a/llms_txt_action/entrypoint.py
+++ b/llms_txt_action/entrypoint.py
@@ -9,8 +9,8 @@
 
 from .utils import (
     concatenate_markdown_files,
-    convert_html_to_markdown,
     generate_docs_structure,
+    html_folder_to_markdown,
 )
 
 logging.basicConfig(level=logging.INFO)
@@ -54,7 +54,7 @@ def generate_documentation(  # noqa: PLR0913
     logger.info("Starting Generation at folder - %s", docs_dir)
 
     logger.info("Generating MD files for all HTML files at folder - %s", docs_dir)
-    markdown_files = convert_html_to_markdown(docs_dir)
+    markdown_files = html_folder_to_markdown(docs_dir)
 
     # Set defaults if None
     skip_md_files = False if skip_md_files is None else skip_md_files
@@ -77,10 +77,8 @@ def generate_documentation(  # noqa: PLR0913
                 )
             except FileNotFoundError:
                 logger.exception(
-                    "Could not find sitemap file at %s",
-                    f"{docs_dir}/{sitemap_path}",
+                    "Failed to generate llms.txt file",
                 )
-                raise
 
     if not skip_llms_full_txt:
         concatenate_markdown_files(
diff --git a/llms_txt_action/utils.py b/llms_txt_action/utils.py
index 4c5228c..d57da7a 100644
--- a/llms_txt_action/utils.py
+++ b/llms_txt_action/utils.py
@@ -5,7 +5,6 @@
 import os
 import re
 from pathlib import Path
-from urllib.parse import urlparse
 
 from defusedxml import ElementTree as ET  # noqa: N817
 from docling.datamodel.base_models import ConversionStatus
@@ -16,7 +15,6 @@
 logger = logging.getLogger(__name__)
 
 
-# %%
 def html_to_markdown(input_file: Path) -> str:
     """Converts HTML content to Markdown.
 
@@ -42,7 +40,7 @@ def html_to_markdown(input_file: Path) -> str:
     raise RuntimeError(msg)
 
 
-def convert_html_to_markdown(input_path: str) -> list:
+def html_folder_to_markdown(input_path: str) -> list:
     """Recursively converts all HTML files in the given directory.
 
     to Markdown files and collects the paths of the generated Markdown files.
@@ -106,10 +104,7 @@ def convert_html_to_markdown(input_path: str) -> list:
     return markdown_files
 
 
-# %%
-
-
-def summarize_page(content: str, model_name: str) -> str:
+def generate_summary(content: str, model_name: str) -> str:
     """Summarize the page content using the model.
 
     This would analyze the page content and generate a summary.
@@ -140,10 +135,10 @@ def summarize_page(content: str, model_name: str) -> str:
         return response.choices[0].message.content
     # Extract largest heading from markdown content if present
     logger.info("No model API key found, using heading as summary")
-    return extract_heading(content)
+    return _extract_heading(content)
 
 
-def extract_heading(content: str) -> str:
+def _extract_heading(content: str) -> str:
     """Extract the largest heading upto h3 from the given content."""
     heading_match = re.search(r"^#{1,3}\s+(.+)$", content, re.MULTILINE)
     logger.info("Heading match: %s", heading_match)
@@ -152,6 +147,79 @@ def extract_heading(content: str) -> str:
     return ""
 
 
+def _extract_site_url(root: ET) -> str:
+    """Extract the site URL from the sitemap.xml file by finding common prefix."""
+    ns = {"ns": root.tag.split("}")[0].strip("{")}
+    urls = [url.find("ns:loc", ns).text for url in root.findall(".//ns:url", ns)]
+    if not urls:
+        msg = "No URLs found in sitemap"
+        raise ValueError(msg)
+
+    # Find common prefix among all URLs
+    shortest = min(urls, key=len)
+    for i, char in enumerate(shortest):
+        if any(url[i] != char for url in urls):
+            return shortest[:i]
+
+    return shortest
+
+
+def _convert_url_to_file_path(
+    url: str,
+    site_url: str,
+    docs_dir: str,
+    locale_length: int = 2,
+) -> str:
+    """Convert the URL to a file path.
+
+    Strips site_url prefix and checks various path patterns to find existing file.
+    Returns empty string if no file exists.
+
+    Args:
+    ----
+        url (str): Full URL to convert
+        site_url (str): Base site URL to strip
+        docs_dir (str): Path to the directory containing the documentation
+        locale_length (int): Length of the locale directory
+
+    Returns:
+    -------
+        Relative file path if found, empty string otherwise
+
+    """
+    # Strip site URL prefix
+    if not url.startswith(site_url):
+        return ""
+    if url.endswith("/"):
+        url = url + "index.html"
+    path = url[len(site_url.rstrip("/")) :].strip("/")
+    # Handle different URL patterns
+    if path in {"", "index.html"}:
+        file_path = "index.md"
+    elif path.endswith(".html"):
+        file_path = f"{path[:-5]}.md"
+    else:
+        file_path = f"{path}/index.md"
+    # Try original path
+    if Path(f"{docs_dir}/{file_path}").exists():
+        return file_path
+
+    # Try without "latest/" suffix
+    if "latest/" in file_path:
+        no_latest = file_path.replace("latest/", "")
+        if Path(f"{docs_dir}/{no_latest}").exists():
+            return no_latest
+
+    # Try without 2-letter locale directory
+    parts = file_path.split("/")
+    if len(parts) > 1 and len(parts[0]) == locale_length:
+        no_locale = "/".join(parts[1:])
+        if Path(f"{docs_dir}/{no_locale}").exists():
+            return no_locale
+
+    return ""
+
+
 def generate_docs_structure(
     docs_dir: str,
     sitemap_path: str,
@@ -159,6 +227,11 @@ def generate_docs_structure(
 ) -> str:
     """Generate a documentation structure from a sitemap.xml file.
 
+    first, extract site url.
+    then for each url, convert to file path.
+    then for each file path, read the file and summarize it.
+    then create a markdown link entry.
+
     Args:
     ----
         docs_dir (str): Path to the directory containing the documentation
@@ -174,6 +247,7 @@ def generate_docs_structure(
     if not Path(f"{docs_dir}/{sitemap_path}").exists():
         msg = f"The sitemap file {docs_dir}/{sitemap_path} does not exist."
         raise FileNotFoundError(msg)
+
     tree = ET.parse(f"{docs_dir}/{sitemap_path}")
     root = tree.getroot()
 
@@ -183,59 +257,28 @@ def generate_docs_structure(
     # Start building the markdown content
     content = ["# Docs\n"]
 
+    site_url = _extract_site_url(root)
     # Process each URL in the sitemap
     for url in root.findall(".//ns:url", ns):
         loc = url.find("ns:loc", ns).text
-        """
-        This doesnt call all cases. let me give more examples that needs to be handled.
-
-        https://test.com/ -> index.md
-        https://test.com/index.html -> index.md
-        https://test.com/configuration/ -> configuration/index.md
-        https://test.com/configuration/azure/ -> configuration/azure/index.md
-        https://test.comen/configuration/auzre.html -> configuration/azure.md
-        """
-        # Convert URL to file path
-        parsed_url = urlparse(loc)
-        path = parsed_url.path.strip("/")
-
-        # Handle different URL patterns
-        if path in {"", "index.html"}:
-            file_path = "index.md"
-        elif path.endswith(".html"):
-            # Remove .html and convert to .md
-            file_path = f"{path[:-5]}.md"
-        else:
-            # For paths ending in / or no extension, append index.md
-            file_path = f"{path}/index.md"
-        # Generate a summary for the page
         try:
+            logger.info("Processing %s", loc)
+            file_path = _convert_url_to_file_path(loc, site_url, docs_dir)
+            logger.info("found file path: %s for %s", file_path, loc)
             with Path(f"{docs_dir}/{file_path}").open() as f:
                 markdown_content = f.read()
+
+            summary = generate_summary(markdown_content, model_name)
+            page_title = loc.rstrip("/").split("/")[-1].replace("-", " ").title()
+            content.append(f"- [{page_title}]({loc}): {summary}")
         except FileNotFoundError:
             # Try without locale path by removing first directory if it's 2 characters
-            file_path_parts = file_path.split("/")
-
-            file_path_no_locale = (
-                "/".join(file_path_parts[1:])
-                if len(file_path_parts) > 1 and len(file_path_parts[0]) == 2  # noqa: PLR2004
-                else file_path
-            )
-            with Path(f"{docs_dir}/{file_path_no_locale}").open() as f:
-                markdown_content = f.read()
-        summary = summarize_page(markdown_content, model_name)
-
-        # Create the markdown link entry
-        page_title = loc.rstrip("/").split("/")[-1].replace("-", " ").title()
-        content.append(f"- [{page_title}]({loc}): {summary}")
-
+            logger.info("File not found: %s", file_path)
+            continue
     # Join all lines with newlines
     return "\n".join(content)
 
 
-# %%
-
-
 def concatenate_markdown_files(markdown_files: list, output_file: str):
     """Concatenates multiple markdown files into a single file.
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 0b0079d..3f167c7 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,18 +1,21 @@
 """Unit tests for the llms_txt_action.utils module."""
-# ruff: noqa: S101
+# ruff: noqa: S101, S314, E501
 
+import xml.etree.ElementTree as ET
 from unittest.mock import Mock, patch
 
 import pytest
 from docling.datamodel.base_models import ConversionStatus
 
 from llms_txt_action.utils import (
+    _convert_url_to_file_path,
+    _extract_heading,
+    _extract_site_url,
     concatenate_markdown_files,
-    convert_html_to_markdown,
-    extract_heading,
     generate_docs_structure,
+    generate_summary,
+    html_folder_to_markdown,
     html_to_markdown,
-    summarize_page,
 )
 
 
@@ -91,7 +94,7 @@ def test_convert_html_to_markdown_success(tmp_path, sample_html_file):
         input_dir.mkdir()
         sample_html_file.rename(input_dir / "test.html")
 
-        result = convert_html_to_markdown(str(input_dir))
+        result = html_folder_to_markdown(str(input_dir))
         assert len(result) == 1
         assert result[0].suffix == ".md"
 
@@ -102,7 +105,7 @@ def test_convert_html_to_markdown_invalid_path():
         ValueError,
         match="The input path nonexistent_path is not a directory.",
     ):
-        convert_html_to_markdown("nonexistent_path")
+        html_folder_to_markdown("nonexistent_path")
 
 
 def test_convert_html_to_markdown_handles_conversion_failure(
@@ -117,7 +120,7 @@ def test_convert_html_to_markdown_handles_conversion_failure(
         input_dir.mkdir()
         sample_html_file.rename(input_dir / "test.html")
 
-        result = convert_html_to_markdown(str(input_dir))
+        result = html_folder_to_markdown(str(input_dir))
         assert len(result) == 0
 
 
@@ -132,7 +135,7 @@ def test_summarize_page_with_model():
         mock_response.choices = [Mock(message=Mock(content="Test summary"))]
         mock_completion.return_value = mock_response
 
-        result = summarize_page("Test content", "gpt-3.5-turbo")
+        result = generate_summary("Test content", "gpt-3.5-turbo")
         assert result == "Test summary"
 
 
@@ -140,10 +143,10 @@ def test_summarize_page_without_model():
     """Test summarize page without model API key."""
     with (
         patch.dict("os.environ", clear=True),
-        patch("llms_txt_action.utils.extract_heading") as mock_extract,
+        patch("llms_txt_action.utils._extract_heading") as mock_extract,
     ):
         mock_extract.return_value = "Test Heading"
-        result = summarize_page("# Test Heading\nContent", "gpt-3.5-turbo")
+        result = generate_summary("# Test Heading\nContent", "gpt-3.5-turbo")
         assert result == "Test Heading"
 
 
@@ -151,19 +154,19 @@ def test_summarize_page_without_model():
 def test_extract_heading_with_h1():
     """Test extract heading with h1."""
     content = "# Main Heading\nContent"
-    assert extract_heading(content) == "Main Heading"
+    assert _extract_heading(content) == "Main Heading"
 
 
 def test_extract_heading_with_h2():
     """Test extract heading with h2."""
     content = "## Secondary Heading\nContent"
-    assert extract_heading(content) == "Secondary Heading"
+    assert _extract_heading(content) == "Secondary Heading"
 
 
 def test_extract_heading_no_heading():
     """Test extract heading with no heading."""
     content = "Just content"
-    assert extract_heading(content) == ""
+    assert _extract_heading(content) == ""
 
 
 # Tests for generate_docs_structure
@@ -233,3 +236,214 @@ def test_concatenate_markdown_files_empty_list(tmp_path):
     concatenate_markdown_files([], output_file)
     assert output_file.exists()
     assert output_file.read_text() == ""
+
+
+def test_extract_site_url_common_prefix():
+    """Test extracting site URL with common prefix."""
+    xml_content = """<?xml version="1.0" encoding="UTF-8"?>
+    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+        <url><loc>https://example.com/page1</loc></url>
+        <url><loc>https://example.com/page2</loc></url>
+        <url><loc>https://example.com/subdir/page3</loc></url>
+    </urlset>
+    """
+    root = ET.fromstring(xml_content)
+    assert _extract_site_url(root) == "https://example.com/"
+
+
+def test_extract_site_url_common_prefix_with_subdir():
+    """Test extracting site URL with common prefix."""
+    xml_content = """<?xml version="1.0" encoding="UTF-8"?>
+    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+        <url><loc>https://example.com/page1/</loc></url>
+        <url><loc>https://example.com/page1/subdir/</loc></url>
+        <url><loc>https://example.com/page1/subdir2/</loc></url>
+    </urlset>
+    """
+    root = ET.fromstring(xml_content)
+    assert _extract_site_url(root) == "https://example.com/page1/"
+
+
+def test_extract_site_url_empty_sitemap():
+    """Test extracting site URL from empty sitemap."""
+    xml_content = """<?xml version="1.0" encoding="UTF-8"?>
+    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+    </urlset>
+    """
+    root = ET.fromstring(xml_content)
+    with pytest.raises(ValueError, match="No URLs found in sitemap"):
+        _extract_site_url(root)
+
+
+def test_extract_site_url_single_url():
+    """Test extracting site URL with single URL."""
+    xml_content = """<?xml version="1.0" encoding="UTF-8"?>
+    <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+        <url><loc>https://example.com/</loc></url>
+    </urlset>
+    """
+    root = ET.fromstring(xml_content)
+    assert _extract_site_url(root) == "https://example.com/"
+
+
+@pytest.fixture
+def setup_mock_files(tmp_path):
+    """Create mock files for testing."""
+    # Create regular markdown files
+    (tmp_path / "index.md").touch()
+    (tmp_path / "docs.md").touch()
+
+    # Create nested structure
+    nested_dir = tmp_path / "nested"
+    nested_dir.mkdir()
+    (nested_dir / "index.md").touch()
+
+    # Create structure with latest directory
+    latest_dir = tmp_path / "latest"
+    latest_dir.mkdir()
+    (latest_dir / "guide.md").touch()
+    (tmp_path / "guide.md").touch()  # Same file without latest/
+
+    # Create structure with locale
+    en_dir = tmp_path / "en"
+    en_dir.mkdir()
+    (en_dir / "about.md").touch()
+    (tmp_path / "about.md").touch()  # Same file without locale
+
+    (tmp_path / "non_locale.md").touch()
+    (nested_dir / "non_locale.md").touch()
+
+    return tmp_path
+
+
+def test_basic_conversion(setup_mock_files, monkeypatch):
+    """Test basic URL to file path conversion."""
+    monkeypatch.chdir(setup_mock_files)
+
+    site_url = "https://example.com/"
+
+    # Test index page
+    assert (
+        _convert_url_to_file_path("https://example.com/", site_url, setup_mock_files)
+        == "index.md"
+    )
+    assert (
+        _convert_url_to_file_path(
+            "https://example.com/index.html",
+            site_url,
+            setup_mock_files,
+        )
+        == "index.md"
+    )
+
+    # Test HTML extension
+    assert (
+        _convert_url_to_file_path(
+            "https://example.com/docs.html",
+            site_url,
+            setup_mock_files,
+        )
+        == "docs.md"
+    )
+
+
+def test_nested_paths(setup_mock_files, monkeypatch):
+    """Test nested directory paths."""
+    monkeypatch.chdir(setup_mock_files)
+
+    site_url = "https://example.com/"
+
+    # Test nested directory
+    assert (
+        _convert_url_to_file_path(
+            "https://example.com/nested/",
+            site_url,
+            setup_mock_files,
+        )
+        == "nested/index.md"
+    )
+
+
+def test_latest_directory(setup_mock_files, monkeypatch):
+    """Test paths with 'latest' directory."""
+    monkeypatch.chdir(setup_mock_files)
+
+    site_url = "https://example.com/"
+
+    # Test with and without latest/
+    assert (
+        _convert_url_to_file_path(
+            "https://example.com/latest/guide.html",
+            site_url,
+            setup_mock_files,
+        )
+        == "latest/guide.md"
+    )
+    assert (
+        _convert_url_to_file_path(
+            "https://example.com/latest/index.html",
+            site_url,
+            setup_mock_files,
+        )
+        == "index.md"
+    )
+
+
+def test_locale_directory(setup_mock_files, monkeypatch):
+    """Test paths with locale directory."""
+    monkeypatch.chdir(setup_mock_files)
+
+    site_url = "https://example.com/"
+
+    # Test with and without locale
+    assert (
+        _convert_url_to_file_path(
+            "https://example.com/en/about.html",
+            site_url,
+            setup_mock_files,
+        )
+        == "en/about.md"
+    )
+    assert (
+        _convert_url_to_file_path(
+            "https://example.com/en/non_locale.html",
+            site_url,
+            setup_mock_files,
+        )
+        == "non_locale.md"
+    )
+    assert (
+        _convert_url_to_file_path(
+            "https://example.com/en/nested/non_locale.html",
+            site_url,
+            setup_mock_files,
+        )
+        == "nested/non_locale.md"
+    )
+
+
+def test_invalid_urls(setup_mock_files, monkeypatch):
+    """Test invalid URLs and non-existent files."""
+    monkeypatch.chdir(setup_mock_files)
+
+    site_url = "https://example.com/"
+
+    # Test URL that doesn't match site_url
+    assert (
+        _convert_url_to_file_path(
+            "https://different.com/page.html",
+            site_url,
+            setup_mock_files,
+        )
+        == ""
+    )
+
+    # Test non-existent file
+    assert (
+        _convert_url_to_file_path(
+            "https://example.com/nonexistent.html",
+            site_url,
+            setup_mock_files,
+        )
+        == ""
+    )