bunkerweb/docs/hooks/llmstxt.py

169 lines
6.4 KiB
Python

"""MkDocs hook to generate llms.txt and llms-full.txt for AI agent consumption."""
from logging import getLogger
from pathlib import Path
from re import DOTALL, MULTILINE, compile as re_compile, split
log = getLogger("mkdocs.hooks.llmstxt")
SITE_NAME = "BunkerWeb documentation"
DESCRIPTION = (
"BunkerWeb is a next-generation, open-source Web Application Firewall (WAF). "
"Based on NGINX under the hood, it protects web services to make them secure "
"by default. It integrates seamlessly into existing environments (Linux, Docker, "
"Swarm, Kubernetes) as a reverse proxy and is fully configurable via environment "
"variables or an awesome web UI. "
"Source: https://github.com/bunkerity/bunkerweb"
)
SECTIONS = {
"Getting Started": {
"index.md": "Introduction and overview of BunkerWeb",
"concepts.md": "Core concepts — multisite, settings contexts, security modes",
"quickstart-guide.md": "Quick start guide for first-time setup",
},
"Integration Guides": {
"integrations.md": "Docker, Kubernetes, Swarm, Linux, and Ansible setup",
},
"Configuration Reference": {
"features.md": "Complete settings reference — all plugins, all options",
},
"Advanced Usage": {
"advanced.md": "Custom configs, headers, ModSecurity, PHP, streaming, and more",
},
"Web UI & API": {
"web-ui.md": "Web UI usage guide",
"api.md": "REST API documentation",
},
"Plugin System": {
"plugins.md": "Writing and using external plugins",
},
"Operations": {
"upgrading.md": "Version migration and upgrade guides",
"troubleshooting.md": "Common issues and solutions",
},
}
# Patterns to strip from markdown content for LLM consumption.
# These operate OUTSIDE fenced code blocks only (see _clean_markdown).
_STRIP_PATTERNS = [
re_compile(r"<figure[^>]*>.*?</figure>", DOTALL),
re_compile(r"<iframe[^>]*>.*?</iframe>", DOTALL),
re_compile(r"<img[^>]*>"),
]
# Image markdown: preserve alt text as [Image: description]
_IMAGE_RE = re_compile(r"!\[([^\]]*)\]\([^)]*\)(\{[^}]*\})?")
_COLLAPSE_BLANK_LINES = re_compile(r"\n{3,}")
# Convert relative .md links to absolute URLs
_RELATIVE_LINK_RE = re_compile(r"\]\((?!http)([a-zA-Z][^)]*?)\.md(#[^)]*?)?\)")
def _clean_markdown(content, base_url):
"""Remove images, iframes, and HTML blocks from markdown content.
Processes only text outside fenced code blocks to avoid corrupting code examples.
"""
# Split on fenced code block boundaries (``` or ~~~)
parts = split(r"(^```.*?^```|^~~~.*?^~~~)", content, flags=MULTILINE | DOTALL)
cleaned_parts = []
for i, part in enumerate(parts):
if i % 2 == 1:
# Inside a fenced code block — keep as-is
cleaned_parts.append(part)
else:
# Outside code blocks — apply stripping
for pattern in _STRIP_PATTERNS:
part = pattern.sub("", part)
# Preserve image alt text
part = _IMAGE_RE.sub(lambda m: f"[Image: {m.group(1)}]" if m.group(1) else "", part)
# Convert relative .md links to absolute
if base_url:
part = _RELATIVE_LINK_RE.sub(
lambda m: f"]({base_url}/{m.group(1).replace('.md', '')}/{m.group(2) or ''})",
part,
)
cleaned_parts.append(part)
return _COLLAPSE_BLANK_LINES.sub("\n\n", "".join(cleaned_parts)).strip()
def _get_page_title(content):
"""Extract the first H1 title from markdown content."""
for line in content.split("\n"):
if line.startswith("# "):
return line[2:].strip()
return None
def on_post_build(config, **kwargs):
"""Generate llms.txt and llms-full.txt after the build completes."""
site_dir = Path(config["site_dir"])
base_url = (config.get("site_url") or "").rstrip("/")
# Always read from the project-root docs/ directory (English source),
# not config["docs_dir"] which the i18n plugin changes per locale.
config_file = config.get("config_file_path")
if config_file:
docs_dir = Path(config_file).parent / "docs"
else:
docs_dir = Path(config["docs_dir"])
# Build llms.txt index
lines = [f"# {SITE_NAME}\n"]
lines.append(f"> {DESCRIPTION}\n")
# Build llms-full.txt content
full_parts = [f"# {SITE_NAME}\n"]
full_parts.append(f"> {DESCRIPTION}\n")
for section_name, pages in SECTIONS.items():
lines.append(f"## {section_name}\n")
full_section_parts = []
for filename, description in pages.items():
src_path = docs_dir / filename
if not src_path.exists():
log.warning("llmstxt: Source file '%s' not found. Skipping.", filename)
continue
content = src_path.read_text(encoding="utf-8")
title = _get_page_title(content) or filename.replace(".md", "").replace("-", " ").title()
# Page URL: filename without .md extension becomes directory/
page_slug = filename.replace(".md", "")
if page_slug == "index":
md_url = f"{base_url}/index.md"
else:
md_url = f"{base_url}/{page_slug}/index.md"
lines.append(f"- [{title}]({md_url}): {description}")
# Clean content for full output
cleaned = _clean_markdown(content, base_url)
full_section_parts.append(cleaned)
# Write per-page .md companion file next to the HTML output
if page_slug == "index":
companion_path = site_dir / "index.md"
else:
companion_dir = site_dir / page_slug
companion_dir.mkdir(parents=True, exist_ok=True)
companion_path = companion_dir / "index.md"
companion_path.write_text(cleaned, encoding="utf-8")
lines.append("")
full_parts.append(f"# {section_name}\n")
full_parts.append("\n\n".join(full_section_parts))
full_parts.append("")
# Write llms.txt
llms_txt = site_dir / "llms.txt"
llms_txt.write_text("\n".join(lines), encoding="utf-8")
log.info("llmstxt: Generated %s", llms_txt)
# Write llms-full.txt
llms_full = site_dir / "llms-full.txt"
llms_full.write_text("\n".join(full_parts), encoding="utf-8")
log.info("llmstxt: Generated %s (%dKB)", llms_full, llms_full.stat().st_size // 1024)