mirror of
https://github.com/NVIDIA-NeMo/DataDesigner
synced 2026-05-24 09:48:29 +00:00
432 lines
16 KiB
Python
432 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
"""Sync Fern authoring content into the CI-managed publish branch."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import shutil
|
|
import sys
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
DEVNOTES_SECTION_RE = re.compile(r"^ - section:\s+Dev Notes\s*$")
|
|
RETIRED_REFERENCE_SECTION_RE = re.compile(r"^ - section:\s+" + re.escape("Code " + "Reference") + r"\s*$")
|
|
RETIRED_REFERENCE_DIR = "code" + "_reference"
|
|
NAV_PATH_RE = re.compile(r"^(\s*path:\s+)\./([^#\s]+)(.*)$")
|
|
REDIRECT_VERSION_RE = re.compile(
|
|
r'^\s*destination:\s+["\']/nemo/datadesigner/((?:v[0-9][^/"\']*)|older-versions)(?:/|["\'])'
|
|
)
|
|
VERSION_SLUG_RE = re.compile(r"^\s*slug:\s+['\"]?([^'\"\s]+)")
|
|
SKIP_NAMES = {
|
|
".git",
|
|
".mypy_cache",
|
|
".notebook-cache",
|
|
".pytest_cache",
|
|
".ruff_cache",
|
|
".venv",
|
|
"__pycache__",
|
|
"dist",
|
|
"site",
|
|
}
|
|
PUBLISH_METADATA_PATH = Path("fern/publish-metadata.json")
|
|
FERN_DEVNOTE_SUPPORT_PATHS = [
|
|
"fern/assets",
|
|
"fern/components/Authors.tsx",
|
|
"fern/components/BlogCard.tsx",
|
|
"fern/components/MetricsTable.tsx",
|
|
"fern/components/TrajectoryViewer.tsx",
|
|
"fern/components/devnotes",
|
|
"fern/styles/authors.css",
|
|
"fern/styles/blog-card.css",
|
|
"fern/styles/metrics-table.css",
|
|
"fern/styles/trajectory-viewer.css",
|
|
]
|
|
RETIRED_REFERENCE_CLEAN_PAGE_PATHS = [
|
|
"concepts/columns.mdx",
|
|
"concepts/custom_columns.mdx",
|
|
"concepts/models/model-configs.mdx",
|
|
"concepts/person_sampling.mdx",
|
|
"concepts/security.mdx",
|
|
"concepts/tool_use_and_mcp.mdx",
|
|
"concepts/validators.mdx",
|
|
"plugins/example.mdx",
|
|
"plugins/overview.mdx",
|
|
]
|
|
|
|
|
|
class PublishedBranchError(RuntimeError):
|
|
pass
|
|
|
|
|
|
def find_top_level_block(lines: list[str], name: str) -> tuple[int, int]:
|
|
start = next((i for i, line in enumerate(lines) if line == f"{name}:\n"), -1)
|
|
if start == -1:
|
|
raise PublishedBranchError(f"Missing top-level '{name}:' block")
|
|
|
|
end = len(lines)
|
|
for i in range(start + 1, len(lines)):
|
|
if re.match(r"^[A-Za-z0-9_-]+:", lines[i]):
|
|
end = i
|
|
break
|
|
return start, end
|
|
|
|
|
|
def versions_block(path: Path) -> list[str] | None:
|
|
if not path.exists():
|
|
return None
|
|
lines = path.read_text().splitlines(keepends=True)
|
|
try:
|
|
start, end = find_top_level_block(lines, "versions")
|
|
except PublishedBranchError:
|
|
return None
|
|
return lines[start:end]
|
|
|
|
|
|
def normalize_latest_display_name(block: list[str] | None) -> list[str] | None:
|
|
if block is None:
|
|
return None
|
|
|
|
normalized = list(block)
|
|
display_name_index = -1
|
|
for index, line in enumerate(block):
|
|
if line.startswith("- display-name:"):
|
|
display_name_index = index
|
|
continue
|
|
match = VERSION_SLUG_RE.match(line)
|
|
if display_name_index != -1 and match and match.group(1) == "latest":
|
|
normalized[display_name_index] = '- display-name: "Latest"\n'
|
|
break
|
|
return normalized
|
|
|
|
|
|
def restore_versions_block(path: Path, block: list[str] | None) -> None:
|
|
if block is None:
|
|
return
|
|
lines = path.read_text().splitlines(keepends=True)
|
|
start, end = find_top_level_block(lines, "versions")
|
|
lines[start:end] = block
|
|
path.write_text("".join(lines))
|
|
|
|
|
|
def required_redirect_slugs(path: Path) -> set[str]:
|
|
required: set[str] = set()
|
|
for line in path.read_text().splitlines():
|
|
match = REDIRECT_VERSION_RE.match(line)
|
|
if match:
|
|
required.add(match.group(1))
|
|
return required
|
|
|
|
|
|
def version_slugs(path: Path) -> set[str]:
|
|
slugs: set[str] = set()
|
|
for line in versions_block(path) or []:
|
|
match = VERSION_SLUG_RE.match(line)
|
|
if match:
|
|
slugs.add(match.group(1))
|
|
return slugs
|
|
|
|
|
|
def validate_redirect_targets(published_root: Path) -> None:
|
|
docs_yml = published_root / "fern" / "docs.yml"
|
|
missing = sorted(required_redirect_slugs(docs_yml) - version_slugs(docs_yml))
|
|
if missing:
|
|
formatted = ", ".join(missing)
|
|
raise PublishedBranchError(
|
|
f"Published Fern docs.yml is missing version entries required by redirects: {formatted}. "
|
|
"Initialize docs-website with the historical Fern archive before publishing."
|
|
)
|
|
|
|
|
|
def write_publish_metadata(published_root: Path, args: argparse.Namespace, action: str) -> None:
|
|
provided = [
|
|
args.metadata_source_repository,
|
|
args.metadata_source_ref,
|
|
args.metadata_source_sha,
|
|
args.metadata_release_tag,
|
|
args.metadata_published_branch,
|
|
]
|
|
if not any(provided):
|
|
return
|
|
|
|
missing = [
|
|
name
|
|
for name, value in (
|
|
("metadata source repository", args.metadata_source_repository),
|
|
("metadata source ref", args.metadata_source_ref),
|
|
("metadata source sha", args.metadata_source_sha),
|
|
)
|
|
if not value
|
|
]
|
|
if missing:
|
|
raise PublishedBranchError(f"Incomplete publish metadata; missing {', '.join(missing)}")
|
|
|
|
metadata: dict[str, object] = {
|
|
"schema_version": 1,
|
|
"kind": "fern-docs-website",
|
|
"action": action,
|
|
"source": {
|
|
"repository": args.metadata_source_repository,
|
|
"ref": args.metadata_source_ref,
|
|
"sha": args.metadata_source_sha,
|
|
},
|
|
}
|
|
if args.metadata_release_tag:
|
|
metadata["release_tag"] = args.metadata_release_tag
|
|
if args.metadata_published_branch:
|
|
metadata["published_branch"] = args.metadata_published_branch
|
|
|
|
target = published_root / PUBLISH_METADATA_PATH
|
|
target.parent.mkdir(parents=True, exist_ok=True)
|
|
target.write_text(json.dumps(metadata, indent=2) + "\n")
|
|
|
|
|
|
def ignore_source(_dir: str, names: list[str]) -> set[str]:
|
|
return {name for name in names if name in SKIP_NAMES}
|
|
|
|
|
|
def copy_path(source: Path, target: Path) -> None:
|
|
if not source.exists():
|
|
return
|
|
if target.exists():
|
|
if target.is_dir():
|
|
shutil.rmtree(target)
|
|
else:
|
|
target.unlink()
|
|
target.parent.mkdir(parents=True, exist_ok=True)
|
|
if source.is_dir():
|
|
shutil.copytree(source, target, ignore=ignore_source)
|
|
else:
|
|
shutil.copy2(source, target)
|
|
|
|
|
|
def clear_published_tree(root: Path) -> None:
|
|
root.mkdir(parents=True, exist_ok=True)
|
|
for path in root.iterdir():
|
|
if path.name == ".git":
|
|
continue
|
|
if path.is_dir():
|
|
shutil.rmtree(path)
|
|
else:
|
|
path.unlink()
|
|
|
|
|
|
def merge_preserved_versions(source_versions: Path, published_versions: Path, preserved_versions: Path) -> None:
|
|
if not preserved_versions.exists():
|
|
return
|
|
published_versions.mkdir(parents=True, exist_ok=True)
|
|
for path in preserved_versions.iterdir():
|
|
target = published_versions / path.name
|
|
source_peer = source_versions / path.name
|
|
if source_peer.exists():
|
|
continue
|
|
copy_path(path, target)
|
|
|
|
|
|
def extract_navigation_section(path: Path, section_re: re.Pattern[str]) -> list[str]:
|
|
lines = path.read_text().splitlines(keepends=True)
|
|
start = next((i for i, line in enumerate(lines) if section_re.match(line)), -1)
|
|
if start == -1:
|
|
raise PublishedBranchError(f"Section not found in {path}")
|
|
end = start + 1
|
|
while end < len(lines):
|
|
if lines[end].startswith(" - ") and lines[end].strip():
|
|
break
|
|
end += 1
|
|
return lines[start:end]
|
|
|
|
|
|
def replace_navigation_section(path: Path, section_re: re.Pattern[str], block: list[str]) -> None:
|
|
lines = path.read_text().splitlines(keepends=True)
|
|
start = next((i for i, line in enumerate(lines) if section_re.match(line)), -1)
|
|
if start == -1:
|
|
raise PublishedBranchError(f"Section not found in {path}")
|
|
end = start + 1
|
|
while end < len(lines):
|
|
if lines[end].startswith(" - ") and lines[end].strip():
|
|
break
|
|
end += 1
|
|
lines[start:end] = block
|
|
path.write_text("".join(lines))
|
|
|
|
|
|
def remove_navigation_section(path: Path, section_re: re.Pattern[str]) -> None:
|
|
lines = path.read_text().splitlines(keepends=True)
|
|
start = next((i for i, line in enumerate(lines) if section_re.match(line)), -1)
|
|
if start == -1:
|
|
return
|
|
end = start + 1
|
|
while end < len(lines):
|
|
if lines[end].startswith(" - ") and lines[end].strip():
|
|
break
|
|
end += 1
|
|
lines[start:end] = []
|
|
path.write_text("".join(lines))
|
|
|
|
|
|
def remove_retired_reference_archive(source_root: Path, published_root: Path) -> None:
|
|
versions_dir = published_root / "fern" / "versions"
|
|
for nav in sorted(path for path in versions_dir.glob("*.yml") if path.name != "latest.yml"):
|
|
remove_navigation_section(nav, RETIRED_REFERENCE_SECTION_RE)
|
|
|
|
for path in sorted(versions_dir.glob(f"*/pages/{RETIRED_REFERENCE_DIR}")):
|
|
if path.is_dir():
|
|
shutil.rmtree(path)
|
|
|
|
source_pages = source_root / "fern" / "versions" / "latest" / "pages"
|
|
for pages_dir in sorted(versions_dir.glob("v*/pages")):
|
|
for rel_path in RETIRED_REFERENCE_CLEAN_PAGE_PATHS:
|
|
source_file = source_pages / rel_path
|
|
target_file = pages_dir / rel_path
|
|
if source_file.exists() and target_file.exists():
|
|
copy_path(source_file, target_file)
|
|
|
|
|
|
def materialize_version_nav_pages(published_root: Path) -> None:
|
|
versions_dir = published_root / "fern" / "versions"
|
|
for nav in sorted(versions_dir.glob("v*.yml")):
|
|
slug = nav.stem
|
|
lines = nav.read_text().splitlines(keepends=True)
|
|
changed = False
|
|
if lines and lines[0].startswith(f"# Frozen {slug} release nav. Reuses shared pages"):
|
|
lines[0] = f"# Frozen {slug} release nav. Pages are materialized under ./{slug}/pages/.\n"
|
|
changed = True
|
|
for index, line in enumerate(lines):
|
|
match = NAV_PATH_RE.match(line)
|
|
if not match:
|
|
continue
|
|
rel_path = Path(match.group(2))
|
|
if len(rel_path.parts) < 3 or rel_path.parts[1] != "pages":
|
|
continue
|
|
|
|
target_rel = Path(slug, "pages", *rel_path.parts[2:])
|
|
source_file = versions_dir / rel_path
|
|
target_file = versions_dir / target_rel
|
|
if not source_file.exists():
|
|
raise PublishedBranchError(f"{nav} references missing page {source_file}")
|
|
if source_file != target_file:
|
|
copy_path(source_file, target_file)
|
|
lines[index] = f"{match.group(1)}./{target_rel.as_posix()}{match.group(3)}\n"
|
|
changed = True
|
|
|
|
if changed:
|
|
nav.write_text("".join(lines))
|
|
|
|
|
|
def sync_source(args: argparse.Namespace) -> int:
|
|
source_root = Path(args.source_root)
|
|
published_root = Path(args.published_root)
|
|
if not (source_root / "fern" / "docs.yml").exists():
|
|
raise PublishedBranchError(f"Missing source Fern docs at {source_root / 'fern'}")
|
|
|
|
preserved_versions_block = normalize_latest_display_name(versions_block(published_root / "fern" / "docs.yml"))
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
preserved_versions = Path(tmpdir) / "versions"
|
|
if (published_root / "fern" / "versions").exists():
|
|
shutil.copytree(published_root / "fern" / "versions", preserved_versions)
|
|
|
|
clear_published_tree(published_root)
|
|
shutil.copytree(source_root, published_root, dirs_exist_ok=True, ignore=ignore_source)
|
|
merge_preserved_versions(
|
|
source_root / "fern" / "versions", published_root / "fern" / "versions", preserved_versions
|
|
)
|
|
remove_retired_reference_archive(source_root, published_root)
|
|
materialize_version_nav_pages(published_root)
|
|
restore_versions_block(published_root / "fern" / "docs.yml", preserved_versions_block)
|
|
validate_redirect_targets(published_root)
|
|
write_publish_metadata(published_root, args, "release-snapshot")
|
|
return 0
|
|
|
|
|
|
def extract_devnotes_block(path: Path) -> list[str]:
|
|
return extract_navigation_section(path, DEVNOTES_SECTION_RE)
|
|
|
|
|
|
def rewrite_devnotes_block(source_root: Path, published_root: Path, block: list[str]) -> list[str]:
|
|
rewritten: list[str] = []
|
|
for line in block:
|
|
match = NAV_PATH_RE.match(line)
|
|
if not match:
|
|
rewritten.append(line)
|
|
continue
|
|
rel_path = Path(match.group(2))
|
|
if "pages/devnotes" not in rel_path.as_posix():
|
|
rewritten.append(line)
|
|
continue
|
|
source_file = source_root / "fern" / "versions" / rel_path
|
|
if not source_file.exists():
|
|
raise PublishedBranchError(
|
|
f"Missing Dev Notes page referenced by {source_root / 'fern' / 'versions'}: {rel_path}"
|
|
)
|
|
target_rel = Path("latest/pages/devnotes") / rel_path.as_posix().split("pages/devnotes/", 1)[1]
|
|
target_file = published_root / "fern" / "versions" / target_rel
|
|
copy_path(source_file, target_file)
|
|
rewritten.append(f"{match.group(1)}./{target_rel.as_posix()}{match.group(3)}\n")
|
|
return rewritten
|
|
|
|
|
|
def replace_devnotes_block(path: Path, block: list[str]) -> None:
|
|
replace_navigation_section(path, DEVNOTES_SECTION_RE, block)
|
|
|
|
|
|
def patch_devnotes(args: argparse.Namespace) -> int:
|
|
source_root = Path(args.source_root)
|
|
published_root = Path(args.published_root)
|
|
source_nav = source_root / "fern" / "versions" / "latest.yml"
|
|
target_nav = published_root / "fern" / "versions" / "latest.yml"
|
|
if not source_nav.exists():
|
|
raise PublishedBranchError(f"Missing {source_nav}")
|
|
if not target_nav.exists():
|
|
raise PublishedBranchError(f"Missing {target_nav}; publish a Fern release snapshot first")
|
|
|
|
for rel_path in FERN_DEVNOTE_SUPPORT_PATHS:
|
|
copy_path(source_root / rel_path, published_root / rel_path)
|
|
|
|
source_block = extract_devnotes_block(source_nav)
|
|
replace_devnotes_block(target_nav, rewrite_devnotes_block(source_root, published_root, source_block))
|
|
write_publish_metadata(published_root, args, "devnotes-patch")
|
|
return 0
|
|
|
|
|
|
def add_metadata_args(parser: argparse.ArgumentParser) -> None:
|
|
parser.add_argument("--metadata-source-repository", help="Repository used to produce this published snapshot")
|
|
parser.add_argument("--metadata-source-ref", help="Git ref used to produce this published snapshot")
|
|
parser.add_argument("--metadata-source-sha", help="Git commit used to produce this published snapshot")
|
|
parser.add_argument("--metadata-release-tag", help="Release tag represented by this published snapshot")
|
|
parser.add_argument("--metadata-published-branch", help="Published branch updated by this snapshot")
|
|
|
|
|
|
def build_parser() -> argparse.ArgumentParser:
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
subparsers = parser.add_subparsers(required=True)
|
|
|
|
sync_parser = subparsers.add_parser("sync-source")
|
|
sync_parser.add_argument("--source-root", required=True, help="Repository checkout with authoring content")
|
|
sync_parser.add_argument("--published-root", required=True, help="docs-website checkout to update")
|
|
add_metadata_args(sync_parser)
|
|
sync_parser.set_defaults(func=sync_source)
|
|
|
|
devnotes_parser = subparsers.add_parser("patch-devnotes")
|
|
devnotes_parser.add_argument("--source-root", required=True, help="Repository checkout with latest Dev Notes")
|
|
devnotes_parser.add_argument("--published-root", required=True, help="docs-website checkout to patch")
|
|
add_metadata_args(devnotes_parser)
|
|
devnotes_parser.set_defaults(func=patch_devnotes)
|
|
return parser
|
|
|
|
|
|
def main() -> int:
|
|
parser = build_parser()
|
|
args = parser.parse_args()
|
|
try:
|
|
return args.func(args)
|
|
except PublishedBranchError as exc:
|
|
print(f"ERROR: {exc}", file=sys.stderr)
|
|
return 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|