mirror of
https://github.com/NVIDIA-NeMo/DataDesigner
synced 2026-05-24 09:48:29 +00:00
* docs: prepare fern generated artifacts * docs: update fern migration artifacts * docs: leave colab notebooks unchanged * docs: add VLM recipe cards to Fern * docs: trim Dev Notes sidebar * docs: collapse older Dev Notes in sidebar * docs: add Fern publishing workflows * docs: gate Fern publishing on check * docs: restrict hosted previews for fork PRs * docs: clean Fern preview URL * docs: cancel stale preview runs * docs: clarify devnotes notebook reuse * docs: clean older versions route * docs: document Fern versioning conventions * docs: add Fern release version guard * docs: harden Fern release tag handling * ci: let docs preview continue after fern failure * ci: split docs preview deploy * docs: clarify fern make commands * ci: harden fern deploy workflows * docs: render preview notebooks without outputs * ci: keep docs preview deploy inline * docs: align notebook code highlighting * docs: show notebook snippet scrollbars * docs: isolate fern preview check failures * ci: align fern release docs behavior
201 lines
6.6 KiB
Python
201 lines
6.6 KiB
Python
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
"""Script to generate Colab-compatible notebooks from notebook source files.
|
|
|
|
This script processes jupytext percent-format Python files and:
|
|
1. Injects an "Open in Colab" badge as the first cell
|
|
2. Injects Colab-specific setup cells (pip install, API key from secrets)
|
|
3. Saves the result as .ipynb files in docs/colab_notebooks
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
from pathlib import Path
|
|
|
|
import jupytext
|
|
from nbformat import NotebookNode
|
|
from nbformat.v4 import new_code_cell, new_markdown_cell
|
|
|
|
COLAB_BADGE_TEMPLATE = (
|
|
'<a href="https://colab.research.google.com/github/NVIDIA-NeMo/DataDesigner'
|
|
'/blob/main/docs/colab_notebooks/{filename}" target="_parent">'
|
|
'<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>'
|
|
)
|
|
|
|
COLAB_SETUP_MARKDOWN = """\
|
|
### ⚡ Colab Setup
|
|
|
|
Run the cells below to install the dependencies and set up the API key. If you don't have an API key, you can generate one from [build.nvidia.com](https://build.nvidia.com).
|
|
"""
|
|
|
|
ADDITIONAL_DEPENDENCIES = {
|
|
"4-providing-images-as-context.py": '"pillow>=12.0.0,<13" "datasets>=4.0.0,<5"',
|
|
}
|
|
|
|
COLAB_INSTALL_CELL = """\
|
|
%%capture
|
|
!pip install -U data-designer"""
|
|
|
|
COLAB_API_KEY_CELL = """\
|
|
import getpass
|
|
import os
|
|
|
|
from google.colab import userdata
|
|
|
|
try:
|
|
os.environ["NVIDIA_API_KEY"] = userdata.get("NVIDIA_API_KEY")
|
|
except userdata.SecretNotFoundError:
|
|
os.environ["NVIDIA_API_KEY"] = getpass.getpass("Enter your NVIDIA API key: ")"""
|
|
|
|
COLAB_INJECT_METADATA = "nemo_colab_inject"
|
|
|
|
|
|
def mark_colab_injected(cell: NotebookNode) -> NotebookNode:
|
|
"""Mark cells generated only for Colab."""
|
|
cell.metadata[COLAB_INJECT_METADATA] = True
|
|
return cell
|
|
|
|
|
|
def create_colab_setup_cells(additional_dependencies: str) -> list[NotebookNode]:
|
|
"""Create the Colab-specific setup cells to inject before imports."""
|
|
cells = []
|
|
cells += [mark_colab_injected(new_markdown_cell(source=COLAB_SETUP_MARKDOWN))]
|
|
|
|
install_cell = COLAB_INSTALL_CELL
|
|
if additional_dependencies:
|
|
install_cell += f" {additional_dependencies}"
|
|
cells += [mark_colab_injected(new_code_cell(source=install_cell))]
|
|
|
|
cells += [mark_colab_injected(new_code_cell(source=COLAB_API_KEY_CELL))]
|
|
return cells
|
|
|
|
|
|
def find_import_section_index(cells: list[NotebookNode]) -> int:
|
|
"""Find the index of the 'Import the essentials' markdown cell."""
|
|
first_code_cell_index = -1
|
|
for i, cell in enumerate(cells):
|
|
if first_code_cell_index == -1 and cell.get("cell_type") == "code":
|
|
first_code_cell_index = i
|
|
|
|
if cell.get("cell_type") == "markdown":
|
|
source = cell.get("source", "")
|
|
if "import" in source.lower() and "essentials" in source.lower():
|
|
return i
|
|
return first_code_cell_index
|
|
|
|
|
|
def process_notebook(notebook: NotebookNode, source_path: Path) -> NotebookNode:
|
|
"""Process a notebook to make it Colab-compatible.
|
|
|
|
Args:
|
|
notebook: The input notebook
|
|
|
|
Returns:
|
|
The processed notebook with Colab setup cells injected
|
|
"""
|
|
cells = notebook.cells
|
|
|
|
additional_dependencies = ADDITIONAL_DEPENDENCIES.get(source_path.name, "")
|
|
|
|
# Find where to insert Colab setup (before "Import the essentials")
|
|
import_idx = find_import_section_index(cells)
|
|
|
|
if import_idx == -1:
|
|
# If not found, insert after first cell (title)
|
|
import_idx = 1
|
|
|
|
# Insert Colab setup cells before the import section
|
|
colab_cells = create_colab_setup_cells(additional_dependencies)
|
|
processed_cells = cells[:import_idx] + colab_cells + cells[import_idx:]
|
|
|
|
badge_source = COLAB_BADGE_TEMPLATE.format(filename=f"{source_path.stem}.ipynb")
|
|
notebook.cells = [mark_colab_injected(new_markdown_cell(source=badge_source))] + processed_cells
|
|
return notebook
|
|
|
|
|
|
def generate_colab_notebook(source_path: Path, output_dir: Path) -> Path:
|
|
"""Generate a Colab-compatible notebook from a source file.
|
|
|
|
Args:
|
|
source_path: Path to the jupytext percent-format Python source file
|
|
output_dir: Directory to save the output notebook
|
|
|
|
Returns:
|
|
Path to the generated notebook
|
|
"""
|
|
# Read the source file using jupytext
|
|
notebook = jupytext.read(source_path)
|
|
|
|
# Process the notebook for Colab
|
|
notebook = process_notebook(notebook, source_path)
|
|
|
|
# Determine output path
|
|
output_path = output_dir / f"{source_path.stem}.ipynb"
|
|
|
|
# Ensure output directory exists
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Write the notebook
|
|
jupytext.write(notebook, output_path, config={"metadata": {"jupytext": {"cell_metadata_filter": "-id"}}})
|
|
|
|
return output_path
|
|
|
|
|
|
def main() -> None:
|
|
"""Main entry point for the script."""
|
|
parser = argparse.ArgumentParser(description="Generate Colab-compatible notebooks from notebook source files.")
|
|
parser.add_argument(
|
|
"--source-dir",
|
|
type=Path,
|
|
default=Path("docs/notebook_source"),
|
|
help="Directory containing notebook source files (default: docs/notebook_source)",
|
|
)
|
|
parser.add_argument(
|
|
"--output-dir",
|
|
type=Path,
|
|
default=Path("docs/colab_notebooks"),
|
|
help="Directory to save Colab notebooks (default: docs/colab_notebooks)",
|
|
)
|
|
parser.add_argument(
|
|
"--files",
|
|
nargs="*",
|
|
help="Specific files to process (if not specified, process all .py files)",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Get list of source files
|
|
if args.files:
|
|
source_files = [args.source_dir / f for f in args.files]
|
|
else:
|
|
source_files = sorted(args.source_dir.glob("*.py"))
|
|
# Filter out files starting with underscore (like _README.md, _pyproject.toml)
|
|
source_files = [f for f in source_files if not f.name.startswith("_")]
|
|
|
|
if not source_files:
|
|
print(f"No source files found in {args.source_dir}")
|
|
return
|
|
|
|
print(f"📓 Generating Colab notebooks from {len(source_files)} source file(s)...")
|
|
print(f" Source: {args.source_dir}")
|
|
print(f" Output: {args.output_dir}")
|
|
print()
|
|
|
|
for source_path in source_files:
|
|
if not source_path.exists():
|
|
print(f"⚠️ Skipping {source_path} (file not found)")
|
|
continue
|
|
|
|
try:
|
|
output_path = generate_colab_notebook(source_path, args.output_dir)
|
|
print(f"✅ {source_path.name} → {output_path.name}")
|
|
except Exception as e:
|
|
print(f"❌ {source_path.name}: {e}")
|
|
|
|
print()
|
|
print(f"✨ Colab notebooks saved to {args.output_dir}/")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|