DataDesigner/docs/scripts/generate_colab_notebooks.py
Johnny Greco d962c86843
fix: update example runner command with notebooks dep group (#204)
* update dep groups; use in makefile

* add quotes to packages in pip command
2026-01-13 11:49:31 -05:00

186 lines
5.9 KiB
Python

# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Script to generate Colab-compatible notebooks from notebook source files.
This script processes jupytext percent-format Python files and:
1. Injects Colab-specific setup cells (pip install, API key from secrets)
2. Injects cells before the "Import the essentials" section
3. Saves the result as .ipynb files in docs/colab_notebooks
"""
from __future__ import annotations
import argparse
from pathlib import Path
import jupytext
from nbformat import NotebookNode
from nbformat.v4 import new_code_cell, new_markdown_cell
COLAB_SETUP_MARKDOWN = """\
### ⚡ Colab Setup
Run the cells below to install the dependencies and set up the API key. If you don't have an API key, you can generate one from [build.nvidia.com](https://build.nvidia.com).
"""
ADDITIONAL_DEPENDENCIES = {
"4-providing-images-as-context.py": '"pillow>=12.0.0,<13" "datasets>=4.0.0,<5"',
}
COLAB_INSTALL_CELL = """\
%%capture
!pip install -U data-designer"""
COLAB_API_KEY_CELL = """\
import getpass
import os
from google.colab import userdata
try:
os.environ["NVIDIA_API_KEY"] = userdata.get("NVIDIA_API_KEY")
except userdata.SecretNotFoundError:
os.environ["NVIDIA_API_KEY"] = getpass.getpass("Enter your NVIDIA API key: ")"""
def create_colab_setup_cells(additional_dependencies: str) -> list[NotebookNode]:
"""Create the Colab-specific setup cells to inject before imports."""
cells = []
cells += [new_markdown_cell(source=COLAB_SETUP_MARKDOWN)]
install_cell = COLAB_INSTALL_CELL
if additional_dependencies:
install_cell += f" {additional_dependencies}"
cells += [new_code_cell(source=install_cell)]
cells += [new_code_cell(source=COLAB_API_KEY_CELL)]
return cells
def find_import_section_index(cells: list[NotebookNode]) -> int:
"""Find the index of the 'Import the essentials' markdown cell."""
first_code_cell_index = -1
for i, cell in enumerate(cells):
if first_code_cell_index == -1 and cell.get("cell_type") == "code":
first_code_cell_index = i
if cell.get("cell_type") == "markdown":
source = cell.get("source", "")
if "import" in source.lower() and "essentials" in source.lower():
return i
return first_code_cell_index
def process_notebook(notebook: NotebookNode, source_path: Path) -> NotebookNode:
"""Process a notebook to make it Colab-compatible.
Args:
notebook: The input notebook
Returns:
The processed notebook with Colab setup cells injected
"""
cells = notebook.cells
additional_dependencies = ADDITIONAL_DEPENDENCIES.get(source_path.name, "")
# Find where to insert Colab setup (before "Import the essentials")
import_idx = find_import_section_index(cells)
if import_idx == -1:
# If not found, insert after first cell (title)
import_idx = 1
# Insert Colab setup cells before the import section
colab_cells = create_colab_setup_cells(additional_dependencies)
processed_cells = cells[:import_idx] + colab_cells + cells[import_idx:]
notebook.cells = processed_cells
return notebook
def generate_colab_notebook(source_path: Path, output_dir: Path) -> Path:
"""Generate a Colab-compatible notebook from a source file.
Args:
source_path: Path to the jupytext percent-format Python source file
output_dir: Directory to save the output notebook
Returns:
Path to the generated notebook
"""
# Read the source file using jupytext
notebook = jupytext.read(source_path)
# Process the notebook for Colab
notebook = process_notebook(notebook, source_path)
# Determine output path
output_path = output_dir / f"{source_path.stem}.ipynb"
# Ensure output directory exists
output_dir.mkdir(parents=True, exist_ok=True)
# Write the notebook
jupytext.write(notebook, output_path, config={"metadata": {"jupytext": {"cell_metadata_filter": "-id"}}})
return output_path
def main() -> None:
"""Main entry point for the script."""
parser = argparse.ArgumentParser(description="Generate Colab-compatible notebooks from notebook source files.")
parser.add_argument(
"--source-dir",
type=Path,
default=Path("docs/notebook_source"),
help="Directory containing notebook source files (default: docs/notebook_source)",
)
parser.add_argument(
"--output-dir",
type=Path,
default=Path("docs/colab_notebooks"),
help="Directory to save Colab notebooks (default: docs/colab_notebooks)",
)
parser.add_argument(
"--files",
nargs="*",
help="Specific files to process (if not specified, process all .py files)",
)
args = parser.parse_args()
# Get list of source files
if args.files:
source_files = [args.source_dir / f for f in args.files]
else:
source_files = sorted(args.source_dir.glob("*.py"))
# Filter out files starting with underscore (like _README.md, _pyproject.toml)
source_files = [f for f in source_files if not f.name.startswith("_")]
if not source_files:
print(f"No source files found in {args.source_dir}")
return
print(f"📓 Generating Colab notebooks from {len(source_files)} source file(s)...")
print(f" Source: {args.source_dir}")
print(f" Output: {args.output_dir}")
print()
for source_path in source_files:
if not source_path.exists():
print(f"⚠️ Skipping {source_path} (file not found)")
continue
try:
output_path = generate_colab_notebook(source_path, args.output_dir)
print(f"{source_path.name}{output_path.name}")
except Exception as e:
print(f"{source_path.name}: {e}")
print()
print(f"✨ Colab notebooks saved to {args.output_dir}/")
if __name__ == "__main__":
main()