mirror of
https://github.com/NVIDIA-NeMo/DataDesigner
synced 2026-05-24 09:48:29 +00:00
* perf: defer heavy imports to improve CLI startup time Move expensive imports (engine, models, controllers) out of the module-level import path so that data-designer --help and other non-generation commands no longer pay the full startup cost. Key changes: - Defer controller imports to inside command functions - Remove eager re-export chains from CLI package __init__ files - Move default-settings bootstrap into load_config_builder() and DataDesigner.__init__() instead of running at import time - Add lazy __getattr__ exports in interface/__init__.py - Replace module-level tokenizer init with cached lazy getter - Fix ModelProvider import to use config layer instead of engine - Update test mock paths to match new import locations Reduces CLI import-time from ~1.67s to ~0.46s. * perf: defer pandas/numpy in io_helpers and add config_list benchmark - Replace eager `from lazy_heavy_imports import pd, np` in io_helpers with module-level __getattr__ (for backwards-compatible external access / test mocks) and function-level imports in the 3 functions that actually use them (read_parquet_dataset, smart_load_dataframe, _convert_to_serializable). Importing io_helpers no longer triggers pandas/numpy loading. - Defer heavy imports in list and reset CLI commands into function bodies to avoid loading repositories, Rich, and prompt_toolkit at module import time. - Add `config_list` (data-designer config list) measurement to the CLI startup benchmark with isolated cold measurement in a separate venv and a --skip-config-list-check flag. - Update test mock paths to match new import locations. * Refine lazy import usage and TYPE_CHECKING cleanup * Run license header updater on PR-touched files * fix: update sqlfluff mock target for lazy imports in test_sql * perf: cache globals() in lazy __getattr__ to avoid repeated lookups Add globals() caching and explanatory comment to all three lazy __getattr__ implementations (lazy_heavy_imports, config/__init__, interface/__init__) so subsequent attribute accesses bypass __getattr__. * perf: lazy CLI command loading and deferred heavy import evaluations - Add LazyTyperGroup to defer command module loading until invocation, allowing module-level imports in all CLI command files - Split DataFrameSeedSource into seed_source_dataframe.py to isolate pandas dependency from other seed source classes - Move TypeVar/TypeAlias definitions (DataT, NumpyArray1dT, RadomStateT, EngineT) to TYPE_CHECKING blocks with runtime fallbacks - Wrap module-level constants in lru_cache (phone_number parquet data, jsonschema validator) to defer I/O and heavy imports to first use - Update test mock targets to patch at usage-site for module-level imports * refactor: use direct pandas import in seed_source_dataframe Drop lazy-loading for pandas in DataFrameSeedSource; use direct import for simplicity. * update lazy import pattern * update tests to use lazy import namespace Switch test modules to import data_designer.lazy_heavy_imports as lazy and reference heavy libraries through that namespace. This keeps heavy imports deferred during module import and aligns tests with the new lazy-import usage pattern. * tighten import perf test thresholds Document recent baseline timings and lower the allowed average import time and timeout so regressions are detected sooner. * document pandas import requirement Clarify that Pydantic needs DataFrame resolved at module load and that keeping the direct import preserves IDE typing support. * increase timeout time * use lazy pandas imports in visualization tests - replace direct pandas usage with lazy.pd in visualization tests to avoid eager imports - add TYPE_CHECKING pandas import and keep CLI controller imports sorted * fix lazy pandas runtime usage and preview mocks Switch sample-record handling to lazy pandas types so runtime paths no longer depend on TYPE_CHECKING imports. Align preview controller tests to patch the module-local DataDesigner symbol, preventing real engine invocation in save results scenarios.
300 lines
12 KiB
Python
300 lines
12 KiB
Python
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
from pathlib import Path
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
import data_designer.cli.utils.config_loader as config_loader_mod
|
|
from data_designer.cli.utils.config_loader import (
|
|
ConfigLoadError,
|
|
load_config_builder,
|
|
)
|
|
from data_designer.config.config_builder import DataDesignerConfigBuilder
|
|
|
|
|
|
@patch("data_designer.cli.utils.config_loader.DataDesignerConfigBuilder.from_config")
|
|
def test_load_config_builder_from_yaml(mock_from_config: MagicMock, tmp_path: Path) -> None:
|
|
"""Test loading a config builder from a YAML file delegates to from_config."""
|
|
yaml_file = tmp_path / "config.yaml"
|
|
yaml_file.write_text("data_designer:\n columns: []\n")
|
|
|
|
mock_builder = MagicMock()
|
|
mock_from_config.return_value = mock_builder
|
|
|
|
result = load_config_builder(str(yaml_file))
|
|
|
|
mock_from_config.assert_called_once_with(yaml_file)
|
|
assert result is mock_builder
|
|
|
|
|
|
@patch("data_designer.cli.utils.config_loader.DataDesignerConfigBuilder.from_config")
|
|
def test_load_config_builder_from_yml(mock_from_config: MagicMock, tmp_path: Path) -> None:
|
|
"""Test loading a config builder from a .yml file delegates to from_config."""
|
|
yml_file = tmp_path / "config.yml"
|
|
yml_file.write_text("data_designer:\n columns: []\n")
|
|
|
|
mock_builder = MagicMock()
|
|
mock_from_config.return_value = mock_builder
|
|
|
|
result = load_config_builder(str(yml_file))
|
|
|
|
mock_from_config.assert_called_once_with(yml_file)
|
|
assert result is mock_builder
|
|
|
|
|
|
@patch("data_designer.cli.utils.config_loader.DataDesignerConfigBuilder.from_config")
|
|
def test_load_config_builder_from_json(mock_from_config: MagicMock, tmp_path: Path) -> None:
|
|
"""Test loading a config builder from a JSON file delegates to from_config."""
|
|
json_file = tmp_path / "config.json"
|
|
json_file.write_text('{"data_designer": {"columns": []}}')
|
|
|
|
mock_builder = MagicMock()
|
|
mock_from_config.return_value = mock_builder
|
|
|
|
result = load_config_builder(str(json_file))
|
|
|
|
mock_from_config.assert_called_once_with(json_file)
|
|
assert result is mock_builder
|
|
|
|
|
|
@patch("data_designer.cli.utils.config_loader.DataDesignerConfigBuilder.from_config")
|
|
def test_load_config_builder_from_yaml_url(mock_from_config: MagicMock) -> None:
|
|
"""Test loading a config builder from a YAML URL delegates to from_config."""
|
|
config_url = "https://example.com/config.yaml"
|
|
mock_builder = MagicMock()
|
|
mock_from_config.return_value = mock_builder
|
|
|
|
result = load_config_builder(config_url)
|
|
|
|
mock_from_config.assert_called_once_with(config_url)
|
|
assert result is mock_builder
|
|
|
|
|
|
@patch("data_designer.cli.utils.config_loader.DataDesignerConfigBuilder.from_config")
|
|
def test_load_config_builder_from_json_url_with_query(mock_from_config: MagicMock) -> None:
|
|
"""Test loading a config builder from a JSON URL with query params delegates to from_config."""
|
|
config_url = "https://example.com/config.json?version=1"
|
|
mock_builder = MagicMock()
|
|
mock_from_config.return_value = mock_builder
|
|
|
|
result = load_config_builder(config_url)
|
|
|
|
mock_from_config.assert_called_once_with(config_url)
|
|
assert result is mock_builder
|
|
|
|
|
|
def test_load_config_builder_from_python_module(tmp_path: Path) -> None:
|
|
"""Test loading a config builder from a Python module with load_config_builder()."""
|
|
py_file = tmp_path / "my_config.py"
|
|
py_file.write_text("def load_config_builder(): pass\n")
|
|
|
|
with patch("data_designer.cli.utils.config_loader._load_from_python_module") as mock_load_py:
|
|
mock_builder = MagicMock()
|
|
mock_load_py.return_value = mock_builder
|
|
|
|
result = load_config_builder(str(py_file))
|
|
|
|
mock_load_py.assert_called_once_with(py_file)
|
|
assert result is mock_builder
|
|
|
|
|
|
def test_load_config_builder_file_not_found() -> None:
|
|
"""Test that a non-existent file raises ConfigLoadError."""
|
|
with pytest.raises(ConfigLoadError, match="Config source not found"):
|
|
load_config_builder("/nonexistent/path/config.yaml")
|
|
|
|
|
|
def test_load_config_builder_not_a_file(tmp_path: Path) -> None:
|
|
"""Test that a directory path raises ConfigLoadError."""
|
|
with pytest.raises(ConfigLoadError, match="Config source is not a file"):
|
|
load_config_builder(str(tmp_path))
|
|
|
|
|
|
def test_load_config_builder_unsupported_extension(tmp_path: Path) -> None:
|
|
"""Test that an unsupported file extension raises ConfigLoadError."""
|
|
txt_file = tmp_path / "config.txt"
|
|
txt_file.write_text("some content")
|
|
|
|
with pytest.raises(ConfigLoadError, match="Unsupported file extension"):
|
|
load_config_builder(str(txt_file))
|
|
|
|
|
|
def test_load_config_builder_url_unsupported_extension() -> None:
|
|
"""Test that a URL with unsupported extension raises ConfigLoadError."""
|
|
with pytest.raises(ConfigLoadError, match="Unsupported file extension"):
|
|
load_config_builder("https://example.com/config.txt")
|
|
|
|
|
|
def test_load_config_builder_remote_python_module_not_supported() -> None:
|
|
"""Test that a Python module URL is rejected."""
|
|
with pytest.raises(ConfigLoadError, match="Remote Python config modules are not supported"):
|
|
load_config_builder("https://example.com/config.py")
|
|
|
|
|
|
def test_load_config_builder_url_no_extension() -> None:
|
|
"""Test that a URL with no file extension raises ConfigLoadError."""
|
|
with pytest.raises(ConfigLoadError, match="Unsupported file extension"):
|
|
load_config_builder("https://example.com/config")
|
|
|
|
|
|
def test_load_config_builder_python_module_missing_function(tmp_path: Path) -> None:
|
|
"""Test that a Python module without load_config_builder() raises ConfigLoadError."""
|
|
py_file = tmp_path / "no_func_config.py"
|
|
py_file.write_text("x = 42\n")
|
|
|
|
with pytest.raises(ConfigLoadError, match="does not define a 'load_config_builder\\(\\)' function"):
|
|
load_config_builder(str(py_file))
|
|
|
|
|
|
def test_load_config_builder_python_module_wrong_return_type(tmp_path: Path) -> None:
|
|
"""Test that load_config_builder() returning wrong type raises ConfigLoadError."""
|
|
py_file = tmp_path / "wrong_type_config.py"
|
|
py_file.write_text("def load_config_builder():\n return {'not': 'a builder'}\n")
|
|
|
|
with pytest.raises(ConfigLoadError, match="returned dict, expected DataDesignerConfigBuilder"):
|
|
load_config_builder(str(py_file))
|
|
|
|
|
|
def test_load_config_builder_python_module_syntax_error(tmp_path: Path) -> None:
|
|
"""Test that a Python module with syntax errors raises ConfigLoadError."""
|
|
py_file = tmp_path / "syntax_err_config.py"
|
|
py_file.write_text("def load_config_builder(\n")
|
|
|
|
with pytest.raises(ConfigLoadError, match="Failed to execute Python module"):
|
|
load_config_builder(str(py_file))
|
|
|
|
|
|
def test_load_config_builder_python_module_function_raises(tmp_path: Path) -> None:
|
|
"""Test that load_config_builder() raising an exception is wrapped in ConfigLoadError."""
|
|
py_file = tmp_path / "raising_config.py"
|
|
py_file.write_text("def load_config_builder():\n raise ValueError('something went wrong')\n")
|
|
|
|
with pytest.raises(ConfigLoadError, match="Error calling 'load_config_builder\\(\\)'"):
|
|
load_config_builder(str(py_file))
|
|
|
|
|
|
def test_load_config_builder_python_module_not_callable(tmp_path: Path) -> None:
|
|
"""Test that load_config_builder being a non-callable raises ConfigLoadError."""
|
|
py_file = tmp_path / "not_callable_config.py"
|
|
py_file.write_text("load_config_builder = 'not a function'\n")
|
|
|
|
with pytest.raises(ConfigLoadError, match="is not callable"):
|
|
load_config_builder(str(py_file))
|
|
|
|
|
|
def test_load_config_builder_python_module_sibling_import(tmp_path: Path) -> None:
|
|
"""Test that a Python config can import sibling modules in the same directory."""
|
|
helper_file = tmp_path / "helpers.py"
|
|
helper_file.write_text("DATASET_NAME = 'my_dataset'\n")
|
|
|
|
py_file = tmp_path / "my_config.py"
|
|
py_file.write_text(
|
|
"from data_designer.config.config_builder import DataDesignerConfigBuilder\n"
|
|
"from helpers import DATASET_NAME\n\n"
|
|
"def load_config_builder():\n"
|
|
" builder = DataDesignerConfigBuilder()\n"
|
|
" builder._test_marker = DATASET_NAME\n"
|
|
" return builder\n"
|
|
)
|
|
|
|
result = load_config_builder(str(py_file))
|
|
|
|
assert isinstance(result, DataDesignerConfigBuilder)
|
|
assert result._test_marker == "my_dataset"
|
|
|
|
|
|
def test_load_config_builder_python_module_cleans_sys_path(tmp_path: Path) -> None:
|
|
"""Test that the config's parent directory is removed from sys.path after loading."""
|
|
import sys
|
|
|
|
py_file = tmp_path / "clean_path_config.py"
|
|
py_file.write_text(
|
|
"from data_designer.config.config_builder import DataDesignerConfigBuilder\n\n"
|
|
"def load_config_builder():\n"
|
|
" return DataDesignerConfigBuilder()\n"
|
|
)
|
|
|
|
parent_dir = str(tmp_path.resolve())
|
|
assert parent_dir not in sys.path
|
|
|
|
load_config_builder(str(py_file))
|
|
|
|
assert parent_dir not in sys.path
|
|
|
|
|
|
def test_load_config_builder_invalid_yaml(tmp_path: Path) -> None:
|
|
"""Test that a YAML file that fails to parse raises ConfigLoadError."""
|
|
yaml_file = tmp_path / "bad.yaml"
|
|
yaml_file.write_text(":\n - [\n")
|
|
|
|
with pytest.raises(ConfigLoadError, match="Failed to load config from"):
|
|
load_config_builder(str(yaml_file))
|
|
|
|
|
|
def test_load_config_builder_invalid_json(tmp_path: Path) -> None:
|
|
"""Test that a malformed JSON file raises ConfigLoadError."""
|
|
json_file = tmp_path / "bad.json"
|
|
json_file.write_text(":\n - [\n")
|
|
|
|
with pytest.raises(ConfigLoadError, match="Failed to load config from"):
|
|
load_config_builder(str(json_file))
|
|
|
|
|
|
@patch("data_designer.cli.utils.config_loader.DataDesignerConfigBuilder.from_config")
|
|
def test_load_config_builder_from_config_validation_error(mock_from_config: MagicMock, tmp_path: Path) -> None:
|
|
"""Test that a valid YAML file with invalid config structure raises ConfigLoadError."""
|
|
yaml_file = tmp_path / "bad_structure.yaml"
|
|
yaml_file.write_text("data_designer:\n not_a_valid_field: true\n")
|
|
|
|
mock_from_config.side_effect = Exception("Validation error")
|
|
|
|
with pytest.raises(ConfigLoadError, match="Failed to load config from"):
|
|
load_config_builder(str(yaml_file))
|
|
|
|
|
|
def test_load_config_builder_non_dict_yaml(tmp_path: Path) -> None:
|
|
"""Test that a YAML file that parses to a non-dict raises ConfigLoadError."""
|
|
yaml_file = tmp_path / "list.yaml"
|
|
yaml_file.write_text("- item1\n- item2\n")
|
|
|
|
with pytest.raises(ConfigLoadError, match="Failed to load config from"):
|
|
load_config_builder(str(yaml_file))
|
|
|
|
|
|
def test_load_config_builder_non_dict_json(tmp_path: Path) -> None:
|
|
"""Test that a JSON file containing an array (not an object) raises ConfigLoadError."""
|
|
json_file = tmp_path / "list.json"
|
|
json_file.write_text('[{"name": "col1"}, {"name": "col2"}]')
|
|
|
|
with pytest.raises(ConfigLoadError, match="Failed to load config from"):
|
|
load_config_builder(str(json_file))
|
|
|
|
|
|
def test_load_config_builder_empty_json(tmp_path: Path) -> None:
|
|
"""Test that an empty JSON file raises ConfigLoadError."""
|
|
json_file = tmp_path / "empty.json"
|
|
json_file.write_text("")
|
|
|
|
with pytest.raises(ConfigLoadError, match="Failed to load config from"):
|
|
load_config_builder(str(json_file))
|
|
|
|
|
|
def test_load_config_builder_empty_yaml(tmp_path: Path) -> None:
|
|
"""Test that an empty YAML file raises ConfigLoadError."""
|
|
yaml_file = tmp_path / "empty.yaml"
|
|
yaml_file.write_text("")
|
|
|
|
with pytest.raises(ConfigLoadError, match="Failed to load config from"):
|
|
load_config_builder(str(yaml_file))
|
|
|
|
|
|
def test_ensure_default_model_settings_runs_once(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
"""_ensure_default_model_settings only calls resolve_seed_default_model_settings once."""
|
|
monkeypatch.setattr(config_loader_mod, "_default_settings_initialized", False)
|
|
|
|
with patch("data_designer.cli.utils.config_loader.resolve_seed_default_model_settings") as mock_resolve:
|
|
config_loader_mod._ensure_default_model_settings()
|
|
config_loader_mod._ensure_default_model_settings()
|
|
mock_resolve.assert_called_once()
|