DataDesigner/packages/data-designer-engine/tests/engine/processing/test_utils.py
Johnny Greco 1439bbea7e
chore: Improve CLI startup with lazy heavy import cleanup (#330)
* perf: defer heavy imports to improve CLI startup time

Move expensive imports (engine, models, controllers) out of the module-level import path so that data-designer --help and other non-generation commands no longer pay the full startup cost.

Key changes:
- Defer controller imports to inside command functions
- Remove eager re-export chains from CLI package __init__ files
- Move default-settings bootstrap into load_config_builder() and DataDesigner.__init__() instead of running at import time
- Add lazy __getattr__ exports in interface/__init__.py
- Replace module-level tokenizer init with cached lazy getter
- Fix ModelProvider import to use config layer instead of engine
- Update test mock paths to match new import locations

Reduces CLI import-time from ~1.67s to ~0.46s.

* perf: defer pandas/numpy in io_helpers and add config_list benchmark

- Replace eager `from lazy_heavy_imports import pd, np` in io_helpers
  with module-level __getattr__ (for backwards-compatible external
  access / test mocks) and function-level imports in the 3 functions
  that actually use them (read_parquet_dataset, smart_load_dataframe,
  _convert_to_serializable). Importing io_helpers no longer triggers
  pandas/numpy loading.
- Defer heavy imports in list and reset CLI commands into function
  bodies to avoid loading repositories, Rich, and prompt_toolkit at
  module import time.
- Add `config_list` (data-designer config list) measurement to the
  CLI startup benchmark with isolated cold measurement in a separate
  venv and a --skip-config-list-check flag.
- Update test mock paths to match new import locations.

* Refine lazy import usage and TYPE_CHECKING cleanup

* Run license header updater on PR-touched files

* fix: update sqlfluff mock target for lazy imports in test_sql

* perf: cache globals() in lazy __getattr__ to avoid repeated lookups

Add globals() caching and explanatory comment to all three lazy
__getattr__ implementations (lazy_heavy_imports, config/__init__,
interface/__init__) so subsequent attribute accesses bypass __getattr__.

* perf: lazy CLI command loading and deferred heavy import evaluations

- Add LazyTyperGroup to defer command module loading until invocation, allowing module-level imports in all CLI command files

- Split DataFrameSeedSource into seed_source_dataframe.py to isolate pandas dependency from other seed source classes

- Move TypeVar/TypeAlias definitions (DataT, NumpyArray1dT, RadomStateT, EngineT) to TYPE_CHECKING blocks with runtime fallbacks

- Wrap module-level constants in lru_cache (phone_number parquet data, jsonschema validator) to defer I/O and heavy imports to first use

- Update test mock targets to patch at usage-site for module-level imports

* refactor: use direct pandas import in seed_source_dataframe

Drop lazy-loading for pandas in DataFrameSeedSource; use direct import
for simplicity.

* update lazy import pattern

* update tests to use lazy import namespace

Switch test modules to import data_designer.lazy_heavy_imports as lazy
and reference heavy libraries through that namespace. This keeps heavy
imports deferred during module import and aligns tests with the new
lazy-import usage pattern.

* tighten import perf test thresholds

Document recent baseline timings and lower the allowed average
import time and timeout so regressions are detected sooner.

* document pandas import requirement

Clarify that Pydantic needs DataFrame resolved at module load and
that keeping the direct import preserves IDE typing support.

* increase timeout time

* use lazy pandas imports in visualization tests

- replace direct pandas usage with lazy.pd in visualization tests to avoid eager imports
- add TYPE_CHECKING pandas import and keep CLI controller imports sorted

* fix lazy pandas runtime usage and preview mocks

Switch sample-record handling to lazy pandas types so runtime paths no longer
depend on TYPE_CHECKING imports. Align preview controller tests to patch the
module-local DataDesigner symbol, preventing real engine invocation in save
results scenarios.
2026-02-18 16:24:15 -05:00

137 lines
5.2 KiB
Python

# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations
from unittest.mock import patch
import pytest
import data_designer.lazy_heavy_imports as lazy
from data_designer.engine.processing.utils import (
concat_datasets,
deserialize_json_values,
parse_list_string,
)
@pytest.fixture
def stub_sample_dataframes():
return {
"df1": lazy.pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]}),
"df2": lazy.pd.DataFrame({"col3": [4, 5, 6], "col4": ["d", "e", "f"]}),
"df_single": lazy.pd.DataFrame({"col1": [1, 2, 3]}),
}
@pytest.fixture
def stub_overlapping_dataframes():
return {
"df1": lazy.pd.DataFrame({"col1": [1, 2, 3]}),
"df2": lazy.pd.DataFrame({"col1": [4, 5, 6]}),
}
@pytest.fixture
def stub_different_length_dataframes():
return {
"df1": lazy.pd.DataFrame({"col1": [1, 2, 3]}),
"df2": lazy.pd.DataFrame({"col2": [4, 5]}),
}
@pytest.mark.parametrize(
"test_case,dataframes_key,expected_result,expected_error",
[
(
"concat_success",
"stub_sample_dataframes",
{"col1": [1, 2, 3], "col2": ["a", "b", "c"], "col3": [4, 5, 6], "col4": ["d", "e", "f"]},
None,
),
("concat_single_dataset", "stub_sample_dataframes", {"col1": [1, 2, 3]}, None),
("overlapping_columns_error", "stub_overlapping_dataframes", None, ValueError),
("different_lengths_error", "stub_different_length_dataframes", None, ValueError),
],
)
def test_concat_datasets_scenarios(request, test_case, dataframes_key, expected_result, expected_error):
if dataframes_key == "stub_sample_dataframes":
if test_case == "concat_success":
dfs = request.getfixturevalue("stub_sample_dataframes")
datasets = [dfs["df1"], dfs["df2"]]
else: # concat_single_dataset
dfs = request.getfixturevalue("stub_sample_dataframes")
datasets = [dfs["df_single"]]
elif dataframes_key == "stub_overlapping_dataframes":
dfs = request.getfixturevalue("stub_overlapping_dataframes")
datasets = [dfs["df1"], dfs["df2"]]
elif dataframes_key == "stub_different_length_dataframes":
dfs = request.getfixturevalue("stub_different_length_dataframes")
datasets = [dfs["df1"], dfs["df2"]]
if expected_error:
with pytest.raises(expected_error):
concat_datasets(datasets)
else:
result = concat_datasets(datasets)
lazy.pd.testing.assert_frame_equal(result, lazy.pd.DataFrame(expected_result))
@patch("data_designer.engine.processing.utils.logger", autospec=True)
def test_concat_datasets_logging(mock_logger, stub_sample_dataframes):
datasets = [stub_sample_dataframes["df1"], stub_sample_dataframes["df2"]]
concat_datasets(datasets)
mock_logger.info.assert_called_once_with("(💾 + 💾) Concatenating 2 datasets")
@pytest.mark.parametrize(
"test_case,input_data,expected_result",
[
("single_string_valid_json", '{"key": "value", "number": 42}', {"key": "value", "number": 42}),
("single_string_invalid_json", '{"key": "value", "number": 42', '{"key": "value", "number": 42'),
("list_of_strings", ['{"a": 1}', '{"b": 2}', "invalid_json"], [{"a": 1}, {"b": 2}, "invalid_json"]),
("list_with_nested_structures", ['{"a": 1}', [2, 3], {"c": "d"}], [{"a": 1}, [2, 3], {"c": "d"}]),
(
"dict_with_json_strings",
{"json_str": '{"nested": "value"}', "regular_str": "not_json", "number": 42},
{"json_str": {"nested": "value"}, "regular_str": "not_json", "number": 42},
),
(
"dict_with_nested_structures",
{
"json_str": '{"nested": "value"}',
"nested_dict": {"inner": '{"deep": "value"}'},
"nested_list": ['{"item": 1}', 2, 3],
},
{
"json_str": {"nested": "value"},
"nested_dict": {"inner": {"deep": "value"}},
"nested_list": [{"item": 1}, 2, 3],
},
),
("non_string_non_dict_non_list", 42, 42),
("none", None, None),
("empty_string", "", ""),
("empty_list", [], []),
("empty_dict", {}, {}),
],
)
def test_deserialize_json_values_scenarios(test_case, input_data, expected_result):
result = deserialize_json_values(input_data)
assert result == expected_result
@pytest.mark.parametrize(
"input_string,expected_result",
[
('["a", "b", "c"]', ["a", "b", "c"]), # valid stringified json array
('[" a ", " b", "c "]', ["a", "b", "c"]), # valid stringified json array with whitespace
('["a", "b", "c",]', ["a", "b", "c"]), # valid stringified json array with trailing comma
("['a', 'b', 'c']", ["a", "b", "c"]), # valid python-style list with single quotes
("['a', 'b', 'c', ]", ["a", "b", "c"]), # valid python-style list with trailing comma
("simple string ", ["simple string"]), # simple string with whitespace
],
)
def test_parse_list_string_scenarios(input_string, expected_result):
result = parse_list_string(input_string)
assert result == expected_result