DataDesigner/tests/interface/test_data_designer.py
Johnny Greco f8c201e085
chore: update header script to check for diffs (#195)
* update script

* update headers

* refactor a bit and add test script

* update headers

* update for edge case

* update headers

* add step to get file creation date

* use git history to get copyright year

* generation type is printed with inference parameters

* fix unit test
2026-01-09 17:10:58 -05:00

381 lines
15 KiB
Python

# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from pathlib import Path
from unittest.mock import MagicMock, patch
import pandas as pd
import pytest
from data_designer.config.column_configs import SamplerColumnConfig
from data_designer.config.config_builder import DataDesignerConfigBuilder
from data_designer.config.dataset_builders import BuildStage
from data_designer.config.errors import InvalidConfigError
from data_designer.config.models import ModelProvider
from data_designer.config.processors import DropColumnsProcessorConfig
from data_designer.config.run_config import RunConfig
from data_designer.config.sampler_params import CategorySamplerParams, SamplerType
from data_designer.config.seed_source import HuggingFaceSeedSource
from data_designer.engine.secret_resolver import CompositeResolver, EnvironmentResolver, PlaintextResolver
from data_designer.interface.data_designer import DataDesigner
from data_designer.interface.errors import (
DataDesignerGenerationError,
DataDesignerProfilingError,
InvalidBufferValueError,
)
@pytest.fixture
def stub_artifact_path(tmp_path):
"""Temporary directory for artifacts."""
return tmp_path / "artifacts"
@pytest.fixture
def stub_managed_assets_path(tmp_path):
"""Temporary directory for managed assets."""
managed_path = tmp_path / "managed-assets"
managed_path.mkdir(parents=True, exist_ok=True)
return managed_path
@pytest.fixture
def stub_model_providers():
return [
ModelProvider(
name="stub-model-provider",
endpoint="https://api.stub-model-provider.com/v1",
api_key="stub-model-provider-api-key",
)
]
def test_init_with_custom_secret_resolver(stub_artifact_path, stub_model_providers):
"""Test DataDesigner initialization with custom secret resolver."""
designer = DataDesigner(
artifact_path=stub_artifact_path,
model_providers=stub_model_providers,
secret_resolver=PlaintextResolver(),
)
assert designer is not None
def test_init_with_default_composite_secret_resolver(stub_artifact_path, stub_model_providers):
"""Test DataDesigner initialization with default composite secret resolver."""
designer = DataDesigner(artifact_path=stub_artifact_path, model_providers=stub_model_providers)
assert designer is not None
assert isinstance(designer.secret_resolver, CompositeResolver)
# Verify the composite resolver is properly configured with the expected resolvers
resolvers = designer.secret_resolver.resolvers
assert len(resolvers) == 2
assert isinstance(resolvers[0], EnvironmentResolver)
assert isinstance(resolvers[1], PlaintextResolver)
def test_init_with_string_path(stub_artifact_path, stub_model_providers):
"""Test DataDesigner accepts string paths."""
designer = DataDesigner(artifact_path=str(stub_artifact_path), model_providers=stub_model_providers)
assert designer is not None
assert isinstance(designer._artifact_path, Path)
def test_init_with_path_object(stub_artifact_path, stub_model_providers):
"""Test DataDesigner accepts Path objects."""
designer = DataDesigner(artifact_path=stub_artifact_path, model_providers=stub_model_providers)
assert designer is not None
def test_buffer_size_setting_persists(stub_artifact_path, stub_model_providers):
"""Test that buffer size setting persists across multiple calls."""
data_designer = DataDesigner(artifact_path=stub_artifact_path, model_providers=stub_model_providers)
custom_buffer_size = 500
data_designer.set_buffer_size(custom_buffer_size)
assert data_designer._buffer_size == custom_buffer_size
another_buffer_size = 750
data_designer.set_buffer_size(another_buffer_size)
assert data_designer._buffer_size == another_buffer_size
def test_set_buffer_size_raises_error_for_invalid_buffer_size(stub_artifact_path, stub_model_providers):
"""Test that set_buffer_size raises error for invalid buffer size."""
data_designer = DataDesigner(artifact_path=stub_artifact_path, model_providers=stub_model_providers)
with pytest.raises(InvalidBufferValueError, match="Buffer size must be greater than 0."):
data_designer.set_buffer_size(0)
def test_run_config_setting_persists(stub_artifact_path, stub_model_providers):
"""Test that run config setting persists across multiple calls."""
data_designer = DataDesigner(artifact_path=stub_artifact_path, model_providers=stub_model_providers)
# Test default values
assert data_designer._run_config.disable_early_shutdown is False
assert data_designer._run_config.shutdown_error_rate == 0.5
assert data_designer._run_config.shutdown_error_window == 10
# Test setting custom values
data_designer.set_run_config(
RunConfig(
disable_early_shutdown=True,
shutdown_error_rate=0.8,
shutdown_error_window=25,
)
)
assert data_designer._run_config.disable_early_shutdown is True
assert data_designer._run_config.shutdown_error_rate == 1.0 # normalized when disabled
assert data_designer._run_config.shutdown_error_window == 25
# Test updating values
data_designer.set_run_config(
RunConfig(
disable_early_shutdown=False,
shutdown_error_rate=0.3,
shutdown_error_window=5,
)
)
assert data_designer._run_config.disable_early_shutdown is False
assert data_designer._run_config.shutdown_error_rate == 0.3
assert data_designer._run_config.shutdown_error_window == 5
def test_run_config_normalizes_error_rate_when_disabled(stub_artifact_path, stub_model_providers):
"""Test that shutdown_error_rate is normalized to 1.0 when disabled."""
data_designer = DataDesigner(artifact_path=stub_artifact_path, model_providers=stub_model_providers)
# When enabled (default), shutdown_error_rate should use the configured value
data_designer.set_run_config(
RunConfig(
disable_early_shutdown=False,
shutdown_error_rate=0.7,
)
)
assert data_designer._run_config.shutdown_error_rate == 0.7
# When disabled, shutdown_error_rate should be normalized to 1.0
data_designer.set_run_config(
RunConfig(
disable_early_shutdown=True,
shutdown_error_rate=0.7,
)
)
assert data_designer._run_config.shutdown_error_rate == 1.0
def test_create_dataset_e2e_using_only_sampler_columns(
stub_sampler_only_config_builder, stub_artifact_path, stub_model_providers, stub_managed_assets_path
):
column_names = [config.name for config in stub_sampler_only_config_builder.get_column_configs()]
num_records = 3
data_designer = DataDesigner(
artifact_path=stub_artifact_path,
model_providers=stub_model_providers,
secret_resolver=PlaintextResolver(),
managed_assets_path=stub_managed_assets_path,
)
results = data_designer.create(stub_sampler_only_config_builder, num_records=num_records)
df = results.load_dataset()
assert len(df) == num_records
assert set(df.columns) == set(column_names)
# cycle through with no errors
for _ in range(num_records + 2):
results.display_sample_record()
analysis = results.load_analysis()
assert analysis.target_num_records == num_records
# display report with no errors
analysis.to_report()
def test_create_raises_error_when_builder_fails(
stub_artifact_path, stub_model_providers, stub_sampler_only_config_builder, stub_managed_assets_path
):
"""Test that create method raises DataDesignerCreateError when builder.build fails."""
data_designer = DataDesigner(
artifact_path=stub_artifact_path,
model_providers=stub_model_providers,
secret_resolver=PlaintextResolver(),
managed_assets_path=stub_managed_assets_path,
)
with patch.object(data_designer, "_create_dataset_builder") as mock_builder_method:
mock_builder = MagicMock()
mock_builder.build.side_effect = RuntimeError("Builder failed")
mock_builder_method.return_value = mock_builder
with pytest.raises(DataDesignerGenerationError, match="🛑 Error generating dataset: Builder failed"):
data_designer.create(stub_sampler_only_config_builder, num_records=3)
def test_create_raises_error_when_profiler_fails(
stub_artifact_path, stub_model_providers, stub_sampler_only_config_builder, stub_managed_assets_path
):
"""Test that create method raises DataDesignerCreateError when profiler.profile_dataset fails."""
data_designer = DataDesigner(
artifact_path=stub_artifact_path,
model_providers=stub_model_providers,
secret_resolver=PlaintextResolver(),
managed_assets_path=stub_managed_assets_path,
)
with (
patch.object(data_designer, "_create_dataset_builder") as mock_builder_method,
patch.object(data_designer, "_create_dataset_profiler") as mock_profiler_method,
):
# Mock builder to succeed
mock_builder = MagicMock()
mock_builder.build.return_value = None
mock_builder.artifact_storage.load_dataset_with_dropped_columns.return_value = pd.DataFrame({"col": [1, 2, 3]})
mock_builder_method.return_value = mock_builder
# Mock profiler to fail
mock_profiler = MagicMock()
mock_profiler.profile_dataset.side_effect = ValueError("Profiler failed")
mock_profiler_method.return_value = mock_profiler
with pytest.raises(DataDesignerProfilingError, match="🛑 Error profiling dataset: Profiler failed"):
data_designer.create(stub_sampler_only_config_builder, num_records=3)
def test_preview_raises_error_when_builder_fails(
stub_artifact_path, stub_model_providers, stub_sampler_only_config_builder, stub_managed_assets_path
):
"""Test that preview method raises DataDesignerPreviewError when builder.build_preview fails."""
data_designer = DataDesigner(
artifact_path=stub_artifact_path,
model_providers=stub_model_providers,
secret_resolver=PlaintextResolver(),
managed_assets_path=stub_managed_assets_path,
)
with patch.object(data_designer, "_create_dataset_builder") as mock_builder_method:
mock_builder = MagicMock()
mock_builder.build_preview.side_effect = RuntimeError("Builder preview failed")
mock_builder_method.return_value = mock_builder
with pytest.raises(
DataDesignerGenerationError, match="🛑 Error generating preview dataset: Builder preview failed"
):
data_designer.preview(stub_sampler_only_config_builder, num_records=3)
def test_preview_raises_error_when_profiler_fails(
stub_artifact_path, stub_model_providers, stub_sampler_only_config_builder, stub_managed_assets_path
):
"""Test that preview method raises DataDesignerPreviewError when profiler.profile_dataset fails."""
data_designer = DataDesigner(
artifact_path=stub_artifact_path,
model_providers=stub_model_providers,
secret_resolver=PlaintextResolver(),
managed_assets_path=stub_managed_assets_path,
)
with (
patch.object(data_designer, "_create_dataset_builder") as mock_builder_method,
patch.object(data_designer, "_create_dataset_profiler") as mock_profiler_method,
):
# Mock builder to succeed
mock_builder = MagicMock()
mock_builder.build_preview.return_value = pd.DataFrame({"col": [1, 2, 3]})
mock_builder.process_preview.return_value = pd.DataFrame({"col": [1, 2, 3]})
mock_builder_method.return_value = mock_builder
# Mock profiler to fail
mock_profiler = MagicMock()
mock_profiler.profile_dataset.side_effect = ValueError("Profiler failed in preview")
mock_profiler_method.return_value = mock_profiler
with pytest.raises(
DataDesignerProfilingError, match="🛑 Error profiling preview dataset: Profiler failed in preview"
):
data_designer.preview(stub_sampler_only_config_builder, num_records=3)
def test_preview_with_dropped_columns(
stub_artifact_path, stub_model_providers, stub_model_configs, stub_managed_assets_path
):
"""Test that preview correctly handles dropped columns and maintains consistency."""
config_builder = DataDesignerConfigBuilder(model_configs=stub_model_configs)
config_builder.add_column(
SamplerColumnConfig(
name="uuid", sampler_type="uuid", params={"prefix": "id_", "short_form": True, "uppercase": False}
)
)
config_builder.add_column(
SamplerColumnConfig(name="category", sampler_type="category", params={"values": ["a", "b", "c"]})
)
config_builder.add_column(
SamplerColumnConfig(name="uniform", sampler_type="uniform", params={"low": 1, "high": 100})
)
config_builder.add_processor(
DropColumnsProcessorConfig(
name="drop_columns_processor", build_stage=BuildStage.POST_BATCH, column_names=["category"]
)
)
data_designer = DataDesigner(
artifact_path=stub_artifact_path,
model_providers=stub_model_providers,
secret_resolver=PlaintextResolver(),
managed_assets_path=stub_managed_assets_path,
)
num_records = 5
preview_results = data_designer.preview(config_builder, num_records=num_records)
preview_dataset = preview_results.dataset
assert "category" not in preview_dataset.columns, "Dropped column 'category' should not be in preview dataset"
assert "uuid" in preview_dataset.columns, "Column 'uuid' should be in preview dataset"
assert "uniform" in preview_dataset.columns, "Column 'uniform' should be in preview dataset"
assert len(preview_dataset) == num_records, f"Preview dataset should have {num_records} records"
analysis = preview_results.analysis
assert analysis is not None, "Analysis should be generated"
column_names_in_analysis = [stat.column_name for stat in analysis.column_statistics]
assert "uuid" in column_names_in_analysis, "Column 'uuid' should be in analysis"
assert "uniform" in column_names_in_analysis, "Column 'uniform' should be in analysis"
assert "category" not in column_names_in_analysis, "Dropped column 'category' should not be in analysis statistics"
assert analysis.side_effect_column_names is not None, "Side effect column names should be tracked"
assert "category" in analysis.side_effect_column_names, (
"Dropped column 'category' should be tracked in side_effect_column_names"
)
def test_validate_raises_error_when_seed_collides(
stub_artifact_path,
stub_model_providers,
stub_model_configs,
stub_managed_assets_path,
stub_seed_reader,
):
config_builder = DataDesignerConfigBuilder(model_configs=stub_model_configs)
config_builder.with_seed_dataset(HuggingFaceSeedSource(path="hf://datasets/test/data.csv"))
config_builder.add_column(
SamplerColumnConfig(
name="city",
sampler_type=SamplerType.CATEGORY,
params=CategorySamplerParams(values=["new york", "los angeles"]),
)
)
data_designer = DataDesigner(
artifact_path=stub_artifact_path,
model_providers=stub_model_providers,
secret_resolver=PlaintextResolver(),
managed_assets_path=stub_managed_assets_path,
seed_readers=[stub_seed_reader],
)
with pytest.raises(InvalidConfigError):
data_designer.validate(config_builder)