mirror of
https://github.com/NVIDIA-NeMo/DataDesigner
synced 2026-05-24 09:48:29 +00:00
* separate column configs and types * create plugin object * create plugin manager * fix config integration * make base task registry raise on collision false by default * update registry test after raise on collision default update * make analysis work using general stats calculation * default -> builtin * use entry point approach instead * rewire using plugin helpers * add env var to disable plugins * fix tests * update plugin manager tests * add tests for plugin helpers * update license headers * add emoji * not using the pm in the builder code * Update src/data_designer/plugins/manager.py Co-authored-by: Nabin Mulepati <nmulepati@nvidia.com> * Update src/data_designer/plugins/manager.py Co-authored-by: Nabin Mulepati <nmulepati@nvidia.com> * Update src/data_designer/plugins/manager.py Co-authored-by: Nabin Mulepati <nmulepati@nvidia.com> * merge plugin registry into the manager * small pr feedback * client side plugin manager * builtin -> default; move adding plugins to registry * update method names to better match what they do * use register verb for consistency with other registries * thread safety updates; make discover private --------- Co-authored-by: Nabin Mulepati <nmulepati@nvidia.com>
67 lines
2.8 KiB
Python
67 lines
2.8 KiB
Python
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
import pandas as pd
|
|
from pydantic import ValidationError
|
|
import pytest
|
|
|
|
from data_designer.config.column_configs import SamplerColumnConfig
|
|
from data_designer.config.column_types import DataDesignerColumnType
|
|
from data_designer.config.sampler_params import SamplerType
|
|
from data_designer.engine.analysis.column_profilers.base import (
|
|
ColumnConfigWithDataFrame,
|
|
ColumnProfilerMetadata,
|
|
)
|
|
|
|
|
|
def test_column_config_with_dataframe_valid_column_config_with_dataframe():
|
|
df = pd.DataFrame({"test_column": [1, 2, 3]})
|
|
column_config = SamplerColumnConfig(name="test_column", sampler_type=SamplerType.CATEGORY, params={})
|
|
|
|
config_with_df = ColumnConfigWithDataFrame(column_config=column_config, df=df)
|
|
|
|
assert config_with_df.column_config.name == "test_column"
|
|
assert "test_column" in config_with_df.df.columns
|
|
assert config_with_df.df["test_column"].tolist() == [1, 2, 3]
|
|
|
|
|
|
def test_column_config_with_dataframe_column_not_found_validation_error():
|
|
df = pd.DataFrame({"other_column": [1, 2, 3]})
|
|
column_config = SamplerColumnConfig(name="test_column", sampler_type=SamplerType.CATEGORY, params={})
|
|
|
|
with pytest.raises(ValidationError, match="Column 'test_column' not found in DataFrame"):
|
|
ColumnConfigWithDataFrame(column_config=column_config, df=df)
|
|
|
|
|
|
def test_column_config_with_dataframe_pyarrow_backend_conversion():
|
|
df = pd.DataFrame({"test_column": [1, 2, 3]})
|
|
column_config = SamplerColumnConfig(name="test_column", sampler_type=SamplerType.CATEGORY, params={})
|
|
|
|
config_with_df = ColumnConfigWithDataFrame(column_config=column_config, df=df)
|
|
|
|
assert all(isinstance(dtype, pd.ArrowDtype) for dtype in config_with_df.df.dtypes)
|
|
|
|
|
|
def test_column_config_with_dataframe_as_tuple_method():
|
|
df = pd.DataFrame({"test_column": [1, 2, 3]})
|
|
column_config = SamplerColumnConfig(name="test_column", sampler_type=SamplerType.CATEGORY, params={})
|
|
|
|
config_with_df = ColumnConfigWithDataFrame(column_config=column_config, df=df)
|
|
column_config_result, df_result = config_with_df.as_tuple()
|
|
|
|
assert column_config_result == column_config
|
|
|
|
assert df_result["test_column"].tolist() == df["test_column"].tolist()
|
|
|
|
|
|
def test_column_profiler_metadata_creation():
|
|
metadata = ColumnProfilerMetadata(
|
|
name="test_profiler",
|
|
description="Test profiler",
|
|
applicable_column_types=[DataDesignerColumnType.SAMPLER, DataDesignerColumnType.LLM_TEXT],
|
|
required_resources=None,
|
|
)
|
|
|
|
assert metadata.name == "test_profiler"
|
|
assert metadata.description == "Test profiler"
|
|
assert metadata.applicable_column_types == [DataDesignerColumnType.SAMPLER, DataDesignerColumnType.LLM_TEXT]
|