DataDesigner/tests_e2e/tests/test_e2e.py
Andre Manoel 982ce79ca9
feat: add processor plugin support (#299)
* feat: add processor plugin support

Add PluginType.PROCESSOR to the plugin system, enabling third-party
processor plugins via entry points. Includes a demo plugin package
with RegexFilterProcessor (process_before_batch) and
SemanticDedupProcessor (process_after_generation).

- Add PluginType.PROCESSOR with processor_type discriminator
- Create processor_types.py for ProcessorConfigT with plugin injection
- Register plugin processors in engine ProcessorRegistry
- Use RLock in PluginRegistry to prevent deadlocks during discovery
- Add demo package: data-designer-demo-processors
- Update processor and plugin documentation

* test: add processor plugin registration test

Verify that processor plugins from PluginRegistry are picked up
by create_default_processor_registry and registered correctly.

* test: simplify processor plugin registration test

* move ProcessorConfig to base and convert demo to e2e test

- Move ProcessorConfig from processors.py to config.base to guard
  against circular deps (alongside SingleColumnConfig)
- Delete demo/ directory with regex_filter and semantic_dedup plugins
- Add regex_filter as an e2e processor plugin test in tests_e2e/

* move plan to plans/299/
2026-02-25 16:40:01 -03:00

102 lines
3.2 KiB
Python

# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from pathlib import Path
import pandas as pd
import data_designer.config as dd
from data_designer.interface import DataDesigner
from data_designer_e2e_tests.plugins.column_generator.config import DemoColumnGeneratorConfig
from data_designer_e2e_tests.plugins.regex_filter.config import RegexFilterProcessorConfig
from data_designer_e2e_tests.plugins.seed_reader.config import DemoSeedSource
def test_column_generator_plugin() -> None:
data_designer = DataDesigner()
config_builder = dd.DataDesignerConfigBuilder()
# This sampler column is necessary as a temporary workaround to https://github.com/NVIDIA-NeMo/DataDesigner/issues/4
config_builder.add_column(
dd.SamplerColumnConfig(
name="irrelevant",
sampler_type=dd.SamplerType.CATEGORY,
params=dd.CategorySamplerParams(values=["irrelevant"]),
)
)
config_builder.add_column(
DemoColumnGeneratorConfig(
name="upper",
text="hello world",
)
)
preview = data_designer.preview(config_builder)
capitalized = set(preview.dataset["upper"].values)
assert capitalized == {"HELLO WORLD"}
def test_seed_reader_plugin() -> None:
current_dir = Path(__file__).parent
data_designer = DataDesigner()
config_builder = dd.DataDesignerConfigBuilder()
config_builder.with_seed_dataset(
DemoSeedSource(
directory=str(current_dir),
filename="test_seed.csv",
)
)
# This sampler column is necessary as a temporary workaround to https://github.com/NVIDIA-NeMo/DataDesigner/issues/4
config_builder.add_column(
dd.SamplerColumnConfig(
name="irrelevant",
sampler_type=dd.SamplerType.CATEGORY,
params=dd.CategorySamplerParams(values=["irrelevant"]),
)
)
config_builder.add_column(
dd.ExpressionColumnConfig(
name="full_name",
expr="{{ first_name }} + {{ last_name }}",
)
)
preview = data_designer.preview(config_builder)
full_names = set(preview.dataset["full_name"].values)
assert full_names == {"John + Coltrane", "Miles + Davis", "Bill + Evans"}
def test_processor_plugin() -> None:
seed_data = pd.DataFrame(
{
"category": ["keep", "drop", "keep", "drop"],
"value": ["a", "b", "c", "d"],
}
)
data_designer = DataDesigner()
config_builder = dd.DataDesignerConfigBuilder()
config_builder.with_seed_dataset(dd.DataFrameSeedSource(df=seed_data))
config_builder.add_column(
dd.SamplerColumnConfig(
name="irrelevant",
sampler_type=dd.SamplerType.CATEGORY,
params=dd.CategorySamplerParams(values=["irrelevant"]),
)
)
config_builder.add_processor(
RegexFilterProcessorConfig(
name="keep_only",
column="category",
pattern="^keep$",
)
)
preview = data_designer.preview(config_builder)
assert len(preview.dataset) > 0
assert all(v == "keep" for v in preview.dataset["category"].values)