mirror of
https://github.com/NVIDIA-NeMo/DataDesigner
synced 2026-05-24 09:48:29 +00:00
* update script * update headers * refactor a bit and add test script * update headers * update for edge case * update headers * add step to get file creation date * use git history to get copyright year * generation type is printed with inference parameters * fix unit test
318 lines
11 KiB
Python
318 lines
11 KiB
Python
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
|
|
import os
|
|
import tarfile
|
|
import tempfile
|
|
import textwrap
|
|
|
|
import pandas as pd
|
|
import pytest
|
|
import yaml
|
|
|
|
from data_designer.config.analysis.column_statistics import GeneralColumnStatistics
|
|
from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
|
|
from data_designer.config.column_configs import SamplerColumnConfig
|
|
from data_designer.config.config_builder import DataDesignerConfigBuilder
|
|
from data_designer.config.data_designer_config import DataDesignerConfig
|
|
from data_designer.config.models import ChatCompletionInferenceParams, ModelConfig, ModelProvider
|
|
from data_designer.config.seed_source import HuggingFaceSeedSource
|
|
from data_designer.engine.resources.seed_reader import SeedReader
|
|
|
|
|
|
@pytest.fixture
|
|
def stub_data_designer_config_str() -> str:
|
|
return """
|
|
model_configs:
|
|
- alias: my_own_code_model
|
|
model: openai/meta/llama-3.3-70b-instruct
|
|
inference_parameters:
|
|
temperature:
|
|
distribution_type: uniform
|
|
params:
|
|
low: 0.5
|
|
high: 0.9
|
|
top_p:
|
|
distribution_type: manual
|
|
params:
|
|
values: [0.1, 0.2, 0.33]
|
|
weights: [0.3, 0.2, 0.50]
|
|
|
|
seed_config:
|
|
source:
|
|
seed_type: hf
|
|
path: datasets/test-repo/testing/data.csv
|
|
sampling_strategy: shuffle
|
|
|
|
columns:
|
|
- name: code_id
|
|
sampler_type: uuid
|
|
column_type: sampler
|
|
params:
|
|
prefix: code_
|
|
short_form: true
|
|
uppercase: true
|
|
- name: age
|
|
sampler_type: uniform
|
|
column_type: sampler
|
|
params:
|
|
low: 35
|
|
high: 88
|
|
- name: domain
|
|
sampler_type: category
|
|
column_type: sampler
|
|
params:
|
|
values: [Healthcare, Finance, Education, Government]
|
|
- name: topic
|
|
sampler_type: category
|
|
column_type: sampler
|
|
params:
|
|
values: [Web Development, Data Science, Machine Learning, Cloud Computing]
|
|
- name: text
|
|
column_type: llm-text
|
|
prompt: Write a description of python code in topic {topic} and domain {domain}
|
|
model_alias: my_own_code_model
|
|
- name: code
|
|
column_type: llm-code
|
|
prompt: Write Python code that will be paired with the following prompt {text}
|
|
model_alias: my_own_code_model
|
|
code_lang: python
|
|
- name: code_validation_result
|
|
column_type: validation
|
|
target_columns:
|
|
- code
|
|
validator_type: code
|
|
validator_params:
|
|
code_lang: python
|
|
- name: code_judge_result
|
|
model_alias: my_own_code_model
|
|
column_type: llm-judge
|
|
prompt: You are an expert in Python programming and make appropriate judgement on the quality of the code.
|
|
scores:
|
|
- name: Pythonic
|
|
description: Pythonic Code and Best Practices (Does the code follow Python conventions and best practices?)
|
|
options:
|
|
"4": The code exemplifies Pythonic principles, making excellent use of Python-specific constructs, standard library modules and programming idioms; follows all relevant PEPs.
|
|
"3": The code closely follows Python conventions and adheres to many best practices; good use of Python-specific constructs, standard library modules and programming idioms.
|
|
"2": The code generally follows Python conventions but has room for better alignment with Pythonic practices.
|
|
"1": The code loosely follows Python conventions, with several deviations from best practices.
|
|
"0": The code does not follow Python conventions or best practices, using non-Pythonic approaches.
|
|
- name: Readability
|
|
description: Readability and Maintainability (Is the Python code easy to understand and maintain?)
|
|
options:
|
|
"4": The code is excellently formatted, follows PEP 8 guidelines, is elegantly concise and clear, uses meaningful variable names, ensuring high readability and ease of maintenance; organizes complex logic well. Docstrings are given in a Google Docstring format.
|
|
"3": The code is well-formatted in the sense of code-as-documentation, making it relatively easy to understand and maintain; uses descriptive names and organizes logic clearly.
|
|
"2": The code is somewhat readable with basic formatting and some comments, but improvements are needed; needs better use of descriptive names and organization.
|
|
"1": The code has minimal formatting, making it hard to understand; lacks meaningful names and organization.
|
|
"0": The code is unreadable, with no attempt at formatting or description.
|
|
|
|
constraints:
|
|
- target_column: age
|
|
operator: "lt"
|
|
rhs: 65
|
|
"""
|
|
|
|
|
|
@pytest.fixture
|
|
def stub_data_designer_builder_config_str(stub_data_designer_config_str: str) -> str:
|
|
return f"""
|
|
data_designer:
|
|
{textwrap.indent(stub_data_designer_config_str, prefix=" ")}
|
|
"""
|
|
|
|
|
|
@pytest.fixture
|
|
def stub_data_designer_config(stub_data_designer_config_str: str) -> DataDesignerConfig:
|
|
json_config = yaml.safe_load(stub_data_designer_config_str)
|
|
return DataDesignerConfig.model_validate(json_config)
|
|
|
|
|
|
@pytest.fixture
|
|
def stub_model_configs() -> list[ModelConfig]:
|
|
return [
|
|
ModelConfig(
|
|
alias="stub-model",
|
|
model="stub-model",
|
|
inference_parameters=ChatCompletionInferenceParams(
|
|
temperature=0.9,
|
|
top_p=0.9,
|
|
max_tokens=2048,
|
|
),
|
|
)
|
|
]
|
|
|
|
|
|
@pytest.fixture
|
|
def stub_model_providers() -> list[ModelProvider]:
|
|
return [
|
|
ModelProvider(
|
|
name="provider-1",
|
|
endpoint="https://api.provider-1.com/v1",
|
|
api_key="PROVIDER_1_API_KEY",
|
|
)
|
|
]
|
|
|
|
|
|
@pytest.fixture
|
|
def stub_empty_builder(stub_model_configs: list[ModelConfig]) -> DataDesignerConfigBuilder:
|
|
"""Test builder with model configs."""
|
|
return DataDesignerConfigBuilder(model_configs=stub_model_configs)
|
|
|
|
|
|
@pytest.fixture
|
|
def stub_complete_builder(stub_data_designer_builder_config_str: str) -> DataDesignerConfigBuilder:
|
|
return DataDesignerConfigBuilder.from_config(config=stub_data_designer_builder_config_str)
|
|
|
|
|
|
@pytest.fixture
|
|
def stub_dataframe():
|
|
return pd.DataFrame(
|
|
{
|
|
"name": ["John", "Jane", "Jim", "Jill", "Mike", "Mary", "Mark", "Martha", "Alex", "Alice", "Bob", "Bella"],
|
|
"age": [25, 30, 35, 40, 45, 50, 55, 60, 22, 28, 65, 38],
|
|
"city": [
|
|
"New York",
|
|
"Los Angeles",
|
|
"Chicago",
|
|
"Houston",
|
|
"Miami",
|
|
"Seattle",
|
|
"San Francisco",
|
|
"Boston",
|
|
"Denver",
|
|
"Austin",
|
|
"Portland",
|
|
"Atlanta",
|
|
],
|
|
"state": ["NY", "CA", "IL", "TX", "FL", "WA", "CA", "MA", "CO", "TX", "OR", "GA"],
|
|
"zip": [
|
|
"10001",
|
|
"90001",
|
|
"60601",
|
|
"77001",
|
|
"33101",
|
|
"98101",
|
|
"94101",
|
|
"02101",
|
|
"80201",
|
|
"73301",
|
|
"97201",
|
|
"30301",
|
|
],
|
|
"email": [
|
|
"john@example.com",
|
|
"jane@example.com",
|
|
"jim@example.com",
|
|
"jill@example.com",
|
|
"mike@example.com",
|
|
"mary@example.com",
|
|
"mark@example.com",
|
|
"martha@example.com",
|
|
"alex.smith@example.co.uk",
|
|
"alice.wu@example.ca",
|
|
"bob.martin@example.org",
|
|
"bella.rossi@example.it",
|
|
],
|
|
"phone": [
|
|
"123-456-7890",
|
|
"213-555-1234",
|
|
"312-222-3333",
|
|
"713-444-5555",
|
|
"305-888-9999",
|
|
"206-777-8888",
|
|
"415-999-0000",
|
|
"617-111-2222",
|
|
"+44 20 7946 0958",
|
|
"+1-416-555-0199",
|
|
"+39 06 6982 1234",
|
|
"+49 30 123456",
|
|
],
|
|
"address": [
|
|
"123 Main St",
|
|
"456 Oak Ave",
|
|
"789 Pine Rd",
|
|
"101 Maple Blvd",
|
|
"202 Elm St",
|
|
"303 Cedar Ave",
|
|
"404 Spruce Dr",
|
|
"505 Birch Ln",
|
|
"12 Baker St",
|
|
"88 King St W",
|
|
"Via Roma 1",
|
|
"Unter den Linden 5",
|
|
],
|
|
}
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def stub_dataset_tar_file():
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
# Create valid parquet files with actual data
|
|
df1 = pd.DataFrame({"id": ["1", "2"], "name": ["test", "sample"]})
|
|
df2 = pd.DataFrame({"id": ["3", "4"], "name": ["data", "example"]})
|
|
|
|
# Write parquet files
|
|
os.makedirs(temp_dir + "/dataset", exist_ok=True)
|
|
df1.to_parquet(temp_dir + "/dataset/dataset-001.parquet", index=False)
|
|
df2.to_parquet(temp_dir + "/dataset/dataset-002.parquet", index=False)
|
|
|
|
# Create tar file
|
|
tar_path = temp_dir + "/dataset.tar"
|
|
with tarfile.open(tar_path, "w:gz") as tar:
|
|
tar.add(temp_dir + "/dataset/dataset-001.parquet", arcname="dataset/dataset-001.parquet")
|
|
tar.add(temp_dir + "/dataset/dataset-002.parquet", arcname="dataset/dataset-002.parquet")
|
|
with open(tar_path, "rb") as tar_file:
|
|
yield tar_file
|
|
|
|
|
|
@pytest.fixture
|
|
def stub_dataset_profiler_results():
|
|
stub_column_statistics = GeneralColumnStatistics(
|
|
column_name="some",
|
|
num_records=1,
|
|
num_unique=1,
|
|
num_null=0,
|
|
pyarrow_dtype="string",
|
|
simple_dtype="string",
|
|
)
|
|
return DatasetProfilerResults(
|
|
num_records=1,
|
|
target_num_records=100,
|
|
column_statistics=[stub_column_statistics],
|
|
side_effect_column_names=None,
|
|
column_profiles=None,
|
|
)
|
|
|
|
|
|
@pytest.fixture
|
|
def stub_sampler_only_config_builder(stub_model_configs: list[ModelConfig]) -> DataDesignerConfigBuilder:
|
|
config_builder = DataDesignerConfigBuilder(model_configs=stub_model_configs)
|
|
config_builder.add_column(
|
|
SamplerColumnConfig(
|
|
name="uuid", sampler_type="uuid", params={"prefix": "code_", "short_form": True, "uppercase": True}
|
|
)
|
|
)
|
|
config_builder.add_column(
|
|
SamplerColumnConfig(name="category", sampler_type="category", params={"values": ["a", "b", "c"]})
|
|
)
|
|
config_builder.add_column(
|
|
SamplerColumnConfig(name="uniform", sampler_type="uniform", params={"low": 1, "high": 100})
|
|
)
|
|
return config_builder
|
|
|
|
|
|
class StubHuggingFaceSeedReader(SeedReader[HuggingFaceSeedSource]):
|
|
def get_column_names(self) -> list[str]:
|
|
return ["age", "city"]
|
|
|
|
def get_dataset_uri(self) -> str:
|
|
return "unused in these tests"
|
|
|
|
def create_duckdb_connection(self):
|
|
pass
|
|
|
|
|
|
@pytest.fixture
|
|
def stub_seed_reader():
|
|
return StubHuggingFaceSeedReader()
|