DataDesigner/tests/conftest.py
Johnny Greco f8c201e085
chore: update header script to check for diffs (#195)
* update script

* update headers

* refactor a bit and add test script

* update headers

* update for edge case

* update headers

* add step to get file creation date

* use git history to get copyright year

* generation type is printed with inference parameters

* fix unit test
2026-01-09 17:10:58 -05:00

318 lines
11 KiB
Python

# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
import os
import tarfile
import tempfile
import textwrap
import pandas as pd
import pytest
import yaml
from data_designer.config.analysis.column_statistics import GeneralColumnStatistics
from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults
from data_designer.config.column_configs import SamplerColumnConfig
from data_designer.config.config_builder import DataDesignerConfigBuilder
from data_designer.config.data_designer_config import DataDesignerConfig
from data_designer.config.models import ChatCompletionInferenceParams, ModelConfig, ModelProvider
from data_designer.config.seed_source import HuggingFaceSeedSource
from data_designer.engine.resources.seed_reader import SeedReader
@pytest.fixture
def stub_data_designer_config_str() -> str:
return """
model_configs:
- alias: my_own_code_model
model: openai/meta/llama-3.3-70b-instruct
inference_parameters:
temperature:
distribution_type: uniform
params:
low: 0.5
high: 0.9
top_p:
distribution_type: manual
params:
values: [0.1, 0.2, 0.33]
weights: [0.3, 0.2, 0.50]
seed_config:
source:
seed_type: hf
path: datasets/test-repo/testing/data.csv
sampling_strategy: shuffle
columns:
- name: code_id
sampler_type: uuid
column_type: sampler
params:
prefix: code_
short_form: true
uppercase: true
- name: age
sampler_type: uniform
column_type: sampler
params:
low: 35
high: 88
- name: domain
sampler_type: category
column_type: sampler
params:
values: [Healthcare, Finance, Education, Government]
- name: topic
sampler_type: category
column_type: sampler
params:
values: [Web Development, Data Science, Machine Learning, Cloud Computing]
- name: text
column_type: llm-text
prompt: Write a description of python code in topic {topic} and domain {domain}
model_alias: my_own_code_model
- name: code
column_type: llm-code
prompt: Write Python code that will be paired with the following prompt {text}
model_alias: my_own_code_model
code_lang: python
- name: code_validation_result
column_type: validation
target_columns:
- code
validator_type: code
validator_params:
code_lang: python
- name: code_judge_result
model_alias: my_own_code_model
column_type: llm-judge
prompt: You are an expert in Python programming and make appropriate judgement on the quality of the code.
scores:
- name: Pythonic
description: Pythonic Code and Best Practices (Does the code follow Python conventions and best practices?)
options:
"4": The code exemplifies Pythonic principles, making excellent use of Python-specific constructs, standard library modules and programming idioms; follows all relevant PEPs.
"3": The code closely follows Python conventions and adheres to many best practices; good use of Python-specific constructs, standard library modules and programming idioms.
"2": The code generally follows Python conventions but has room for better alignment with Pythonic practices.
"1": The code loosely follows Python conventions, with several deviations from best practices.
"0": The code does not follow Python conventions or best practices, using non-Pythonic approaches.
- name: Readability
description: Readability and Maintainability (Is the Python code easy to understand and maintain?)
options:
"4": The code is excellently formatted, follows PEP 8 guidelines, is elegantly concise and clear, uses meaningful variable names, ensuring high readability and ease of maintenance; organizes complex logic well. Docstrings are given in a Google Docstring format.
"3": The code is well-formatted in the sense of code-as-documentation, making it relatively easy to understand and maintain; uses descriptive names and organizes logic clearly.
"2": The code is somewhat readable with basic formatting and some comments, but improvements are needed; needs better use of descriptive names and organization.
"1": The code has minimal formatting, making it hard to understand; lacks meaningful names and organization.
"0": The code is unreadable, with no attempt at formatting or description.
constraints:
- target_column: age
operator: "lt"
rhs: 65
"""
@pytest.fixture
def stub_data_designer_builder_config_str(stub_data_designer_config_str: str) -> str:
return f"""
data_designer:
{textwrap.indent(stub_data_designer_config_str, prefix=" ")}
"""
@pytest.fixture
def stub_data_designer_config(stub_data_designer_config_str: str) -> DataDesignerConfig:
json_config = yaml.safe_load(stub_data_designer_config_str)
return DataDesignerConfig.model_validate(json_config)
@pytest.fixture
def stub_model_configs() -> list[ModelConfig]:
return [
ModelConfig(
alias="stub-model",
model="stub-model",
inference_parameters=ChatCompletionInferenceParams(
temperature=0.9,
top_p=0.9,
max_tokens=2048,
),
)
]
@pytest.fixture
def stub_model_providers() -> list[ModelProvider]:
return [
ModelProvider(
name="provider-1",
endpoint="https://api.provider-1.com/v1",
api_key="PROVIDER_1_API_KEY",
)
]
@pytest.fixture
def stub_empty_builder(stub_model_configs: list[ModelConfig]) -> DataDesignerConfigBuilder:
"""Test builder with model configs."""
return DataDesignerConfigBuilder(model_configs=stub_model_configs)
@pytest.fixture
def stub_complete_builder(stub_data_designer_builder_config_str: str) -> DataDesignerConfigBuilder:
return DataDesignerConfigBuilder.from_config(config=stub_data_designer_builder_config_str)
@pytest.fixture
def stub_dataframe():
return pd.DataFrame(
{
"name": ["John", "Jane", "Jim", "Jill", "Mike", "Mary", "Mark", "Martha", "Alex", "Alice", "Bob", "Bella"],
"age": [25, 30, 35, 40, 45, 50, 55, 60, 22, 28, 65, 38],
"city": [
"New York",
"Los Angeles",
"Chicago",
"Houston",
"Miami",
"Seattle",
"San Francisco",
"Boston",
"Denver",
"Austin",
"Portland",
"Atlanta",
],
"state": ["NY", "CA", "IL", "TX", "FL", "WA", "CA", "MA", "CO", "TX", "OR", "GA"],
"zip": [
"10001",
"90001",
"60601",
"77001",
"33101",
"98101",
"94101",
"02101",
"80201",
"73301",
"97201",
"30301",
],
"email": [
"john@example.com",
"jane@example.com",
"jim@example.com",
"jill@example.com",
"mike@example.com",
"mary@example.com",
"mark@example.com",
"martha@example.com",
"alex.smith@example.co.uk",
"alice.wu@example.ca",
"bob.martin@example.org",
"bella.rossi@example.it",
],
"phone": [
"123-456-7890",
"213-555-1234",
"312-222-3333",
"713-444-5555",
"305-888-9999",
"206-777-8888",
"415-999-0000",
"617-111-2222",
"+44 20 7946 0958",
"+1-416-555-0199",
"+39 06 6982 1234",
"+49 30 123456",
],
"address": [
"123 Main St",
"456 Oak Ave",
"789 Pine Rd",
"101 Maple Blvd",
"202 Elm St",
"303 Cedar Ave",
"404 Spruce Dr",
"505 Birch Ln",
"12 Baker St",
"88 King St W",
"Via Roma 1",
"Unter den Linden 5",
],
}
)
@pytest.fixture
def stub_dataset_tar_file():
with tempfile.TemporaryDirectory() as temp_dir:
# Create valid parquet files with actual data
df1 = pd.DataFrame({"id": ["1", "2"], "name": ["test", "sample"]})
df2 = pd.DataFrame({"id": ["3", "4"], "name": ["data", "example"]})
# Write parquet files
os.makedirs(temp_dir + "/dataset", exist_ok=True)
df1.to_parquet(temp_dir + "/dataset/dataset-001.parquet", index=False)
df2.to_parquet(temp_dir + "/dataset/dataset-002.parquet", index=False)
# Create tar file
tar_path = temp_dir + "/dataset.tar"
with tarfile.open(tar_path, "w:gz") as tar:
tar.add(temp_dir + "/dataset/dataset-001.parquet", arcname="dataset/dataset-001.parquet")
tar.add(temp_dir + "/dataset/dataset-002.parquet", arcname="dataset/dataset-002.parquet")
with open(tar_path, "rb") as tar_file:
yield tar_file
@pytest.fixture
def stub_dataset_profiler_results():
stub_column_statistics = GeneralColumnStatistics(
column_name="some",
num_records=1,
num_unique=1,
num_null=0,
pyarrow_dtype="string",
simple_dtype="string",
)
return DatasetProfilerResults(
num_records=1,
target_num_records=100,
column_statistics=[stub_column_statistics],
side_effect_column_names=None,
column_profiles=None,
)
@pytest.fixture
def stub_sampler_only_config_builder(stub_model_configs: list[ModelConfig]) -> DataDesignerConfigBuilder:
config_builder = DataDesignerConfigBuilder(model_configs=stub_model_configs)
config_builder.add_column(
SamplerColumnConfig(
name="uuid", sampler_type="uuid", params={"prefix": "code_", "short_form": True, "uppercase": True}
)
)
config_builder.add_column(
SamplerColumnConfig(name="category", sampler_type="category", params={"values": ["a", "b", "c"]})
)
config_builder.add_column(
SamplerColumnConfig(name="uniform", sampler_type="uniform", params={"low": 1, "high": 100})
)
return config_builder
class StubHuggingFaceSeedReader(SeedReader[HuggingFaceSeedSource]):
def get_column_names(self) -> list[str]:
return ["age", "city"]
def get_dataset_uri(self) -> str:
return "unused in these tests"
def create_duckdb_connection(self):
pass
@pytest.fixture
def stub_seed_reader():
return StubHuggingFaceSeedReader()