mirror of
https://github.com/NVIDIA-NeMo/DataDesigner
synced 2026-05-24 09:48:29 +00:00
278 lines
8.2 KiB
Python
278 lines
8.2 KiB
Python
# ---
|
||
# jupyter:
|
||
# jupytext:
|
||
# text_representation:
|
||
# extension: .py
|
||
# format_name: percent
|
||
# format_version: '1.3'
|
||
# jupytext_version: 1.18.1
|
||
# kernelspec:
|
||
# display_name: .venv
|
||
# language: python
|
||
# name: python3
|
||
# ---
|
||
|
||
# %% [markdown]
|
||
# # 🎨 Data Designer Tutorial: Seeding Synthetic Data Generation with an External Dataset
|
||
#
|
||
# #### 📚 What you'll learn
|
||
#
|
||
# In this notebook, we will demonstrate how to seed synthetic data generation in Data Designer with an external dataset.
|
||
#
|
||
# If this is your first time using Data Designer, we recommend starting with the [first notebook](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/1-the-basics/) in this tutorial series.
|
||
#
|
||
|
||
# %% [markdown]
|
||
# ### 📦 Import Data Designer
|
||
#
|
||
# - `data_designer.config` provides access to the configuration API.
|
||
#
|
||
# - `DataDesigner` is the main interface for data generation.
|
||
#
|
||
|
||
# %%
|
||
import data_designer.config as dd
|
||
from data_designer.interface import DataDesigner
|
||
|
||
# %% [markdown]
|
||
# ### ⚙️ Initialize the Data Designer interface
|
||
#
|
||
# - `DataDesigner` is the main object responsible for managing the data generation process.
|
||
#
|
||
# - When initialized without arguments, the [default model providers](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/models/default-model-settings/) are used.
|
||
#
|
||
|
||
# %%
|
||
data_designer = DataDesigner()
|
||
|
||
# %% [markdown]
|
||
# ### 🎛️ Define model configurations
|
||
#
|
||
# - Each `ModelConfig` defines a model that can be used during the generation process.
|
||
#
|
||
# - The "model alias" is used to reference the model in the Data Designer config (as we will see below).
|
||
#
|
||
# - The "model provider" is the external service that hosts the model (see the [model config](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/models/default-model-settings/) docs for more details).
|
||
#
|
||
# - By default, we use [build.nvidia.com](https://build.nvidia.com/models) as the model provider.
|
||
#
|
||
|
||
# %%
|
||
# This name is set in the model provider configuration.
|
||
MODEL_PROVIDER = "nvidia"
|
||
|
||
# The model ID is from build.nvidia.com.
|
||
MODEL_ID = "nvidia/nemotron-3-nano-30b-a3b"
|
||
|
||
# We choose this alias to be descriptive for our use case.
|
||
MODEL_ALIAS = "nemotron-nano-v3"
|
||
|
||
model_configs = [
|
||
dd.ModelConfig(
|
||
alias=MODEL_ALIAS,
|
||
model=MODEL_ID,
|
||
provider=MODEL_PROVIDER,
|
||
inference_parameters=dd.ChatCompletionInferenceParams(
|
||
temperature=1.0,
|
||
top_p=1.0,
|
||
max_tokens=2048,
|
||
extra_body={"chat_template_kwargs": {"enable_thinking": False}},
|
||
),
|
||
)
|
||
]
|
||
|
||
# %% [markdown]
|
||
# ### 🏗️ Initialize the Data Designer Config Builder
|
||
#
|
||
# - The Data Designer config defines the dataset schema and generation process.
|
||
#
|
||
# - The config builder provides an intuitive interface for building this configuration.
|
||
#
|
||
# - The list of model configs is provided to the builder at initialization.
|
||
#
|
||
|
||
# %%
|
||
config_builder = dd.DataDesignerConfigBuilder(model_configs=model_configs)
|
||
|
||
# %% [markdown]
|
||
# ## 🏥 Prepare a seed dataset
|
||
#
|
||
# - For this notebook, we'll create a synthetic dataset of patient notes.
|
||
#
|
||
# - We will _seed_ the generation process with a [symptom-to-diagnosis dataset](https://huggingface.co/datasets/gretelai/symptom_to_diagnosis).
|
||
#
|
||
# - We already have the dataset downloaded in the [data](../data) directory of this repository.
|
||
#
|
||
# <br>
|
||
#
|
||
# > 🌱 **Why use a seed dataset?**
|
||
# >
|
||
# > - Seed datasets let you steer the generation process by providing context that is specific to your use case.
|
||
# >
|
||
# > - Seed datasets are also an excellent way to inject real-world diversity into your synthetic data.
|
||
# >
|
||
# > - During generation, prompt templates can reference any of the seed dataset fields.
|
||
#
|
||
|
||
# %%
|
||
# Download sample dataset from Github
|
||
import urllib.request
|
||
|
||
url = "https://raw.githubusercontent.com/NVIDIA/GenerativeAIExamples/refs/heads/main/nemo/NeMo-Data-Designer/data/gretelai_symptom_to_diagnosis.csv"
|
||
local_filename, _ = urllib.request.urlretrieve(url, "gretelai_symptom_to_diagnosis.csv")
|
||
|
||
# Seed datasets are passed as reference objects to the config builder.
|
||
seed_source = dd.LocalFileSeedSource(path=local_filename)
|
||
|
||
config_builder.with_seed_dataset(seed_source)
|
||
|
||
# %% [markdown]
|
||
# ## 🎨 Designing our synthetic patient notes dataset
|
||
#
|
||
# - The prompt template can reference fields from our seed dataset:
|
||
# - `{{ diagnosis }}` - the medical diagnosis from the seed data
|
||
# - `{{ patient_summary }}` - the symptom description from the seed data
|
||
#
|
||
|
||
# %%
|
||
config_builder.add_column(
|
||
dd.SamplerColumnConfig(
|
||
name="patient_sampler",
|
||
sampler_type=dd.SamplerType.PERSON_FROM_FAKER,
|
||
params=dd.PersonFromFakerSamplerParams(),
|
||
)
|
||
)
|
||
|
||
config_builder.add_column(
|
||
dd.SamplerColumnConfig(
|
||
name="doctor_sampler",
|
||
sampler_type=dd.SamplerType.PERSON_FROM_FAKER,
|
||
params=dd.PersonFromFakerSamplerParams(),
|
||
)
|
||
)
|
||
|
||
config_builder.add_column(
|
||
dd.SamplerColumnConfig(
|
||
name="patient_id",
|
||
sampler_type=dd.SamplerType.UUID,
|
||
params=dd.UUIDSamplerParams(
|
||
prefix="PT-",
|
||
short_form=True,
|
||
uppercase=True,
|
||
),
|
||
)
|
||
)
|
||
|
||
config_builder.add_column(dd.ExpressionColumnConfig(name="first_name", expr="{{ patient_sampler.first_name }}"))
|
||
|
||
config_builder.add_column(dd.ExpressionColumnConfig(name="last_name", expr="{{ patient_sampler.last_name }}"))
|
||
|
||
config_builder.add_column(dd.ExpressionColumnConfig(name="dob", expr="{{ patient_sampler.birth_date }}"))
|
||
|
||
config_builder.add_column(
|
||
dd.SamplerColumnConfig(
|
||
name="symptom_onset_date",
|
||
sampler_type=dd.SamplerType.DATETIME,
|
||
params=dd.DatetimeSamplerParams(start="2024-01-01", end="2024-12-31"),
|
||
)
|
||
)
|
||
|
||
config_builder.add_column(
|
||
dd.SamplerColumnConfig(
|
||
name="date_of_visit",
|
||
sampler_type=dd.SamplerType.TIMEDELTA,
|
||
params=dd.TimeDeltaSamplerParams(dt_min=1, dt_max=30, reference_column_name="symptom_onset_date"),
|
||
)
|
||
)
|
||
|
||
config_builder.add_column(dd.ExpressionColumnConfig(name="physician", expr="Dr. {{ doctor_sampler.last_name }}"))
|
||
|
||
config_builder.add_column(
|
||
dd.LLMTextColumnConfig(
|
||
name="physician_notes",
|
||
prompt="""\
|
||
You are a primary-care physician who just had an appointment with {{ first_name }} {{ last_name }},
|
||
who has been struggling with symptoms from {{ diagnosis }} since {{ symptom_onset_date }}.
|
||
The date of today's visit is {{ date_of_visit }}.
|
||
|
||
{{ patient_summary }}
|
||
|
||
Write careful notes about your visit with {{ first_name }},
|
||
as Dr. {{ doctor_sampler.first_name }} {{ doctor_sampler.last_name }}.
|
||
|
||
Format the notes as a busy doctor might.
|
||
Respond with only the notes, no other text.
|
||
""",
|
||
model_alias=MODEL_ALIAS,
|
||
)
|
||
)
|
||
|
||
data_designer.validate(config_builder)
|
||
|
||
# %% [markdown]
|
||
# ### 🔁 Iteration is key – preview the dataset!
|
||
#
|
||
# 1. Use the `preview` method to generate a sample of records quickly.
|
||
#
|
||
# 2. Inspect the results for quality and format issues.
|
||
#
|
||
# 3. Adjust column configurations, prompts, or parameters as needed.
|
||
#
|
||
# 4. Re-run the preview until satisfied.
|
||
#
|
||
|
||
# %%
|
||
preview = data_designer.preview(config_builder, num_records=2)
|
||
|
||
# %%
|
||
# Run this cell multiple times to cycle through the 2 preview records.
|
||
preview.display_sample_record()
|
||
|
||
# %%
|
||
# The preview dataset is available as a pandas DataFrame.
|
||
preview.dataset
|
||
|
||
# %% [markdown]
|
||
# ### 📊 Analyze the generated data
|
||
#
|
||
# - Data Designer automatically generates a basic statistical analysis of the generated data.
|
||
#
|
||
# - This analysis is available via the `analysis` property of generation result objects.
|
||
#
|
||
|
||
# %%
|
||
# Print the analysis as a table.
|
||
preview.analysis.to_report()
|
||
|
||
# %% [markdown]
|
||
# ### 🆙 Scale up!
|
||
#
|
||
# - Happy with your preview data?
|
||
#
|
||
# - Use the `create` method to submit larger Data Designer generation jobs.
|
||
#
|
||
|
||
# %%
|
||
results = data_designer.create(config_builder, num_records=10, dataset_name="tutorial-3")
|
||
|
||
# %%
|
||
# Load the generated dataset as a pandas DataFrame.
|
||
dataset = results.load_dataset()
|
||
|
||
dataset.head()
|
||
|
||
# %%
|
||
# Load the analysis results into memory.
|
||
analysis = results.load_analysis()
|
||
|
||
analysis.to_report()
|
||
|
||
# %% [markdown]
|
||
# ## ⏭️ Next Steps
|
||
#
|
||
# Check out the following notebook to learn more about:
|
||
#
|
||
# - [Providing images as context](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/4-providing-images-as-context/)
|
||
#
|
||
# - [Generating images](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/5-generating-images/)
|
||
#
|