From 7181db3eb7074f97d4e7ffbf06e46a4a1ff12e7a Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Thu, 15 Jan 2026 14:51:54 -0700 Subject: [PATCH] chore: lazy 3rd party imports (#222) --- .gitignore | 3 + AGENTS.md | 145 +++++++++++++++++- Makefile | 49 +++++- src/data_designer/__init__.py | 2 + src/data_designer/cli/__init__.py | 2 + src/data_designer/cli/commands/download.py | 2 + src/data_designer/cli/commands/list.py | 2 + src/data_designer/cli/commands/models.py | 2 + src/data_designer/cli/commands/providers.py | 2 + src/data_designer/cli/commands/reset.py | 2 + src/data_designer/cli/controllers/__init__.py | 2 + .../cli/controllers/download_controller.py | 2 + .../cli/controllers/model_controller.py | 7 +- .../cli/controllers/provider_controller.py | 7 +- src/data_designer/cli/forms/__init__.py | 2 + src/data_designer/cli/forms/builder.py | 2 + src/data_designer/cli/forms/field.py | 2 + src/data_designer/cli/forms/form.py | 2 + src/data_designer/cli/forms/model_builder.py | 2 + .../cli/forms/provider_builder.py | 2 + src/data_designer/cli/main.py | 2 + .../cli/repositories/__init__.py | 2 + src/data_designer/cli/repositories/base.py | 2 + .../cli/repositories/model_repository.py | 2 + .../cli/repositories/persona_repository.py | 2 + .../cli/repositories/provider_repository.py | 2 + src/data_designer/cli/services/__init__.py | 2 + .../cli/services/download_service.py | 2 + .../cli/services/model_service.py | 2 + .../cli/services/provider_service.py | 2 + src/data_designer/cli/ui.py | 2 + src/data_designer/cli/utils.py | 2 + .../config/analysis/column_profilers.py | 2 + .../config/analysis/column_statistics.py | 13 +- .../config/analysis/dataset_profiler.py | 12 +- .../config/analysis/utils/errors.py | 2 + .../config/analysis/utils/reporting.py | 2 +- src/data_designer/config/column_configs.py | 2 + src/data_designer/config/column_types.py | 1 + src/data_designer/config/dataset_builders.py | 2 + .../config/default_model_settings.py | 1 + src/data_designer/config/errors.py | 2 + src/data_designer/config/exports.py | 2 + src/data_designer/config/interface.py | 5 +- src/data_designer/config/models.py | 9 +- src/data_designer/config/preview_results.py | 6 +- src/data_designer/config/processors.py | 2 + src/data_designer/config/run_config.py | 2 + .../config/sampler_constraints.py | 2 + src/data_designer/config/sampler_params.py | 11 +- src/data_designer/config/seed.py | 2 + src/data_designer/config/seed_source.py | 11 +- src/data_designer/config/seed_source_types.py | 2 + src/data_designer/config/utils/constants.py | 2 + src/data_designer/config/utils/errors.py | 2 + src/data_designer/config/utils/info.py | 2 + src/data_designer/config/utils/io_helpers.py | 11 +- .../config/utils/numerical_helpers.py | 2 + .../config/utils/type_helpers.py | 2 + .../config/utils/visualization.py | 6 +- src/data_designer/config/validator_params.py | 2 + .../engine/analysis/column_profilers/base.py | 6 +- .../column_profilers/judge_score_profiler.py | 15 +- .../analysis/column_profilers/registry.py | 2 + .../engine/analysis/column_statistics.py | 7 +- .../engine/analysis/dataset_profiler.py | 10 +- src/data_designer/engine/analysis/errors.py | 2 + .../utils/column_statistics_calculations.py | 11 +- .../analysis/utils/judge_score_processing.py | 10 +- .../column_generators/generators/base.py | 6 +- .../column_generators/generators/embedding.py | 1 + .../generators/expression.py | 7 +- .../generators/llm_completion.py | 2 + .../column_generators/generators/samplers.py | 8 +- .../generators/seed_dataset.py | 10 +- .../generators/validation.py | 7 +- .../engine/column_generators/registry.py | 2 + .../engine/column_generators/utils/errors.py | 2 + .../utils/generator_classification.py | 2 + .../utils/judge_score_factory.py | 2 + .../utils/prompt_renderer.py | 2 + src/data_designer/engine/compiler.py | 2 + src/data_designer/engine/configurable_task.py | 10 +- .../dataset_builders/artifact_storage.py | 8 +- .../dataset_builders/column_wise_builder.py | 6 +- .../engine/dataset_builders/errors.py | 2 + .../dataset_builders/multi_column_configs.py | 2 + .../dataset_builders/utils/config_compiler.py | 2 + .../engine/dataset_builders/utils/dag.py | 9 +- .../utils/dataset_batch_manager.py | 12 +- .../engine/dataset_builders/utils/errors.py | 2 + src/data_designer/engine/errors.py | 2 + src/data_designer/engine/model_provider.py | 2 + src/data_designer/engine/models/errors.py | 52 +++---- src/data_designer/engine/models/facade.py | 21 +-- src/data_designer/engine/models/factory.py | 42 +++++ .../engine/models/litellm_overrides.py | 27 ++-- .../engine/models/parsers/errors.py | 2 + .../engine/models/parsers/parser.py | 4 +- .../engine/models/parsers/postprocessors.py | 1 + .../engine/models/parsers/tag_parsers.py | 2 + .../engine/models/parsers/types.py | 2 + .../engine/models/recipes/base.py | 2 + .../engine/models/recipes/response_recipes.py | 2 + src/data_designer/engine/models/registry.py | 29 ++-- src/data_designer/engine/models/telemetry.py | 8 +- .../engine/processing/ginja/ast.py | 2 + .../engine/processing/ginja/environment.py | 2 + .../engine/processing/ginja/exceptions.py | 2 + .../engine/processing/ginja/record.py | 2 + .../processing/gsonschema/exceptions.py | 11 +- .../gsonschema/schema_transformers.py | 2 + .../engine/processing/gsonschema/types.py | 2 + .../processing/gsonschema/validators.py | 16 +- .../processing/processors/drop_columns.py | 9 +- .../engine/processing/processors/registry.py | 2 + .../processing/processors/schema_transform.py | 9 +- src/data_designer/engine/processing/utils.py | 10 +- src/data_designer/engine/registry/base.py | 2 + .../engine/registry/data_designer_registry.py | 2 + src/data_designer/engine/registry/errors.py | 2 + .../resources/managed_dataset_generator.py | 8 +- .../resources/managed_dataset_repository.py | 13 +- .../engine/resources/managed_storage.py | 2 + .../engine/resources/resource_provider.py | 9 +- .../engine/resources/seed_reader.py | 9 +- .../engine/sampling_gen/column.py | 2 + .../engine/sampling_gen/constraints.py | 12 +- .../engine/sampling_gen/data_sources/base.py | 19 ++- .../sampling_gen/data_sources/errors.py | 2 + .../sampling_gen/data_sources/sources.py | 49 +++--- .../entities/dataset_based_person_fields.py | 4 +- .../entities/email_address_utils.py | 2 + .../engine/sampling_gen/entities/errors.py | 2 + .../entities/national_id_utils.py | 2 + .../engine/sampling_gen/entities/person.py | 2 + .../sampling_gen/entities/phone_number.py | 9 +- .../engine/sampling_gen/errors.py | 2 + .../engine/sampling_gen/generator.py | 9 +- .../engine/sampling_gen/jinja_utils.py | 10 +- .../engine/sampling_gen/people_gen.py | 14 +- .../engine/sampling_gen/person_constants.py | 2 + .../engine/sampling_gen/schema.py | 6 +- .../engine/sampling_gen/schema_builder.py | 2 + .../engine/sampling_gen/utils.py | 10 +- src/data_designer/engine/secret_resolver.py | 2 + .../engine/validators/__init__.py | 2 + src/data_designer/engine/validators/base.py | 2 + .../engine/validators/local_callable.py | 9 +- src/data_designer/engine/validators/python.py | 8 +- src/data_designer/engine/validators/remote.py | 10 +- src/data_designer/engine/validators/sql.py | 11 +- src/data_designer/errors.py | 2 + src/data_designer/essentials/__init__.py | 2 + src/data_designer/interface/data_designer.py | 19 ++- src/data_designer/interface/errors.py | 2 + src/data_designer/interface/results.py | 7 +- src/data_designer/lazy_heavy_imports.py | 54 +++++++ src/data_designer/logging.py | 2 + src/data_designer/plugins/__init__.py | 2 + src/data_designer/plugins/errors.py | 2 + src/data_designer/plugins/registry.py | 2 + src/data_designer/plugins/testing/__init__.py | 2 + src/data_designer/plugins/testing/stubs.py | 2 + src/data_designer/plugins/testing/utils.py | 2 + tests/engine/models/conftest.py | 3 +- tests/engine/models/test_model_registry.py | 5 +- .../resources/test_resource_provider.py | 12 +- tests/test_import_perf.py | 64 ++++++++ 169 files changed, 1003 insertions(+), 255 deletions(-) create mode 100644 src/data_designer/engine/models/factory.py create mode 100644 src/data_designer/lazy_heavy_imports.py create mode 100644 tests/test_import_perf.py diff --git a/.gitignore b/.gitignore index 4863e65c..bfcd1fd9 100644 --- a/.gitignore +++ b/.gitignore @@ -93,3 +93,6 @@ docs/notebook_source/*.csv docs/**/artifacts/ tests_e2e/uv.lock + +# Performance profiling +perf_*.txt diff --git a/AGENTS.md b/AGENTS.md index 10cb9c13..6c2368d4 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -158,12 +158,13 @@ Type annotations are REQUIRED for all code in this project. This is strictly enf ### Import Style - **ALWAYS** use absolute imports, never relative imports -- Place imports at module level, not inside functions +- Place imports at module level, not inside functions (exception: it is unavoidable for performance reasons) - Import sorting is handled by `ruff`'s `isort` - imports should be grouped and sorted: 1. Standard library imports - 2. Third-party imports + 2. Third-party imports (use `lazy_heavy_imports` for heavy libraries) 3. First-party imports (`data_designer`) - Use standard import conventions (enforced by `ICN`) +- See [Lazy Loading and TYPE_CHECKING](#lazy-loading-and-type_checking) section for optimization guidelines ```python # Good @@ -184,6 +185,146 @@ Type annotations are REQUIRED for all code in this project. This is strictly enf path = Path(filename) ``` +### Lazy Loading and TYPE_CHECKING + +This project uses lazy loading for heavy third-party dependencies to optimize import performance. + +#### When to Use Lazy Loading + +**Heavy third-party libraries** (>100ms import cost) should be lazy-loaded via `lazy_heavy_imports.py`: + +```python +# ❌ Don't import directly +import pandas as pd +import numpy as np + +# ✅ Use lazy loading with IDE support +from typing import TYPE_CHECKING +from data_designer.lazy_heavy_imports import pd, np + +if TYPE_CHECKING: + import pandas as pd # For IDE autocomplete and type hints + import numpy as np +``` + +This pattern provides: +- Runtime lazy loading (fast startup) +- Full IDE support (autocomplete, type hints) +- Type checker validation + +**See [lazy_heavy_imports.py](src/data_designer/lazy_heavy_imports.py) for the current list of lazy-loaded libraries.** + +#### Adding New Heavy Dependencies + +If you add a new dependency with significant import cost (>100ms): + +1. **Add to `lazy_heavy_imports.py`:** + ```python + _LAZY_IMPORTS = { + # ... existing entries ... + "your_lib": "your_library_name", + } + ``` + +2. **Update imports across codebase:** + ```python + from typing import TYPE_CHECKING + from data_designer.lazy_heavy_imports import your_lib + + if TYPE_CHECKING: + import your_library_name as your_lib # For IDE support + ``` + +3. **Verify with performance test:** + ```bash + make perf-import CLEAN=1 + ``` + +#### Using TYPE_CHECKING Blocks + +`TYPE_CHECKING` blocks defer imports that are only needed for type hints, preventing circular dependencies and reducing import time. + +**For internal data_designer imports:** + +```python +from __future__ import annotations # Always include at top + +from typing import TYPE_CHECKING + +# Runtime imports +from pathlib import Path +from data_designer.config.base import ConfigBase + +if TYPE_CHECKING: + # Type-only imports - only visible to type checkers + from data_designer.engine.models.facade import ModelFacade + +def get_model(model: ModelFacade) -> str: + return model.name +``` + +**For lazy-loaded libraries (see pattern in "When to Use Lazy Loading" above):** +- Import from `lazy_heavy_imports` for runtime +- Add full import in `TYPE_CHECKING` block for IDE support + +**Rules for TYPE_CHECKING:** + +✅ **DO put in TYPE_CHECKING:** +- Internal `data_designer` imports used **only** in type hints +- Imports that would cause circular dependencies +- **Full imports of lazy-loaded libraries for IDE support** (e.g., `import pandas as pd` in addition to runtime `from data_designer.lazy_heavy_imports import pd`) + +❌ **DON'T put in TYPE_CHECKING:** +- **Standard library imports** (`Path`, `Any`, `Callable`, `Literal`, `TypeAlias`, etc.) +- **Pydantic model types** used in field definitions (needed at runtime for validation) +- **Types used in discriminated unions** (Pydantic needs them at runtime) +- **Any import used at runtime** (instantiation, method calls, base classes, etc.) + +**Examples:** + +```python +# ✅ CORRECT - Lazy-loaded library with IDE support +from typing import TYPE_CHECKING +from data_designer.lazy_heavy_imports import pd + +if TYPE_CHECKING: + import pandas as pd # IDE gets full type hints + +def load_data(path: str) -> pd.DataFrame: # IDE understands pd.DataFrame + return pd.read_csv(path) + +# ✅ CORRECT - Standard library NOT in TYPE_CHECKING +from pathlib import Path +from typing import Any + +def process_file(path: Path) -> Any: + return path.read_text() + +# ✅ CORRECT - Internal type-only import +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from data_designer.engine.models.facade import ModelFacade + +def get_model(model: ModelFacade) -> str: # Only used in type hint + return model.name + +# ❌ INCORRECT - Pydantic field type in TYPE_CHECKING +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from data_designer.config.models import ModelConfig # Wrong! + +class MyConfig(BaseModel): + model: ModelConfig # Pydantic needs this at runtime! + +# ✅ CORRECT - Pydantic field type at runtime +from data_designer.config.models import ModelConfig + +class MyConfig(BaseModel): + model: ModelConfig +``` + ### Naming Conventions (PEP 8) Follow PEP 8 naming conventions: diff --git a/Makefile b/Makefile index dcd33940..f6837cdc 100644 --- a/Makefile +++ b/Makefile @@ -45,14 +45,25 @@ help: @echo " check-license-headers - Check if all files have license headers" @echo " update-license-headers - Add license headers to all files" @echo "" + @echo "⚡ Performance:" + @echo " perf-import - Profile import time and show summary" + @echo " perf-import CLEAN=1 - Clean cache, then profile import time" + @echo " perf-import NOFILE=1 - Profile without writing to file (for CI)" + @echo "" @echo "═════════════════════════════════════════════════════════════" @echo "💡 Tip: Run 'make ' to execute any command above" @echo "" -clean: - @echo "🧹 Cleaning up coverage reports and cache files..." - rm -rf htmlcov .coverage .pytest_cache +clean-pycache: + @echo "🧹 Cleaning up Python cache files..." find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true + find . -type f -name "*.pyc" -delete 2>/dev/null || true + @echo "✅ Cache cleaned!" + +clean: clean-pycache + @echo "🧹 Cleaning up coverage reports and test cache..." + rm -rf htmlcov .coverage .pytest_cache + @echo "✅ Cleaned!" coverage: @echo "📊 Running tests with coverage analysis..." @@ -168,4 +179,34 @@ install-dev-notebooks: $(call install-pre-commit-hooks) @echo "✅ Dev + notebooks installation complete!" -.PHONY: clean coverage format format-check lint lint-fix test test-e2e test-run-tutorials test-run-recipes test-run-all-examples check-license-headers update-license-headers check-all check-all-fix install install-dev install-dev-notebooks generate-colab-notebooks +perf-import: +ifdef CLEAN + @$(MAKE) clean-pycache +endif + @echo "⚡ Profiling import time for data_designer.essentials..." +ifdef NOFILE + @PERF_OUTPUT=$$(uv run python -X importtime -c "import data_designer.essentials" 2>&1); \ + echo "$$PERF_OUTPUT"; \ + echo ""; \ + echo "Summary:"; \ + echo "$$PERF_OUTPUT" | tail -1 | awk '{printf " Total: %.3fs\n", $$5/1000000}'; \ + echo ""; \ + echo "💡 Top 10 slowest imports:"; \ + printf "%-12s %-12s %s\n" "Self (s)" "Cumulative (s)" "Module"; \ + printf "%-12s %-12s %s\n" "--------" "--------------" "------"; \ + echo "$$PERF_OUTPUT" | grep "import time:" | sort -rn -k5 | head -10 | awk '{printf "%-12.3f %-12.3f %s", $$3/1000000, $$5/1000000, $$7; for(i=8;i<=NF;i++) printf " %s", $$i; printf "\n"}' +else + @PERF_FILE="perf_import_$$(date +%Y%m%d_%H%M%S).txt"; \ + uv run python -X importtime -c "import data_designer.essentials" > "$$PERF_FILE" 2>&1; \ + echo "📊 Import profile saved to $$PERF_FILE"; \ + echo ""; \ + echo "Summary:"; \ + tail -1 "$$PERF_FILE" | awk '{printf " Total: %.3fs\n", $$5/1000000}'; \ + echo ""; \ + echo "💡 Top 10 slowest imports:"; \ + printf "%-12s %-12s %s\n" "Self (s)" "Cumulative (s)" "Module"; \ + printf "%-12s %-12s %s\n" "--------" "--------------" "------"; \ + grep "import time:" "$$PERF_FILE" | sort -rn -k5 | head -10 | awk '{printf "%-12.3f %-12.3f %s", $$3/1000000, $$5/1000000, $$7; for(i=8;i<=NF;i++) printf " %s", $$i; printf "\n"}' +endif + +.PHONY: clean clean-pycache coverage format format-check lint lint-fix test test-e2e test-run-tutorials test-run-recipes test-run-all-examples check-license-headers update-license-headers check-all check-all-fix install install-dev install-dev-notebooks generate-colab-notebooks perf-import diff --git a/src/data_designer/__init__.py b/src/data_designer/__init__.py index 4df9cc97..250004cb 100644 --- a/src/data_designer/__init__.py +++ b/src/data_designer/__init__.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + try: from data_designer._version import __version__ except ImportError: diff --git a/src/data_designer/cli/__init__.py b/src/data_designer/cli/__init__.py index c11e4047..aab869e6 100644 --- a/src/data_designer/cli/__init__.py +++ b/src/data_designer/cli/__init__.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.cli.main import app, main __all__ = ["app", "main"] diff --git a/src/data_designer/cli/commands/download.py b/src/data_designer/cli/commands/download.py index c6d46650..0984e700 100644 --- a/src/data_designer/cli/commands/download.py +++ b/src/data_designer/cli/commands/download.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import typer from data_designer.cli.controllers.download_controller import DownloadController diff --git a/src/data_designer/cli/commands/list.py b/src/data_designer/cli/commands/list.py index 5eb9ba8e..a5991352 100644 --- a/src/data_designer/cli/commands/list.py +++ b/src/data_designer/cli/commands/list.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from rich.table import Table from data_designer.cli.repositories.model_repository import ModelRepository diff --git a/src/data_designer/cli/commands/models.py b/src/data_designer/cli/commands/models.py index 7486cc99..5475a4e0 100644 --- a/src/data_designer/cli/commands/models.py +++ b/src/data_designer/cli/commands/models.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.cli.controllers.model_controller import ModelController from data_designer.config.utils.constants import DATA_DESIGNER_HOME diff --git a/src/data_designer/cli/commands/providers.py b/src/data_designer/cli/commands/providers.py index c1f22442..5d1a0bb3 100644 --- a/src/data_designer/cli/commands/providers.py +++ b/src/data_designer/cli/commands/providers.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.cli.controllers.provider_controller import ProviderController from data_designer.config.utils.constants import DATA_DESIGNER_HOME diff --git a/src/data_designer/cli/commands/reset.py b/src/data_designer/cli/commands/reset.py index 51a63936..fd49394c 100644 --- a/src/data_designer/cli/commands/reset.py +++ b/src/data_designer/cli/commands/reset.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import typer from data_designer.cli.repositories.model_repository import ModelRepository diff --git a/src/data_designer/cli/controllers/__init__.py b/src/data_designer/cli/controllers/__init__.py index e4dbe53d..8deb0cfa 100644 --- a/src/data_designer/cli/controllers/__init__.py +++ b/src/data_designer/cli/controllers/__init__.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.cli.controllers.download_controller import DownloadController from data_designer.cli.controllers.model_controller import ModelController from data_designer.cli.controllers.provider_controller import ProviderController diff --git a/src/data_designer/cli/controllers/download_controller.py b/src/data_designer/cli/controllers/download_controller.py index 4b4e1009..706323df 100644 --- a/src/data_designer/cli/controllers/download_controller.py +++ b/src/data_designer/cli/controllers/download_controller.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import subprocess from pathlib import Path diff --git a/src/data_designer/cli/controllers/model_controller.py b/src/data_designer/cli/controllers/model_controller.py index 6302cfc6..24a0379e 100644 --- a/src/data_designer/cli/controllers/model_controller.py +++ b/src/data_designer/cli/controllers/model_controller.py @@ -1,7 +1,10 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from pathlib import Path +from typing import TYPE_CHECKING from data_designer.cli.forms.model_builder import ModelFormBuilder from data_designer.cli.repositories.model_repository import ModelRepository @@ -20,7 +23,9 @@ from data_designer.cli.ui import ( print_warning, select_with_arrows, ) -from data_designer.config.models import ModelConfig + +if TYPE_CHECKING: + from data_designer.config.models import ModelConfig class ModelController: diff --git a/src/data_designer/cli/controllers/provider_controller.py b/src/data_designer/cli/controllers/provider_controller.py index edec7cac..94e7e03c 100644 --- a/src/data_designer/cli/controllers/provider_controller.py +++ b/src/data_designer/cli/controllers/provider_controller.py @@ -1,8 +1,11 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import copy from pathlib import Path +from typing import TYPE_CHECKING from data_designer.cli.forms.provider_builder import ProviderFormBuilder from data_designer.cli.repositories.model_repository import ModelRepository @@ -20,7 +23,9 @@ from data_designer.cli.ui import ( print_warning, select_with_arrows, ) -from data_designer.engine.model_provider import ModelProvider + +if TYPE_CHECKING: + from data_designer.engine.model_provider import ModelProvider class ProviderController: diff --git a/src/data_designer/cli/forms/__init__.py b/src/data_designer/cli/forms/__init__.py index 6f8f1683..acc27afa 100644 --- a/src/data_designer/cli/forms/__init__.py +++ b/src/data_designer/cli/forms/__init__.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.cli.forms.builder import FormBuilder from data_designer.cli.forms.field import Field, NumericField, SelectField, TextField, ValidationError from data_designer.cli.forms.form import Form diff --git a/src/data_designer/cli/forms/builder.py b/src/data_designer/cli/forms/builder.py index 75503550..5ab3a0d8 100644 --- a/src/data_designer/cli/forms/builder.py +++ b/src/data_designer/cli/forms/builder.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from abc import ABC, abstractmethod from typing import Any, Generic, TypeVar diff --git a/src/data_designer/cli/forms/field.py b/src/data_designer/cli/forms/field.py index 7aba17d9..fbdaf949 100644 --- a/src/data_designer/cli/forms/field.py +++ b/src/data_designer/cli/forms/field.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from abc import ABC, abstractmethod from collections.abc import Callable from typing import Any, Generic, TypeVar diff --git a/src/data_designer/cli/forms/form.py b/src/data_designer/cli/forms/form.py index 800c9e44..c8c9349b 100644 --- a/src/data_designer/cli/forms/form.py +++ b/src/data_designer/cli/forms/form.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from typing import Any from data_designer.cli.forms.field import Field diff --git a/src/data_designer/cli/forms/model_builder.py b/src/data_designer/cli/forms/model_builder.py index f2cd2d5e..e1d637df 100644 --- a/src/data_designer/cli/forms/model_builder.py +++ b/src/data_designer/cli/forms/model_builder.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from typing import Any from data_designer.cli.forms.builder import FormBuilder diff --git a/src/data_designer/cli/forms/provider_builder.py b/src/data_designer/cli/forms/provider_builder.py index 1d321105..01120b16 100644 --- a/src/data_designer/cli/forms/provider_builder.py +++ b/src/data_designer/cli/forms/provider_builder.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from typing import Any from data_designer.cli.forms.builder import FormBuilder diff --git a/src/data_designer/cli/main.py b/src/data_designer/cli/main.py index 72be5b5c..420a5c9a 100644 --- a/src/data_designer/cli/main.py +++ b/src/data_designer/cli/main.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import typer from data_designer.cli.commands import download, models, providers, reset diff --git a/src/data_designer/cli/repositories/__init__.py b/src/data_designer/cli/repositories/__init__.py index 0977aa41..6c59c0a3 100644 --- a/src/data_designer/cli/repositories/__init__.py +++ b/src/data_designer/cli/repositories/__init__.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.cli.repositories.base import ConfigRepository from data_designer.cli.repositories.model_repository import ModelRepository from data_designer.cli.repositories.provider_repository import ProviderRepository diff --git a/src/data_designer/cli/repositories/base.py b/src/data_designer/cli/repositories/base.py index 6c8f29c3..f1b5088f 100644 --- a/src/data_designer/cli/repositories/base.py +++ b/src/data_designer/cli/repositories/base.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from abc import ABC, abstractmethod from pathlib import Path from typing import Generic, TypeVar diff --git a/src/data_designer/cli/repositories/model_repository.py b/src/data_designer/cli/repositories/model_repository.py index 7a22214b..0285cda6 100644 --- a/src/data_designer/cli/repositories/model_repository.py +++ b/src/data_designer/cli/repositories/model_repository.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from pathlib import Path from pydantic import BaseModel diff --git a/src/data_designer/cli/repositories/persona_repository.py b/src/data_designer/cli/repositories/persona_repository.py index 5fced12f..d1ec32f6 100644 --- a/src/data_designer/cli/repositories/persona_repository.py +++ b/src/data_designer/cli/repositories/persona_repository.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from pydantic import BaseModel from data_designer.config.utils.constants import ( diff --git a/src/data_designer/cli/repositories/provider_repository.py b/src/data_designer/cli/repositories/provider_repository.py index 59432853..a2b692fb 100644 --- a/src/data_designer/cli/repositories/provider_repository.py +++ b/src/data_designer/cli/repositories/provider_repository.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from pathlib import Path from pydantic import BaseModel diff --git a/src/data_designer/cli/services/__init__.py b/src/data_designer/cli/services/__init__.py index 1a1e2972..62ddf891 100644 --- a/src/data_designer/cli/services/__init__.py +++ b/src/data_designer/cli/services/__init__.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.cli.services.download_service import DownloadService from data_designer.cli.services.model_service import ModelService from data_designer.cli.services.provider_service import ProviderService diff --git a/src/data_designer/cli/services/download_service.py b/src/data_designer/cli/services/download_service.py index be47f1a4..be1594be 100644 --- a/src/data_designer/cli/services/download_service.py +++ b/src/data_designer/cli/services/download_service.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import glob import shutil import subprocess diff --git a/src/data_designer/cli/services/model_service.py b/src/data_designer/cli/services/model_service.py index 8bb98237..1ec73802 100644 --- a/src/data_designer/cli/services/model_service.py +++ b/src/data_designer/cli/services/model_service.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.cli.repositories.model_repository import ModelConfigRegistry, ModelRepository from data_designer.config.models import ModelConfig diff --git a/src/data_designer/cli/services/provider_service.py b/src/data_designer/cli/services/provider_service.py index 8a4f3bdd..02bb05da 100644 --- a/src/data_designer/cli/services/provider_service.py +++ b/src/data_designer/cli/services/provider_service.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.cli.repositories.provider_repository import ModelProviderRegistry, ProviderRepository from data_designer.config.models import ModelProvider diff --git a/src/data_designer/cli/ui.py b/src/data_designer/cli/ui.py index 0d7fe4c0..2b565a4c 100644 --- a/src/data_designer/cli/ui.py +++ b/src/data_designer/cli/ui.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from collections.abc import Callable from prompt_toolkit import Application, prompt diff --git a/src/data_designer/cli/utils.py b/src/data_designer/cli/utils.py index 7c5cb400..93a716f4 100644 --- a/src/data_designer/cli/utils.py +++ b/src/data_designer/cli/utils.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import shutil import subprocess diff --git a/src/data_designer/config/analysis/column_profilers.py b/src/data_designer/config/analysis/column_profilers.py index 772aa9e9..f175570c 100644 --- a/src/data_designer/config/analysis/column_profilers.py +++ b/src/data_designer/config/analysis/column_profilers.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from abc import ABC from enum import Enum diff --git a/src/data_designer/config/analysis/column_statistics.py b/src/data_designer/config/analysis/column_statistics.py index f89a34d4..844e9659 100644 --- a/src/data_designer/config/analysis/column_statistics.py +++ b/src/data_designer/config/analysis/column_statistics.py @@ -5,9 +5,8 @@ from __future__ import annotations from abc import ABC, abstractmethod from enum import Enum -from typing import Any, Literal +from typing import TYPE_CHECKING, Any, Literal -from pandas import Series from pydantic import BaseModel, ConfigDict, create_model, field_validator, model_validator from typing_extensions import Self, TypeAlias @@ -15,8 +14,12 @@ from data_designer.config.column_types import DataDesignerColumnType from data_designer.config.sampler_params import SamplerType from data_designer.config.utils.constants import EPSILON from data_designer.config.utils.numerical_helpers import is_float, is_int, prepare_number_for_reporting +from data_designer.lazy_heavy_imports import pd from data_designer.plugin_manager import PluginManager +if TYPE_CHECKING: + import pandas as pd + class MissingValue(str, Enum): CALCULATION_FAILED = "--" @@ -314,7 +317,7 @@ class CategoricalHistogramData(BaseModel): return self @classmethod - def from_series(cls, series: Series) -> Self: + def from_series(cls, series: pd.Series) -> Self: counts = series.value_counts() return cls(categories=counts.index.tolist(), counts=counts.tolist()) @@ -337,7 +340,7 @@ class CategoricalDistribution(BaseModel): return str(v) if not is_int(v) else prepare_number_for_reporting(v, int) @classmethod - def from_series(cls, series: Series) -> Self: + def from_series(cls, series: pd.Series) -> Self: counts = series.value_counts() return cls( most_common_value=counts.index[0], @@ -368,7 +371,7 @@ class NumericalDistribution(BaseModel): return prepare_number_for_reporting(v, int if is_int(v) else float) @classmethod - def from_series(cls, series: Series) -> Self: + def from_series(cls, series: pd.Series) -> Self: return cls( min=series.min(skipna=True), max=series.max(skipna=True), diff --git a/src/data_designer/config/analysis/dataset_profiler.py b/src/data_designer/config/analysis/dataset_profiler.py index bf797d30..c4ff5b96 100644 --- a/src/data_designer/config/analysis/dataset_profiler.py +++ b/src/data_designer/config/analysis/dataset_profiler.py @@ -1,19 +1,25 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from functools import cached_property from pathlib import Path -from typing import Annotated +from typing import TYPE_CHECKING, Annotated from pydantic import BaseModel, Field, field_validator from data_designer.config.analysis.column_profilers import ColumnProfilerResultsT from data_designer.config.analysis.column_statistics import ColumnStatisticsT -from data_designer.config.analysis.utils.reporting import ReportSection, generate_analysis_report -from data_designer.config.column_types import DataDesignerColumnType, get_column_display_order +from data_designer.config.analysis.utils.reporting import generate_analysis_report +from data_designer.config.column_types import get_column_display_order from data_designer.config.utils.constants import EPSILON from data_designer.config.utils.numerical_helpers import prepare_number_for_reporting +if TYPE_CHECKING: + from data_designer.config.analysis.utils.reporting import ReportSection + from data_designer.config.column_types import DataDesignerColumnType + class DatasetProfilerResults(BaseModel): """Container for complete dataset profiling and analysis results. diff --git a/src/data_designer/config/analysis/utils/errors.py b/src/data_designer/config/analysis/utils/errors.py index 88d6e5e1..fd760e0d 100644 --- a/src/data_designer/config/analysis/utils/errors.py +++ b/src/data_designer/config/analysis/utils/errors.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.errors import DataDesignerError diff --git a/src/data_designer/config/analysis/utils/reporting.py b/src/data_designer/config/analysis/utils/reporting.py index 3b4d823d..fadaa89e 100644 --- a/src/data_designer/config/analysis/utils/reporting.py +++ b/src/data_designer/config/analysis/utils/reporting.py @@ -14,7 +14,6 @@ from rich.rule import Rule from rich.table import Column, Table from rich.text import Text -from data_designer.config.analysis.column_statistics import CategoricalHistogramData from data_designer.config.analysis.utils.errors import AnalysisReportError from data_designer.config.column_types import ( DataDesignerColumnType, @@ -29,6 +28,7 @@ from data_designer.config.utils.visualization import ( ) if TYPE_CHECKING: + from data_designer.config.analysis.column_statistics import CategoricalHistogramData from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults HEADER_STYLE = "dim" diff --git a/src/data_designer/config/column_configs.py b/src/data_designer/config/column_configs.py index 3fa099a5..ca773633 100644 --- a/src/data_designer/config/column_configs.py +++ b/src/data_designer/config/column_configs.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from abc import ABC, abstractmethod from typing import Annotated, Literal diff --git a/src/data_designer/config/column_types.py b/src/data_designer/config/column_types.py index 2d52bf94..33805bfe 100644 --- a/src/data_designer/config/column_types.py +++ b/src/data_designer/config/column_types.py @@ -1,6 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations from typing_extensions import TypeAlias diff --git a/src/data_designer/config/dataset_builders.py b/src/data_designer/config/dataset_builders.py index 2cadc6c0..bbfbb2fb 100644 --- a/src/data_designer/config/dataset_builders.py +++ b/src/data_designer/config/dataset_builders.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from enum import Enum diff --git a/src/data_designer/config/default_model_settings.py b/src/data_designer/config/default_model_settings.py index d60b7937..918287e5 100644 --- a/src/data_designer/config/default_model_settings.py +++ b/src/data_designer/config/default_model_settings.py @@ -1,6 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations import logging import os diff --git a/src/data_designer/config/errors.py b/src/data_designer/config/errors.py index acbe2097..2778a1ba 100644 --- a/src/data_designer/config/errors.py +++ b/src/data_designer/config/errors.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.errors import DataDesignerError diff --git a/src/data_designer/config/exports.py b/src/data_designer/config/exports.py index 77cac5de..21ce8d6f 100644 --- a/src/data_designer/config/exports.py +++ b/src/data_designer/config/exports.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.config.analysis.column_profilers import JudgeScoreProfilerConfig from data_designer.config.column_configs import ( EmbeddingColumnConfig, diff --git a/src/data_designer/config/interface.py b/src/data_designer/config/interface.py index 054232a7..3eadd5d0 100644 --- a/src/data_designer/config/interface.py +++ b/src/data_designer/config/interface.py @@ -6,13 +6,14 @@ from __future__ import annotations from abc import ABC, abstractmethod from typing import TYPE_CHECKING, Generic, Protocol, TypeVar -import pandas as pd - from data_designer.config.models import ModelConfig, ModelProvider from data_designer.config.utils.constants import DEFAULT_NUM_RECORDS from data_designer.config.utils.info import InterfaceInfo +from data_designer.lazy_heavy_imports import pd if TYPE_CHECKING: + import pandas as pd + from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults from data_designer.config.config_builder import DataDesignerConfigBuilder from data_designer.config.preview_results import PreviewResults diff --git a/src/data_designer/config/models.py b/src/data_designer/config/models.py index 4ab7107e..eb08ed41 100644 --- a/src/data_designer/config/models.py +++ b/src/data_designer/config/models.py @@ -1,13 +1,14 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import logging from abc import ABC, abstractmethod from enum import Enum from pathlib import Path -from typing import Annotated, Any, Generic, Literal, TypeVar +from typing import TYPE_CHECKING, Annotated, Any, Generic, Literal, TypeVar -import numpy as np from pydantic import BaseModel, Field, field_validator, model_validator from typing_extensions import Self, TypeAlias @@ -20,6 +21,10 @@ from data_designer.config.utils.constants import ( MIN_TOP_P, ) from data_designer.config.utils.io_helpers import smart_load_yaml +from data_designer.lazy_heavy_imports import np + +if TYPE_CHECKING: + import numpy as np logger = logging.getLogger(__name__) diff --git a/src/data_designer/config/preview_results.py b/src/data_designer/config/preview_results.py index 4fba22f8..6df1a8d2 100644 --- a/src/data_designer/config/preview_results.py +++ b/src/data_designer/config/preview_results.py @@ -3,12 +3,16 @@ from __future__ import annotations -import pandas as pd +from typing import TYPE_CHECKING from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults from data_designer.config.config_builder import DataDesignerConfigBuilder from data_designer.config.dataset_metadata import DatasetMetadata from data_designer.config.utils.visualization import WithRecordSamplerMixin +from data_designer.lazy_heavy_imports import pd + +if TYPE_CHECKING: + import pandas as pd class PreviewResults(WithRecordSamplerMixin): diff --git a/src/data_designer/config/processors.py b/src/data_designer/config/processors.py index 58fa97a7..db7bb9ce 100644 --- a/src/data_designer/config/processors.py +++ b/src/data_designer/config/processors.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import json from abc import ABC from enum import Enum diff --git a/src/data_designer/config/run_config.py b/src/data_designer/config/run_config.py index d1123070..daa9f994 100644 --- a/src/data_designer/config/run_config.py +++ b/src/data_designer/config/run_config.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from pydantic import Field, model_validator from typing_extensions import Self diff --git a/src/data_designer/config/sampler_constraints.py b/src/data_designer/config/sampler_constraints.py index 5e387350..86dc2c09 100644 --- a/src/data_designer/config/sampler_constraints.py +++ b/src/data_designer/config/sampler_constraints.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from abc import ABC, abstractmethod from enum import Enum diff --git a/src/data_designer/config/sampler_params.py b/src/data_designer/config/sampler_params.py index dab08e93..936dace0 100644 --- a/src/data_designer/config/sampler_params.py +++ b/src/data_designer/config/sampler_params.py @@ -1,10 +1,11 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from enum import Enum -from typing import Literal +from __future__ import annotations + +from enum import Enum +from typing import TYPE_CHECKING, Literal -import pandas as pd from pydantic import Field, field_validator, model_validator from typing_extensions import Self, TypeAlias @@ -16,6 +17,10 @@ from data_designer.config.utils.constants import ( MAX_AGE, MIN_AGE, ) +from data_designer.lazy_heavy_imports import pd + +if TYPE_CHECKING: + import pandas as pd class SamplerType(str, Enum): diff --git a/src/data_designer/config/seed.py b/src/data_designer/config/seed.py index 8ab83be3..bdd9dae2 100644 --- a/src/data_designer/config/seed.py +++ b/src/data_designer/config/seed.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from enum import Enum from pydantic import Field, model_validator diff --git a/src/data_designer/config/seed_source.py b/src/data_designer/config/seed_source.py index 522dd5e5..124fcfe0 100644 --- a/src/data_designer/config/seed_source.py +++ b/src/data_designer/config/seed_source.py @@ -1,10 +1,11 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from abc import ABC -from typing import Literal +from __future__ import annotations + +from abc import ABC +from typing import TYPE_CHECKING, Literal -import pandas as pd from pydantic import BaseModel, ConfigDict, Field, field_validator from pydantic.json_schema import SkipJsonSchema from typing_extensions import Self @@ -14,6 +15,10 @@ from data_designer.config.utils.io_helpers import ( validate_dataset_file_path, validate_path_contains_files_of_type, ) +from data_designer.lazy_heavy_imports import pd + +if TYPE_CHECKING: + import pandas as pd class SeedSource(BaseModel, ABC): diff --git a/src/data_designer/config/seed_source_types.py b/src/data_designer/config/seed_source_types.py index 34d01a40..b7897a3a 100644 --- a/src/data_designer/config/seed_source_types.py +++ b/src/data_designer/config/seed_source_types.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from typing import Annotated from pydantic import Field diff --git a/src/data_designer/config/utils/constants.py b/src/data_designer/config/utils/constants.py index 19f20a4e..8662d476 100644 --- a/src/data_designer/config/utils/constants.py +++ b/src/data_designer/config/utils/constants.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import os from enum import Enum from pathlib import Path diff --git a/src/data_designer/config/utils/errors.py b/src/data_designer/config/utils/errors.py index 32c584cb..aa56c2b7 100644 --- a/src/data_designer/config/utils/errors.py +++ b/src/data_designer/config/utils/errors.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.errors import DataDesignerError diff --git a/src/data_designer/config/utils/info.py b/src/data_designer/config/utils/info.py index 82ae7c61..9817f50d 100644 --- a/src/data_designer/config/utils/info.py +++ b/src/data_designer/config/utils/info.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from abc import ABC, abstractmethod from enum import Enum from typing import Literal, TypeVar diff --git a/src/data_designer/config/utils/io_helpers.py b/src/data_designer/config/utils/io_helpers.py index 7576736b..7a97c9c4 100644 --- a/src/data_designer/config/utils/io_helpers.py +++ b/src/data_designer/config/utils/io_helpers.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import json import logging import os @@ -8,13 +10,16 @@ from datetime import date, datetime, timedelta from decimal import Decimal from numbers import Number from pathlib import Path -from typing import Any +from typing import TYPE_CHECKING, Any -import numpy as np -import pandas as pd import yaml from data_designer.config.errors import InvalidFileFormatError, InvalidFilePathError +from data_designer.lazy_heavy_imports import np, pd + +if TYPE_CHECKING: + import numpy as np + import pandas as pd logger = logging.getLogger(__name__) diff --git a/src/data_designer/config/utils/numerical_helpers.py b/src/data_designer/config/utils/numerical_helpers.py index 5fc218b0..7fcd3dd2 100644 --- a/src/data_designer/config/utils/numerical_helpers.py +++ b/src/data_designer/config/utils/numerical_helpers.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import numbers from numbers import Number from typing import Any diff --git a/src/data_designer/config/utils/type_helpers.py b/src/data_designer/config/utils/type_helpers.py index c7c89d05..8e2acb54 100644 --- a/src/data_designer/config/utils/type_helpers.py +++ b/src/data_designer/config/utils/type_helpers.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import inspect from enum import Enum from typing import Any, Literal, get_args, get_origin diff --git a/src/data_designer/config/utils/visualization.py b/src/data_designer/config/utils/visualization.py index 4edd2a32..7e5c79a9 100644 --- a/src/data_designer/config/utils/visualization.py +++ b/src/data_designer/config/utils/visualization.py @@ -10,8 +10,6 @@ from enum import Enum from functools import cached_property from typing import TYPE_CHECKING, Any -import numpy as np -import pandas as pd from rich.console import Console, Group from rich.padding import Padding from rich.panel import Panel @@ -28,8 +26,12 @@ from data_designer.config.sampler_params import SamplerType from data_designer.config.utils.code_lang import code_lang_to_syntax_lexer from data_designer.config.utils.constants import NVIDIA_API_KEY_ENV_VAR_NAME, OPENAI_API_KEY_ENV_VAR_NAME from data_designer.config.utils.errors import DatasetSampleDisplayError +from data_designer.lazy_heavy_imports import np, pd if TYPE_CHECKING: + import numpy as np + import pandas as pd + from data_designer.config.config_builder import DataDesignerConfigBuilder from data_designer.config.dataset_metadata import DatasetMetadata diff --git a/src/data_designer/config/validator_params.py b/src/data_designer/config/validator_params.py index e3d5131a..e08c3186 100644 --- a/src/data_designer/config/validator_params.py +++ b/src/data_designer/config/validator_params.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from enum import Enum from typing import Any diff --git a/src/data_designer/engine/analysis/column_profilers/base.py b/src/data_designer/engine/analysis/column_profilers/base.py index 1b19db1b..a97585a7 100644 --- a/src/data_designer/engine/analysis/column_profilers/base.py +++ b/src/data_designer/engine/analysis/column_profilers/base.py @@ -5,8 +5,8 @@ from __future__ import annotations import logging from abc import ABC, abstractmethod +from typing import TYPE_CHECKING -import pandas as pd from pydantic import BaseModel, model_validator from typing_extensions import Self @@ -14,6 +14,10 @@ from data_designer.config.base import ConfigBase from data_designer.config.column_configs import SingleColumnConfig from data_designer.config.column_types import DataDesignerColumnType from data_designer.engine.configurable_task import ConfigurableTask, TaskConfigT +from data_designer.lazy_heavy_imports import pd + +if TYPE_CHECKING: + import pandas as pd logger = logging.getLogger(__name__) diff --git a/src/data_designer/engine/analysis/column_profilers/judge_score_profiler.py b/src/data_designer/engine/analysis/column_profilers/judge_score_profiler.py index 7b37c9ba..437ad9a7 100644 --- a/src/data_designer/engine/analysis/column_profilers/judge_score_profiler.py +++ b/src/data_designer/engine/analysis/column_profilers/judge_score_profiler.py @@ -5,19 +5,16 @@ from __future__ import annotations import logging import random +from typing import TYPE_CHECKING from data_designer.config.analysis.column_profilers import ( JudgeScoreProfilerConfig, JudgeScoreProfilerResults, - JudgeScoreSample, JudgeScoreSummary, ) from data_designer.config.analysis.column_statistics import ( - CategoricalDistribution, - CategoricalHistogramData, ColumnDistributionType, MissingValue, - NumericalDistribution, ) from data_designer.config.column_types import DataDesignerColumnType from data_designer.engine.analysis.column_profilers.base import ColumnConfigWithDataFrame, ColumnProfiler @@ -25,9 +22,17 @@ from data_designer.engine.analysis.utils.judge_score_processing import ( extract_judge_score_distributions, sample_scores_and_reasoning, ) -from data_designer.engine.models.facade import ModelFacade from data_designer.engine.models.recipes.response_recipes import TextResponseRecipe +if TYPE_CHECKING: + from data_designer.config.analysis.column_profilers import JudgeScoreSample + from data_designer.config.analysis.column_statistics import ( + CategoricalDistribution, + CategoricalHistogramData, + NumericalDistribution, + ) + from data_designer.engine.models.facade import ModelFacade + logger = logging.getLogger(__name__) diff --git a/src/data_designer/engine/analysis/column_profilers/registry.py b/src/data_designer/engine/analysis/column_profilers/registry.py index 7fb6529b..e0bcac86 100644 --- a/src/data_designer/engine/analysis/column_profilers/registry.py +++ b/src/data_designer/engine/analysis/column_profilers/registry.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.config.analysis.column_profilers import ColumnProfilerType from data_designer.config.base import ConfigBase from data_designer.engine.analysis.column_profilers.base import ColumnProfiler diff --git a/src/data_designer/engine/analysis/column_statistics.py b/src/data_designer/engine/analysis/column_statistics.py index 9d6cd23b..62dfb80e 100644 --- a/src/data_designer/engine/analysis/column_statistics.py +++ b/src/data_designer/engine/analysis/column_statistics.py @@ -4,9 +4,8 @@ from __future__ import annotations import logging -from typing import Any, TypeAlias +from typing import TYPE_CHECKING, Any, TypeAlias -import pandas as pd from pydantic import BaseModel from typing_extensions import Self @@ -25,6 +24,10 @@ from data_designer.engine.analysis.utils.column_statistics_calculations import ( calculate_token_stats, calculate_validation_column_info, ) +from data_designer.lazy_heavy_imports import pd + +if TYPE_CHECKING: + import pandas as pd logger = logging.getLogger(__name__) diff --git a/src/data_designer/engine/analysis/dataset_profiler.py b/src/data_designer/engine/analysis/dataset_profiler.py index d1f48bed..898edbea 100644 --- a/src/data_designer/engine/analysis/dataset_profiler.py +++ b/src/data_designer/engine/analysis/dataset_profiler.py @@ -1,12 +1,13 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import logging from collections.abc import Sequence from functools import cached_property +from typing import TYPE_CHECKING -import pandas as pd -import pyarrow as pa from pydantic import Field, field_validator from data_designer.config.analysis.column_profilers import ColumnProfilerConfigT @@ -21,6 +22,11 @@ from data_designer.engine.analysis.utils.column_statistics_calculations import h from data_designer.engine.dataset_builders.multi_column_configs import DatasetBuilderColumnConfigT, MultiColumnConfig from data_designer.engine.registry.data_designer_registry import DataDesignerRegistry from data_designer.engine.resources.resource_provider import ResourceProvider +from data_designer.lazy_heavy_imports import pa, pd + +if TYPE_CHECKING: + import pandas as pd + import pyarrow as pa logger = logging.getLogger(__name__) diff --git a/src/data_designer/engine/analysis/errors.py b/src/data_designer/engine/analysis/errors.py index e99cfac7..84c7afdd 100644 --- a/src/data_designer/engine/analysis/errors.py +++ b/src/data_designer/engine/analysis/errors.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.errors import DataDesignerError diff --git a/src/data_designer/engine/analysis/utils/column_statistics_calculations.py b/src/data_designer/engine/analysis/utils/column_statistics_calculations.py index 10bce2e6..8dae819d 100644 --- a/src/data_designer/engine/analysis/utils/column_statistics_calculations.py +++ b/src/data_designer/engine/analysis/utils/column_statistics_calculations.py @@ -5,11 +5,8 @@ from __future__ import annotations import logging from numbers import Number -from typing import Any +from typing import TYPE_CHECKING, Any -import numpy as np -import pandas as pd -import pyarrow as pa import tiktoken from data_designer.config.analysis.column_statistics import ( @@ -26,6 +23,12 @@ from data_designer.engine.column_generators.utils.prompt_renderer import ( RecordBasedPromptRenderer, create_response_recipe, ) +from data_designer.lazy_heavy_imports import np, pa, pd + +if TYPE_CHECKING: + import numpy as np + import pandas as pd + import pyarrow as pa RANDOM_SEED = 42 MAX_PROMPT_SAMPLE_SIZE = 1000 diff --git a/src/data_designer/engine/analysis/utils/judge_score_processing.py b/src/data_designer/engine/analysis/utils/judge_score_processing.py index 21d13358..99ce1992 100644 --- a/src/data_designer/engine/analysis/utils/judge_score_processing.py +++ b/src/data_designer/engine/analysis/utils/judge_score_processing.py @@ -1,11 +1,11 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import logging from collections import defaultdict -from typing import Any - -import pandas as pd +from typing import TYPE_CHECKING, Any from data_designer.config.analysis.column_profilers import JudgeScoreDistributions, JudgeScoreSample from data_designer.config.analysis.column_statistics import ( @@ -15,6 +15,10 @@ from data_designer.config.analysis.column_statistics import ( NumericalDistribution, ) from data_designer.config.column_configs import LLMJudgeColumnConfig +from data_designer.lazy_heavy_imports import pd + +if TYPE_CHECKING: + import pandas as pd logger = logging.getLogger(__name__) diff --git a/src/data_designer/engine/column_generators/generators/base.py b/src/data_designer/engine/column_generators/generators/base.py index 617d35fd..862061c3 100644 --- a/src/data_designer/engine/column_generators/generators/base.py +++ b/src/data_designer/engine/column_generators/generators/base.py @@ -9,16 +9,16 @@ from abc import ABC, abstractmethod from enum import Enum from typing import TYPE_CHECKING, overload -import pandas as pd - from data_designer.engine.configurable_task import ConfigurableTask, DataT, TaskConfigT +from data_designer.lazy_heavy_imports import pd if TYPE_CHECKING: + import pandas as pd + from data_designer.config.models import BaseInferenceParams, ModelConfig from data_designer.engine.models.facade import ModelFacade from data_designer.engine.models.registry import ModelRegistry - logger = logging.getLogger(__name__) diff --git a/src/data_designer/engine/column_generators/generators/embedding.py b/src/data_designer/engine/column_generators/generators/embedding.py index 22b08930..83b13ffd 100644 --- a/src/data_designer/engine/column_generators/generators/embedding.py +++ b/src/data_designer/engine/column_generators/generators/embedding.py @@ -1,6 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations from pydantic import BaseModel, computed_field diff --git a/src/data_designer/engine/column_generators/generators/expression.py b/src/data_designer/engine/column_generators/generators/expression.py index 41f58853..98c8fa7b 100644 --- a/src/data_designer/engine/column_generators/generators/expression.py +++ b/src/data_designer/engine/column_generators/generators/expression.py @@ -4,14 +4,17 @@ from __future__ import annotations import logging - -import pandas as pd +from typing import TYPE_CHECKING from data_designer.config.column_configs import ExpressionColumnConfig from data_designer.engine.column_generators.generators.base import ColumnGeneratorFullColumn from data_designer.engine.column_generators.utils.errors import ExpressionTemplateRenderError from data_designer.engine.processing.ginja.environment import WithJinja2UserTemplateRendering from data_designer.engine.processing.utils import deserialize_json_values +from data_designer.lazy_heavy_imports import pd + +if TYPE_CHECKING: + import pandas as pd logger = logging.getLogger(__name__) diff --git a/src/data_designer/engine/column_generators/generators/llm_completion.py b/src/data_designer/engine/column_generators/generators/llm_completion.py index 85248108..c35cf443 100644 --- a/src/data_designer/engine/column_generators/generators/llm_completion.py +++ b/src/data_designer/engine/column_generators/generators/llm_completion.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import functools import logging diff --git a/src/data_designer/engine/column_generators/generators/samplers.py b/src/data_designer/engine/column_generators/generators/samplers.py index 1cceafd0..de18598a 100644 --- a/src/data_designer/engine/column_generators/generators/samplers.py +++ b/src/data_designer/engine/column_generators/generators/samplers.py @@ -6,9 +6,7 @@ from __future__ import annotations import logging import random from functools import partial -from typing import Callable - -import pandas as pd +from typing import TYPE_CHECKING, Callable from data_designer.config.utils.constants import LOCALES_WITH_MANAGED_DATASETS from data_designer.engine.column_generators.generators.base import FromScratchColumnGenerator, GenerationStrategy @@ -18,6 +16,10 @@ from data_designer.engine.resources.managed_dataset_generator import ManagedData from data_designer.engine.sampling_gen.data_sources.sources import SamplerType from data_designer.engine.sampling_gen.entities.person import load_person_data_sampler from data_designer.engine.sampling_gen.generator import DatasetGenerator as SamplingDatasetGenerator +from data_designer.lazy_heavy_imports import pd + +if TYPE_CHECKING: + import pandas as pd logger = logging.getLogger(__name__) diff --git a/src/data_designer/engine/column_generators/generators/seed_dataset.py b/src/data_designer/engine/column_generators/generators/seed_dataset.py index d228ee00..64193aee 100644 --- a/src/data_designer/engine/column_generators/generators/seed_dataset.py +++ b/src/data_designer/engine/column_generators/generators/seed_dataset.py @@ -1,20 +1,22 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 - from __future__ import annotations import functools import logging - -import duckdb -import pandas as pd +from typing import TYPE_CHECKING from data_designer.config.seed import IndexRange, PartitionBlock, SamplingStrategy from data_designer.engine.column_generators.generators.base import FromScratchColumnGenerator, GenerationStrategy from data_designer.engine.column_generators.utils.errors import SeedDatasetError from data_designer.engine.dataset_builders.multi_column_configs import SeedDatasetMultiColumnConfig from data_designer.engine.processing.utils import concat_datasets +from data_designer.lazy_heavy_imports import duckdb, pd + +if TYPE_CHECKING: + import duckdb + import pandas as pd MAX_ZERO_RECORD_RESPONSE_FACTOR = 2 diff --git a/src/data_designer/engine/column_generators/generators/validation.py b/src/data_designer/engine/column_generators/generators/validation.py index ed1ac3e7..1308efd4 100644 --- a/src/data_designer/engine/column_generators/generators/validation.py +++ b/src/data_designer/engine/column_generators/generators/validation.py @@ -4,8 +4,7 @@ from __future__ import annotations import logging - -import pandas as pd +from typing import TYPE_CHECKING from data_designer.config.column_configs import ValidationColumnConfig from data_designer.config.errors import InvalidConfigError @@ -22,6 +21,10 @@ from data_designer.engine.validators import ( SQLValidator, ValidationResult, ) +from data_designer.lazy_heavy_imports import pd + +if TYPE_CHECKING: + import pandas as pd logger = logging.getLogger(__name__) diff --git a/src/data_designer/engine/column_generators/registry.py b/src/data_designer/engine/column_generators/registry.py index 06e9f07a..0f4d3297 100644 --- a/src/data_designer/engine/column_generators/registry.py +++ b/src/data_designer/engine/column_generators/registry.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.config.base import ConfigBase from data_designer.config.column_configs import ( EmbeddingColumnConfig, diff --git a/src/data_designer/engine/column_generators/utils/errors.py b/src/data_designer/engine/column_generators/utils/errors.py index 70b814d3..ae5b9112 100644 --- a/src/data_designer/engine/column_generators/utils/errors.py +++ b/src/data_designer/engine/column_generators/utils/errors.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.engine.errors import DataDesignerError diff --git a/src/data_designer/engine/column_generators/utils/generator_classification.py b/src/data_designer/engine/column_generators/utils/generator_classification.py index 9339b378..2e082779 100644 --- a/src/data_designer/engine/column_generators/utils/generator_classification.py +++ b/src/data_designer/engine/column_generators/utils/generator_classification.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.config.column_types import DataDesignerColumnType from data_designer.config.utils.type_helpers import resolve_string_enum from data_designer.engine.column_generators.generators.base import ColumnGeneratorWithModelRegistry diff --git a/src/data_designer/engine/column_generators/utils/judge_score_factory.py b/src/data_designer/engine/column_generators/utils/judge_score_factory.py index 782b1983..0ee030d0 100644 --- a/src/data_designer/engine/column_generators/utils/judge_score_factory.py +++ b/src/data_designer/engine/column_generators/utils/judge_score_factory.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from enum import Enum from pydantic import BaseModel, ConfigDict, Field, create_model diff --git a/src/data_designer/engine/column_generators/utils/prompt_renderer.py b/src/data_designer/engine/column_generators/utils/prompt_renderer.py index be701ae8..ee23ef2f 100644 --- a/src/data_designer/engine/column_generators/utils/prompt_renderer.py +++ b/src/data_designer/engine/column_generators/utils/prompt_renderer.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import json import logging diff --git a/src/data_designer/engine/compiler.py b/src/data_designer/engine/compiler.py index 921206ed..d3972b14 100644 --- a/src/data_designer/engine/compiler.py +++ b/src/data_designer/engine/compiler.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import logging from data_designer.config.column_configs import SeedDatasetColumnConfig diff --git a/src/data_designer/engine/configurable_task.py b/src/data_designer/engine/configurable_task.py index d93405db..2d64f9fc 100644 --- a/src/data_designer/engine/configurable_task.py +++ b/src/data_designer/engine/configurable_task.py @@ -1,15 +1,19 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from abc import ABC from pathlib import Path -from typing import Generic, TypeVar, get_origin - -import pandas as pd +from typing import TYPE_CHECKING, Generic, TypeVar, get_origin from data_designer.config.base import ConfigBase from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage from data_designer.engine.resources.resource_provider import ResourceProvider +from data_designer.lazy_heavy_imports import pd + +if TYPE_CHECKING: + import pandas as pd DataT = TypeVar("DataT", dict, pd.DataFrame) TaskConfigT = TypeVar("ConfigT", bound=ConfigBase) diff --git a/src/data_designer/engine/dataset_builders/artifact_storage.py b/src/data_designer/engine/dataset_builders/artifact_storage.py index 8f096bae..234bd04a 100644 --- a/src/data_designer/engine/dataset_builders/artifact_storage.py +++ b/src/data_designer/engine/dataset_builders/artifact_storage.py @@ -1,19 +1,25 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import json import logging import shutil from datetime import datetime from functools import cached_property from pathlib import Path +from typing import TYPE_CHECKING -import pandas as pd from pydantic import BaseModel, field_validator, model_validator from data_designer.config.utils.io_helpers import read_parquet_dataset from data_designer.config.utils.type_helpers import StrEnum, resolve_string_enum from data_designer.engine.dataset_builders.errors import ArtifactStorageError +from data_designer.lazy_heavy_imports import pd + +if TYPE_CHECKING: + import pandas as pd logger = logging.getLogger(__name__) diff --git a/src/data_designer/engine/dataset_builders/column_wise_builder.py b/src/data_designer/engine/dataset_builders/column_wise_builder.py index c6af0164..eedbac31 100644 --- a/src/data_designer/engine/dataset_builders/column_wise_builder.py +++ b/src/data_designer/engine/dataset_builders/column_wise_builder.py @@ -12,8 +12,6 @@ import uuid from pathlib import Path from typing import TYPE_CHECKING, Callable -import pandas as pd - from data_designer.config.column_types import ColumnConfigT from data_designer.config.dataset_builders import BuildStage from data_designer.config.processors import ( @@ -40,14 +38,16 @@ from data_designer.engine.processing.processors.base import Processor from data_designer.engine.processing.processors.drop_columns import DropColumnsProcessor from data_designer.engine.registry.data_designer_registry import DataDesignerRegistry from data_designer.engine.resources.resource_provider import ResourceProvider +from data_designer.lazy_heavy_imports import pd if TYPE_CHECKING: + import pandas as pd + from data_designer.engine.column_generators.generators.base import ColumnGeneratorWithModelRegistry from data_designer.engine.models.usage import ModelUsageStats logger = logging.getLogger(__name__) - _CLIENT_VERSION: str = importlib.metadata.version("data_designer") diff --git a/src/data_designer/engine/dataset_builders/errors.py b/src/data_designer/engine/dataset_builders/errors.py index 96d3cef3..819ce3eb 100644 --- a/src/data_designer/engine/dataset_builders/errors.py +++ b/src/data_designer/engine/dataset_builders/errors.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.engine.errors import DataDesignerError diff --git a/src/data_designer/engine/dataset_builders/multi_column_configs.py b/src/data_designer/engine/dataset_builders/multi_column_configs.py index 54dc67ae..8605ff9a 100644 --- a/src/data_designer/engine/dataset_builders/multi_column_configs.py +++ b/src/data_designer/engine/dataset_builders/multi_column_configs.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from abc import ABC from typing import TypeAlias diff --git a/src/data_designer/engine/dataset_builders/utils/config_compiler.py b/src/data_designer/engine/dataset_builders/utils/config_compiler.py index b2bace77..19f82827 100644 --- a/src/data_designer/engine/dataset_builders/utils/config_compiler.py +++ b/src/data_designer/engine/dataset_builders/utils/config_compiler.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.config.column_types import DataDesignerColumnType from data_designer.config.data_designer_config import DataDesignerConfig from data_designer.config.processors import ProcessorConfig diff --git a/src/data_designer/engine/dataset_builders/utils/dag.py b/src/data_designer/engine/dataset_builders/utils/dag.py index e187f39f..9592ea88 100644 --- a/src/data_designer/engine/dataset_builders/utils/dag.py +++ b/src/data_designer/engine/dataset_builders/utils/dag.py @@ -1,13 +1,18 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -import logging +from __future__ import annotations -import networkx as nx +import logging +from typing import TYPE_CHECKING from data_designer.config.column_types import ColumnConfigT from data_designer.engine.column_generators.utils.generator_classification import column_type_used_in_execution_dag from data_designer.engine.dataset_builders.utils.errors import DAGCircularDependencyError +from data_designer.lazy_heavy_imports import nx + +if TYPE_CHECKING: + import networkx as nx logger = logging.getLogger(__name__) diff --git a/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py b/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py index 176f4d60..9c9554a7 100644 --- a/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +++ b/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py @@ -1,16 +1,20 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import logging import shutil from pathlib import Path -from typing import Callable, Container, Iterator - -import pandas as pd -import pyarrow.parquet as pq +from typing import TYPE_CHECKING, Callable, Container, Iterator from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage, BatchStage from data_designer.engine.dataset_builders.utils.errors import DatasetBatchManagementError +from data_designer.lazy_heavy_imports import pd, pq + +if TYPE_CHECKING: + import pandas as pd + import pyarrow.parquet as pq logger = logging.getLogger(__name__) diff --git a/src/data_designer/engine/dataset_builders/utils/errors.py b/src/data_designer/engine/dataset_builders/utils/errors.py index 8f22fc03..4cf59697 100644 --- a/src/data_designer/engine/dataset_builders/utils/errors.py +++ b/src/data_designer/engine/dataset_builders/utils/errors.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.engine.errors import DataDesignerError diff --git a/src/data_designer/engine/errors.py b/src/data_designer/engine/errors.py index 2ed1dd18..3aee0544 100644 --- a/src/data_designer/engine/errors.py +++ b/src/data_designer/engine/errors.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from pydantic import BaseModel, Field from data_designer.errors import DataDesignerError diff --git a/src/data_designer/engine/model_provider.py b/src/data_designer/engine/model_provider.py index 02b51ca6..85bf8bbe 100644 --- a/src/data_designer/engine/model_provider.py +++ b/src/data_designer/engine/model_provider.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from functools import cached_property from pydantic import BaseModel, field_validator, model_validator diff --git a/src/data_designer/engine/models/errors.py b/src/data_designer/engine/models/errors.py index 6603b6ce..3e1ddf01 100644 --- a/src/data_designer/engine/models/errors.py +++ b/src/data_designer/engine/models/errors.py @@ -6,25 +6,15 @@ from __future__ import annotations import logging from collections.abc import Callable from functools import wraps -from typing import Any +from typing import TYPE_CHECKING, Any -from litellm.exceptions import ( - APIConnectionError, - APIError, - AuthenticationError, - BadRequestError, - ContextWindowExceededError, - InternalServerError, - NotFoundError, - PermissionDeniedError, - RateLimitError, - Timeout, - UnprocessableEntityError, - UnsupportedParamsError, -) from pydantic import BaseModel from data_designer.engine.errors import DataDesignerError +from data_designer.lazy_heavy_imports import litellm + +if TYPE_CHECKING: + import litellm logger = logging.getLogger(__name__) @@ -132,10 +122,10 @@ def handle_llm_exceptions( err_msg_parser = DownstreamLLMExceptionMessageParser(model_name, model_provider_name, purpose) match exception: # Common errors that can come from LiteLLM - case APIError(): + case litellm.exceptions.APIError(): raise err_msg_parser.parse_api_error(exception, authentication_error) from None - case APIConnectionError(): + case litellm.exceptions.APIConnectionError(): raise ModelAPIConnectionError( FormattedLLMErrorMessage( cause=f"Connection to model {model_name!r} hosted on model provider {model_provider_name!r} failed while {purpose}.", @@ -143,13 +133,13 @@ def handle_llm_exceptions( ) ) from None - case AuthenticationError(): + case litellm.exceptions.AuthenticationError(): raise ModelAuthenticationError(authentication_error) from None - case ContextWindowExceededError(): + case litellm.exceptions.ContextWindowExceededError(): raise err_msg_parser.parse_context_window_exceeded_error(exception) from None - case UnsupportedParamsError(): + case litellm.exceptions.UnsupportedParamsError(): raise ModelUnsupportedParamsError( FormattedLLMErrorMessage( cause=f"One or more of the parameters you provided were found to be unsupported by model {model_name!r} while {purpose}.", @@ -157,10 +147,10 @@ def handle_llm_exceptions( ) ) from None - case BadRequestError(): + case litellm.exceptions.BadRequestError(): raise err_msg_parser.parse_bad_request_error(exception) from None - case InternalServerError(): + case litellm.exceptions.InternalServerError(): raise ModelInternalServerError( FormattedLLMErrorMessage( cause=f"Model {model_name!r} is currently experiencing internal server issues while {purpose}.", @@ -168,7 +158,7 @@ def handle_llm_exceptions( ) ) from None - case NotFoundError(): + case litellm.exceptions.NotFoundError(): raise ModelNotFoundError( FormattedLLMErrorMessage( cause=f"The specified model {model_name!r} could not be found while {purpose}.", @@ -176,7 +166,7 @@ def handle_llm_exceptions( ) ) from None - case PermissionDeniedError(): + case litellm.exceptions.PermissionDeniedError(): raise ModelPermissionDeniedError( FormattedLLMErrorMessage( cause=f"Your API key was found to lack the necessary permissions to use model {model_name!r} while {purpose}.", @@ -184,7 +174,7 @@ def handle_llm_exceptions( ) ) from None - case RateLimitError(): + case litellm.exceptions.RateLimitError(): raise ModelRateLimitError( FormattedLLMErrorMessage( cause=f"You have exceeded the rate limit for model {model_name!r} while {purpose}.", @@ -192,7 +182,7 @@ def handle_llm_exceptions( ) ) from None - case Timeout(): + case litellm.exceptions.Timeout(): raise ModelTimeoutError( FormattedLLMErrorMessage( cause=f"The request to model {model_name!r} timed out while {purpose}.", @@ -200,7 +190,7 @@ def handle_llm_exceptions( ) ) from None - case UnprocessableEntityError(): + case litellm.exceptions.UnprocessableEntityError(): raise ModelUnprocessableEntityError( FormattedLLMErrorMessage( cause=f"The request to model {model_name!r} failed despite correct request format while {purpose}.", @@ -264,7 +254,7 @@ class DownstreamLLMExceptionMessageParser: self.model_provider_name = model_provider_name self.purpose = purpose - def parse_bad_request_error(self, exception: BadRequestError) -> DataDesignerError: + def parse_bad_request_error(self, exception: litellm.exceptions.BadRequestError) -> DataDesignerError: err_msg = FormattedLLMErrorMessage( cause=f"The request for model {self.model_name!r} was found to be malformed or missing required parameters while {self.purpose}.", solution="Check your request parameters and try again.", @@ -276,7 +266,9 @@ class DownstreamLLMExceptionMessageParser: ) return ModelBadRequestError(err_msg) - def parse_context_window_exceeded_error(self, exception: ContextWindowExceededError) -> DataDesignerError: + def parse_context_window_exceeded_error( + self, exception: litellm.exceptions.ContextWindowExceededError + ) -> DataDesignerError: cause = f"The input data for model '{self.model_name}' was found to exceed its supported context width while {self.purpose}." try: if "OpenAIException - This model's maximum context length is " in str(exception): @@ -295,7 +287,7 @@ class DownstreamLLMExceptionMessageParser: ) def parse_api_error( - self, exception: InternalServerError, auth_error_msg: FormattedLLMErrorMessage + self, exception: litellm.exceptions.InternalServerError, auth_error_msg: FormattedLLMErrorMessage ) -> DataDesignerError: if "Error code: 403" in str(exception): return ModelAuthenticationError(auth_error_msg) diff --git a/src/data_designer/engine/models/facade.py b/src/data_designer/engine/models/facade.py index b5640e29..103ab83c 100644 --- a/src/data_designer/engine/models/facade.py +++ b/src/data_designer/engine/models/facade.py @@ -6,10 +6,7 @@ from __future__ import annotations import logging from collections.abc import Callable from copy import deepcopy -from typing import Any - -from litellm.types.router import DeploymentTypedDict, LiteLLM_Params -from litellm.types.utils import EmbeddingResponse, ModelResponse +from typing import TYPE_CHECKING, Any from data_designer.config.models import GenerationType, ModelConfig, ModelProvider from data_designer.engine.model_provider import ModelProviderRegistry @@ -23,6 +20,10 @@ from data_designer.engine.models.parsers.errors import ParserException from data_designer.engine.models.usage import ModelUsageStats, RequestUsageStats, TokenUsageStats from data_designer.engine.models.utils import prompt_to_messages, str_to_message from data_designer.engine.secret_resolver import SecretResolver +from data_designer.lazy_heavy_imports import litellm + +if TYPE_CHECKING: + import litellm logger = logging.getLogger(__name__) @@ -65,7 +66,9 @@ class ModelFacade: def usage_stats(self) -> ModelUsageStats: return self._usage_stats - def completion(self, messages: list[dict[str, str]], skip_usage_tracking: bool = False, **kwargs) -> ModelResponse: + def completion( + self, messages: list[dict[str, str]], skip_usage_tracking: bool = False, **kwargs + ) -> litellm.ModelResponse: logger.debug( f"Prompting model {self.model_name!r}...", extra={"model": self.model_name, "messages": messages}, @@ -236,14 +239,14 @@ class ModelFacade: ) from exc return output_obj, reasoning_trace - def _get_litellm_deployment(self, model_config: ModelConfig) -> DeploymentTypedDict: + def _get_litellm_deployment(self, model_config: ModelConfig) -> litellm.DeploymentTypedDict: provider = self._model_provider_registry.get_provider(model_config.provider) api_key = None if provider.api_key: api_key = self._secret_resolver.resolve(provider.api_key) api_key = api_key or "not-used-but-required" - litellm_params = LiteLLM_Params( + litellm_params = litellm.LiteLLM_Params( model=f"{provider.provider_type}/{model_config.model}", api_base=provider.endpoint, api_key=api_key, @@ -253,7 +256,7 @@ class ModelFacade: "litellm_params": litellm_params.model_dump(), } - def _track_usage(self, response: ModelResponse | None) -> None: + def _track_usage(self, response: litellm.types.utils.ModelResponse | None) -> None: if response is None: self._usage_stats.extend(request_usage=RequestUsageStats(successful_requests=0, failed_requests=1)) return @@ -270,7 +273,7 @@ class ModelFacade: request_usage=RequestUsageStats(successful_requests=1, failed_requests=0), ) - def _track_usage_from_embedding(self, response: EmbeddingResponse | None) -> None: + def _track_usage_from_embedding(self, response: litellm.types.utils.EmbeddingResponse | None) -> None: if response is None: self._usage_stats.extend(request_usage=RequestUsageStats(successful_requests=0, failed_requests=1)) return diff --git a/src/data_designer/engine/models/factory.py b/src/data_designer/engine/models/factory.py new file mode 100644 index 00000000..5f9b30ae --- /dev/null +++ b/src/data_designer/engine/models/factory.py @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from data_designer.config.models import ModelConfig +from data_designer.engine.model_provider import ModelProviderRegistry +from data_designer.engine.secret_resolver import SecretResolver + +if TYPE_CHECKING: + from data_designer.engine.models.registry import ModelRegistry + + +def create_model_registry( + *, + model_configs: list[ModelConfig] | None = None, + secret_resolver: SecretResolver, + model_provider_registry: ModelProviderRegistry, +) -> ModelRegistry: + """Factory function for creating a ModelRegistry instance. + + Heavy dependencies (litellm, httpx) are deferred until this function is called. + This is a factory function pattern - imports inside factories are idiomatic Python + for lazy initialization. + """ + from data_designer.engine.models.facade import ModelFacade + from data_designer.engine.models.litellm_overrides import apply_litellm_patches + from data_designer.engine.models.registry import ModelRegistry + + apply_litellm_patches() + + def model_facade_factory(model_config, secret_resolver, model_provider_registry): + return ModelFacade(model_config, secret_resolver, model_provider_registry) + + return ModelRegistry( + model_configs=model_configs, + secret_resolver=secret_resolver, + model_provider_registry=model_provider_registry, + model_facade_factory=model_facade_factory, + ) diff --git a/src/data_designer/engine/models/litellm_overrides.py b/src/data_designer/engine/models/litellm_overrides.py index 35685bc6..eab141df 100644 --- a/src/data_designer/engine/models/litellm_overrides.py +++ b/src/data_designer/engine/models/litellm_overrides.py @@ -5,21 +5,26 @@ from __future__ import annotations import random import threading +from typing import TYPE_CHECKING -import httpx -import litellm -from litellm import RetryPolicy -from litellm.caching.in_memory_cache import InMemoryCache -from litellm.litellm_core_utils.logging_callback_manager import LoggingCallbackManager -from litellm.router import Router from pydantic import BaseModel, Field from typing_extensions import override +from data_designer.lazy_heavy_imports import httpx, litellm from data_designer.logging import quiet_noisy_logger +if TYPE_CHECKING: + import httpx + import litellm + DEFAULT_MAX_CALLBACKS = 1000 +def _get_logging_callback_manager(): + """Lazy accessor for LoggingCallbackManager to avoid loading litellm at import time.""" + return litellm.litellm_core_utils.logging_callback_manager.LoggingCallbackManager + + class LiteLLMRouterDefaultKwargs(BaseModel): ## Number of seconds to wait initially after a connection ## failure. @@ -35,15 +40,15 @@ class LiteLLMRouterDefaultKwargs(BaseModel): ## Sets the default retry policy, including the number ## of retries to use in particular scenarios. - retry_policy: RetryPolicy = Field( - default_factory=lambda: RetryPolicy( + retry_policy: litellm.RetryPolicy = Field( + default_factory=lambda: litellm.RetryPolicy( RateLimitErrorRetries=3, TimeoutErrorRetries=3, ) ) -class ThreadSafeCache(InMemoryCache): +class ThreadSafeCache(litellm.caching.in_memory_cache.InMemoryCache): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -78,7 +83,7 @@ class ThreadSafeCache(InMemoryCache): super().flush_cache() -class CustomRouter(Router): +class CustomRouter(litellm.router.Router): def __init__( self, *args, @@ -155,7 +160,7 @@ def apply_litellm_patches(): litellm.in_memory_llm_clients_cache = ThreadSafeCache() # Workaround for the litellm issue described in https://github.com/BerriAI/litellm/issues/9792 - LoggingCallbackManager.MAX_CALLBACKS = DEFAULT_MAX_CALLBACKS + _get_logging_callback_manager().MAX_CALLBACKS = DEFAULT_MAX_CALLBACKS quiet_noisy_logger("httpx") quiet_noisy_logger("LiteLLM") diff --git a/src/data_designer/engine/models/parsers/errors.py b/src/data_designer/engine/models/parsers/errors.py index 9bbed1e9..7d1db351 100644 --- a/src/data_designer/engine/models/parsers/errors.py +++ b/src/data_designer/engine/models/parsers/errors.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + class ParserException(Exception): """Identifies errors resulting from generic parser errors. diff --git a/src/data_designer/engine/models/parsers/parser.py b/src/data_designer/engine/models/parsers/parser.py index 7c56a463..18e95bac 100644 --- a/src/data_designer/engine/models/parsers/parser.py +++ b/src/data_designer/engine/models/parsers/parser.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from functools import reduce import marko @@ -80,13 +82,11 @@ class LLMResponseParser: code: str syntax: Optional[str] = None - class CodeBlockParser: def __call__(self, element: _Element) -> CodeBlock: # Implementation details... return CodeBlock(code=element.text, syntax=element.get("class")) - parser = LLMResponseParser( tag_parsers={ "pre.code": CodeBlockParser(), diff --git a/src/data_designer/engine/models/parsers/postprocessors.py b/src/data_designer/engine/models/parsers/postprocessors.py index b9f935b8..4635562c 100644 --- a/src/data_designer/engine/models/parsers/postprocessors.py +++ b/src/data_designer/engine/models/parsers/postprocessors.py @@ -1,6 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations import json_repair from pydantic import BaseModel, ValidationError diff --git a/src/data_designer/engine/models/parsers/tag_parsers.py b/src/data_designer/engine/models/parsers/tag_parsers.py index 559fe1fc..ba09eed0 100644 --- a/src/data_designer/engine/models/parsers/tag_parsers.py +++ b/src/data_designer/engine/models/parsers/tag_parsers.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from lxml.etree import _Element from data_designer.engine.models.parsers.types import CodeBlock, TextBlock diff --git a/src/data_designer/engine/models/parsers/types.py b/src/data_designer/engine/models/parsers/types.py index 73de1b53..81575ef9 100644 --- a/src/data_designer/engine/models/parsers/types.py +++ b/src/data_designer/engine/models/parsers/types.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from typing import Any, Protocol, runtime_checkable from lxml.etree import _Element diff --git a/src/data_designer/engine/models/recipes/base.py b/src/data_designer/engine/models/recipes/base.py index 27f5d489..ab3e313a 100644 --- a/src/data_designer/engine/models/recipes/base.py +++ b/src/data_designer/engine/models/recipes/base.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import abc from collections.abc import Callable from typing import Generic, TypeVar diff --git a/src/data_designer/engine/models/recipes/response_recipes.py b/src/data_designer/engine/models/recipes/response_recipes.py index cf0ce90c..deba050e 100644 --- a/src/data_designer/engine/models/recipes/response_recipes.py +++ b/src/data_designer/engine/models/recipes/response_recipes.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import json from collections.abc import Callable diff --git a/src/data_designer/engine/models/registry.py b/src/data_designer/engine/models/registry.py index c3cf7a7d..29baf012 100644 --- a/src/data_designer/engine/models/registry.py +++ b/src/data_designer/engine/models/registry.py @@ -4,14 +4,17 @@ from __future__ import annotations import logging +from collections.abc import Callable +from typing import TYPE_CHECKING from data_designer.config.models import GenerationType, ModelConfig from data_designer.engine.model_provider import ModelProvider, ModelProviderRegistry -from data_designer.engine.models.facade import ModelFacade -from data_designer.engine.models.litellm_overrides import apply_litellm_patches from data_designer.engine.models.usage import ModelUsageStats, RequestUsageStats, TokenUsageStats from data_designer.engine.secret_resolver import SecretResolver +if TYPE_CHECKING: + from data_designer.engine.models.facade import ModelFacade + logger = logging.getLogger(__name__) @@ -22,10 +25,12 @@ class ModelRegistry: secret_resolver: SecretResolver, model_provider_registry: ModelProviderRegistry, model_configs: list[ModelConfig] | None = None, + model_facade_factory: Callable[[ModelConfig, SecretResolver, ModelProviderRegistry], ModelFacade] | None = None, ): self._secret_resolver = secret_resolver self._model_provider_registry = model_provider_registry - self._model_configs = {} + self._model_facade_factory = model_facade_factory + self._model_configs: dict[str, ModelConfig] = {} self._models: dict[str, ModelFacade] = {} self._set_model_configs(model_configs) @@ -136,18 +141,6 @@ class ModelRegistry: # Models are now lazily initialized in get_model() when first requested def _get_model(self, model_config: ModelConfig) -> ModelFacade: - return ModelFacade(model_config, self._secret_resolver, self._model_provider_registry) - - -def create_model_registry( - *, - model_configs: list[ModelConfig] | None = None, - secret_resolver: SecretResolver, - model_provider_registry: ModelProviderRegistry, -) -> ModelRegistry: - apply_litellm_patches() - return ModelRegistry( - model_configs=model_configs, - secret_resolver=secret_resolver, - model_provider_registry=model_provider_registry, - ) + if self._model_facade_factory is None: + raise RuntimeError("ModelRegistry was not initialized with a model_facade_factory") + return self._model_facade_factory(model_config, self._secret_resolver, self._model_provider_registry) diff --git a/src/data_designer/engine/models/telemetry.py b/src/data_designer/engine/models/telemetry.py index 3da3dc89..93956612 100644 --- a/src/data_designer/engine/models/telemetry.py +++ b/src/data_designer/engine/models/telemetry.py @@ -18,11 +18,15 @@ import platform from dataclasses import dataclass from datetime import datetime, timezone from enum import Enum -from typing import Any, ClassVar +from typing import TYPE_CHECKING, Any, ClassVar -import httpx from pydantic import BaseModel, Field +from data_designer.lazy_heavy_imports import httpx + +if TYPE_CHECKING: + import httpx + TELEMETRY_ENABLED = os.getenv("NEMO_TELEMETRY_ENABLED", "true").lower() in ("1", "true", "yes") CLIENT_ID = "184482118588404" NEMO_TELEMETRY_VERSION = "nemo-telemetry/1.0" diff --git a/src/data_designer/engine/processing/ginja/ast.py b/src/data_designer/engine/processing/ginja/ast.py index 5cd95f63..42cea3f2 100644 --- a/src/data_designer/engine/processing/ginja/ast.py +++ b/src/data_designer/engine/processing/ginja/ast.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from collections import deque from jinja2 import nodes as j_nodes diff --git a/src/data_designer/engine/processing/ginja/environment.py b/src/data_designer/engine/processing/ginja/environment.py index ea1e2c1f..60987405 100644 --- a/src/data_designer/engine/processing/ginja/environment.py +++ b/src/data_designer/engine/processing/ginja/environment.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import re from collections.abc import Callable from functools import partial, wraps diff --git a/src/data_designer/engine/processing/ginja/exceptions.py b/src/data_designer/engine/processing/ginja/exceptions.py index ec55110d..1c56621e 100644 --- a/src/data_designer/engine/processing/ginja/exceptions.py +++ b/src/data_designer/engine/processing/ginja/exceptions.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import re from jinja2 import TemplateAssertionError diff --git a/src/data_designer/engine/processing/ginja/record.py b/src/data_designer/engine/processing/ginja/record.py index c860a4db..3304bbe6 100644 --- a/src/data_designer/engine/processing/ginja/record.py +++ b/src/data_designer/engine/processing/ginja/record.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import json from data_designer.config.utils.io_helpers import serialize_data diff --git a/src/data_designer/engine/processing/gsonschema/exceptions.py b/src/data_designer/engine/processing/gsonschema/exceptions.py index 719a6ec4..bd534b85 100644 --- a/src/data_designer/engine/processing/gsonschema/exceptions.py +++ b/src/data_designer/engine/processing/gsonschema/exceptions.py @@ -1,8 +1,15 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from jsonschema import ValidationError +from __future__ import annotations + +from typing import TYPE_CHECKING + +from data_designer.lazy_heavy_imports import jsonschema + +if TYPE_CHECKING: + import jsonschema -class JSONSchemaValidationError(ValidationError): +class JSONSchemaValidationError(jsonschema.ValidationError): """Alias of ValidationError to ease imports.""" diff --git a/src/data_designer/engine/processing/gsonschema/schema_transformers.py b/src/data_designer/engine/processing/gsonschema/schema_transformers.py index bbff4a79..c57b487b 100644 --- a/src/data_designer/engine/processing/gsonschema/schema_transformers.py +++ b/src/data_designer/engine/processing/gsonschema/schema_transformers.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from copy import deepcopy from typing import Any diff --git a/src/data_designer/engine/processing/gsonschema/types.py b/src/data_designer/engine/processing/gsonschema/types.py index 77699a1e..8dacc600 100644 --- a/src/data_designer/engine/processing/gsonschema/types.py +++ b/src/data_designer/engine/processing/gsonschema/types.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from typing import Any, TypeVar T_primitive = TypeVar("T_primitive", str, int, float, bool) diff --git a/src/data_designer/engine/processing/gsonschema/validators.py b/src/data_designer/engine/processing/gsonschema/validators.py index 732cfa49..b3e71681 100644 --- a/src/data_designer/engine/processing/gsonschema/validators.py +++ b/src/data_designer/engine/processing/gsonschema/validators.py @@ -1,19 +1,23 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import logging import re from copy import deepcopy from decimal import ROUND_HALF_UP, Decimal -from typing import Any, overload - -from jsonschema import Draft202012Validator, ValidationError, validators +from typing import TYPE_CHECKING, Any, overload from data_designer.engine.processing.gsonschema.exceptions import JSONSchemaValidationError from data_designer.engine.processing.gsonschema.schema_transformers import forbid_additional_properties from data_designer.engine.processing.gsonschema.types import DataObjectT, JSONSchemaT, T_primitive +from data_designer.lazy_heavy_imports import jsonschema -DEFAULT_JSONSCHEMA_VALIDATOR = Draft202012Validator +if TYPE_CHECKING: + import jsonschema + +DEFAULT_JSONSCHEMA_VALIDATOR = jsonschema.Draft202012Validator logger = logging.getLogger(__name__) @@ -69,7 +73,7 @@ def extend_jsonschema_validator_with_pruning(validator): Type[jsonschema.Validator]: A validator class that will prune extra fields. """ - return validators.extend(validator, {"additionalProperties": prune_additional_properties}) + return jsonschema.validators.extend(validator, {"additionalProperties": prune_additional_properties}) def _get_decimal_info_from_anyof(schema: dict) -> tuple[bool, int | None]: @@ -190,7 +194,7 @@ def validate( try: validator(schema).validate(final_object) - except ValidationError as exc: + except jsonschema.ValidationError as exc: raise JSONSchemaValidationError(str(exc)) from exc final_object = normalize_decimal_fields(final_object, schema) diff --git a/src/data_designer/engine/processing/processors/drop_columns.py b/src/data_designer/engine/processing/processors/drop_columns.py index 2668ead6..98369a6b 100644 --- a/src/data_designer/engine/processing/processors/drop_columns.py +++ b/src/data_designer/engine/processing/processors/drop_columns.py @@ -1,13 +1,18 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -import logging +from __future__ import annotations -import pandas as pd +import logging +from typing import TYPE_CHECKING from data_designer.config.processors import DropColumnsProcessorConfig from data_designer.engine.dataset_builders.artifact_storage import BatchStage from data_designer.engine.processing.processors.base import Processor +from data_designer.lazy_heavy_imports import pd + +if TYPE_CHECKING: + import pandas as pd logger = logging.getLogger(__name__) diff --git a/src/data_designer/engine/processing/processors/registry.py b/src/data_designer/engine/processing/processors/registry.py index 6e31cef0..9a9b463e 100644 --- a/src/data_designer/engine/processing/processors/registry.py +++ b/src/data_designer/engine/processing/processors/registry.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.config.base import ConfigBase from data_designer.config.processors import ( DropColumnsProcessorConfig, diff --git a/src/data_designer/engine/processing/processors/schema_transform.py b/src/data_designer/engine/processing/processors/schema_transform.py index 52d44fff..193244ea 100644 --- a/src/data_designer/engine/processing/processors/schema_transform.py +++ b/src/data_designer/engine/processing/processors/schema_transform.py @@ -1,16 +1,21 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import json import logging - -import pandas as pd +from typing import TYPE_CHECKING from data_designer.config.processors import SchemaTransformProcessorConfig from data_designer.engine.dataset_builders.artifact_storage import BatchStage from data_designer.engine.processing.ginja.environment import WithJinja2UserTemplateRendering from data_designer.engine.processing.processors.base import Processor from data_designer.engine.processing.utils import deserialize_json_values +from data_designer.lazy_heavy_imports import pd + +if TYPE_CHECKING: + import pandas as pd logger = logging.getLogger(__name__) diff --git a/src/data_designer/engine/processing/utils.py b/src/data_designer/engine/processing/utils.py index d205e9c6..24ca2e81 100644 --- a/src/data_designer/engine/processing/utils.py +++ b/src/data_designer/engine/processing/utils.py @@ -1,13 +1,18 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import ast import json import logging import re -from typing import Any, TypeVar, overload +from typing import TYPE_CHECKING, Any, TypeVar, overload -import pandas as pd +from data_designer.lazy_heavy_imports import pd + +if TYPE_CHECKING: + import pandas as pd logger = logging.getLogger(__name__) @@ -52,7 +57,6 @@ def deserialize_json_values(data): - Dictionary (potentially with nested JSON strings to deserialize) - Some other object that can't be deserialized. - Returns: Deserialized data in the corresponding format: - Dictionary (when input is a single string) diff --git a/src/data_designer/engine/registry/base.py b/src/data_designer/engine/registry/base.py index a2e0b75b..837851f4 100644 --- a/src/data_designer/engine/registry/base.py +++ b/src/data_designer/engine/registry/base.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import threading from typing import Any, Generic, TypeVar diff --git a/src/data_designer/engine/registry/data_designer_registry.py b/src/data_designer/engine/registry/data_designer_registry.py index 0420a14c..8b361233 100644 --- a/src/data_designer/engine/registry/data_designer_registry.py +++ b/src/data_designer/engine/registry/data_designer_registry.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.engine.analysis.column_profilers.registry import ( ColumnProfilerRegistry, create_default_column_profiler_registry, diff --git a/src/data_designer/engine/registry/errors.py b/src/data_designer/engine/registry/errors.py index 8b9afab5..017afac1 100644 --- a/src/data_designer/engine/registry/errors.py +++ b/src/data_designer/engine/registry/errors.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.engine.errors import DataDesignerError diff --git a/src/data_designer/engine/resources/managed_dataset_generator.py b/src/data_designer/engine/resources/managed_dataset_generator.py index 83157b2d..d050dd82 100644 --- a/src/data_designer/engine/resources/managed_dataset_generator.py +++ b/src/data_designer/engine/resources/managed_dataset_generator.py @@ -1,11 +1,15 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from typing import Any +from __future__ import annotations -import pandas as pd +from typing import TYPE_CHECKING, Any from data_designer.engine.resources.managed_dataset_repository import ManagedDatasetRepository +from data_designer.lazy_heavy_imports import pd + +if TYPE_CHECKING: + import pandas as pd class ManagedDatasetGenerator: diff --git a/src/data_designer/engine/resources/managed_dataset_repository.py b/src/data_designer/engine/resources/managed_dataset_repository.py index a42692b1..84fe4058 100644 --- a/src/data_designer/engine/resources/managed_dataset_repository.py +++ b/src/data_designer/engine/resources/managed_dataset_repository.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import logging import tempfile import threading @@ -9,13 +11,15 @@ from abc import ABC, abstractmethod from dataclasses import dataclass from functools import cached_property from pathlib import Path -from typing import Any - -import duckdb -import pandas as pd +from typing import TYPE_CHECKING, Any from data_designer.config.utils.constants import LOCALES_WITH_MANAGED_DATASETS from data_designer.engine.resources.managed_storage import LocalBlobStorageProvider, ManagedBlobStorage +from data_designer.lazy_heavy_imports import duckdb, pd + +if TYPE_CHECKING: + import duckdb + import pandas as pd logger = logging.getLogger(__name__) @@ -52,7 +56,6 @@ class Table: DataCatalog = list[Table] - # For now we hardcode the remote data catalog in code. This make it easier # initialize the data catalog. Eventually we can make this work more # dynamically once this data catalog pattern becomes more widely adopted. diff --git a/src/data_designer/engine/resources/managed_storage.py b/src/data_designer/engine/resources/managed_storage.py index e1ea6d6a..b0a1d75f 100644 --- a/src/data_designer/engine/resources/managed_storage.py +++ b/src/data_designer/engine/resources/managed_storage.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import logging from abc import ABC, abstractmethod from collections.abc import Iterator diff --git a/src/data_designer/engine/resources/resource_provider.py b/src/data_designer/engine/resources/resource_provider.py index a578dfb6..b7184597 100644 --- a/src/data_designer/engine/resources/resource_provider.py +++ b/src/data_designer/engine/resources/resource_provider.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.config.base import ConfigBase from data_designer.config.dataset_metadata import DatasetMetadata from data_designer.config.models import ModelConfig @@ -9,7 +11,8 @@ from data_designer.config.seed_source import SeedSource from data_designer.config.utils.type_helpers import StrEnum from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage from data_designer.engine.model_provider import ModelProviderRegistry -from data_designer.engine.models.registry import ModelRegistry, create_model_registry +from data_designer.engine.models.factory import create_model_registry +from data_designer.engine.models.registry import ModelRegistry from data_designer.engine.resources.managed_storage import ManagedBlobStorage, init_managed_blob_storage from data_designer.engine.resources.seed_reader import SeedReader, SeedReaderRegistry from data_designer.engine.secret_resolver import SecretResolver @@ -51,12 +54,16 @@ def create_resource_provider( seed_dataset_source: SeedSource | None = None, run_config: RunConfig | None = None, ) -> ResourceProvider: + """Factory function for creating a ResourceProvider instance. + This function triggers lazy loading of heavy dependencies like litellm. + """ seed_reader = None if seed_dataset_source: seed_reader = seed_reader_registry.get_reader( seed_dataset_source, secret_resolver, ) + return ResourceProvider( artifact_storage=artifact_storage, model_registry=create_model_registry( diff --git a/src/data_designer/engine/resources/seed_reader.py b/src/data_designer/engine/resources/seed_reader.py index 20db10e6..1539ba2e 100644 --- a/src/data_designer/engine/resources/seed_reader.py +++ b/src/data_designer/engine/resources/seed_reader.py @@ -1,11 +1,12 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from abc import ABC, abstractmethod from collections.abc import Sequence -from typing import Generic, TypeVar, get_args, get_origin +from typing import TYPE_CHECKING, Generic, TypeVar, get_args, get_origin -import duckdb from huggingface_hub import HfFileSystem from typing_extensions import Self @@ -17,6 +18,10 @@ from data_designer.config.seed_source import ( ) from data_designer.engine.secret_resolver import SecretResolver from data_designer.errors import DataDesignerError +from data_designer.lazy_heavy_imports import duckdb + +if TYPE_CHECKING: + import duckdb class SeedReaderError(DataDesignerError): ... diff --git a/src/data_designer/engine/sampling_gen/column.py b/src/data_designer/engine/sampling_gen/column.py index 3a7f647e..92aa63f1 100644 --- a/src/data_designer/engine/sampling_gen/column.py +++ b/src/data_designer/engine/sampling_gen/column.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from typing import Any from pydantic import field_serializer, model_validator diff --git a/src/data_designer/engine/sampling_gen/constraints.py b/src/data_designer/engine/sampling_gen/constraints.py index 15b9d629..0a061e1b 100644 --- a/src/data_designer/engine/sampling_gen/constraints.py +++ b/src/data_designer/engine/sampling_gen/constraints.py @@ -1,10 +1,11 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from abc import ABC, abstractmethod +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING -import numpy as np -import pandas as pd from numpy.typing import NDArray from data_designer.config.base import ConfigBase @@ -15,6 +16,11 @@ from data_designer.config.sampler_constraints import ( InequalityOperator, ScalarInequalityConstraint, ) +from data_designer.lazy_heavy_imports import np, pd + +if TYPE_CHECKING: + import numpy as np + import pandas as pd class ConstraintChecker(ConfigBase, ABC): diff --git a/src/data_designer/engine/sampling_gen/data_sources/base.py b/src/data_designer/engine/sampling_gen/data_sources/base.py index f01a4f13..3da5495c 100644 --- a/src/data_designer/engine/sampling_gen/data_sources/base.py +++ b/src/data_designer/engine/sampling_gen/data_sources/base.py @@ -1,24 +1,27 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from abc import ABC, abstractmethod -from typing import Any, Generic, TypeVar +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any, Generic, TypeVar -import numpy as np -import pandas as pd from numpy.typing import NDArray -from scipy import stats from data_designer.config.sampler_params import SamplerParamsT from data_designer.engine.sampling_gen.utils import check_random_state +from data_designer.lazy_heavy_imports import np, pd, scipy + +if TYPE_CHECKING: + import numpy as np + import pandas as pd + import scipy NumpyArray1dT = NDArray[Any] RadomStateT = int | np.random.RandomState - GenericParamsT = TypeVar("GenericParamsT", bound=SamplerParamsT) - ########################################################### # Processing Mixins # ----------------- @@ -208,7 +211,7 @@ class Sampler(DataSource[GenericParamsT], ABC): class ScipyStatsSampler(Sampler[GenericParamsT], ABC): @property @abstractmethod - def distribution(self) -> stats.rv_continuous | stats.rv_discrete: ... + def distribution(self) -> scipy.stats.rv_continuous | scipy.stats.rv_discrete: ... def sample(self, num_samples: int) -> NumpyArray1dT: return self.distribution.rvs(size=num_samples, random_state=self.rng) diff --git a/src/data_designer/engine/sampling_gen/data_sources/errors.py b/src/data_designer/engine/sampling_gen/data_sources/errors.py index b2753f3a..0be5b9da 100644 --- a/src/data_designer/engine/sampling_gen/data_sources/errors.py +++ b/src/data_designer/engine/sampling_gen/data_sources/errors.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.engine.sampling_gen.errors import SamplingGenError diff --git a/src/data_designer/engine/sampling_gen/data_sources/sources.py b/src/data_designer/engine/sampling_gen/data_sources/sources.py index 616be249..378a1097 100644 --- a/src/data_designer/engine/sampling_gen/data_sources/sources.py +++ b/src/data_designer/engine/sampling_gen/data_sources/sources.py @@ -1,11 +1,10 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -import uuid +from __future__ import annotations -import numpy as np -import pandas as pd -from scipy import stats +import uuid +from typing import TYPE_CHECKING from data_designer.config.sampler_params import ( BernoulliMixtureSamplerParams, @@ -40,6 +39,12 @@ from data_designer.engine.sampling_gen.data_sources.errors import ( ) from data_designer.engine.sampling_gen.entities.dataset_based_person_fields import PERSONA_FIELDS, PII_FIELDS from data_designer.engine.sampling_gen.people_gen import PeopleGen +from data_designer.lazy_heavy_imports import np, pd, scipy + +if TYPE_CHECKING: + import numpy as np + import pandas as pd + import scipy ONE_BILLION = 10**9 @@ -264,8 +269,8 @@ class ScipySampler(TypeConversionMixin, ScipyStatsSampler[ScipySamplerParams]): """Escape hatch sampler to give users access to any scipy.stats distribution.""" @property - def distribution(self) -> stats.rv_continuous | stats.rv_discrete: - return getattr(stats, self.params.dist_name)(**self.params.dist_params) + def distribution(self) -> scipy.stats.rv_continuous | scipy.stats.rv_discrete: + return getattr(scipy.stats, self.params.dist_name)(**self.params.dist_params) def _validate(self) -> None: _validate_scipy_distribution(self.params.dist_name, self.params.dist_params) @@ -274,16 +279,16 @@ class ScipySampler(TypeConversionMixin, ScipyStatsSampler[ScipySamplerParams]): @SamplerRegistry.register(SamplerType.BERNOULLI) class BernoulliSampler(TypeConversionMixin, ScipyStatsSampler[BernoulliSamplerParams]): @property - def distribution(self) -> stats.rv_discrete: - return stats.bernoulli(p=self.params.p) + def distribution(self) -> scipy.stats.rv_discrete: + return scipy.stats.bernoulli(p=self.params.p) @SamplerRegistry.register(SamplerType.BERNOULLI_MIXTURE) class BernoulliMixtureSampler(TypeConversionMixin, Sampler[BernoulliMixtureSamplerParams]): def sample(self, num_samples: int) -> NumpyArray1dT: - return stats.bernoulli(p=self.params.p).rvs(size=num_samples) * getattr(stats, self.params.dist_name)( - **self.params.dist_params - ).rvs(size=num_samples) + return scipy.stats.bernoulli(p=self.params.p).rvs(size=num_samples) * getattr( + scipy.stats, self.params.dist_name + )(**self.params.dist_params).rvs(size=num_samples) def _validate(self) -> None: _validate_scipy_distribution(self.params.dist_name, self.params.dist_params) @@ -292,29 +297,29 @@ class BernoulliMixtureSampler(TypeConversionMixin, Sampler[BernoulliMixtureSampl @SamplerRegistry.register(SamplerType.BINOMIAL) class BinomialSampler(TypeConversionMixin, ScipyStatsSampler[BinomialSamplerParams]): @property - def distribution(self) -> stats.rv_discrete: - return stats.binom(n=self.params.n, p=self.params.p) + def distribution(self) -> scipy.stats.rv_discrete: + return scipy.stats.binom(n=self.params.n, p=self.params.p) @SamplerRegistry.register(SamplerType.GAUSSIAN) class GaussianSampler(TypeConversionMixin, ScipyStatsSampler[GaussianSamplerParams]): @property - def distribution(self) -> stats.rv_continuous: - return stats.norm(loc=self.params.mean, scale=self.params.stddev) + def distribution(self) -> scipy.stats.rv_continuous: + return scipy.stats.norm(loc=self.params.mean, scale=self.params.stddev) @SamplerRegistry.register(SamplerType.POISSON) class PoissonSampler(TypeConversionMixin, ScipyStatsSampler[PoissonSamplerParams]): @property - def distribution(self) -> stats.rv_discrete: - return stats.poisson(mu=self.params.mean) + def distribution(self) -> scipy.stats.rv_discrete: + return scipy.stats.poisson(mu=self.params.mean) @SamplerRegistry.register(SamplerType.UNIFORM) class UniformSampler(TypeConversionMixin, ScipyStatsSampler[UniformSamplerParams]): @property - def distribution(self) -> stats.rv_continuous: - return stats.uniform(loc=self.params.low, scale=self.params.high - self.params.low) + def distribution(self) -> scipy.stats.rv_continuous: + return scipy.stats.uniform(loc=self.params.low, scale=self.params.high - self.params.low) ################################################### @@ -328,14 +333,14 @@ def load_sampler(sampler_type: SamplerType, **params) -> DataSource: def _validate_scipy_distribution(dist_name: str, dist_params: dict) -> None: - if not hasattr(stats, dist_name): + if not hasattr(scipy.stats, dist_name): raise InvalidSamplerParamsError(f"Distribution {dist_name} not found in scipy.stats") - if not hasattr(getattr(stats, dist_name), "rvs"): + if not hasattr(getattr(scipy.stats, dist_name), "rvs"): raise InvalidSamplerParamsError( f"Distribution {dist_name} does not have a `rvs` method, which is required for sampling." ) try: - getattr(stats, dist_name)(**dist_params) + getattr(scipy.stats, dist_name)(**dist_params) except Exception: raise InvalidSamplerParamsError( f"Distribution parameters {dist_params} are not a valid for distribution '{dist_name}'" diff --git a/src/data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py b/src/data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py index e6346337..ec0d5683 100644 --- a/src/data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +++ b/src/data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py @@ -10,8 +10,9 @@ This file contains all possible fields that: Do not add any other code or logic in this file. """ -REQUIRED_FIELDS = {"first_name", "last_name", "age", "locale"} +from __future__ import annotations +REQUIRED_FIELDS = {"first_name", "last_name", "age", "locale"} PII_FIELDS = [ # Core demographic fields @@ -52,7 +53,6 @@ PII_FIELDS = [ "third_language", ] - PERSONA_FIELDS = [ # Core persona fields "persona", diff --git a/src/data_designer/engine/sampling_gen/entities/email_address_utils.py b/src/data_designer/engine/sampling_gen/entities/email_address_utils.py index 0f716de3..d22c5069 100644 --- a/src/data_designer/engine/sampling_gen/entities/email_address_utils.py +++ b/src/data_designer/engine/sampling_gen/entities/email_address_utils.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import random import re from datetime import date diff --git a/src/data_designer/engine/sampling_gen/entities/errors.py b/src/data_designer/engine/sampling_gen/entities/errors.py index 6c592871..3fe3d609 100644 --- a/src/data_designer/engine/sampling_gen/entities/errors.py +++ b/src/data_designer/engine/sampling_gen/entities/errors.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.errors import DataDesignerError diff --git a/src/data_designer/engine/sampling_gen/entities/national_id_utils.py b/src/data_designer/engine/sampling_gen/entities/national_id_utils.py index ea24cc8a..1cc0a20f 100644 --- a/src/data_designer/engine/sampling_gen/entities/national_id_utils.py +++ b/src/data_designer/engine/sampling_gen/entities/national_id_utils.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import random from datetime import date diff --git a/src/data_designer/engine/sampling_gen/entities/person.py b/src/data_designer/engine/sampling_gen/entities/person.py index c7ea14cf..a51b6cca 100644 --- a/src/data_designer/engine/sampling_gen/entities/person.py +++ b/src/data_designer/engine/sampling_gen/entities/person.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import random from datetime import date, timedelta from typing import Any, Literal, TypeAlias diff --git a/src/data_designer/engine/sampling_gen/entities/phone_number.py b/src/data_designer/engine/sampling_gen/entities/phone_number.py index db847ef9..cd7691ac 100644 --- a/src/data_designer/engine/sampling_gen/entities/phone_number.py +++ b/src/data_designer/engine/sampling_gen/entities/phone_number.py @@ -1,12 +1,19 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import random from pathlib import Path +from typing import TYPE_CHECKING -import pandas as pd from pydantic import BaseModel, Field, field_validator +from data_designer.lazy_heavy_imports import pd + +if TYPE_CHECKING: + import pandas as pd + ZIP_AREA_CODE_DATA = pd.read_parquet(Path(__file__).parent / "assets" / "zip_area_code_map.parquet") ZIPCODE_AREA_CODE_MAP = dict(zip(ZIP_AREA_CODE_DATA["zipcode"], ZIP_AREA_CODE_DATA["area_code"])) ZIPCODE_POPULATION_MAP = dict(zip(ZIP_AREA_CODE_DATA["zipcode"], ZIP_AREA_CODE_DATA["count"])) diff --git a/src/data_designer/engine/sampling_gen/errors.py b/src/data_designer/engine/sampling_gen/errors.py index 6031d0f4..78672b6e 100644 --- a/src/data_designer/engine/sampling_gen/errors.py +++ b/src/data_designer/engine/sampling_gen/errors.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.engine.errors import DataDesignerError diff --git a/src/data_designer/engine/sampling_gen/generator.py b/src/data_designer/engine/sampling_gen/generator.py index 6b2fb0fb..e0e925d1 100644 --- a/src/data_designer/engine/sampling_gen/generator.py +++ b/src/data_designer/engine/sampling_gen/generator.py @@ -6,18 +6,19 @@ from __future__ import annotations from collections.abc import Callable from typing import TYPE_CHECKING -import networkx as nx -import numpy as np -import pandas as pd - from data_designer.engine.sampling_gen.data_sources.base import RadomStateT from data_designer.engine.sampling_gen.errors import RejectionSamplingError from data_designer.engine.sampling_gen.jinja_utils import JinjaDataFrame from data_designer.engine.sampling_gen.people_gen import create_people_gen_resource from data_designer.engine.sampling_gen.schema import DataSchema from data_designer.engine.sampling_gen.utils import check_random_state +from data_designer.lazy_heavy_imports import np, nx, pd if TYPE_CHECKING: + import networkx as nx + import numpy as np + import pandas as pd + from data_designer.engine.dataset_builders.multi_column_configs import SamplerMultiColumnConfig from data_designer.engine.resources.managed_dataset_generator import ManagedDatasetGenerator from data_designer.engine.sampling_gen.column import ConditionalDataColumn diff --git a/src/data_designer/engine/sampling_gen/jinja_utils.py b/src/data_designer/engine/sampling_gen/jinja_utils.py index 0c9a1cd3..ac81446f 100644 --- a/src/data_designer/engine/sampling_gen/jinja_utils.py +++ b/src/data_designer/engine/sampling_gen/jinja_utils.py @@ -1,15 +1,19 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -import ast -from typing import Any +from __future__ import annotations -import pandas as pd +import ast +from typing import TYPE_CHECKING, Any from data_designer.engine.processing.ginja.environment import ( UserTemplateSandboxEnvironment, WithJinja2UserTemplateRendering, ) +from data_designer.lazy_heavy_imports import pd + +if TYPE_CHECKING: + import pandas as pd class JinjaDataFrame(WithJinja2UserTemplateRendering): diff --git a/src/data_designer/engine/sampling_gen/people_gen.py b/src/data_designer/engine/sampling_gen/people_gen.py index 4a1fcb0d..885b2562 100644 --- a/src/data_designer/engine/sampling_gen/people_gen.py +++ b/src/data_designer/engine/sampling_gen/people_gen.py @@ -10,9 +10,6 @@ from collections.abc import Callable from copy import deepcopy from typing import TYPE_CHECKING, Any, TypeAlias -import pandas as pd -from faker import Faker - from data_designer.config.utils.constants import DEFAULT_AGE_RANGE from data_designer.engine.resources.managed_dataset_generator import ManagedDatasetGenerator from data_designer.engine.sampling_gen.entities.dataset_based_person_fields import PERSONA_FIELDS, PII_FIELDS @@ -22,12 +19,15 @@ from data_designer.engine.sampling_gen.entities.person import ( ) from data_designer.engine.sampling_gen.errors import ManagedDatasetGeneratorError from data_designer.engine.sampling_gen.person_constants import faker_constants +from data_designer.lazy_heavy_imports import faker, pd if TYPE_CHECKING: + import faker + import pandas as pd + from data_designer.engine.sampling_gen.schema import DataSchema - -EngineT: TypeAlias = Faker | ManagedDatasetGenerator +EngineT: TypeAlias = faker.Faker | ManagedDatasetGenerator class PeopleGen(ABC): @@ -46,7 +46,7 @@ class PeopleGen(ABC): class PeopleGenFaker(PeopleGen): @property - def _fake(self) -> Faker: + def _fake(self) -> faker.Faker: return self._engine def try_fake_else_none(self, attr_name: str, none_fill: Any | None = None) -> type: @@ -193,7 +193,7 @@ def create_people_gen_resource( for params in [column.params, *list(column.conditional_params.values())]: if params.people_gen_key not in people_gen_resource: people_gen_resource[params.people_gen_key] = PeopleGenFaker( - engine=Faker(params.locale), locale=params.locale + engine=faker.Faker(params.locale), locale=params.locale ) return people_gen_resource diff --git a/src/data_designer/engine/sampling_gen/person_constants.py b/src/data_designer/engine/sampling_gen/person_constants.py index 73f30f00..86633a0d 100644 --- a/src/data_designer/engine/sampling_gen/person_constants.py +++ b/src/data_designer/engine/sampling_gen/person_constants.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from typing import NamedTuple diff --git a/src/data_designer/engine/sampling_gen/schema.py b/src/data_designer/engine/sampling_gen/schema.py index 4c09aed5..9f254aea 100644 --- a/src/data_designer/engine/sampling_gen/schema.py +++ b/src/data_designer/engine/sampling_gen/schema.py @@ -4,8 +4,8 @@ from __future__ import annotations from functools import cached_property +from typing import TYPE_CHECKING -import networkx as nx from pydantic import BaseModel, Field, field_validator, model_validator from typing_extensions import Self @@ -14,6 +14,10 @@ from data_designer.config.sampler_constraints import ColumnConstraintT from data_designer.config.sampler_params import SamplerType from data_designer.engine.sampling_gen.column import ConditionalDataColumn from data_designer.engine.sampling_gen.constraints import ConstraintChecker, get_constraint_checker +from data_designer.lazy_heavy_imports import nx + +if TYPE_CHECKING: + import networkx as nx class Dag(BaseModel): diff --git a/src/data_designer/engine/sampling_gen/schema_builder.py b/src/data_designer/engine/sampling_gen/schema_builder.py index aaea4d91..65416cd5 100644 --- a/src/data_designer/engine/sampling_gen/schema_builder.py +++ b/src/data_designer/engine/sampling_gen/schema_builder.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from copy import deepcopy from data_designer.config.column_configs import SamplerColumnConfig diff --git a/src/data_designer/engine/sampling_gen/utils.py b/src/data_designer/engine/sampling_gen/utils.py index ec403733..2020d854 100644 --- a/src/data_designer/engine/sampling_gen/utils.py +++ b/src/data_designer/engine/sampling_gen/utils.py @@ -1,9 +1,15 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -import numbers +from __future__ import annotations -import numpy as np +import numbers +from typing import TYPE_CHECKING + +from data_designer.lazy_heavy_imports import np + +if TYPE_CHECKING: + import numpy as np def check_random_state(seed): diff --git a/src/data_designer/engine/secret_resolver.py b/src/data_designer/engine/secret_resolver.py index e38eadaf..5d996f52 100644 --- a/src/data_designer/engine/secret_resolver.py +++ b/src/data_designer/engine/secret_resolver.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import json import logging import os diff --git a/src/data_designer/engine/validators/__init__.py b/src/data_designer/engine/validators/__init__.py index e8bb01b5..cbf4c0d1 100644 --- a/src/data_designer/engine/validators/__init__.py +++ b/src/data_designer/engine/validators/__init__.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.engine.validators.base import BaseValidator, ValidationResult from data_designer.engine.validators.local_callable import LocalCallableValidator from data_designer.engine.validators.python import PythonValidator diff --git a/src/data_designer/engine/validators/base.py b/src/data_designer/engine/validators/base.py index e7ce2747..0cabb066 100644 --- a/src/data_designer/engine/validators/base.py +++ b/src/data_designer/engine/validators/base.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from abc import ABC, abstractmethod from typing import Iterator diff --git a/src/data_designer/engine/validators/local_callable.py b/src/data_designer/engine/validators/local_callable.py index 55f43e13..eddcc52d 100644 --- a/src/data_designer/engine/validators/local_callable.py +++ b/src/data_designer/engine/validators/local_callable.py @@ -1,14 +1,19 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -import logging +from __future__ import annotations -import pandas as pd +import logging +from typing import TYPE_CHECKING from data_designer.config.validator_params import LocalCallableValidatorParams from data_designer.engine.errors import LocalCallableValidationError from data_designer.engine.processing.gsonschema.validators import validate from data_designer.engine.validators.base import BaseValidator, ValidationOutput, ValidationResult +from data_designer.lazy_heavy_imports import pd + +if TYPE_CHECKING: + import pandas as pd logger = logging.getLogger(__name__) diff --git a/src/data_designer/engine/validators/python.py b/src/data_designer/engine/validators/python.py index d199102d..0f92a880 100644 --- a/src/data_designer/engine/validators/python.py +++ b/src/data_designer/engine/validators/python.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import ast import json import logging @@ -8,14 +10,18 @@ import subprocess import tempfile from collections import defaultdict from pathlib import Path +from typing import TYPE_CHECKING from uuid import uuid4 -import pandas as pd from pydantic import BaseModel from ruff.__main__ import find_ruff_bin from data_designer.config.validator_params import CodeValidatorParams from data_designer.engine.validators.base import BaseValidator, ValidationOutput, ValidationResult +from data_designer.lazy_heavy_imports import pd + +if TYPE_CHECKING: + import pandas as pd logger = logging.getLogger(__name__) diff --git a/src/data_designer/engine/validators/remote.py b/src/data_designer/engine/validators/remote.py index bded2552..d03fd94c 100644 --- a/src/data_designer/engine/validators/remote.py +++ b/src/data_designer/engine/validators/remote.py @@ -1,9 +1,11 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -import logging +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING -import httpx from httpx_retries import Retry, RetryTransport from data_designer.config.validator_params import RemoteValidatorParams @@ -11,6 +13,10 @@ from data_designer.engine.errors import RemoteValidationSchemaError from data_designer.engine.processing.gsonschema.exceptions import JSONSchemaValidationError from data_designer.engine.processing.gsonschema.validators import validate from data_designer.engine.validators.base import BaseValidator, ValidationResult +from data_designer.lazy_heavy_imports import httpx + +if TYPE_CHECKING: + import httpx logger = logging.getLogger(__name__) diff --git a/src/data_designer/engine/validators/sql.py b/src/data_designer/engine/validators/sql.py index fc1cd9c0..4e2c2522 100644 --- a/src/data_designer/engine/validators/sql.py +++ b/src/data_designer/engine/validators/sql.py @@ -1,15 +1,20 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import logging import re - -import pandas as pd -import sqlfluff +from typing import TYPE_CHECKING from data_designer.config.utils.code_lang import CodeLang from data_designer.config.validator_params import CodeValidatorParams from data_designer.engine.validators.base import BaseValidator, ValidationOutput, ValidationResult +from data_designer.lazy_heavy_imports import pd, sqlfluff + +if TYPE_CHECKING: + import pandas as pd + import sqlfluff sqlfluff_logger = logging.getLogger("sqlfluff") sqlfluff_logger.setLevel(logging.WARNING) diff --git a/src/data_designer/errors.py b/src/data_designer/errors.py index 4e1ce4a8..f6cc07af 100644 --- a/src/data_designer/errors.py +++ b/src/data_designer/errors.py @@ -1,5 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + class DataDesignerError(Exception): ... diff --git a/src/data_designer/essentials/__init__.py b/src/data_designer/essentials/__init__.py index 057e7827..084966ad 100644 --- a/src/data_designer/essentials/__init__.py +++ b/src/data_designer/essentials/__init__.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.config.default_model_settings import resolve_seed_default_model_settings from data_designer.config.exports import * # noqa: F403 from data_designer.config.run_config import RunConfig diff --git a/src/data_designer/interface/data_designer.py b/src/data_designer/interface/data_designer.py index b466356a..3155336f 100644 --- a/src/data_designer/interface/data_designer.py +++ b/src/data_designer/interface/data_designer.py @@ -1,10 +1,11 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import logging from pathlib import Path - -import pandas as pd +from typing import TYPE_CHECKING from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults from data_designer.config.config_builder import DataDesignerConfigBuilder @@ -29,10 +30,7 @@ from data_designer.config.utils.constants import ( PREDEFINED_PROVIDERS, ) from data_designer.config.utils.info import InfoType, InterfaceInfo -from data_designer.engine.analysis.dataset_profiler import ( - DataDesignerDatasetProfiler, - DatasetProfilerConfig, -) +from data_designer.engine.analysis.dataset_profiler import DataDesignerDatasetProfiler, DatasetProfilerConfig from data_designer.engine.compiler import compile_data_designer_config from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage from data_designer.engine.dataset_builders.column_wise_builder import ColumnWiseDatasetBuilder @@ -58,10 +56,17 @@ from data_designer.interface.errors import ( DataDesignerProfilingError, ) from data_designer.interface.results import DatasetCreationResults +from data_designer.lazy_heavy_imports import pd from data_designer.logging import RandomEmoji from data_designer.plugins.plugin import PluginType from data_designer.plugins.registry import PluginRegistry +if TYPE_CHECKING: + import pandas as pd + +logger = logging.getLogger(__name__) + + DEFAULT_SECRET_RESOLVER = CompositeResolver([EnvironmentResolver(), PlaintextResolver()]) DEFAULT_SEED_READERS = [ @@ -72,8 +77,6 @@ DEFAULT_SEED_READERS = [ for plugin in PluginRegistry().get_plugins(PluginType.SEED_READER): DEFAULT_SEED_READERS.append(plugin.impl_cls()) -logger = logging.getLogger(__name__) - class DataDesigner(DataDesignerInterface[DatasetCreationResults]): """Main interface for creating datasets with Data Designer. diff --git a/src/data_designer/interface/errors.py b/src/data_designer/interface/errors.py index d8647e56..1e5f5050 100644 --- a/src/data_designer/interface/errors.py +++ b/src/data_designer/interface/errors.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.errors import DataDesignerError diff --git a/src/data_designer/interface/results.py b/src/data_designer/interface/results.py index 3954ae07..b9467c58 100644 --- a/src/data_designer/interface/results.py +++ b/src/data_designer/interface/results.py @@ -4,8 +4,7 @@ from __future__ import annotations from pathlib import Path - -import pandas as pd +from typing import TYPE_CHECKING from data_designer.config.analysis.dataset_profiler import DatasetProfilerResults from data_designer.config.config_builder import DataDesignerConfigBuilder @@ -13,6 +12,10 @@ from data_designer.config.dataset_metadata import DatasetMetadata from data_designer.config.utils.visualization import WithRecordSamplerMixin from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage from data_designer.engine.dataset_builders.errors import ArtifactStorageError +from data_designer.lazy_heavy_imports import pd + +if TYPE_CHECKING: + import pandas as pd class DatasetCreationResults(WithRecordSamplerMixin): diff --git a/src/data_designer/lazy_heavy_imports.py b/src/data_designer/lazy_heavy_imports.py new file mode 100644 index 00000000..f2755332 --- /dev/null +++ b/src/data_designer/lazy_heavy_imports.py @@ -0,0 +1,54 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Lazy imports facade for heavy third-party dependencies. + +This module provides a centralized facade that lazily imports heavy dependencies +only when accessed, significantly improving import performance. + +Usage: + from data_designer.lazy_heavy_imports import pd, np, faker, litellm + + df = pd.DataFrame(...) + arr = np.array([1, 2, 3]) + fake = faker.Faker() +""" + +from __future__ import annotations + +import importlib + +# Mapping of lazy import names to their actual module paths +_LAZY_IMPORTS = { + "pd": "pandas", + "np": "numpy", + "pq": "pyarrow.parquet", + "pa": "pyarrow", + "faker": "faker", + "litellm": "litellm", + "sqlfluff": "sqlfluff", + "httpx": "httpx", + "duckdb": "duckdb", + "nx": "networkx", + "scipy": "scipy", + "jsonschema": "jsonschema", +} + + +def __getattr__(name: str) -> object: + """Lazily import heavy third-party dependencies when accessed. + + This allows fast imports of data_designer while deferring loading of heavy + libraries until they're actually needed. + """ + if name in _LAZY_IMPORTS: + module_name = _LAZY_IMPORTS[name] + return importlib.import_module(module_name) + + raise AttributeError(f"module 'data_designer.lazy_heavy_imports' has no attribute {name!r}") + + +def __dir__() -> list[str]: + """Return list of available lazy imports.""" + return list(_LAZY_IMPORTS.keys()) diff --git a/src/data_designer/logging.py b/src/data_designer/logging.py index 7a432679..ba203d7a 100644 --- a/src/data_designer/logging.py +++ b/src/data_designer/logging.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import logging import random import sys diff --git a/src/data_designer/plugins/__init__.py b/src/data_designer/plugins/__init__.py index 3b8af700..3b2fae19 100644 --- a/src/data_designer/plugins/__init__.py +++ b/src/data_designer/plugins/__init__.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.plugins.plugin import Plugin, PluginType __all__ = ["Plugin", "PluginType"] diff --git a/src/data_designer/plugins/errors.py b/src/data_designer/plugins/errors.py index 4a428777..646785b5 100644 --- a/src/data_designer/plugins/errors.py +++ b/src/data_designer/plugins/errors.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.errors import DataDesignerError diff --git a/src/data_designer/plugins/registry.py b/src/data_designer/plugins/registry.py index c2580357..b544e146 100644 --- a/src/data_designer/plugins/registry.py +++ b/src/data_designer/plugins/registry.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import logging import os import threading diff --git a/src/data_designer/plugins/testing/__init__.py b/src/data_designer/plugins/testing/__init__.py index 4d3565b3..5f88d165 100644 --- a/src/data_designer/plugins/testing/__init__.py +++ b/src/data_designer/plugins/testing/__init__.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.plugins.testing.utils import assert_valid_plugin __all__ = [ diff --git a/src/data_designer/plugins/testing/stubs.py b/src/data_designer/plugins/testing/stubs.py index 174d0d2c..2b92467f 100644 --- a/src/data_designer/plugins/testing/stubs.py +++ b/src/data_designer/plugins/testing/stubs.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from typing import Literal from data_designer.config.base import ConfigBase diff --git a/src/data_designer/plugins/testing/utils.py b/src/data_designer/plugins/testing/utils.py index 7b70b64e..453e6cc4 100644 --- a/src/data_designer/plugins/testing/utils.py +++ b/src/data_designer/plugins/testing/utils.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + from data_designer.config.base import ConfigBase from data_designer.engine.configurable_task import ConfigurableTask from data_designer.engine.resources.seed_reader import SeedReader diff --git a/tests/engine/models/conftest.py b/tests/engine/models/conftest.py index 0ebeda05..96540e7a 100644 --- a/tests/engine/models/conftest.py +++ b/tests/engine/models/conftest.py @@ -11,7 +11,8 @@ from data_designer.config.models import ( ModelConfig, ) from data_designer.engine.model_provider import ModelProvider, ModelProviderRegistry -from data_designer.engine.models.registry import ModelRegistry, create_model_registry +from data_designer.engine.models.factory import create_model_registry +from data_designer.engine.models.registry import ModelRegistry from data_designer.engine.secret_resolver import SecretsFileResolver diff --git a/tests/engine/models/test_model_registry.py b/tests/engine/models/test_model_registry.py index 01fb9acc..0e249b3d 100644 --- a/tests/engine/models/test_model_registry.py +++ b/tests/engine/models/test_model_registry.py @@ -8,7 +8,8 @@ import pytest from data_designer.config.models import ChatCompletionInferenceParams, ModelConfig from data_designer.engine.models.errors import ModelAuthenticationError from data_designer.engine.models.facade import ModelFacade -from data_designer.engine.models.registry import ModelRegistry, create_model_registry +from data_designer.engine.models.factory import create_model_registry +from data_designer.engine.models.registry import ModelRegistry from data_designer.engine.models.usage import ModelUsageStats, RequestUsageStats, TokenUsageStats @@ -39,7 +40,7 @@ def stub_no_usage_config(): ) -@patch("data_designer.engine.models.registry.apply_litellm_patches", autospec=True) +@patch("data_designer.engine.models.litellm_overrides.apply_litellm_patches", autospec=True) def test_create_model_registry( mock_apply_litellm_patches, stub_model_configs, stub_secrets_resolver, stub_model_provider_registry ): diff --git a/tests/engine/resources/test_resource_provider.py b/tests/engine/resources/test_resource_provider.py index 7a4ec8a2..8bb04c05 100644 --- a/tests/engine/resources/test_resource_provider.py +++ b/tests/engine/resources/test_resource_provider.py @@ -5,10 +5,8 @@ from unittest.mock import Mock, patch import pytest -from data_designer.engine.resources.resource_provider import ( - ResourceProvider, - create_resource_provider, -) +from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage +from data_designer.engine.resources.resource_provider import ResourceProvider, create_resource_provider def test_resource_provider_artifact_storage_required(): @@ -22,8 +20,8 @@ def test_resource_provider_artifact_storage_required(): ("model_registry_creation_error", "Model registry creation failed"), ], ) -def test_create_resource_provider_error_cases(test_case, expected_error): - mock_artifact_storage = Mock() +def test_create_resource_provider_error_cases(test_case, expected_error, tmp_path): + artifact_storage = ArtifactStorage(artifact_path=str(tmp_path), dataset_name="test") mock_model_configs = [Mock(), Mock()] mock_secret_resolver = Mock() mock_model_provider_registry = Mock() @@ -34,7 +32,7 @@ def test_create_resource_provider_error_cases(test_case, expected_error): with pytest.raises(Exception, match=expected_error): create_resource_provider( - artifact_storage=mock_artifact_storage, + artifact_storage=artifact_storage, model_configs=mock_model_configs, secret_resolver=mock_secret_resolver, model_provider_registry=mock_model_provider_registry, diff --git a/tests/test_import_perf.py b/tests/test_import_perf.py new file mode 100644 index 00000000..abb1f202 --- /dev/null +++ b/tests/test_import_perf.py @@ -0,0 +1,64 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import re +import subprocess +from pathlib import Path + +# Maximum allowed average import time in seconds +# Average of 1 cold start + 4 warm cache runs +# Cold starts vary 4-13s due to OS caching, system load, CPU scaling +# Warm cache consistently <3s. Average should be well under 6s. +MAX_IMPORT_TIME_SECONDS = 6.0 +PERF_TEST_TIMEOUT_SECONDS = 15.0 + + +def test_import_performance(): + """Test that average import time never exceeds 6 seconds (1 cold start + 4 warm cache runs).""" + # Get the project root (where Makefile is located) + project_root = Path(__file__).parent.parent + + num_runs = 5 + import_times = [] + + for run in range(num_runs): + # Clean cache only on first run (cold start), rest use warm cache + cmd = ["make", "perf-import", "NOFILE=1"] + if run == 0: + cmd.append("CLEAN=1") + + result = subprocess.run( + cmd, + cwd=project_root, + capture_output=True, + text=True, + timeout=PERF_TEST_TIMEOUT_SECONDS, + ) + + # Parse the output to extract import time + # Looking for line like: " Total: 3.456s" + match = re.search(r"Total:\s+([\d.]+)s", result.stdout) + assert match, f"Could not parse import time from run {run + 1}:\n{result.stdout}" + + import_time = float(match.group(1)) + import_times.append(import_time) + + # Calculate average + avg_import_time = sum(import_times) / len(import_times) + min_import_time = min(import_times) + max_import_time = max(import_times) + + # Print summary for debugging + print("\nImport Performance Summary:") + print(f" Runs: {num_runs} (1 cold start + {num_runs - 1} warm cache)") + print(f" Cold start (run 1): {import_times[0]:.3f}s") + print(f" Warm cache (runs 2-{num_runs}): {', '.join(f'{t:.3f}s' for t in import_times[1:])}") + print(f" Average: {avg_import_time:.3f}s") + print(f" Min: {min_import_time:.3f}s") + print(f" Max: {max_import_time:.3f}s") + + # Assert average import time is under threshold + assert avg_import_time < MAX_IMPORT_TIME_SECONDS, ( + f"Average import time {avg_import_time:.3f}s exceeds {MAX_IMPORT_TIME_SECONDS}s threshold " + f"(times: {import_times})" + )