DataDesigner/packages/data-designer-engine/tests/engine/validators/test_sql.py

# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from unittest.mock import patch

import pytest

from data_designer.config.utils.code_lang import CodeLang
from data_designer.config.validator_params import CodeValidatorParams
from data_designer.engine.validators.sql import SQLValidator


def test_valid_ansi_sql_code() -> None:
    sql_validator = SQLValidator(CodeValidatorParams(code_lang=CodeLang.SQL_ANSI))
    code = "SELECT category, COUNT(*) as total_incidents FROM security_incidents_2 GROUP BY category;"
    result = sql_validator.run_validation([{"sql": code}])
    assert result.data[0].is_valid
    assert result.data[0].error_messages == ""


def test_invalid_ansi_sql_code() -> None:
    sql_validator = SQLValidator(CodeValidatorParams(code_lang=CodeLang.SQL_ANSI))
    code = "NOT SQL"
    result = sql_validator.run_validation([{"sql": code}])
    assert not result.data[0].is_valid
    assert result.data[0].error_messages == "PRS: Line 1, Position 1: Found unparsable section: 'NOT SQL'"


def test_sql_validator_multi_column_input_raises() -> None:
    sql_validator = SQLValidator(CodeValidatorParams(code_lang=CodeLang.SQL_ANSI))
    with pytest.raises(ValueError, match="single column input"):
        sql_validator.run_validation([{"sql": "SELECT 1", "extra": "ignored"}])


def test_sql_validator_decimal_without_scale_fails() -> None:
    sql_validator = SQLValidator(CodeValidatorParams(code_lang=CodeLang.SQL_ANSI))
    code = "CREATE TABLE example (amount DECIMAL(10));"
    result = sql_validator.run_validation([{"sql": code}])
    assert not result.data[0].is_valid
    assert "DECIMAL definitions without a scale" in result.data[0].error_messages


def test_sql_validator_handles_lint_exception() -> None:
    sql_validator = SQLValidator(CodeValidatorParams(code_lang=CodeLang.SQL_ANSI))
    with patch("data_designer.lazy_heavy_imports.sqlfluff.lint", side_effect=RuntimeError("boom")):
        result = sql_validator.run_validation([{"sql": "SELECT 1"}])
    assert not result.data[0].is_valid
    assert "Exception during SQL parsing" in result.data[0].error_messages
chore: add publish script and update license headers (#253) 2026-01-28 13:47:34 +00:00			`# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.`
initial port 2025-10-27 18:29:12 +00:00			`# SPDX-License-Identifier: Apache-2.0`

feat(engine): env-var switch for async-first models experiment (#280) 2026-02-13 22:28:35 +00:00			`from unittest.mock import patch`

			`import pytest`

initial port 2025-10-27 18:29:12 +00:00			`from data_designer.config.utils.code_lang import CodeLang`
			`from data_designer.config.validator_params import CodeValidatorParams`
			`from data_designer.engine.validators.sql import SQLValidator`


feat(engine): env-var switch for async-first models experiment (#280) 2026-02-13 22:28:35 +00:00			`def test_valid_ansi_sql_code() -> None:`
initial port 2025-10-27 18:29:12 +00:00			`sql_validator = SQLValidator(CodeValidatorParams(code_lang=CodeLang.SQL_ANSI))`
			`code = "SELECT category, COUNT(*) as total_incidents FROM security_incidents_2 GROUP BY category;"`
			`result = sql_validator.run_validation([{"sql": code}])`
			`assert result.data[0].is_valid`
			`assert result.data[0].error_messages == ""`


feat(engine): env-var switch for async-first models experiment (#280) 2026-02-13 22:28:35 +00:00			`def test_invalid_ansi_sql_code() -> None:`
initial port 2025-10-27 18:29:12 +00:00			`sql_validator = SQLValidator(CodeValidatorParams(code_lang=CodeLang.SQL_ANSI))`
			`code = "NOT SQL"`
			`result = sql_validator.run_validation([{"sql": code}])`
			`assert not result.data[0].is_valid`
			`assert result.data[0].error_messages == "PRS: Line 1, Position 1: Found unparsable section: 'NOT SQL'"`
feat(engine): env-var switch for async-first models experiment (#280) 2026-02-13 22:28:35 +00:00

			`def test_sql_validator_multi_column_input_raises() -> None:`
			`sql_validator = SQLValidator(CodeValidatorParams(code_lang=CodeLang.SQL_ANSI))`
			`with pytest.raises(ValueError, match="single column input"):`
			`sql_validator.run_validation([{"sql": "SELECT 1", "extra": "ignored"}])`


			`def test_sql_validator_decimal_without_scale_fails() -> None:`
			`sql_validator = SQLValidator(CodeValidatorParams(code_lang=CodeLang.SQL_ANSI))`
			`code = "CREATE TABLE example (amount DECIMAL(10));"`
			`result = sql_validator.run_validation([{"sql": code}])`
			`assert not result.data[0].is_valid`
			`assert "DECIMAL definitions without a scale" in result.data[0].error_messages`


			`def test_sql_validator_handles_lint_exception() -> None:`
			`sql_validator = SQLValidator(CodeValidatorParams(code_lang=CodeLang.SQL_ANSI))`
chore: Improve CLI startup with lazy heavy import cleanup (#330) * perf: defer heavy imports to improve CLI startup time Move expensive imports (engine, models, controllers) out of the module-level import path so that data-designer --help and other non-generation commands no longer pay the full startup cost. Key changes: - Defer controller imports to inside command functions - Remove eager re-export chains from CLI package __init__ files - Move default-settings bootstrap into load_config_builder() and DataDesigner.__init__() instead of running at import time - Add lazy __getattr__ exports in interface/__init__.py - Replace module-level tokenizer init with cached lazy getter - Fix ModelProvider import to use config layer instead of engine - Update test mock paths to match new import locations Reduces CLI import-time from ~1.67s to ~0.46s. * perf: defer pandas/numpy in io_helpers and add config_list benchmark - Replace eager `from lazy_heavy_imports import pd, np` in io_helpers with module-level __getattr__ (for backwards-compatible external access / test mocks) and function-level imports in the 3 functions that actually use them (read_parquet_dataset, smart_load_dataframe, _convert_to_serializable). Importing io_helpers no longer triggers pandas/numpy loading. - Defer heavy imports in list and reset CLI commands into function bodies to avoid loading repositories, Rich, and prompt_toolkit at module import time. - Add `config_list` (data-designer config list) measurement to the CLI startup benchmark with isolated cold measurement in a separate venv and a --skip-config-list-check flag. - Update test mock paths to match new import locations. * Refine lazy import usage and TYPE_CHECKING cleanup * Run license header updater on PR-touched files * fix: update sqlfluff mock target for lazy imports in test_sql * perf: cache globals() in lazy __getattr__ to avoid repeated lookups Add globals() caching and explanatory comment to all three lazy __getattr__ implementations (lazy_heavy_imports, config/__init__, interface/__init__) so subsequent attribute accesses bypass __getattr__. * perf: lazy CLI command loading and deferred heavy import evaluations - Add LazyTyperGroup to defer command module loading until invocation, allowing module-level imports in all CLI command files - Split DataFrameSeedSource into seed_source_dataframe.py to isolate pandas dependency from other seed source classes - Move TypeVar/TypeAlias definitions (DataT, NumpyArray1dT, RadomStateT, EngineT) to TYPE_CHECKING blocks with runtime fallbacks - Wrap module-level constants in lru_cache (phone_number parquet data, jsonschema validator) to defer I/O and heavy imports to first use - Update test mock targets to patch at usage-site for module-level imports * refactor: use direct pandas import in seed_source_dataframe Drop lazy-loading for pandas in DataFrameSeedSource; use direct import for simplicity. * update lazy import pattern * update tests to use lazy import namespace Switch test modules to import data_designer.lazy_heavy_imports as lazy and reference heavy libraries through that namespace. This keeps heavy imports deferred during module import and aligns tests with the new lazy-import usage pattern. * tighten import perf test thresholds Document recent baseline timings and lower the allowed average import time and timeout so regressions are detected sooner. * document pandas import requirement Clarify that Pydantic needs DataFrame resolved at module load and that keeping the direct import preserves IDE typing support. * increase timeout time * use lazy pandas imports in visualization tests - replace direct pandas usage with lazy.pd in visualization tests to avoid eager imports - add TYPE_CHECKING pandas import and keep CLI controller imports sorted * fix lazy pandas runtime usage and preview mocks Switch sample-record handling to lazy pandas types so runtime paths no longer depend on TYPE_CHECKING imports. Align preview controller tests to patch the module-local DataDesigner symbol, preventing real engine invocation in save results scenarios. 2026-02-18 21:24:15 +00:00			`with patch("data_designer.lazy_heavy_imports.sqlfluff.lint", side_effect=RuntimeError("boom")):`
feat(engine): env-var switch for async-first models experiment (#280) 2026-02-13 22:28:35 +00:00			`result = sql_validator.run_validation([{"sql": "SELECT 1"}])`
			`assert not result.data[0].is_valid`
			`assert "Exception during SQL parsing" in result.data[0].error_messages`