mirror of
https://github.com/open-metadata/OpenMetadata
synced 2026-05-24 09:39:11 +00:00
* refactor(sampler): collapse SamplerInterface to a single config object Replace the 9-parameter constructor/create() signature with a typed SamplerConfig hierarchy (SamplerConfig / DatabaseSamplerConfig / StorageSamplerConfig). Config resolution — partition_details, sample_query, include/exclude columns, sample_config, sample_data_count — now happens in callers (entity_adapters, profiler_source, base_test_suite_source) before construction, so the interface only receives already-resolved values. - Add sampler_config.py with SamplerConfig dataclass hierarchy - Remove database-specific imports from SamplerInterface base class - Move SSL connection setup and column include/exclude filtering to database-family subclasses (SQASampler, PandasSampler, NoSQLSampler) - Simplify BigQuery/Postgres/Snowflake samplers to *args/**kwargs init - Remove StorageSampler.create() override; base create() is sufficient - Update profiler_source and base_test_suite_source to build DatabaseSamplerConfig before calling sampler_class.create() Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * docs(sampler): fix build_sampler_kwargs example to use SamplerConfig The non-database adapter example was showing the old flat kwargs pattern (sample_config, sample_data_count) that SamplerInterface now silently ignores via **__. Replace with the correct "config": SamplerConfig(...) pattern that matches the actual ContainerAdapter implementation. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * fix(sampler): guard BigQuerySampler.tableType access with isinstance check ClassifiableEntityType includes Container which has no tableType. The *args/**kwargs init simplified the constructor but lost the explicit Table type annotation, triggering a basedpyright error. Guard with isinstance(self.entity, Table) so the type checker knows tableType is only accessed on Table entities. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * Fix tests * Gitar bot feedback --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
139 lines
5.6 KiB
Python
139 lines
5.6 KiB
Python
# Copyright 2025 Collate
|
|
# Licensed under the Collate Community License, Version 1.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
from metadata.generated.schema.configuration.profilerConfiguration import (
|
|
SampleDataIngestionConfig,
|
|
)
|
|
from metadata.generated.schema.entity.data.table import TableData
|
|
from metadata.sampler.sampler_interface import SamplerInterface
|
|
from metadata.utils.constants import SAMPLE_DATA_MAX_CELL_LENGTH
|
|
|
|
|
|
class TestTruncateCell:
|
|
@pytest.mark.parametrize(
|
|
"value,expected",
|
|
[
|
|
("short string", "short string"),
|
|
(12345, 12345),
|
|
(None, None),
|
|
(True, True),
|
|
(3.14, 3.14),
|
|
(b"bytes", b"bytes"),
|
|
],
|
|
)
|
|
def test_non_oversized_values_pass_through(self, value, expected):
|
|
assert SamplerInterface._truncate_cell(value) == expected
|
|
|
|
def test_string_at_limit_is_not_truncated(self):
|
|
value = "a" * SAMPLE_DATA_MAX_CELL_LENGTH
|
|
result = SamplerInterface._truncate_cell(value)
|
|
assert result == value
|
|
assert len(result) == SAMPLE_DATA_MAX_CELL_LENGTH
|
|
|
|
def test_string_over_limit_is_truncated(self):
|
|
value = "a" * (SAMPLE_DATA_MAX_CELL_LENGTH + 500)
|
|
result = SamplerInterface._truncate_cell(value)
|
|
assert len(result) == SAMPLE_DATA_MAX_CELL_LENGTH
|
|
|
|
def test_truncation_preserves_prefix(self):
|
|
prefix = "important_data_"
|
|
value = prefix + "x" * SAMPLE_DATA_MAX_CELL_LENGTH
|
|
result = SamplerInterface._truncate_cell(value)
|
|
assert result.startswith(prefix)
|
|
assert len(result) == SAMPLE_DATA_MAX_CELL_LENGTH
|
|
|
|
def test_very_large_string_is_truncated(self):
|
|
value = "z" * 10_000_000
|
|
result = SamplerInterface._truncate_cell(value)
|
|
assert len(result) == SAMPLE_DATA_MAX_CELL_LENGTH
|
|
|
|
|
|
class TestGenerateSampleData:
|
|
"""Test SamplerInterface.generate_sample_data with SampleDataIngestionConfig"""
|
|
|
|
@pytest.fixture
|
|
def sampler(self):
|
|
"""Create a concrete SamplerInterface subclass for testing"""
|
|
sampler = MagicMock(spec=SamplerInterface)
|
|
sampler.entity = MagicMock()
|
|
sampler.entity.fullyQualifiedName.root = "test_service.db.schema.table"
|
|
sampler.columns = [MagicMock(name="col1"), MagicMock(name="col2")]
|
|
sampler.sample_limit = 50
|
|
sampler.upload_sample_storage_config = None
|
|
|
|
sample_table_data = TableData(
|
|
columns=["col1", "col2"],
|
|
rows=[["val1", "val2"], ["val3", "val4"]],
|
|
)
|
|
sampler.fetch_sample_data.return_value = sample_table_data
|
|
|
|
sampler.generate_sample_data = SamplerInterface.generate_sample_data.__wrapped__.__get__(
|
|
sampler, SamplerInterface
|
|
)
|
|
sampler._truncate_cell = SamplerInterface._truncate_cell
|
|
|
|
return sampler
|
|
|
|
def test_both_disabled_returns_empty(self, sampler):
|
|
config = SampleDataIngestionConfig(storeSampleData=False, readSampleData=False)
|
|
result = sampler.generate_sample_data(config)
|
|
|
|
assert result.rows == []
|
|
assert result.columns == []
|
|
sampler.fetch_sample_data.assert_not_called()
|
|
|
|
def test_read_only_fetches_but_does_not_store(self, sampler):
|
|
config = SampleDataIngestionConfig(storeSampleData=False, readSampleData=True)
|
|
result = sampler.generate_sample_data(config)
|
|
|
|
assert len(result.rows) == 2
|
|
sampler.fetch_sample_data.assert_called_once()
|
|
|
|
def test_store_enabled_fetches_data(self, sampler):
|
|
config = SampleDataIngestionConfig(storeSampleData=True, readSampleData=False)
|
|
result = sampler.generate_sample_data(config)
|
|
|
|
assert len(result.rows) == 2
|
|
sampler.fetch_sample_data.assert_called_once()
|
|
|
|
def test_both_enabled_fetches_data(self, sampler):
|
|
config = SampleDataIngestionConfig(storeSampleData=True, readSampleData=True)
|
|
result = sampler.generate_sample_data(config)
|
|
|
|
assert len(result.rows) == 2
|
|
sampler.fetch_sample_data.assert_called_once()
|
|
|
|
def test_none_config_defaults_to_both_enabled(self, sampler):
|
|
result = sampler.generate_sample_data(None)
|
|
|
|
assert len(result.rows) == 2
|
|
sampler.fetch_sample_data.assert_called_once()
|
|
|
|
def test_store_enabled_with_storage_config_uploads(self, sampler):
|
|
sampler.upload_sample_storage_config = MagicMock()
|
|
config = SampleDataIngestionConfig(storeSampleData=True, readSampleData=True)
|
|
with patch("metadata.sampler.sampler_interface.upload_sample_data") as mock_upload:
|
|
result = sampler.generate_sample_data(config)
|
|
|
|
mock_upload.assert_called_once()
|
|
assert len(result.rows) == 2
|
|
|
|
def test_store_disabled_with_storage_config_does_not_upload(self, sampler):
|
|
sampler.upload_sample_storage_config = MagicMock()
|
|
config = SampleDataIngestionConfig(storeSampleData=False, readSampleData=True)
|
|
with patch("metadata.sampler.sampler_interface.upload_sample_data") as mock_upload:
|
|
result = sampler.generate_sample_data(config)
|
|
|
|
mock_upload.assert_not_called()
|
|
assert len(result.rows) == 2
|