OpenMetadata/bootstrap/sql/migrations/native/1.13.0/postgres/postDataMigrationSQLScript.sql

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

90 lines
7.1 KiB
MySQL
Raw Normal View History

UPDATE ingestion_pipeline_entity
SET json = (json::jsonb #- '{sourceConfig,config,computeMetrics}')::json
WHERE json::jsonb -> 'sourceConfig' -> 'config' -> 'computeMetrics' IS NOT NULL
AND pipelineType = 'profiler';
fix(sampler): Respect randomizedSample flag at 100% percentage sampling (#26966) * fix(sampler): respect randomizedSample flag at 100% percentage sampling When profileSample is 100% with PERCENTAGE type, the sampler short-circuits and returns the raw dataset without any randomization, even when randomizedSample is True (the default). Split the combined condition so: - No profileSample set -> return raw dataset (no sampling configured) - 100% PERCENTAGE + randomizedSample=False -> return raw dataset (optimization) - 100% PERCENTAGE + randomizedSample=True -> go through normal sampling path which applies RandomNumFn/df.sample for proper row shuffling Fixes #21304 * Address review: use 'is False' for Optional[bool] and add unit tests - Fix randomizedSample check from 'not' to 'is False' in both SQASampler and DatalakeSampler to correctly handle None (Optional[bool] default=True) - Add unit tests verifying 100%% PERCENTAGE behavior for randomizedSample values True, False, and None * Add ORDER BY on random column in fetch_sample_data for true randomization The get_dataset() fix ensures 100% PERCENTAGE + randomizedSample routes through get_sample_query() which produces a CTE with a random column. Now fetch_sample_data() detects that column and applies ORDER BY before LIMIT, so each call returns a different subset of rows. Also add real-DB integration tests using SQLite for the 100% PERCENTAGE edge case (True, False, None). * Address review: remove stale comment, unused import, add return assertions * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Address review: move ORDER BY to get_sample_query, clean up fetch_sample_data - Move ORDER BY rnd.c.random into get_sample_query() PERCENTAGE branch, gated on randomizedSample is not False (mirrors ABSOLUTE branch pattern) - Revert fetch_sample_data() to original: remove ds_columns variable, random_column detection, and ORDER BY logic (ordering now handled in CTE) - Remove duplicate assertions in DatalakeSampler100Pct tests * Address review: None defaults to False for randomizedSample Per TeddyCr's feedback, randomization is computationally heavy and should not be the default. Changed from 'is False'/'is not False' to truthiness checks so None (unset) behaves the same as False. Only explicit randomizedSample=True triggers ORDER BY and skips the 100% fast path. This is consistent with the ABSOLUTE branch which already uses truthiness checks. * Fix integration test: None should skip sample_query (matches truthiness semantics) * fix(tests): update BigQuery view sampling expected queries with ORDER BY BigQuery views fall through to SQASampler.get_sample_query() which now adds ORDER BY rnd.random when randomizedSample is enabled. Update the expected SQL strings in test_sampling_for_views and test_sampling_view_with_partition to match. * refactor: use explicit is False for randomizedSample checks Address review comments: SampleConfig.randomizedSample defaults to True, so only an explicit False should disable randomization. Using is False / is not False instead of truthiness ensures None follows the model default (enabled) rather than being incorrectly treated as disabled. * ci: re-trigger checks after SIGSEGV flake * refactor: only explicit True randomizes, add non-determinism tests * test: increase non-determinism iterations to reduce flakiness * chore: added randomize as false * fix: align randomizedSample defaults with schema (false) * fix: remove ORDER BY from BigQuery test expectations BigQuery sampling tests create SampleConfig without setting randomizedSample, which now defaults to False. Since ORDER BY is only added when randomizedSample is True, the expected query strings should not include ORDER BY. Also fix inaccurate docstring in test_sample.py. * test: increase non-determinism test iterations to reduce flakiness Increase fetch_sample_data loop from 10 to 20 iterations to further reduce the theoretical probability of a false failure in the randomized ordering test. --------- Co-authored-by: Teddy <teddy.crepineau@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2026-04-14 17:28:54 +00:00
-- Set randomizedSample to false where it was true (old default behavior)
UPDATE ingestion_pipeline_entity
SET json = jsonb_set(json::jsonb, '{sourceConfig,config,randomizedSample}', 'false'::jsonb)::json
WHERE json::jsonb #>> '{sourceConfig,config,randomizedSample}' = 'true'
AND pipelineType = 'profiler';
UPDATE table_entity
SET json = jsonb_set(json::jsonb, '{tableProfilerConfig,randomizedSample}', 'false'::jsonb)::json
WHERE json::jsonb #>> '{tableProfilerConfig,randomizedSample}' = 'true';
UPDATE database_entity
SET json = jsonb_set(json::jsonb, '{databaseProfilerConfig,randomizedSample}', 'false'::jsonb)::json
WHERE json::jsonb #>> '{databaseProfilerConfig,randomizedSample}' = 'true';
UPDATE database_schema_entity
SET json = jsonb_set(json::jsonb, '{databaseSchemaProfilerConfig,randomizedSample}', 'false'::jsonb)::json
WHERE json::jsonb #>> '{databaseSchemaProfilerConfig,randomizedSample}' = 'true';
-- Hard-delete ingestion pipelines for Iceberg services (must run before service migration)
DELETE FROM ingestion_pipeline_entity ipe
USING dbservice_entity dse
WHERE dse.serviceType = 'Iceberg'
AND ipe.json::jsonb -> 'service' ->> 'type' = 'databaseService'
AND ipe.json::jsonb -> 'service' ->> 'id' = dse.id;
-- Migrate Iceberg database services to CustomDatabase (connector removed)
-- serviceType is a GENERATED column derived from json, so only update json
UPDATE dbservice_entity
SET json = jsonb_set(
jsonb_set(
json::jsonb,
'{serviceType}', '"CustomDatabase"'
),
'{connection,config,type}', '"CustomDatabase"'
)::json
WHERE serviceType = 'Iceberg';
-- Migrate serviceType in child entities (serviceType is in JSON blob only, no generated column)
UPDATE database_entity
SET json = jsonb_set(json::jsonb, '{serviceType}', '"CustomDatabase"')::json
WHERE json->>'serviceType' = 'Iceberg';
UPDATE database_schema_entity
SET json = jsonb_set(json::jsonb, '{serviceType}', '"CustomDatabase"')::json
WHERE json->>'serviceType' = 'Iceberg';
UPDATE table_entity
SET json = jsonb_set(json::jsonb, '{serviceType}', '"CustomDatabase"')::json
WHERE json->>'serviceType' = 'Iceberg';
UPDATE stored_procedure_entity
SET json = jsonb_set(json::jsonb, '{serviceType}', '"CustomDatabase"')::json
Move ontology/glossary relation migration from 1.14.0 back to 1.13.0 (#27431) * Move ontology/glossary relation migration from 1.14.0 back to 1.13.0 Ontology feature will ship in 1.13.0, not 1.14.0. Move the glossary term relation migrations (relationType backfill, settings insert, stale relatedTerms strip, conceptMappings backfill) back to the 1.13.0 postDataMigrationSQLScript for both MySQL and PostgreSQL. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Restore empty 1.14.0 SQL migration files for Java migration framework The V114 MigrationUtil.java package requires the 1.14.0 migration directory to exist with SQL files for the migration to be picked up. Keep them as empty files (matching convention of other versions with no post-data SQL). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Add schemaChanges.sql and comment all 1.14.0 SQL migration files Add both schemaChanges.sql and postDataMigrationSQLScript.sql for mysql and postgres with a comment explaining the directory is required for the V114 Java migrations to be picked up by the migration framework. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * Fix missing trailing newline in postgres postDataMigrationSQLScript Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * address feedback --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com> Co-authored-by: Karan Hotchandani <33024356+karanh37@users.noreply.github.com>
2026-04-16 16:45:01 +00:00
WHERE json->>'serviceType' = 'Iceberg';
-- Migrate existing glossary term RELATED_TO relationships to include relationType
-- For backward compatibility, existing relations without a relationType are set to "relatedTo"
UPDATE entity_relationship
SET json = jsonb_set(COALESCE(json::jsonb, '{}'::jsonb), '{relationType}', '"relatedTo"')
WHERE fromentity = 'glossaryTerm'
AND toentity = 'glossaryTerm'
AND relation = 15
AND (json IS NULL OR json::jsonb->>'relationType' IS NULL);
-- Insert default glossary term relation settings if they don't exist
-- This preserves any existing user customizations
INSERT INTO openmetadata_settings (configtype, json)
SELECT 'glossaryTermRelationSettings', '{"relationTypes":[{"name":"relatedTo","displayName":"Related To","description":"General association between terms that are conceptually connected.","rdfPredicate":"https://open-metadata.org/ontology/relatedTo","isSymmetric":true,"isTransitive":false,"isCrossGlossaryAllowed":true,"category":"associative","isSystemDefined":true,"color":"#1570ef"},{"name":"synonym","displayName":"Synonym","description":"Terms that have the same meaning and can be used interchangeably.","rdfPredicate":"http://www.w3.org/2004/02/skos/core#exactMatch","isSymmetric":true,"isTransitive":false,"isCrossGlossaryAllowed":true,"category":"equivalence","isSystemDefined":true,"color":"#b42318"},{"name":"antonym","displayName":"Antonym","description":"Terms that have opposite meanings.","rdfPredicate":"https://open-metadata.org/ontology/antonym","isSymmetric":true,"isTransitive":false,"isCrossGlossaryAllowed":true,"category":"associative","isSystemDefined":true,"color":"#b54708"},{"name":"broader","displayName":"Broader","description":"A more general term (hypernym).","inverseRelation":"narrower","rdfPredicate":"http://www.w3.org/2004/02/skos/core#broader","isSymmetric":false,"isTransitive":true,"isCrossGlossaryAllowed":true,"category":"hierarchical","isSystemDefined":true,"color":"#067647"},{"name":"narrower","displayName":"Narrower","description":"A more specific term (hyponym).","inverseRelation":"broader","rdfPredicate":"http://www.w3.org/2004/02/skos/core#narrower","isSymmetric":false,"isTransitive":true,"isCrossGlossaryAllowed":true,"category":"hierarchical","isSystemDefined":true,"color":"#4e5ba6"},{"name":"partOf","displayName":"Part Of","description":"This term is a part or component of another term.","inverseRelation":"hasPart","rdfPredicate":"https://open-metadata.org/ontology/partOf","isSymmetric":false,"isTransitive":false,"isCrossGlossaryAllowed":true,"category":"hierarchical","isSystemDefined":true,"color":"#026aa2"},{"name":"hasPart","displayName":"Has Part","description":"This term has the other term as a part or component.","inverseRelation":"partOf","rdfPredicate":"https://open-metadata.org/ontology/hasPart","isSymmetric":false,"isTransitive":false,"isCrossGlossaryAllowed":true,"category":"hierarchical","isSystemDefined":true,"color":"#155eef"},{"name":"calculatedFrom","displayName":"Calculated From","description":"This term/metric is calculated or derived from another term.","inverseRelation":"usedToCalculate","rdfPredicate":"https://open-metadata.org/ontology/calculatedFrom","isSymmetric":false,"isTransitive":false,"isCrossGlossaryAllowed":true,"category":"associative","isSystemDefined":true,"color":"#6938ef"},{"name":"usedToCalculate","displayName":"Used To Calculate","description":"This term is used in the calculation of another term.","inverseRelation":"calculatedFrom","rdfPredicate":"https://open-metadata.org/ontology/usedToCalculate","isSymmetric":false,"isTransitive":false,"isCrossGlossaryAllowed":true,"category":"associative","isSystemDefined":true,"color":"#ba24d5"},{"name":"seeAlso","displayName":"See Also","description":"Related term that may provide additional context.","rdfPredicate":"http://www.w3.org/2000/01/rdf-schema#seeAlso","isSymmetric":true,"isTransitive":false,"isCrossGlossaryAllowed":true,"category":"associative","isSystemDefined":true,"color":"#c11574"}]}'::jsonb
WHERE NOT EXISTS (
SELECT 1 FROM openmetadata_settings WHERE configtype = 'glossaryTermRelationSettings'
);
-- Strip stale relatedTerms from glossary term entity JSON.
-- relatedTerms is now loaded from entity_relationship table, not from entity JSON.
-- Old data stored relatedTerms as EntityReference objects which fail to deserialize as TermRelation.
UPDATE glossary_term_entity
SET json = (json::jsonb - 'relatedTerms')::json
WHERE jsonb_exists(json::jsonb, 'relatedTerms');
-- Backfill conceptMappings for existing glossary terms
UPDATE glossary_term_entity
SET json = jsonb_set(COALESCE(json::jsonb, '{}'::jsonb), '{conceptMappings}', '[]'::jsonb)
WHERE json IS NULL OR json::jsonb->'conceptMappings' IS NULL;