fix(rdf): schema default + migration force entities=[all] for safe full reindex

- rdfIndexingAppConfig.json: flip recreateIndex.default from false to true so any UI form / config generation path that surfaces the schema default agrees with the install JSON files and the new full-rebuild semantics. - 2.0.1 migration (MySQL + Postgres): in addition to flipping recreateIndex=true and the weekly Saturday cron, also rewrite appConfiguration.entities to ["all"]. Pre-upgrade an operator could have narrowed RDF indexing to a subset of entity types; the new recreateIndex=true semantics issues CLEAR ALL before indexing, which would otherwise wipe triples for excluded entity types and leave the graph permanently missing them. Forcing entities back to ["all"] ensures the post-CLEAR-ALL run repopulates the graph fully. Operators can re-narrow after the migration if they need partial indexing. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-24 09:39:11 +00:00 · 2026-05-15 09:49:48 -07:00 · 2026-05-15 09:49:48 -07:00 · e2575d51ab
commit e2575d51ab
parent 7a1fae7f89
3 changed files with 36 additions and 13 deletions
--- a/bootstrap/sql/migrations/native/2.0.1/mysql/postDataMigrationSQLScript.sql
+++ b/bootstrap/sql/migrations/native/2.0.1/mysql/postDataMigrationSQLScript.sql
@ -1,15 +1,26 @@
 -- Post data migration script for Task workflow cutover - OpenMetadata 2.0.1

-- RdfIndexApp: switch to weekly Saturday cron and recreate-on-each-run.
+-- RdfIndexApp: switch to weekly Saturday cron and full-rebuild every run.
 -- Previous defaults (daily, incremental) were producing unbounded triple growth
 -- because relationship-removal paths weren't fully reconciled. With per-run
-- CLEAR ALL the dataset always converges to the current MySQL state; weekly
-- cadence keeps the per-run cost from saturating Fuseki.
+-- CLEAR ALL the dataset always converges to MySQL state; weekly cadence keeps
+-- per-run cost from saturating Fuseki.
+--
+-- Also rewrite `entities` to `["all"]`. Pre-upgrade, an operator could have
+-- narrowed RDF indexing to a subset of entity types; the new recreateIndex=true
+-- semantics issues a CLEAR ALL before indexing, which would otherwise wipe
+-- triples for entity types still in MySQL but missing from the subset list.
+-- Forcing the subset list back to `["all"]` ensures the post-CLEAR-ALL run
+-- repopulates the graph fully; operators can re-narrow after the migration if
+-- they need partial indexing.
 UPDATE installed_apps
 SET json = JSON_SET(
-    json,
-    '$.appConfiguration.recreateIndex', CAST('true' AS JSON),
-    '$.appSchedule.cronExpression', '0 0 * * 6'
+    JSON_SET(
+        json,
+        '$.appConfiguration.recreateIndex', CAST('true' AS JSON),
+        '$.appSchedule.cronExpression', '0 0 * * 6'
+    ),
+    '$.appConfiguration.entities', JSON_ARRAY('all')
 )
 WHERE name = 'RdfIndexApp';

--- a/bootstrap/sql/migrations/native/2.0.1/postgres/postDataMigrationSQLScript.sql
+++ b/bootstrap/sql/migrations/native/2.0.1/postgres/postDataMigrationSQLScript.sql
@ -1,15 +1,27 @@
 -- Post data migration script for Task workflow cutover - OpenMetadata 2.0.1

-- RdfIndexApp: switch to weekly Saturday cron and recreate-on-each-run.
+-- RdfIndexApp: switch to weekly Saturday cron and full-rebuild every run.
 -- Previous defaults (daily, incremental) were producing unbounded triple growth
 -- because relationship-removal paths weren't fully reconciled. With per-run
-- CLEAR ALL the dataset always converges to the current MySQL state; weekly
-- cadence keeps the per-run cost from saturating Fuseki.
+-- CLEAR ALL the dataset always converges to MySQL state; weekly cadence keeps
+-- per-run cost from saturating Fuseki.
+--
+-- Also rewrite `entities` to `["all"]`. Pre-upgrade, an operator could have
+-- narrowed RDF indexing to a subset of entity types; the new recreateIndex=true
+-- semantics issues a CLEAR ALL before indexing, which would otherwise wipe
+-- triples for entity types still in MySQL but missing from the subset list.
+-- Forcing the subset list back to `["all"]` ensures the post-CLEAR-ALL run
+-- repopulates the graph fully; operators can re-narrow after the migration if
+-- they need partial indexing.
 UPDATE installed_apps
 SET json = jsonb_set(
-    jsonb_set(json::jsonb, '{appConfiguration,recreateIndex}', 'true'),
-    '{appSchedule,cronExpression}',
-    '"0 0 * * 6"'
+    jsonb_set(
+        jsonb_set(json::jsonb, '{appConfiguration,recreateIndex}', 'true'),
+        '{appSchedule,cronExpression}',
+        '"0 0 * * 6"'
+    ),
+    '{appConfiguration,entities}',
+    '["all"]'::jsonb
 )
 WHERE name = 'RdfIndexApp';

--- a/openmetadata-spec/src/main/resources/json/schema/entity/applications/configuration/internal/rdfIndexingAppConfig.json
+++ b/openmetadata-spec/src/main/resources/json/schema/entity/applications/configuration/internal/rdfIndexingAppConfig.json
@ -104,7 +104,7 @@
      "title": "Recreate RDF Store",
      "description": "Recreate the RDF store before indexing.",
      "type": "boolean",
-      "default": false
+      "default": true
    },
    "batchSize": {
      "title": "Batch Size",