Address PR review: post-state checks, FAILED listener, hermetic IT, InOrder

Five additional behavioral fixes from Copilot review on PR #27920.

1. delete-canonical and addAliases: detect failure via post-state, not via
   try/catch. ElasticSearchIndexManager#deleteIndexWithBackoff and
   #addAliasesInternal both swallow transport exceptions and return void
   (same shape on the OS side), so the existing try/catch could never
   observe a failed delete or alias-add. After deleteIndexWithBackoff,
   verify the canonical no longer exists; after addAliases, verify the
   canonical-name alias is actually attached. If either post-condition
   fails, log [ALIAS_PROMOTE_FAILED reason=delete-not-acknowledged] /
   [reason=alias-not-attached] and mark the promotion failed (data loss
   for the alias-not-attached path).

2. SearchIndexExecutor.executeSingleServer: wire onJobFailed for the new
   FAILED status. Previously the listener chain only had callbacks for
   COMPLETED / COMPLETED_WITH_ERRORS / STOPPED, so promotion-driven FAILED
   ended without populating jobData.failure or notifying observers. Pass
   in an IllegalStateException naming the data-loss entities so the app
   run record carries the right failure context.

3. SearchIndexAliasPromotionIT trigger payload: explicitly set
   liveIndexSettings (1s/1/request), liveIndexSettingsByEntity (empty),
   and useDistributedIndexing=true. /v1/apps/trigger/X merges the payload
   into the persisted config rather than replacing it, so without these
   the test could be affected by previous local config or silently exercise
   the single-server path. The hard-coded post-promotion assertions are
   now anchored to values the test itself supplies.

4. testPromoteEntityIndexForceMergesWhenConfigured: replace standalone
   verify() with InOrder(forceMerge → swapAliases) so a refactor that
   swaps aliases first and merges afterward fails the test instead of
   passing.

All 148 unit tests pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Sriharsha Chintalapani 2026-05-05 17:14:49 -07:00
parent 4654158706
commit 9a7fa49494
4 changed files with 48 additions and 3 deletions

View file

@ -206,11 +206,19 @@ public class SearchIndexAliasPromotionIT {
bulk.put("translogDurability", "async");
bulk.put("translogSyncInterval", "30s");
Map<String, Object> live = new HashMap<>();
live.put("numberOfReplicas", 1);
live.put("refreshInterval", "1s");
live.put("translogDurability", "request");
Map<String, Object> config = new HashMap<>();
config.put("entities", List.of("table"));
config.put("recreateIndex", true);
config.put("batchSize", 100);
config.put("bulkIndexSettings", bulk);
config.put("liveIndexSettings", live);
config.put("liveIndexSettingsByEntity", new HashMap<String, Object>());
config.put("useDistributedIndexing", true);
Awaitility.await("Trigger " + APP_NAME)
.atMost(Duration.ofMinutes(2))

View file

@ -1762,6 +1762,15 @@ public class SearchIndexExecutor implements AutoCloseable {
listeners.onJobCompletedWithErrors(stats.get(), endTime - startTime);
} else if (status == ExecutionResult.Status.STOPPED) {
listeners.onJobStopped(stats.get());
} else if (status == ExecutionResult.Status.FAILED) {
Set<String> dataLoss =
recreateIndexHandler != null ? recreateIndexHandler.getDataLossPromotions() : Set.of();
listeners.onJobFailed(
stats.get(),
new IllegalStateException(
"Promotion data loss for entities: "
+ dataLoss
+ ". Canonical index was deleted but alias not re-attached."));
}
return ExecutionResult.fromStats(stats.get(), status, startTime);

View file

@ -322,12 +322,17 @@ public class DefaultRecreateHandler implements RecreateIndexHandler {
}
}
// deleteIndexWithBackoff and addAliases swallow transport exceptions and return void in
// both ES and OS clients (ElasticSearchIndexManager#deleteIndexWithBackoff,
// ElasticSearchIndexManager#addAliasesInternal same shape on the OS side). A try/catch
// alone cannot detect failure. Verify outcome via post-state checks: the index is gone,
// the alias is attached.
if (needsCanonicalAlias && searchClient.indexExists(canonicalIndex)) {
try {
searchClient.deleteIndexWithBackoff(canonicalIndex);
} catch (Exception ex) {
LOG.error(
"[ALIAS_PROMOTE_FAILED phase=delete-canonical entity={} stagedIndex={} canonicalIndex={}] "
"[ALIAS_PROMOTE_FAILED phase=delete-canonical entity={} stagedIndex={} canonicalIndex={} reason=exception] "
+ "Parent aliases on staged; canonical-name lookups still hit old index until retry.",
entityType,
stagedIndex,
@ -336,6 +341,16 @@ public class DefaultRecreateHandler implements RecreateIndexHandler {
markPromotionFailed(entityType, false);
return false;
}
if (searchClient.indexExists(canonicalIndex)) {
LOG.error(
"[ALIAS_PROMOTE_FAILED phase=delete-canonical entity={} stagedIndex={} canonicalIndex={} reason=delete-not-acknowledged] "
+ "Client did not throw, but canonical index still exists. Old index keeps serving canonical-name lookups.",
entityType,
stagedIndex,
canonicalIndex);
markPromotionFailed(entityType, false);
return false;
}
}
if (needsCanonicalAlias) {
@ -343,7 +358,7 @@ public class DefaultRecreateHandler implements RecreateIndexHandler {
searchClient.addAliases(stagedIndex, Set.of(canonicalIndex));
} catch (Exception ex) {
LOG.error(
"[ALIAS_PROMOTE_FAILED phase=swap2 entity={} stagedIndex={} canonicalIndex={}] "
"[ALIAS_PROMOTE_FAILED phase=swap2 entity={} stagedIndex={} canonicalIndex={} reason=exception] "
+ "DATA UNAVAILABLE: canonical was deleted but canonical-name alias add failed. "
+ "Parent aliases work; canonical-name lookups will 404 until manual repair.",
entityType,
@ -353,6 +368,17 @@ public class DefaultRecreateHandler implements RecreateIndexHandler {
markPromotionFailed(entityType, true);
return false;
}
if (!searchClient.getAliases(stagedIndex).contains(canonicalIndex)) {
LOG.error(
"[ALIAS_PROMOTE_FAILED phase=swap2 entity={} stagedIndex={} canonicalIndex={} reason=alias-not-attached] "
+ "DATA UNAVAILABLE: client did not throw, but canonical-name alias is not on staged index. "
+ "Parent aliases work; canonical-name lookups will 404 until manual repair.",
entityType,
stagedIndex,
canonicalIndex);
markPromotionFailed(entityType, true);
return false;
}
}
LOG.info(

View file

@ -495,7 +495,9 @@ class DefaultRecreateHandlerTest {
new DefaultRecreateHandler().withJobData(jobData).promoteEntityIndex(context, true);
}
verify(client).forceMerge("table_search_index_rebuild_new", 1);
org.mockito.InOrder order = org.mockito.Mockito.inOrder(client);
order.verify(client).forceMerge("table_search_index_rebuild_new", 1);
order.verify(client).swapAliases(anySet(), eq("table_search_index_rebuild_new"), anySet());
}
@Test