mirror of
https://github.com/open-metadata/OpenMetadata
synced 2026-05-24 09:39:11 +00:00
* Update Perf * Add multi asset scale count * Update perf and Usage * Fix recommendation * Add Benchmarking script and doc * Fix Perf * Add --no break to benchmark * add more metrics and validation for indexes miss * Update generated TypeScript types * Bound Doc Virtual Threads * Remove Additional Properties from the UI * Update doc * Fix Job Getting Marked Stopped * Server killed logs fixes * Add Server stat to Quartz Progress * Fix CPU spiking * Make Auto Tune Consider JVm configs * Fix Partition Calculator and Recovery Job Stats * Update Auto Tune to show up in logs and stored in config * Fix Auto Tune Config not store in app run record * Fix OnDemand Job type * Indexing Failures not flushed fixed * Fix Stat counting at job level with process job failures * Add Reindex Job Identifier * Add Thread Identifiers * Wait for sink * Wait for sink * Fix Stopping to let partitions finish the job * CPU Budgeting * More Conservative settings * Address Review Comment * fix Open Search Index Manager * Reapply OpenSearch BulkSink --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
714 lines
28 KiB
Bash
Executable file
714 lines
28 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
PERF_TEST="$SCRIPT_DIR/perf-test.sh"
|
|
|
|
# ─── Defaults ───────────────────────────────────────────────────────────────────
|
|
SERVER="http://localhost:8585"
|
|
WORKERS=30
|
|
OUTPUT_DIR="./sizing-results"
|
|
START_SCALE="10k"
|
|
END_SCALE="xlarge"
|
|
MODES="seq,realistic"
|
|
ADMIN_PORT=""
|
|
TOKEN=""
|
|
RAMP=false
|
|
SKIP_EXISTING=false
|
|
MIXED=false
|
|
MIXED_DURATION=60
|
|
NO_BREAK=false
|
|
|
|
# ─── Scale Ladder (ordered) ────────────────────────────────────────────────────
|
|
ALL_SCALES=(10k 50k 100k 200k 500k large xlarge)
|
|
|
|
# ─── Colors ─────────────────────────────────────────────────────────────────────
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
CYAN='\033[0;36m'
|
|
BOLD='\033[1m'
|
|
NC='\033[0m'
|
|
|
|
# ─── Usage ──────────────────────────────────────────────────────────────────────
|
|
usage() {
|
|
cat <<'EOF'
|
|
Usage: benchmark-sizing.sh [OPTIONS]
|
|
|
|
Progressive benchmark runner that loops through scale tiers, runs perf-test.sh
|
|
at each tier, captures JSON outputs, and generates a final comparison report.
|
|
|
|
Scale Ladder: 10k → 50k → 100k → 200k → 500k → large (~2M) → xlarge (~5M)
|
|
|
|
OPTIONS:
|
|
--server URL Target server (default: http://localhost:8585)
|
|
--workers NUM Worker count for all runs (default: 30)
|
|
--output-dir DIR Directory for JSON reports + final summary (default: ./sizing-results)
|
|
--start-scale PRESET First scale tier to run (default: 10k)
|
|
--end-scale PRESET Last scale tier to run (default: xlarge)
|
|
--modes MODES Comma-separated: seq, realistic, or both (default: seq,realistic)
|
|
--admin-port PORT Pass through to perf-test.sh for server diagnostics
|
|
--token TOKEN Auth token pass-through
|
|
--ramp Run ramp test at first tier to find optimal workers
|
|
--skip-existing Skip tiers that already have JSON output in output-dir
|
|
--mixed Also run mixed read/write workload at each tier
|
|
--mixed-duration SECS Duration for mixed workload (default: 60)
|
|
--no-break Don't stop at break-points, run all tiers regardless
|
|
-h, --help Show this help message
|
|
|
|
EXAMPLES:
|
|
# Quick 2-tier test
|
|
benchmark-sizing.sh --start-scale 10k --end-scale 50k
|
|
|
|
# Full progressive benchmark with ramp and mixed workloads
|
|
benchmark-sizing.sh --ramp --mixed --output-dir /data/sizing
|
|
|
|
# Resume after failure (skips completed tiers)
|
|
benchmark-sizing.sh --skip-existing --output-dir ./sizing-results
|
|
|
|
# Single mode only
|
|
benchmark-sizing.sh --modes realistic --end-scale 500k
|
|
EOF
|
|
exit 0
|
|
}
|
|
|
|
# ─── Parse Arguments ────────────────────────────────────────────────────────────
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--server) SERVER="$2"; shift 2 ;;
|
|
--workers) WORKERS="$2"; shift 2 ;;
|
|
--output-dir) OUTPUT_DIR="$2"; shift 2 ;;
|
|
--start-scale) START_SCALE="$2"; shift 2 ;;
|
|
--end-scale) END_SCALE="$2"; shift 2 ;;
|
|
--modes) MODES="$2"; shift 2 ;;
|
|
--admin-port) ADMIN_PORT="$2"; shift 2 ;;
|
|
--token) TOKEN="$2"; shift 2 ;;
|
|
--ramp) RAMP=true; shift ;;
|
|
--skip-existing) SKIP_EXISTING=true; shift ;;
|
|
--mixed) MIXED=true; shift ;;
|
|
--mixed-duration) MIXED_DURATION="$2"; shift 2 ;;
|
|
--no-break) NO_BREAK=true; shift ;;
|
|
-h|--help) usage ;;
|
|
*) echo "Unknown option: $1"; usage ;;
|
|
esac
|
|
done
|
|
|
|
# ─── Validate ───────────────────────────────────────────────────────────────────
|
|
if [[ ! -f "$PERF_TEST" ]]; then
|
|
echo -e "${RED}Error: perf-test.sh not found at $PERF_TEST${NC}"
|
|
exit 1
|
|
fi
|
|
|
|
validate_scale() {
|
|
local scale="$1"
|
|
for s in "${ALL_SCALES[@]}"; do
|
|
[[ "$s" == "$scale" ]] && return 0
|
|
done
|
|
echo -e "${RED}Error: Invalid scale '$scale'. Valid: ${ALL_SCALES[*]}${NC}"
|
|
exit 1
|
|
}
|
|
|
|
validate_scale "$START_SCALE"
|
|
validate_scale "$END_SCALE"
|
|
|
|
# ─── Resolve Scale Range ───────────────────────────────────────────────────────
|
|
resolve_scales() {
|
|
local start="$1" end="$2"
|
|
local collecting=false
|
|
SCALES=()
|
|
for s in "${ALL_SCALES[@]}"; do
|
|
[[ "$s" == "$start" ]] && collecting=true
|
|
if $collecting; then
|
|
SCALES+=("$s")
|
|
fi
|
|
[[ "$s" == "$end" ]] && break
|
|
done
|
|
if [[ ${#SCALES[@]} -eq 0 ]]; then
|
|
echo -e "${RED}Error: No scales resolved between $start and $end${NC}"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
resolve_scales "$START_SCALE" "$END_SCALE"
|
|
|
|
# ─── Parse Modes ────────────────────────────────────────────────────────────────
|
|
IFS=',' read -ra MODE_LIST <<< "$MODES"
|
|
for m in "${MODE_LIST[@]}"; do
|
|
if [[ "$m" != "seq" && "$m" != "realistic" ]]; then
|
|
echo -e "${RED}Error: Invalid mode '$m'. Valid: seq, realistic${NC}"
|
|
exit 1
|
|
fi
|
|
done
|
|
|
|
# ─── Setup Output Directory ────────────────────────────────────────────────────
|
|
mkdir -p "$OUTPUT_DIR"
|
|
SUMMARY_CSV="$OUTPUT_DIR/.sizing-progress.csv"
|
|
SUMMARY_MD="$OUTPUT_DIR/SIZING-SUMMARY.md"
|
|
LOG_FILE="$OUTPUT_DIR/benchmark-sizing.log"
|
|
|
|
log() {
|
|
local msg="[$(date '+%Y-%m-%d %H:%M:%S')] $*"
|
|
echo "$msg" >> "$LOG_FILE"
|
|
echo -e "$msg"
|
|
}
|
|
|
|
# ─── Print Banner ──────────────────────────────────────────────────────────────
|
|
echo ""
|
|
echo -e "${BOLD}══════════════════════════════════════════════════════════════════${NC}"
|
|
echo -e "${BOLD} OpenMetadata Progressive Cluster Sizing Benchmark${NC}"
|
|
echo -e "${BOLD}══════════════════════════════════════════════════════════════════${NC}"
|
|
echo ""
|
|
echo -e " Server: ${CYAN}$SERVER${NC}"
|
|
echo -e " Workers: ${CYAN}$WORKERS${NC}"
|
|
echo -e " Scale range: ${CYAN}${SCALES[*]}${NC}"
|
|
echo -e " Modes: ${CYAN}${MODE_LIST[*]}${NC}"
|
|
echo -e " Output: ${CYAN}$OUTPUT_DIR${NC}"
|
|
echo -e " Ramp test: ${CYAN}$RAMP${NC}"
|
|
echo -e " Mixed: ${CYAN}$MIXED${NC}"
|
|
echo -e " No-break: ${CYAN}$NO_BREAK${NC}"
|
|
[[ -n "$ADMIN_PORT" ]] && echo -e " Admin port: ${CYAN}$ADMIN_PORT${NC}"
|
|
echo ""
|
|
|
|
# ─── Connectivity Check ────────────────────────────────────────────────────────
|
|
log "Checking server connectivity..."
|
|
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "$SERVER/api/v1/system/version" 2>/dev/null || echo "000")
|
|
if [[ "$HTTP_CODE" == "000" ]]; then
|
|
echo -e "${RED}Error: Cannot reach $SERVER (connection refused or timeout)${NC}"
|
|
exit 1
|
|
elif [[ "$HTTP_CODE" -ge 400 ]]; then
|
|
echo -e "${YELLOW}Warning: Server returned HTTP $HTTP_CODE (may need auth token)${NC}"
|
|
fi
|
|
log "Server reachable (HTTP $HTTP_CODE)"
|
|
|
|
# ─── Build perf-test.sh Base Args ──────────────────────────────────────────────
|
|
build_perf_args() {
|
|
local scale="$1" mode="$2" output_file="$3"
|
|
local args=()
|
|
args+=(--server "$SERVER")
|
|
args+=(--workers "$WORKERS")
|
|
args+=(--scale "$scale")
|
|
args+=(--output "$output_file")
|
|
[[ -n "$ADMIN_PORT" ]] && args+=(--admin-port "$ADMIN_PORT")
|
|
[[ -n "$TOKEN" ]] && args+=(--token "$TOKEN")
|
|
[[ "$mode" == "realistic" ]] && args+=(--realistic)
|
|
if $MIXED; then
|
|
args+=(--mixed)
|
|
args+=(--mixed-duration "$MIXED_DURATION")
|
|
fi
|
|
echo "${args[@]}"
|
|
}
|
|
|
|
# ─── Extract Metrics from JSON ──────────────────────────────────────────────────
|
|
extract_metrics() {
|
|
local json_file="$1"
|
|
if [[ ! -f "$json_file" ]]; then
|
|
echo "N/A,0,0,0,0,0,N/A"
|
|
return
|
|
fi
|
|
|
|
python3 -c "
|
|
import json, sys
|
|
|
|
with open('$json_file') as f:
|
|
data = json.load(f)
|
|
|
|
overall = data.get('overall', {})
|
|
sizing = data.get('cluster_sizing', {})
|
|
server = data.get('server_info', {})
|
|
|
|
total = overall.get('total_entities_created', 0)
|
|
rps = overall.get('overall_throughput_rps', 0)
|
|
error_rate = overall.get('overall_error_rate_pct', 0)
|
|
wall_clock = overall.get('total_wall_clock_s', 0)
|
|
assessment = sizing.get('assessment', 'unknown')
|
|
|
|
# Find max p95 and p99 across write entities
|
|
max_p95 = 0
|
|
max_p99 = 0
|
|
entities = data.get('entities', {})
|
|
for name, ent in entities.items():
|
|
if name.startswith('read_') or name.startswith('mixed_'):
|
|
continue
|
|
lat = ent.get('latency_ms', {})
|
|
p95 = lat.get('p95', 0)
|
|
p99 = lat.get('p99', 0)
|
|
if p95 > max_p95:
|
|
max_p95 = p95
|
|
if p99 > max_p99:
|
|
max_p99 = p99
|
|
|
|
print(f'{total},{rps:.1f},{max_p95:.0f},{max_p99:.0f},{error_rate:.2f},{wall_clock:.0f},{assessment}')
|
|
" 2>/dev/null || echo "N/A,0,0,0,0,0,error"
|
|
}
|
|
|
|
# ─── Extract Config Summary from JSON ───────────────────────────────────────────
|
|
extract_config() {
|
|
local json_file="$1"
|
|
if [[ ! -f "$json_file" ]]; then
|
|
echo "N/A"
|
|
return
|
|
fi
|
|
|
|
python3 -c "
|
|
import json
|
|
with open('$json_file') as f:
|
|
data = json.load(f)
|
|
cs = data.get('cluster_sizing', {}).get('config_summary', [])
|
|
print('; '.join(cs) if cs else 'N/A')
|
|
" 2>/dev/null || echo "N/A"
|
|
}
|
|
|
|
# ─── Extract Server Info ────────────────────────────────────────────────────────
|
|
extract_server_info() {
|
|
local json_file="$1"
|
|
python3 -c "
|
|
import json
|
|
with open('$json_file') as f:
|
|
data = json.load(f)
|
|
si = data.get('server_info', {})
|
|
version = si.get('version', 'unknown')
|
|
diag = data.get('diagnostics_before', {})
|
|
jvm = diag.get('jvm', {})
|
|
heap_max = jvm.get('heap_max_bytes', 0)
|
|
heap_gb = heap_max / (1024**3) if heap_max else 0
|
|
jetty = diag.get('jetty', {})
|
|
max_threads = jetty.get('threads_max', 'unknown')
|
|
db = diag.get('database', {})
|
|
db_pool = db.get('pool_max', 'unknown')
|
|
print(f'Version: {version}')
|
|
print(f'Heap: {heap_gb:.1f}GB')
|
|
print(f'Threads: {max_threads}')
|
|
print(f'DB Pool: {db_pool}')
|
|
" 2>/dev/null || echo "Version: unknown"
|
|
}
|
|
|
|
# ─── Break-point Detection ──────────────────────────────────────────────────────
|
|
PREV_RPS_PER_ENTITY=""
|
|
|
|
is_broken() {
|
|
local metrics="$1"
|
|
IFS=',' read -r total rps p95 p99 error_rate wall_clock assessment <<< "$metrics"
|
|
|
|
[[ "$assessment" == "undersized" ]] && return 0
|
|
|
|
if python3 -c "exit(0 if float('$error_rate') > 10 else 1)" 2>/dev/null; then
|
|
return 0
|
|
fi
|
|
|
|
if python3 -c "exit(0 if float('$p95') > 10000 else 1)" 2>/dev/null; then
|
|
return 0
|
|
fi
|
|
|
|
if [[ -n "$PREV_RPS_PER_ENTITY" && "$total" != "0" && "$total" != "N/A" ]]; then
|
|
if python3 -c "
|
|
curr_rpe = float('$rps') / max(float('$total'), 1)
|
|
prev_rpe = float('$PREV_RPS_PER_ENTITY')
|
|
exit(0 if prev_rpe > 0 and curr_rpe / prev_rpe < 0.5 else 1)
|
|
" 2>/dev/null; then
|
|
return 0
|
|
fi
|
|
fi
|
|
|
|
return 1
|
|
}
|
|
|
|
# ─── Results Tracking ──────────────────────────────────────────────────────────
|
|
declare -a RESULT_LINES=()
|
|
BREAK_SCALE=""
|
|
BREAK_MODE=""
|
|
BREAK_REASON=""
|
|
|
|
add_result() {
|
|
local scale="$1" mode="$2" metrics="$3" config="$4"
|
|
RESULT_LINES+=("$scale,$mode,$metrics,$config")
|
|
}
|
|
|
|
# ─── Optional Ramp Test ────────────────────────────────────────────────────────
|
|
if $RAMP; then
|
|
FIRST_SCALE="${SCALES[0]}"
|
|
log "${BOLD}Running ramp test at scale $FIRST_SCALE to find optimal workers...${NC}"
|
|
RAMP_OUTPUT="$OUTPUT_DIR/sizing-${FIRST_SCALE}-ramp.json"
|
|
|
|
RAMP_ARGS=(--server "$SERVER" --workers "$WORKERS" --scale "$FIRST_SCALE" --ramp --output "$RAMP_OUTPUT")
|
|
[[ -n "$ADMIN_PORT" ]] && RAMP_ARGS+=(--admin-port "$ADMIN_PORT")
|
|
[[ -n "$TOKEN" ]] && RAMP_ARGS+=(--token "$TOKEN")
|
|
|
|
if $SKIP_EXISTING && [[ -f "$RAMP_OUTPUT" ]]; then
|
|
log "Ramp output exists, skipping (--skip-existing)"
|
|
else
|
|
log "Running: perf-test.sh ${RAMP_ARGS[*]}"
|
|
if "$PERF_TEST" "${RAMP_ARGS[@]}" 2>&1 | tee -a "$LOG_FILE"; then
|
|
OPTIMAL=$(python3 -c "
|
|
import json
|
|
with open('$RAMP_OUTPUT') as f:
|
|
data = json.load(f)
|
|
rt = data.get('ramp_test', {})
|
|
print(rt.get('optimal_workers', $WORKERS))
|
|
" 2>/dev/null || echo "$WORKERS")
|
|
log "${GREEN}Ramp test complete. Optimal workers: $OPTIMAL${NC}"
|
|
echo -e "\n${YELLOW}Ramp test suggests $OPTIMAL workers. Current setting: $WORKERS${NC}"
|
|
echo -e "${YELLOW}To use the suggested value, re-run with --workers $OPTIMAL${NC}\n"
|
|
else
|
|
log "${YELLOW}Ramp test failed, continuing with $WORKERS workers${NC}"
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
# ─── Main Benchmark Loop ──────────────────────────────────────────────────────
|
|
BENCHMARK_START=$(date +%s)
|
|
TOTAL_TIERS=${#SCALES[@]}
|
|
TIER_NUM=0
|
|
BROKEN=false
|
|
|
|
for scale in "${SCALES[@]}"; do
|
|
TIER_NUM=$((TIER_NUM + 1))
|
|
|
|
echo ""
|
|
echo -e "${BOLD}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
|
echo -e "${BOLD} Tier $TIER_NUM/$TOTAL_TIERS: scale=$scale${NC}"
|
|
echo -e "${BOLD}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
|
|
|
|
for mode in "${MODE_LIST[@]}"; do
|
|
OUTPUT_FILE="$OUTPUT_DIR/sizing-${scale}-${mode}.json"
|
|
MODE_LABEL=$([[ "$mode" == "seq" ]] && echo "sequential" || echo "realistic")
|
|
|
|
log " [$scale / $MODE_LABEL] Starting..."
|
|
|
|
if $SKIP_EXISTING && [[ -f "$OUTPUT_FILE" ]]; then
|
|
log " [$scale / $MODE_LABEL] Output exists, skipping (--skip-existing)"
|
|
METRICS=$(extract_metrics "$OUTPUT_FILE")
|
|
CONFIG=$(extract_config "$OUTPUT_FILE")
|
|
add_result "$scale" "$mode" "$METRICS" "$CONFIG"
|
|
|
|
IFS=',' read -r _total _rps _p95 _p99 _err _wall _assess <<< "$METRICS"
|
|
echo -e " ${GREEN}[SKIP]${NC} $scale/$MODE_LABEL: ${_total} entities, ${_rps} RPS, p95=${_p95}ms, err=${_err}%, ${_assess}"
|
|
continue
|
|
fi
|
|
|
|
PERF_ARGS=$(build_perf_args "$scale" "$mode" "$OUTPUT_FILE")
|
|
log " Running: perf-test.sh $PERF_ARGS"
|
|
|
|
RUN_START=$(date +%s)
|
|
EXIT_CODE=0
|
|
# shellcheck disable=SC2086
|
|
"$PERF_TEST" $PERF_ARGS 2>&1 | tee -a "$LOG_FILE" || EXIT_CODE=$?
|
|
RUN_END=$(date +%s)
|
|
RUN_DURATION=$((RUN_END - RUN_START))
|
|
|
|
if [[ $EXIT_CODE -ne 0 ]]; then
|
|
log " ${RED}[$scale / $MODE_LABEL] FAILED (exit code $EXIT_CODE) after ${RUN_DURATION}s${NC}"
|
|
add_result "$scale" "$mode" "FAILED,0,0,0,100,${RUN_DURATION},failed" "N/A"
|
|
BREAK_SCALE="$scale"
|
|
BREAK_MODE="$mode"
|
|
BREAK_REASON="perf-test.sh exited with code $EXIT_CODE"
|
|
BROKEN=true
|
|
break
|
|
fi
|
|
|
|
METRICS=$(extract_metrics "$OUTPUT_FILE")
|
|
CONFIG=$(extract_config "$OUTPUT_FILE")
|
|
add_result "$scale" "$mode" "$METRICS" "$CONFIG"
|
|
|
|
IFS=',' read -r _total _rps _p95 _p99 _err _wall _assess <<< "$METRICS"
|
|
log " ${GREEN}[$scale / $MODE_LABEL] Done:${NC} ${_total} entities, ${_rps} RPS, p95=${_p95}ms, err=${_err}%, ${_assess} (${RUN_DURATION}s)"
|
|
|
|
if is_broken "$METRICS"; then
|
|
if [[ "$_assess" == "undersized" ]]; then
|
|
reason="Cluster assessed as undersized"
|
|
elif python3 -c "exit(0 if float('$_err') > 10 else 1)" 2>/dev/null; then
|
|
reason="Error rate ${_err}% exceeds 10% threshold"
|
|
elif python3 -c "exit(0 if float('$_p95') > 10000 else 1)" 2>/dev/null; then
|
|
reason="p95 latency ${_p95}ms exceeds 10s threshold"
|
|
else
|
|
reason="Throughput degraded >50% from previous tier"
|
|
fi
|
|
|
|
if $NO_BREAK; then
|
|
echo -e "\n ${YELLOW}${BOLD}BREAK-POINT WOULD FIRE at $scale/$MODE_LABEL: $reason (--no-break, continuing)${NC}\n"
|
|
log "BREAK-POINT SUPPRESSED at $scale/$MODE_LABEL: $reason (--no-break)"
|
|
if [[ -z "$BREAK_SCALE" ]]; then
|
|
BREAK_SCALE="$scale"
|
|
BREAK_MODE="$mode"
|
|
BREAK_REASON="$reason (continued with --no-break)"
|
|
fi
|
|
else
|
|
BREAK_SCALE="$scale"
|
|
BREAK_MODE="$mode"
|
|
BREAK_REASON="$reason"
|
|
echo -e "\n ${RED}${BOLD}BREAK-POINT DETECTED at $scale/$MODE_LABEL: $BREAK_REASON${NC}\n"
|
|
log "BREAK-POINT DETECTED at $scale/$MODE_LABEL: $BREAK_REASON"
|
|
BROKEN=true
|
|
break
|
|
fi
|
|
fi
|
|
|
|
if [[ "$_total" != "0" && "$_total" != "N/A" ]]; then
|
|
PREV_RPS_PER_ENTITY=$(python3 -c "print(float('$_rps') / max(float('$_total'), 1))" 2>/dev/null || echo "")
|
|
fi
|
|
done
|
|
|
|
if $BROKEN; then
|
|
break
|
|
fi
|
|
|
|
# Tier comparison
|
|
if [[ ${#MODE_LIST[@]} -gt 1 ]]; then
|
|
SEQ_FILE="$OUTPUT_DIR/sizing-${scale}-seq.json"
|
|
REAL_FILE="$OUTPUT_DIR/sizing-${scale}-realistic.json"
|
|
if [[ -f "$SEQ_FILE" && -f "$REAL_FILE" ]]; then
|
|
echo ""
|
|
echo -e " ${CYAN}Tier $scale comparison:${NC}"
|
|
SEQ_M=$(extract_metrics "$SEQ_FILE")
|
|
REAL_M=$(extract_metrics "$REAL_FILE")
|
|
IFS=',' read -r s_total s_rps s_p95 s_p99 s_err s_wall s_assess <<< "$SEQ_M"
|
|
IFS=',' read -r r_total r_rps r_p95 r_p99 r_err r_wall r_assess <<< "$REAL_M"
|
|
printf " %-12s %8s %8s %8s %8s %10s\n" "" "Entities" "RPS" "p95ms" "Errors%" "Assessment"
|
|
printf " %-12s %8s %8s %8s %8s %10s\n" "Sequential" "$s_total" "$s_rps" "$s_p95" "${s_err}%" "$s_assess"
|
|
printf " %-12s %8s %8s %8s %8s %10s\n" "Realistic" "$r_total" "$r_rps" "$r_p95" "${r_err}%" "$r_assess"
|
|
echo ""
|
|
fi
|
|
fi
|
|
done
|
|
|
|
BENCHMARK_END=$(date +%s)
|
|
BENCHMARK_DURATION=$((BENCHMARK_END - BENCHMARK_START))
|
|
|
|
# ─── Generate SIZING-SUMMARY.md ────────────────────────────────────────────────
|
|
log "Generating $SUMMARY_MD..."
|
|
|
|
# Try to get server info from the first available JSON
|
|
SERVER_INFO=""
|
|
for scale in "${SCALES[@]}"; do
|
|
for mode in "${MODE_LIST[@]}"; do
|
|
F="$OUTPUT_DIR/sizing-${scale}-${mode}.json"
|
|
if [[ -f "$F" ]]; then
|
|
SERVER_INFO=$(extract_server_info "$F")
|
|
break 2
|
|
fi
|
|
done
|
|
done
|
|
|
|
{
|
|
echo "# OpenMetadata Cluster Sizing Summary"
|
|
echo ""
|
|
echo "Generated: $(date '+%Y-%m-%d %H:%M:%S')"
|
|
echo "Total benchmark time: ${BENCHMARK_DURATION}s ($((BENCHMARK_DURATION / 60))m $((BENCHMARK_DURATION % 60))s)"
|
|
echo ""
|
|
echo "## Server Configuration"
|
|
echo ""
|
|
echo '```'
|
|
echo "Server: $SERVER"
|
|
echo "Workers: $WORKERS"
|
|
if [[ -n "$SERVER_INFO" ]]; then
|
|
echo "$SERVER_INFO"
|
|
fi
|
|
echo '```'
|
|
echo ""
|
|
echo "## Progressive Results"
|
|
echo ""
|
|
echo "| Scale | Mode | Entities | RPS | p95 (ms) | p99 (ms) | Errors % | Assessment | Duration |"
|
|
echo "|-------|------|----------|-----|----------|----------|----------|------------|----------|"
|
|
|
|
LAST_ADEQUATE_SCALE=""
|
|
LAST_ADEQUATE_CONFIG=""
|
|
|
|
for line in "${RESULT_LINES[@]}"; do
|
|
IFS=',' read -r scale mode total rps p95 p99 err wall assess config <<< "$line"
|
|
mode_label=$([[ "$mode" == "seq" ]] && echo "Sequential" || echo "Realistic")
|
|
|
|
# Format duration
|
|
if [[ "$wall" =~ ^[0-9]+$ ]]; then
|
|
duration="${wall}s"
|
|
else
|
|
duration="$wall"
|
|
fi
|
|
|
|
# Assessment emoji
|
|
case "$assess" in
|
|
adequate) assess_fmt="$assess" ;;
|
|
marginal) assess_fmt="$assess" ;;
|
|
undersized) assess_fmt="$assess" ;;
|
|
failed) assess_fmt="FAILED" ;;
|
|
*) assess_fmt="$assess" ;;
|
|
esac
|
|
|
|
echo "| $scale | $mode_label | $total | $rps | $p95 | $p99 | $err | $assess_fmt | $duration |"
|
|
|
|
if [[ "$assess" == "adequate" || "$assess" == "marginal" ]]; then
|
|
LAST_ADEQUATE_SCALE="$scale"
|
|
LAST_ADEQUATE_CONFIG="$config"
|
|
fi
|
|
done
|
|
|
|
echo ""
|
|
|
|
# Break-point section
|
|
if [[ -n "$BREAK_SCALE" ]]; then
|
|
echo "## Break-Point Detected"
|
|
echo ""
|
|
echo "**Scale:** $BREAK_SCALE "
|
|
echo "**Mode:** $([[ "$BREAK_MODE" == "seq" ]] && echo "Sequential" || echo "Realistic") "
|
|
echo "**Reason:** $BREAK_REASON"
|
|
echo ""
|
|
if [[ -n "$LAST_ADEQUATE_SCALE" ]]; then
|
|
echo "The cluster handled **$LAST_ADEQUATE_SCALE** scale adequately but broke at **$BREAK_SCALE**."
|
|
echo ""
|
|
fi
|
|
else
|
|
echo "## Result"
|
|
echo ""
|
|
echo "No break-point detected. The cluster handled all tested scales up to **${SCALES[${#SCALES[@]}-1]}**."
|
|
echo ""
|
|
fi
|
|
|
|
# Recommended configuration
|
|
if [[ -n "$LAST_ADEQUATE_CONFIG" && "$LAST_ADEQUATE_CONFIG" != "N/A" ]]; then
|
|
echo "## Recommended Configuration"
|
|
echo ""
|
|
echo "Based on the last adequate tier ($LAST_ADEQUATE_SCALE):"
|
|
echo ""
|
|
echo '```bash'
|
|
IFS=';' read -ra CONFIGS <<< "$LAST_ADEQUATE_CONFIG"
|
|
for cfg in "${CONFIGS[@]}"; do
|
|
cfg_trimmed=$(echo "$cfg" | xargs)
|
|
[[ -n "$cfg_trimmed" ]] && echo "export $cfg_trimmed"
|
|
done
|
|
echo '```'
|
|
echo ""
|
|
fi
|
|
|
|
# Sequential vs Realistic comparison
|
|
if [[ ${#MODE_LIST[@]} -gt 1 ]]; then
|
|
echo "## Sequential vs Realistic Comparison"
|
|
echo ""
|
|
echo "| Scale | Seq RPS | Real RPS | RPS Diff | Seq p95 | Real p95 | p95 Diff |"
|
|
echo "|-------|---------|----------|----------|---------|----------|----------|"
|
|
|
|
for scale in "${SCALES[@]}"; do
|
|
SEQ_FILE="$OUTPUT_DIR/sizing-${scale}-seq.json"
|
|
REAL_FILE="$OUTPUT_DIR/sizing-${scale}-realistic.json"
|
|
if [[ -f "$SEQ_FILE" && -f "$REAL_FILE" ]]; then
|
|
python3 -c "
|
|
import json
|
|
with open('$SEQ_FILE') as f:
|
|
seq = json.load(f)
|
|
with open('$REAL_FILE') as f:
|
|
real = json.load(f)
|
|
s_rps = seq.get('overall', {}).get('overall_throughput_rps', 0)
|
|
r_rps = real.get('overall', {}).get('overall_throughput_rps', 0)
|
|
rps_diff = ((r_rps - s_rps) / s_rps * 100) if s_rps > 0 else 0
|
|
|
|
# Max p95
|
|
def max_p95(data):
|
|
mx = 0
|
|
for name, ent in data.get('entities', {}).items():
|
|
if name.startswith('read_') or name.startswith('mixed_'):
|
|
continue
|
|
p = ent.get('latency_ms', {}).get('p95', 0)
|
|
if p > mx:
|
|
mx = p
|
|
return mx
|
|
|
|
s_p95 = max_p95(seq)
|
|
r_p95 = max_p95(real)
|
|
p95_diff = ((r_p95 - s_p95) / s_p95 * 100) if s_p95 > 0 else 0
|
|
|
|
print(f'| $scale | {s_rps:.1f} | {r_rps:.1f} | {rps_diff:+.1f}% | {s_p95:.0f} | {r_p95:.0f} | {p95_diff:+.1f}% |')
|
|
" 2>/dev/null || true
|
|
fi
|
|
done
|
|
|
|
echo ""
|
|
echo "> **Sequential** runs entity types one at a time. **Realistic** runs all concurrently through a shared worker pool, exposing cross-entity contention."
|
|
echo ""
|
|
fi
|
|
|
|
# Individual tier details
|
|
echo "## Tier Details"
|
|
echo ""
|
|
for scale in "${SCALES[@]}"; do
|
|
for mode in "${MODE_LIST[@]}"; do
|
|
F="$OUTPUT_DIR/sizing-${scale}-${mode}.json"
|
|
[[ ! -f "$F" ]] && continue
|
|
mode_label=$([[ "$mode" == "seq" ]] && echo "Sequential" || echo "Realistic")
|
|
echo "### $scale ($mode_label)"
|
|
echo ""
|
|
python3 -c "
|
|
import json
|
|
with open('$F') as f:
|
|
data = json.load(f)
|
|
|
|
entities = data.get('entities', {})
|
|
print('| Entity | Count | RPS | p50 | p95 | p99 | Errors % |')
|
|
print('|--------|-------|-----|-----|-----|-----|----------|')
|
|
for name, ent in sorted(entities.items()):
|
|
count = ent.get('created', ent.get('total_requests', 0))
|
|
rps = ent.get('throughput_rps', 0)
|
|
lat = ent.get('latency_ms', {})
|
|
p50 = lat.get('p50', 0)
|
|
p95 = lat.get('p95', 0)
|
|
p99 = lat.get('p99', 0)
|
|
err = ent.get('error_rate_pct', 0)
|
|
print(f'| {name} | {count} | {rps:.1f} | {p50:.0f} | {p95:.0f} | {p99:.0f} | {err:.2f} |')
|
|
|
|
# Sizing findings
|
|
sizing = data.get('cluster_sizing', {})
|
|
findings = sizing.get('findings', [])
|
|
if findings:
|
|
print()
|
|
print('**Findings:**')
|
|
for f in findings:
|
|
print(f'- {f}')
|
|
" 2>/dev/null || echo "_Failed to parse results_"
|
|
echo ""
|
|
done
|
|
done
|
|
|
|
echo "---"
|
|
echo ""
|
|
echo "Generated by \`benchmark-sizing.sh\` on $(date '+%Y-%m-%d %H:%M:%S')"
|
|
|
|
} > "$SUMMARY_MD"
|
|
|
|
# ─── Final Console Summary ─────────────────────────────────────────────────────
|
|
echo ""
|
|
echo -e "${BOLD}══════════════════════════════════════════════════════════════════${NC}"
|
|
echo -e "${BOLD} SIZING BENCHMARK COMPLETE${NC}"
|
|
echo -e "${BOLD}══════════════════════════════════════════════════════════════════${NC}"
|
|
echo ""
|
|
|
|
printf " %-8s %-12s %8s %8s %8s %8s %10s\n" "Scale" "Mode" "Entities" "RPS" "p95ms" "Errors%" "Assessment"
|
|
printf " %-8s %-12s %8s %8s %8s %8s %10s\n" "────────" "────────────" "────────" "────────" "────────" "────────" "──────────"
|
|
|
|
for line in "${RESULT_LINES[@]}"; do
|
|
IFS=',' read -r scale mode total rps p95 p99 err wall assess config <<< "$line"
|
|
mode_label=$([[ "$mode" == "seq" ]] && echo "Sequential" || echo "Realistic")
|
|
|
|
case "$assess" in
|
|
adequate) color="$GREEN" ;;
|
|
marginal) color="$YELLOW" ;;
|
|
undersized) color="$RED" ;;
|
|
failed) color="$RED" ;;
|
|
*) color="$NC" ;;
|
|
esac
|
|
|
|
printf " %-8s %-12s %8s %8s %8s %8s ${color}%10s${NC}\n" \
|
|
"$scale" "$mode_label" "$total" "$rps" "$p95" "${err}%" "$assess"
|
|
done
|
|
|
|
echo ""
|
|
|
|
if [[ -n "$BREAK_SCALE" ]]; then
|
|
echo -e " ${RED}${BOLD}Break-point: $BREAK_SCALE ($BREAK_MODE) — $BREAK_REASON${NC}"
|
|
if [[ -n "$LAST_ADEQUATE_SCALE" ]]; then
|
|
echo -e " ${GREEN}Last adequate scale: $LAST_ADEQUATE_SCALE${NC}"
|
|
fi
|
|
else
|
|
echo -e " ${GREEN}No break-point detected — cluster handled all scales through ${SCALES[${#SCALES[@]}-1]}${NC}"
|
|
fi
|
|
|
|
echo ""
|
|
echo -e " Total time: ${BENCHMARK_DURATION}s ($((BENCHMARK_DURATION / 60))m $((BENCHMARK_DURATION % 60))s)"
|
|
echo -e " Summary: ${CYAN}$SUMMARY_MD${NC}"
|
|
echo -e " Log: ${CYAN}$LOG_FILE${NC}"
|
|
echo ""
|