OpenMetadata/scripts/jacoco_diff_coverage.py

#!/usr/bin/env python3

import argparse
import re
import subprocess
import sys
import xml.etree.ElementTree as ET
from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path, PurePosixPath


HUNK_PATTERN = re.compile(r"^@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@")


@dataclass
class FileCoverage:
    path: str
    executable_lines: list[int]
    covered_lines: list[int]
    missed_lines: list[int]
    non_executable_lines: list[int]

    @property
    def executable_count(self) -> int:
        return len(self.executable_lines)

    @property
    def covered_count(self) -> int:
        return len(self.covered_lines)

    @property
    def missed_count(self) -> int:
        return len(self.missed_lines)

    @property
    def non_executable_count(self) -> int:
        return len(self.non_executable_lines)

    @property
    def coverage_pct(self) -> float | None:
        if not self.executable_count:
            return None
        return (self.covered_count / self.executable_count) * 100.0


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Compute JaCoCo coverage for changed production lines in a PR diff."
    )
    parser.add_argument("--report", required=True, help="Path to jacoco.xml report")
    parser.add_argument(
        "--source-root",
        default="openmetadata-service/src/main/java",
        help="Source root to evaluate for changed production code",
    )
    parser.add_argument(
        "--base-ref",
        required=True,
        help="Git base ref/SHA. If --head-ref is omitted, compare working tree against this ref.",
    )
    parser.add_argument(
        "--head-ref",
        help="Git head ref/SHA. If set, diff is computed with base...head.",
    )
    parser.add_argument(
        "--minimum-coverage",
        type=float,
        default=90.0,
        help="Minimum required changed-line coverage percentage",
    )
    parser.add_argument(
        "--markdown-output",
        required=True,
        help="File to write the Markdown summary to",
    )
    return parser.parse_args()


def run_git_diff(base_ref: str, head_ref: str | None, source_root: str) -> str:
    cmd = ["git", "diff", "--unified=0", "--no-color"]
    if head_ref:
        cmd.append(f"{base_ref}...{head_ref}")
    else:
        cmd.append(base_ref)
    cmd.extend(["--", source_root])
    result = subprocess.run(cmd, check=True, capture_output=True, text=True)
    return result.stdout


def parse_changed_lines(diff_text: str) -> dict[str, set[int]]:
    changed_lines: dict[str, set[int]] = defaultdict(set)
    current_file: str | None = None
    current_line: int | None = None

    for line in diff_text.splitlines():
        if line.startswith("+++ "):
            file_path = line[4:]
            if file_path == "/dev/null":
                current_file = None
            elif file_path.startswith("b/"):
                current_file = file_path[2:]
            else:
                current_file = file_path
            current_line = None
            continue

        if line.startswith("@@ "):
            match = HUNK_PATTERN.match(line)
            current_line = int(match.group(1)) if match else None
            continue

        if current_file is None or current_line is None:
            continue

        if line.startswith("+") and not line.startswith("+++"):
            changed_lines[current_file].add(current_line)
            current_line += 1
        elif line.startswith("-") and not line.startswith("---"):
            continue
        elif line.startswith(" "):
            current_line += 1

    return changed_lines


def parse_jacoco_report(report_path: str, source_root: str) -> dict[str, dict[int, bool]]:
    root = ET.parse(report_path).getroot()
    normalized_root = PurePosixPath(source_root)
    coverage: dict[str, dict[int, bool]] = {}

    for package in root.findall("package"):
        package_name = package.attrib.get("name", "")
        package_root = normalized_root / package_name if package_name else normalized_root

        for sourcefile in package.findall("sourcefile"):
            file_path = (package_root / sourcefile.attrib["name"]).as_posix()
            line_coverage: dict[int, bool] = {}
            for line in sourcefile.findall("line"):
                line_number = int(line.attrib["nr"])
                line_coverage[line_number] = int(line.attrib["ci"]) > 0
            coverage[file_path] = line_coverage

    return coverage


def build_file_coverage(
    changed_lines: dict[str, set[int]], jacoco_coverage: dict[str, dict[int, bool]]
) -> list[FileCoverage]:
    files: list[FileCoverage] = []

    for file_path in sorted(changed_lines):
        line_map = jacoco_coverage.get(file_path, {})
        executable_lines = sorted(line for line in changed_lines[file_path] if line in line_map)
        covered_lines = sorted(line for line in executable_lines if line_map.get(line, False))
        missed_lines = sorted(line for line in executable_lines if not line_map.get(line, False))
        non_executable_lines = sorted(line for line in changed_lines[file_path] if line not in line_map)
        files.append(
            FileCoverage(
                path=file_path,
                executable_lines=executable_lines,
                covered_lines=covered_lines,
                missed_lines=missed_lines,
                non_executable_lines=non_executable_lines,
            )
        )

    files.sort(
        key=lambda item: (
            item.coverage_pct if item.coverage_pct is not None else 101.0,
            -item.missed_count,
            item.path,
        )
    )
    return files


def format_line_list(lines: list[int], limit: int = 12) -> str:
    if not lines:
        return "-"
    if len(lines) <= limit:
        return ", ".join(str(line) for line in lines)
    visible = ", ".join(str(line) for line in lines[:limit])
    return f"{visible}, +{len(lines) - limit} more"


def render_markdown(
    files: list[FileCoverage], minimum_coverage: float, source_root: str
) -> tuple[str, bool]:
    changed_files = len(files)
    executable_total = sum(item.executable_count for item in files)
    covered_total = sum(item.covered_count for item in files)
    missed_total = sum(item.missed_count for item in files)
    non_executable_total = sum(item.non_executable_count for item in files)
    overall_pct = (covered_total / executable_total * 100.0) if executable_total else 100.0

    failing_files = [
        item for item in files if item.coverage_pct is not None and item.coverage_pct < minimum_coverage
    ]
    should_fail = executable_total > 0 and (
        overall_pct < minimum_coverage or bool(failing_files)
    )
    status = "FAIL" if should_fail else "PASS"
    status_icon = "❌" if should_fail else "✅"

    lines: list[str] = []
    lines.append("## OpenMetadata Service New-Code Coverage")
    lines.append("")

    if changed_files == 0:
        lines.append(
            f"{status_icon} No changed production Java files under `{source_root}`. Coverage gate skipped."
        )
        return "\n".join(lines) + "\n", False

    lines.append(
        f"{status_icon} **{status}**. Required changed-line coverage: `{minimum_coverage:.2f}%` overall and per touched production file."
    )
    lines.append("")
    lines.append(
        f"- Overall executable changed lines: `{covered_total}/{executable_total}` covered (`{overall_pct:.2f}%`)"
    )
    lines.append(f"- Missed executable changed lines: `{missed_total}`")
    lines.append(f"- Non-executable changed lines ignored by JaCoCo: `{non_executable_total}`")
    lines.append(f"- Changed production files: `{changed_files}`")
    lines.append("")

    if executable_total == 0:
        lines.append(
            "All changed production lines are non-executable from JaCoCo's perspective. Gate passed."
        )
        lines.append("")
    elif failing_files:
        lines.append("Files below threshold:")
        for item in failing_files:
            lines.append(
                f"- `{item.path}`: `{item.covered_count}/{item.executable_count}` covered (`{item.coverage_pct:.2f}%`), uncovered lines `{format_line_list(item.missed_lines)}`"
            )
        lines.append("")

    lines.append("| File | Covered | Missed | Executable | Non-exec | Coverage | Uncovered lines |")
    lines.append("| --- | ---: | ---: | ---: | ---: | ---: | --- |")
    for item in files:
        coverage_display = (
            f"{item.coverage_pct:.2f}%"
            if item.coverage_pct is not None
            else "N/A"
        )
        lines.append(
            f"| `{item.path}` | {item.covered_count} | {item.missed_count} | {item.executable_count} | {item.non_executable_count} | {coverage_display} | {format_line_list(item.missed_lines)} |"
        )

    lines.append("")
    lines.append(
        f"Only changed executable lines under `{source_root}` are counted. Test files, comments, imports, and non-executable lines are excluded."
    )
    return "\n".join(lines) + "\n", should_fail


def main() -> int:
    args = parse_args()

    diff_text = run_git_diff(args.base_ref, args.head_ref, args.source_root)
    changed_lines = parse_changed_lines(diff_text)
    jacoco_coverage = parse_jacoco_report(args.report, args.source_root)
    files = build_file_coverage(changed_lines, jacoco_coverage)
    markdown, should_fail = render_markdown(files, args.minimum_coverage, args.source_root)

    Path(args.markdown_output).parent.mkdir(parents=True, exist_ok=True)
    Path(args.markdown_output).write_text(markdown, encoding="utf-8")

    if should_fail:
        print(markdown)
        return 1

    print(markdown)
    return 0


if __name__ == "__main__":
    sys.exit(main())
Add Unit Tests coverage (#26360) * Enable more service unit tests and fix uncovered regressions * Fix remaining broadened unit-suite regressions * Add meaningful Handlebars helper coverage * Add formatter decorator unit coverage * Improve formatter decorator coverage * Improve utility, validator, and formatter coverage * Expand OIDC validator coverage * Tighten shared OIDC validator coverage * Improve user and connection utility coverage * Cover subscription utility workflows * Cover entity field utility workflows * Expand lineage and helper utility coverage * Improve auth code flow handler coverage * Expand auth code flow handler coverage * Cover entity csv parsing flows * Deepen entity csv parser coverage * Fix search builder aggregation null handling * Expand entity utility core coverage * Cover search index utility workflows * Expand search utility coverage * Expand formatter message coverage * Harden notification markdown rendering coverage * Add notification card assembler coverage * Expand EntityCsv coverage and dry-run fixes * Expand K8s pipeline client coverage * Expand saml validator coverage * Expand rdf property mapper coverage * Expand subscription utility coverage * Fix schema field extractor coverage gaps * Expand auth refresh flow coverage * Add service unit test workflow * Enforce new-code coverage on service PRs * Add Unit Test Coverage * Expand k8s pipeline and auth flow coverage * Expand entity csv batch import coverage * Expand entity csv entity creation coverage * Expand entity csv user and flush coverage * Expand entity csv typed import coverage * Cover entity csv dependency validation paths * Expand airflow and csv utility coverage * Replace placeholder authorizer tests with real coverage * Cover PII masking security flows * Tighten async service retry and shutdown coverage * Expand security util claim coverage * Fix checkstyle * Strengthen user bootstrap utility coverage * Expand user activity tracker coverage * Expand ODCS converter coverage * Expand S3 log storage coverage * Expand search repository and lineage coverage * Expand search filter and index factory coverage * Expand reindex handler coverage * Expand inherited field search coverage * Expand search cluster metrics coverage * Expand search repository lifecycle coverage * Expand slack client coverage and stabilize tests * Expand search index executor control flow coverage * Cover search index utility helpers * Cover distributed indexing strategy flows * Strengthen distributed search executor coverage * Cover search reindex pipeline flows * Cover search index logging flows * Cover search index stats tracking * Cover quartz search index progress flows * Cover search index app coordination * Cover slack progress listener behavior * Cover polling job notifier behavior * Cover redis job notifier behavior * Expand Slack notifier coverage * Cover partition worker processing flows * Expand distributed participant coverage * Cover orphan job monitor behavior * Expand distributed stats aggregator coverage * Expand distributed partition coverage * Strengthen distributed coordinator coverage * Expand search index and repository coverage * Expand search executor control flow coverage * Expand search repository delegation coverage * Expand search index executor coverage * Expand search repository helper coverage * Expand search utility coverage * Expand search index executor coverage * Expand search repository coverage * Strengthen search index manager coverage * Strengthen distributed recovery and worker coverage * Strengthen distributed executor coverage * Fix index sink batching and stats coverage * Expand elastic bulk sink behavior coverage * Expand open search bulk sink behavior coverage * Fix dropped bulk processor failure accounting * Cover migration workflow discovery paths * fix java checkstyle * Fix permission debug effect normalization * Cover migration FQN repair workflows * Fix glossary workflow migration idempotency * Cover v1100 migration utility flows * Cover v1104 migration extension flows * Fix and cover v160 migration policy flows * fix java checkstyle * Address PR review comments on vector search and csv docs * fix java checkstyle * Harden service unit test PR workflow * Cover migration utility repair flows * fix java checkstyle * Fix service unit test regressions * Split service new-code coverage check * fix java checkstyle * Fix service diff coverage regressions * fix java checkstyle * Clarify missing JaCoCo artifact failures * fix java checkstyle * Fix bulk sink lifecycle tests * simplify CI * Address PR review feedback after main merge * Fix merged service unit test expectations * Fix search repository bulk update tests * Apply spotless formatting * Use standard exception logging in search repository * Stabilize multi-domain search integration test * Apply spotless formatting * Isolate web analytic event integration timestamps --------- Co-authored-by: Pere Miquel Brull <peremiquelbrull@gmail.com> 2026-03-23 15:17:15 +00:00			`#!/usr/bin/env python3`

			`import argparse`
			`import re`
			`import subprocess`
			`import sys`
			`import xml.etree.ElementTree as ET`
			`from collections import defaultdict`
			`from dataclasses import dataclass`
			`from pathlib import Path, PurePosixPath`


			`HUNK_PATTERN = re.compile(r"^@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@")`


			`@dataclass`
			`class FileCoverage:`
			`path: str`
			`executable_lines: list[int]`
			`covered_lines: list[int]`
			`missed_lines: list[int]`
			`non_executable_lines: list[int]`

			`@property`
			`def executable_count(self) -> int:`
			`return len(self.executable_lines)`

			`@property`
			`def covered_count(self) -> int:`
			`return len(self.covered_lines)`

			`@property`
			`def missed_count(self) -> int:`
			`return len(self.missed_lines)`

			`@property`
			`def non_executable_count(self) -> int:`
			`return len(self.non_executable_lines)`

			`@property`
			`def coverage_pct(self) -> float \| None:`
			`if not self.executable_count:`
			`return None`
			`return (self.covered_count / self.executable_count) * 100.0`


			`def parse_args() -> argparse.Namespace:`
			`parser = argparse.ArgumentParser(`
			`description="Compute JaCoCo coverage for changed production lines in a PR diff."`
			`)`
			`parser.add_argument("--report", required=True, help="Path to jacoco.xml report")`
			`parser.add_argument(`
			`"--source-root",`
			`default="openmetadata-service/src/main/java",`
			`help="Source root to evaluate for changed production code",`
			`)`
			`parser.add_argument(`
			`"--base-ref",`
			`required=True,`
			`help="Git base ref/SHA. If --head-ref is omitted, compare working tree against this ref.",`
			`)`
			`parser.add_argument(`
			`"--head-ref",`
			`help="Git head ref/SHA. If set, diff is computed with base...head.",`
			`)`
			`parser.add_argument(`
			`"--minimum-coverage",`
			`type=float,`
			`default=90.0,`
			`help="Minimum required changed-line coverage percentage",`
			`)`
			`parser.add_argument(`
			`"--markdown-output",`
			`required=True,`
			`help="File to write the Markdown summary to",`
			`)`
			`return parser.parse_args()`


			`def run_git_diff(base_ref: str, head_ref: str \| None, source_root: str) -> str:`
			`cmd = ["git", "diff", "--unified=0", "--no-color"]`
			`if head_ref:`
			`cmd.append(f"{base_ref}...{head_ref}")`
			`else:`
			`cmd.append(base_ref)`
			`cmd.extend(["--", source_root])`
			`result = subprocess.run(cmd, check=True, capture_output=True, text=True)`
			`return result.stdout`


			`def parse_changed_lines(diff_text: str) -> dict[str, set[int]]:`
			`changed_lines: dict[str, set[int]] = defaultdict(set)`
			`current_file: str \| None = None`
			`current_line: int \| None = None`

			`for line in diff_text.splitlines():`
			`if line.startswith("+++ "):`
			`file_path = line[4:]`
			`if file_path == "/dev/null":`
			`current_file = None`
			`elif file_path.startswith("b/"):`
			`current_file = file_path[2:]`
			`else:`
			`current_file = file_path`
			`current_line = None`
			`continue`

			`if line.startswith("@@ "):`
			`match = HUNK_PATTERN.match(line)`
			`current_line = int(match.group(1)) if match else None`
			`continue`

			`if current_file is None or current_line is None:`
			`continue`

			`if line.startswith("+") and not line.startswith("+++"):`
			`changed_lines[current_file].add(current_line)`
			`current_line += 1`
			`elif line.startswith("-") and not line.startswith("---"):`
			`continue`
			`elif line.startswith(" "):`
			`current_line += 1`

			`return changed_lines`


			`def parse_jacoco_report(report_path: str, source_root: str) -> dict[str, dict[int, bool]]:`
			`root = ET.parse(report_path).getroot()`
			`normalized_root = PurePosixPath(source_root)`
			`coverage: dict[str, dict[int, bool]] = {}`

			`for package in root.findall("package"):`
			`package_name = package.attrib.get("name", "")`
			`package_root = normalized_root / package_name if package_name else normalized_root`

			`for sourcefile in package.findall("sourcefile"):`
			`file_path = (package_root / sourcefile.attrib["name"]).as_posix()`
			`line_coverage: dict[int, bool] = {}`
			`for line in sourcefile.findall("line"):`
			`line_number = int(line.attrib["nr"])`
			`line_coverage[line_number] = int(line.attrib["ci"]) > 0`
			`coverage[file_path] = line_coverage`

			`return coverage`


			`def build_file_coverage(`
			`changed_lines: dict[str, set[int]], jacoco_coverage: dict[str, dict[int, bool]]`
			`) -> list[FileCoverage]:`
			`files: list[FileCoverage] = []`

			`for file_path in sorted(changed_lines):`
			`line_map = jacoco_coverage.get(file_path, {})`
			`executable_lines = sorted(line for line in changed_lines[file_path] if line in line_map)`
			`covered_lines = sorted(line for line in executable_lines if line_map.get(line, False))`
			`missed_lines = sorted(line for line in executable_lines if not line_map.get(line, False))`
			`non_executable_lines = sorted(line for line in changed_lines[file_path] if line not in line_map)`
			`files.append(`
			`FileCoverage(`
			`path=file_path,`
			`executable_lines=executable_lines,`
			`covered_lines=covered_lines,`
			`missed_lines=missed_lines,`
			`non_executable_lines=non_executable_lines,`
			`)`
			`)`

			`files.sort(`
			`key=lambda item: (`
			`item.coverage_pct if item.coverage_pct is not None else 101.0,`
			`-item.missed_count,`
			`item.path,`
			`)`
			`)`
			`return files`


			`def format_line_list(lines: list[int], limit: int = 12) -> str:`
			`if not lines:`
			`return "-"`
			`if len(lines) <= limit:`
			`return ", ".join(str(line) for line in lines)`
			`visible = ", ".join(str(line) for line in lines[:limit])`
			`return f"{visible}, +{len(lines) - limit} more"`


			`def render_markdown(`
			`files: list[FileCoverage], minimum_coverage: float, source_root: str`
			`) -> tuple[str, bool]:`
			`changed_files = len(files)`
			`executable_total = sum(item.executable_count for item in files)`
			`covered_total = sum(item.covered_count for item in files)`
			`missed_total = sum(item.missed_count for item in files)`
			`non_executable_total = sum(item.non_executable_count for item in files)`
			`overall_pct = (covered_total / executable_total * 100.0) if executable_total else 100.0`

			`failing_files = [`
			`item for item in files if item.coverage_pct is not None and item.coverage_pct < minimum_coverage`
			`]`
			`should_fail = executable_total > 0 and (`
			`overall_pct < minimum_coverage or bool(failing_files)`
			`)`
			`status = "FAIL" if should_fail else "PASS"`
			`status_icon = "❌" if should_fail else "✅"`

			`lines: list[str] = []`
			`lines.append("## OpenMetadata Service New-Code Coverage")`
			`lines.append("")`

			`if changed_files == 0:`
			`lines.append(`
			f"{status_icon} No changed production Java files under `{source_root}`. Coverage gate skipped."
			`)`
			`return "\n".join(lines) + "\n", False`

			`lines.append(`
			f"{status_icon} {status}. Required changed-line coverage: `{minimum_coverage:.2f}%` overall and per touched production file."
			`)`
			`lines.append("")`
			`lines.append(`
			f"- Overall executable changed lines: `{covered_total}/{executable_total}` covered (`{overall_pct:.2f}%`)"
			`)`
			lines.append(f"- Missed executable changed lines: `{missed_total}`")
			lines.append(f"- Non-executable changed lines ignored by JaCoCo: `{non_executable_total}`")
			lines.append(f"- Changed production files: `{changed_files}`")
			`lines.append("")`

			`if executable_total == 0:`
			`lines.append(`
			`"All changed production lines are non-executable from JaCoCo's perspective. Gate passed."`
			`)`
			`lines.append("")`
			`elif failing_files:`
			`lines.append("Files below threshold:")`
			`for item in failing_files:`
			`lines.append(`
			f"- `{item.path}`: `{item.covered_count}/{item.executable_count}` covered (`{item.coverage_pct:.2f}%`), uncovered lines `{format_line_list(item.missed_lines)}`"
			`)`
			`lines.append("")`

			`lines.append("\| File \| Covered \| Missed \| Executable \| Non-exec \| Coverage \| Uncovered lines \|")`
			`lines.append("\| --- \| ---: \| ---: \| ---: \| ---: \| ---: \| --- \|")`
			`for item in files:`
			`coverage_display = (`
			`f"{item.coverage_pct:.2f}%"`
			`if item.coverage_pct is not None`
			`else "N/A"`
			`)`
			`lines.append(`
			f"\| `{item.path}` \| {item.covered_count} \| {item.missed_count} \| {item.executable_count} \| {item.non_executable_count} \| {coverage_display} \| {format_line_list(item.missed_lines)} \|"
			`)`

			`lines.append("")`
			`lines.append(`
			f"Only changed executable lines under `{source_root}` are counted. Test files, comments, imports, and non-executable lines are excluded."
			`)`
			`return "\n".join(lines) + "\n", should_fail`


			`def main() -> int:`
			`args = parse_args()`

			`diff_text = run_git_diff(args.base_ref, args.head_ref, args.source_root)`
			`changed_lines = parse_changed_lines(diff_text)`
			`jacoco_coverage = parse_jacoco_report(args.report, args.source_root)`
			`files = build_file_coverage(changed_lines, jacoco_coverage)`
			`markdown, should_fail = render_markdown(files, args.minimum_coverage, args.source_root)`

			`Path(args.markdown_output).parent.mkdir(parents=True, exist_ok=True)`
			`Path(args.markdown_output).write_text(markdown, encoding="utf-8")`

			`if should_fail:`
			`print(markdown)`
			`return 1`

			`print(markdown)`
			`return 0`


			`if __name__ == "__main__":`
			`sys.exit(main())`