pbi-cli/tests/test_skill_triggering.py
MinaSaad1 5acb3f33e3 fix: resolve CI failures (formatting, mypy, test import)
- Run ruff format on all 26 unformatted files
- Fix mypy strict errors: add explicit typing for json.loads returns,
  add pywin32/websockets to mypy ignore_missing_imports
- Remove yaml dependency from test_skill_triggering.py (use regex parser)
- Fix skill triggering test to handle both single-line and multi-line
  description formats in YAML frontmatter
2026-04-02 15:59:49 +02:00

117 lines
4.9 KiB
Python

"""Skill triggering evaluation -- verify prompts match expected skills.
This is NOT a pytest test. Run directly:
python tests/test_skill_triggering.py
Uses keyword-based scoring to simulate which skill description best matches
each user prompt, without requiring an LLM call.
"""
from __future__ import annotations
import importlib.resources
import re
def _load_skills() -> dict[str, str]:
"""Load all skill names and descriptions from bundled skills."""
skills_pkg = importlib.resources.files("pbi_cli.skills")
skills: dict[str, str] = {}
for item in skills_pkg.iterdir():
if item.is_dir() and (item / "SKILL.md").is_file():
content = (item / "SKILL.md").read_text(encoding="utf-8")
match = re.match(r"^---\n(.*?)\n---", content, re.DOTALL)
if match:
# Parse description from YAML frontmatter without pyyaml
# Handles both single-line and multi-line (>) formats
fm_text = match.group(1)
multi = re.search(r"description:\s*>?\s*\n((?:\s+.*\n)*)", fm_text)
single = re.search(r"description:\s+(.+)", fm_text)
if multi and multi.group(1).strip():
desc = " ".join(
line.strip() for line in multi.group(1).splitlines() if line.strip()
)
skills[item.name] = desc.lower()
elif single:
skills[item.name] = single.group(1).strip().lower()
return skills
def _score_prompt(prompt: str, description: str) -> int:
"""Score how well a prompt matches a skill description using word overlap."""
prompt_words = set(re.findall(r"[a-z]+", prompt.lower()))
desc_words = set(re.findall(r"[a-z]+", description))
# Weight longer matching words higher (domain terms matter more)
score = 0
for word in prompt_words & desc_words:
if len(word) >= 5:
score += 3
elif len(word) >= 3:
score += 1
return score
def _find_best_skill(prompt: str, skills: dict[str, str]) -> str:
"""Find the skill with the highest keyword overlap score."""
scores = {name: _score_prompt(prompt, desc) for name, desc in skills.items()}
return max(scores, key=lambda k: scores[k])
# Test cases: (prompt, expected_skill)
TEST_CASES: list[tuple[str, str]] = [
# power-bi-visuals
("Add a bar chart to the overview page showing sales by region", "power-bi-visuals"),
("I need to bind Sales[Revenue] to the value field on my KPI visual", "power-bi-visuals"),
("What visual types does pbi-cli support? I need a scatter plot", "power-bi-visuals"),
("Resize all the card visuals on the dashboard page to 200x120", "power-bi-visuals"),
# power-bi-pages
("Add a new page called Regional Detail to my report", "power-bi-pages"),
("Hide the drillthrough page from the navigation bar", "power-bi-pages"),
("Create a bookmark for the current executive view", "power-bi-pages"),
# power-bi-themes
("Apply our corporate brand colours to the entire report", "power-bi-themes"),
(
"I want conditional formatting on the revenue column green for high red for low",
"power-bi-themes",
),
("Compare this new theme JSON against what is currently applied", "power-bi-themes"),
# power-bi-filters
("Filter the overview page to show only the top 10 products by revenue", "power-bi-filters"),
("Add a date filter for the last 30 days on the Sales page", "power-bi-filters"),
("What filters are currently on my dashboard page", "power-bi-filters"),
# power-bi-report
("Create a new PBIR report project for our sales dashboard", "power-bi-report"),
("Validate the report structure to make sure everything is correct", "power-bi-report"),
("Start the preview server so I can see the layout", "power-bi-report"),
# Should NOT trigger report skills
("Create a measure called Total Revenue equals SUM of Sales Amount", "power-bi-modeling"),
("Export the semantic model to TMDL for version control", "power-bi-deployment"),
("Set up row-level security for regional managers", "power-bi-security"),
]
def main() -> None:
skills = _load_skills()
passed = 0
failed = 0
print(f"Testing {len(TEST_CASES)} prompts against {len(skills)} skills\n")
print(f"{'#':<3} {'Result':<6} {'Expected':<22} {'Got':<22} Prompt")
print("-" * 100)
for i, (prompt, expected) in enumerate(TEST_CASES, 1):
got = _find_best_skill(prompt, skills)
ok = got == expected
status = "PASS" if ok else "FAIL"
if ok:
passed += 1
else:
failed += 1
short_prompt = prompt[:45] + "..." if len(prompt) > 45 else prompt
print(f"{i:<3} {status:<6} {expected:<22} {got:<22} {short_prompt}")
print(f"\n{passed}/{len(TEST_CASES)} passed, {failed} failed")
if __name__ == "__main__":
main()