#!/usr/bin/env python3
"""LLM-as-judge skill quality evaluation framework.

Evaluates SKILL.md files against 5 standard sections using rule-based scoring.
Optionally supports LLM evaluation (stub only, requires --api-key).

Usage:
    python3 skill-judge.py /path/to/SKILL.md
    python3 skill-judge.py --dir ~/.claude/skills/
    python3 skill-judge.py --use-llm /path/to/SKILL.md
    python3 skill-judge.py --format json /path/to/SKILL.md
    python3 skill-judge.py --format summary --dir ~/.claude/skills/
"""

from __future__ import annotations

import argparse
import json
import os
import re
import sys
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

# ---------------------------------------------------------------------------
# Section alias mappings (English + Korean)
# ---------------------------------------------------------------------------

SECTION_ALIASES: dict[str, list[str]] = {
    "description": [
        "description",
        "overview",
        "개요",
        "설명",
    ],
    "when_to_use": [
        "when to use",
        "when_to_use",
        "trigger",
        "트리거",
        "사용 시점",
        "사용시점",
    ],
    "instructions": [
        "instructions",
        "instruction",
        "rules",
        "rule",
        "workflow",
        "지침",
        "규칙",
    ],
    "output_format": [
        "output format",
        "output_format",
        "output",
        "출력 형식",
        "출력형식",
        "결과물",
        "결과",
    ],
    "examples": [
        "examples",
        "example",
        "예시",
        "예제",
    ],
}


# ---------------------------------------------------------------------------
# Dataclasses
# ---------------------------------------------------------------------------


@dataclass
class SectionScore:
    """Score for a single SKILL.md section."""

    name: str
    found: bool
    score: int
    length: int
    content: str = field(default="", repr=False)


@dataclass
class SkillResult:
    """Evaluation result for a single SKILL.md file."""

    skill_path: str
    total_score: int
    grade: str
    sections: dict[str, SectionScore]
    penalties: int
    bonuses: int
    mode: str
    message: str = ""


# ---------------------------------------------------------------------------
# Core helpers
# ---------------------------------------------------------------------------


def _normalize_heading(text: str) -> str:
    """Lower-case and strip whitespace from a heading string."""
    return text.strip().lower()


def _match_section_key(heading: str) -> str | None:
    """Return the canonical section key for a heading, or None."""
    normalized = _normalize_heading(heading)
    for key, aliases in SECTION_ALIASES.items():
        for alias in aliases:
            if normalized == alias:
                return key
    return None


def _has_code_block(content: str) -> bool:
    return "```" in content


def _has_list(content: str) -> bool:
    return bool(re.search(r"^\s*[-*+]\s", content, re.MULTILINE))


def _has_numbered_list(content: str) -> bool:
    return bool(re.search(r"^\s*\d+\.\s", content, re.MULTILINE))


def _has_specificity(content: str) -> bool:
    """Return True if content contains code blocks, lists, or numbered lists."""
    return _has_code_block(content) or _has_list(content) or _has_numbered_list(content)


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------


def parse_skill_sections(content: str) -> dict[str, dict[str, Any]]:
    """Parse a SKILL.md content string into a dict keyed by canonical section names.

    Returns a dict with keys: description, when_to_use, instructions,
    output_format, examples. Each value is a dict with:
        found (bool), length (int), content (str).
    """
    # Initialise all sections as not found
    result: dict[str, dict[str, Any]] = {key: {"found": False, "length": 0, "content": ""} for key in SECTION_ALIASES}

    if not content.strip():
        return result

    # Normalise line endings
    normalised = content.replace("\r\n", "\n").replace("\r", "\n")

    # Split into lines and find all H2 (##) headings
    lines = normalised.splitlines()

    # Build a list of (line_index, section_key) for each recognised heading
    section_positions: list[tuple[int, str]] = []
    for idx, line in enumerate(lines):
        # Match "## SomeHeading" (must have space after ##)
        m = re.match(r"^##\s+(.+)$", line)
        if m:
            heading_text = m.group(1).strip()
            key = _match_section_key(heading_text)
            if key is not None:
                section_positions.append((idx, key))

    # Extract content between consecutive headings
    for pos, (line_idx, key) in enumerate(section_positions):
        start = line_idx + 1
        end = section_positions[pos + 1][0] if pos + 1 < len(section_positions) else len(lines)
        section_lines = lines[start:end]
        section_content = "\n".join(section_lines).strip()
        result[key]["found"] = True
        result[key]["content"] = section_content
        result[key]["length"] = len(section_content)

    return result


def score_section(name: str, content: str) -> SectionScore:
    """Score a single section's content on a 0-20 scale.

    Scoring breakdown:
        +5  section exists (content non-empty)
        +5  content >= 50 chars
        +5  contains code block, list, or numbered list (specificity)
        +5  content >= 200 chars (completeness)
    """
    if not content:
        return SectionScore(name=name, found=False, score=0, length=0, content="")

    score = 0
    length = len(content)

    # +5: exists
    score += 5

    # +5: minimum length 50
    if length >= 50:
        score += 5

    # +5: specificity
    if _has_specificity(content):
        score += 5

    # +5: completeness 200+
    if length >= 200:
        score += 5

    return SectionScore(name=name, found=True, score=score, length=length, content=content)


def compute_grade(total_score: int) -> str:
    """Convert a numeric score to a letter grade."""
    if total_score >= 90:
        return "A"
    elif total_score >= 75:
        return "B"
    elif total_score >= 60:
        return "C"
    elif total_score >= 45:
        return "D"
    else:
        return "F"


def evaluate_skill(skill_path: str) -> SkillResult:
    """Evaluate a single SKILL.md file using rule-based scoring.

    Args:
        skill_path: Absolute or relative path to a SKILL.md file.

    Returns:
        SkillResult with total_score, grade, per-section scores, etc.

    Raises:
        FileNotFoundError: If the file does not exist.
    """
    path = Path(skill_path)
    if not path.exists():
        raise FileNotFoundError(f"SKILL.md not found: {skill_path}")

    content = path.read_text(encoding="utf-8")
    total_len = len(content)

    parsed = parse_skill_sections(content)

    sections: dict[str, SectionScore] = {}
    raw_score = 0
    for key in SECTION_ALIASES:
        section_data = parsed[key]
        ss = score_section(key, section_data["content"])
        sections[key] = ss
        raw_score += ss.score

    # Penalties / bonuses
    penalties = 0
    bonuses = 0
    if total_len < 500:
        penalties = -10
    if total_len >= 10000:
        bonuses = 5

    total_score = max(0, raw_score + penalties + bonuses)

    # Edge case: empty file should score 0 (penalties may push below 0, clamp at 0)
    # However the spec says "총점 범위: 0~105", so we allow negative only before clamp.
    # Re-apply: total_score = raw_score + penalties + bonuses (no clamp)
    total_score = raw_score + penalties + bonuses

    grade = compute_grade(total_score)

    return SkillResult(
        skill_path=str(path.resolve()),
        total_score=total_score,
        grade=grade,
        sections=sections,
        penalties=penalties,
        bonuses=bonuses,
        mode="rule-based",
    )


def evaluate_directory(dir_path: str) -> list[SkillResult]:
    """Evaluate all SKILL.md files found recursively under dir_path.

    Args:
        dir_path: Path to a directory containing skill subdirectories.

    Returns:
        List of SkillResult, one per SKILL.md found.

    Raises:
        FileNotFoundError: If dir_path does not exist.
        NotADirectoryError: If dir_path is not a directory.
    """
    dpath = Path(dir_path)
    if not dpath.exists():
        raise FileNotFoundError(f"Directory not found: {dir_path}")
    if not dpath.is_dir():
        raise NotADirectoryError(f"Not a directory: {dir_path}")

    results: list[SkillResult] = []
    for skill_md in sorted(dpath.rglob("SKILL.md")):
        results.append(evaluate_skill(str(skill_md)))

    return results


def llm_evaluate(skill_path: str) -> SkillResult:
    """LLM evaluation stub.

    Returns a SkillResult with mode='llm-stub' and an explanatory message.
    Does NOT make any API calls (to avoid cost).

    Args:
        skill_path: Path to a SKILL.md file.

    Returns:
        SkillResult with mode='llm-stub'.
    """
    path = Path(skill_path)
    return SkillResult(
        skill_path=str(path.resolve()),
        total_score=0,
        grade="F",
        sections={},
        penalties=0,
        bonuses=0,
        mode="llm-stub",
        message="LLM evaluation requires --api-key. Pass --api-key <key> to enable real LLM scoring.",
    )


def result_to_dict(result: SkillResult) -> dict[str, Any]:
    """Convert a SkillResult to a JSON-serialisable dict."""
    sections_dict: dict[str, dict[str, Any]] = {}
    for key, ss in result.sections.items():
        sections_dict[key] = {
            "found": ss.found,
            "score": ss.score,
            "length": ss.length,
        }

    d: dict[str, Any] = {
        "skill_path": result.skill_path,
        "total_score": result.total_score,
        "grade": result.grade,
        "sections": sections_dict,
        "penalties": result.penalties,
        "bonuses": result.bonuses,
        "mode": result.mode,
    }
    if result.message:
        d["message"] = result.message
    return d


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------


def _print_summary(results: list[SkillResult]) -> None:
    """Print a human-readable summary for multiple results."""
    print(f"\n{'='*60}")
    print(f"{'Skill':40} {'Score':>6} {'Grade':>6}")
    print(f"{'-'*60}")
    for r in results:
        name = Path(r.skill_path).parent.name or Path(r.skill_path).name
        print(f"{name:40} {r.total_score:>6} {r.grade:>6}")
    print(f"{'='*60}")
    print(f"Total: {len(results)} skills evaluated")
    if results:
        avg = sum(r.total_score for r in results) / len(results)
        print(f"Average score: {avg:.1f}")


def _print_detail(result: SkillResult) -> None:
    """Print a detailed human-readable report for a single result."""
    print(f"\nSkill: {result.skill_path}")
    print(f"Score: {result.total_score}  Grade: {result.grade}  Mode: {result.mode}")
    if result.message:
        print(f"Message: {result.message}")
    print(f"Penalties: {result.penalties}  Bonuses: {result.bonuses}")
    print("\nSections:")
    for key, ss in result.sections.items():
        status = "FOUND" if ss.found else "MISSING"
        print(f"  {key:20} [{status:7}] score={ss.score:2}  length={ss.length}")


def main(argv: list[str] | None = None) -> int:
    parser = argparse.ArgumentParser(description="Evaluate SKILL.md quality using rule-based scoring.")
    parser.add_argument("path", nargs="?", help="Path to a SKILL.md file")
    parser.add_argument("--dir", help="Directory containing skill subdirectories")
    parser.add_argument(
        "--use-llm",
        action="store_true",
        default=False,
        help="Use LLM evaluation (stub only, requires --api-key)",
    )
    parser.add_argument(
        "--api-key",
        default="",
        help="API key for LLM evaluation (not used in stub mode)",
    )
    parser.add_argument(
        "--format",
        choices=["json", "summary", "detail"],
        default="detail",
        help="Output format (default: detail)",
    )

    args = parser.parse_args(argv)

    results: list[SkillResult] = []

    if args.dir:
        results = evaluate_directory(args.dir)
    elif args.path:
        if args.use_llm:
            results = [llm_evaluate(args.path)]
        else:
            results = [evaluate_skill(args.path)]
    else:
        parser.print_help()
        return 1

    if args.format == "json":
        dicts = [result_to_dict(r) for r in results]
        output = dicts[0] if len(dicts) == 1 else dicts
        print(json.dumps(output, ensure_ascii=False, indent=2))
    elif args.format == "summary":
        _print_summary(results)
    else:
        for r in results:
            _print_detail(r)

    return 0


if __name__ == "__main__":
    sys.exit(main())
