"""Judge 모듈 - 체크리스트 기반 스킬 결과물 채점"""

import json
import re

import yaml
from autoresearch.claude_runner import call_claude, estimate_tokens

JUDGE_PROMPT_TEMPLATE = """
아래 체크리스트의 각 항목에 대해 결과물을 평가하세요.
각 항목에 PASS 또는 FAIL로 답하고, 짧은 이유를 달아주세요.

[체크리스트]
{checklist_items}

[스킬 결과물]
{skill_output}

JSON 형식으로 응답:
{{
  "items": [
    {{"id": "item_id", "result": "PASS", "reason": "..."}},
    ...
  ],
  "total_score": 0.8,
  "summary": "..."
}}
"""

_MAX_CHECKLIST_ITEMS = 6


def load_checklist(checklist_path: str) -> dict:
    """YAML 체크리스트를 로드하여 dict로 반환.

    필드: name, version, description, items (list of {id, question, weight}), scoring

    Raises:
        ValueError: items가 6개를 초과하는 경우
    """
    with open(checklist_path, encoding="utf-8") as f:
        data: dict = yaml.safe_load(f)

    items = data.get("items", [])
    if len(items) > _MAX_CHECKLIST_ITEMS:
        raise ValueError(f"체크리스트 항목은 최대 {_MAX_CHECKLIST_ITEMS}개여야 합니다. " f"현재 {len(items)}개.")

    return data


def format_checklist_for_prompt(checklist: dict) -> str:
    """체크리스트를 프롬프트용 텍스트로 변환.

    각 항목을 "- [id] question (weight: X)" 형식으로 출력.
    """
    lines: list[str] = []
    for item in checklist.get("items", []):
        item_id = item.get("id", "")
        question = item.get("question", "")
        weight = item.get("weight", 1.0)
        lines.append(f"- [{item_id}] {question} (weight: {weight})")
    return "\n".join(lines)


def build_judge_prompt(checklist: dict, skill_output: str) -> str:
    """judge 프롬프트를 조립"""
    checklist_items = format_checklist_for_prompt(checklist)
    return JUDGE_PROMPT_TEMPLATE.format(
        checklist_items=checklist_items,
        skill_output=skill_output,
    )


def parse_judge_response(response_text: str, checklist: dict) -> dict:
    """LLM 응답을 파싱하여 채점 결과 반환.

    Returns:
        {
            "items": [{"id": str, "result": "PASS"|"FAIL", "reason": str}, ...],
            "total_score": float,  # 0.0~1.0
            "summary": str,
        }

    total_score는 가중평균으로 재계산 (LLM 답변 신뢰 대신 직접 계산)

    Raises:
        ValueError: JSON 파싱 실패 시
    """
    # JSON 블록 추출: 텍스트에 감싸진 경우도 처리
    json_match = re.search(r"\{[\s\S]*\}", response_text)
    if not json_match:
        raise ValueError(f"응답에서 JSON을 찾을 수 없습니다. 응답: {response_text[:200]!r}")

    try:
        data: dict = json.loads(json_match.group())
    except json.JSONDecodeError as exc:
        raise ValueError(f"JSON 파싱 실패: {exc}. 응답: {response_text[:200]!r}") from exc

    items: list[dict] = data.get("items", [])
    summary: str = data.get("summary", "")

    # 가중평균 직접 재계산
    # checklist의 weight 맵 구성
    weight_map: dict[str, float] = {item["id"]: float(item.get("weight", 1.0)) for item in checklist.get("items", [])}

    total_weight = 0.0
    weighted_score = 0.0
    for item in items:
        item_id: str = item.get("id", "")
        result: str = item.get("result", "FAIL")
        weight = weight_map.get(item_id, 1.0)
        score = 1.0 if result == "PASS" else 0.0
        weighted_score += score * weight
        total_weight += weight

    total_score = weighted_score / total_weight if total_weight > 0.0 else 0.0

    return {
        "items": items,
        "total_score": total_score,
        "summary": summary,
    }


def judge_output(
    checklist: dict,
    skill_output: str,
    model: str = "claude-haiku-4-5-20251001",
) -> dict:
    """LLM을 호출하여 채점 수행.

    Returns: parse_judge_response 결과 + {"input_tokens": int, "output_tokens": int}
    """
    prompt = build_judge_prompt(checklist, skill_output)

    response_text = call_claude(prompt=prompt, model=model, max_tokens=2048)

    result = parse_judge_response(response_text, checklist)
    result["input_tokens"] = estimate_tokens(prompt)
    result["output_tokens"] = estimate_tokens(response_text)

    return result