"""AI-powered comparison and benchmarking for output review system."""

from __future__ import annotations

import json
import os
import random
import re
from typing import Any

AI_MODEL = "claude-sonnet-4-6"
MAX_TOKENS_COMPARE = 1024
MAX_TOKENS_IMPROVE = 4096


def _get_anthropic_client():  # -> anthropic.Anthropic
    import anthropic

    api_key = os.environ.get("ANTHROPIC_API_KEY")
    if not api_key:
        raise EnvironmentError("ANTHROPIC_API_KEY 환경변수가 설정되지 않았습니다.")
    return anthropic.Anthropic(api_key=api_key)


def _extract_json(text: str) -> dict[str, Any]:
    text = text.strip()
    try:
        return json.loads(text)  # type: ignore[no-any-return]
    except json.JSONDecodeError:
        pass
    m = re.search(r"```json\s*(.*?)```", text, re.DOTALL)
    if m:
        return json.loads(m.group(1).strip())  # type: ignore[no-any-return]
    raise ValueError(f"JSON을 추출할 수 없습니다: {text[:200]}")


def compare_outputs_ai(output_a: str, output_b: str, eval_axes: list[str], skill_name: str) -> dict[str, Any]:
    """AI 비교 판정. A/B 순서를 랜덤화하여 position bias를 방지합니다."""
    items = [("A", output_a), ("B", output_b)]
    random.shuffle(items)
    (l1, t1), (l2, t2) = items[0], items[1]
    axes_str = "\n".join(f"- {ax}" for ax in eval_axes) if eval_axes else "- 전반적 품질"
    prompt = (
        f"스킬: {skill_name}\n평가 축:\n{axes_str}\n\n아웃풋 1:\n{t1}\n\n아웃풋 2:\n{t2}\n\n"
        f'JSON only: {{"winner": "1" or "2", "reason": "...", "scores": {{"axis": [s1,s2]}}}}\n'
        f"각 축 점수는 1~5 정수."
    )
    try:
        client = _get_anthropic_client()
        msg = client.messages.create(
            model=AI_MODEL, max_tokens=MAX_TOKENS_COMPARE, messages=[{"role": "user", "content": prompt}]
        )
        res = _extract_json(msg.content[0].text)  # type: ignore[index]
        winner_label = l1 if str(res.get("winner", "1")) == "1" else l2
        scores: dict[str, list[int]] = {}
        for ax, vals in res.get("scores", {}).items():
            if isinstance(vals, (list, tuple)) and len(vals) == 2:
                s1, s2 = int(vals[0]), int(vals[1])
                scores[ax] = [s1, s2] if l1 == "A" else [s2, s1]
        return {"winner": winner_label, "reason": str(res.get("reason", "")), "scores": scores}
    except Exception as e:
        winner = "A" if len(output_a) >= len(output_b) else "B"
        return {"winner": winner, "reason": f"AI 판정 실패 (fallback): {e}", "scores": {}}


def search_expert_output(skill_name: str, topic: str) -> str | None:
    """AI를 사용해 해당 스킬의 전문가 수준 가이드/기법을 생성합니다."""
    prompt = (
        f"스킬 '{skill_name}'에서 '{topic}' 주제의 전문가 수준 가이드와 핵심 기법을 "
        f"실무에 바로 적용 가능하게 구체적으로 작성해주세요."
    )
    try:
        client = _get_anthropic_client()
        msg = client.messages.create(
            model=AI_MODEL, max_tokens=MAX_TOKENS_COMPARE, messages=[{"role": "user", "content": prompt}]
        )
        return msg.content[0].text  # type: ignore[index]
    except Exception:
        return None


def cross_model_verify(output: str, skill_name: str) -> dict[str, Any]:
    """스텁: Venus/Atlas 연동 전 self-review로 대체합니다."""
    return {"verdict": "pass", "suggestions": []}


def generate_improved_output(original: str, suggestions: list[str], skill_name: str, eval_axes: list[str]) -> str:
    """AI를 사용해 개선된 아웃풋을 생성합니다."""
    if not suggestions:
        return original
    prompt = (
        f"스킬: {skill_name}\n평가 축: {', '.join(eval_axes) if eval_axes else '전반적 품질'}\n\n"
        f"원본:\n{original}\n\n개선 제안:\n"
        + "\n".join(f"- {s}" for s in suggestions)
        + "\n\n제안을 반영한 개선 아웃풋만 출력하세요. 설명/메타 텍스트 없이."
    )
    try:
        client = _get_anthropic_client()
        msg = client.messages.create(
            model=AI_MODEL, max_tokens=MAX_TOKENS_IMPROVE, messages=[{"role": "user", "content": prompt}]
        )
        return msg.content[0].text  # type: ignore[index]
    except Exception:
        return original


def delta_verify(v1: str, v2: str, eval_axes: list[str], skill_name: str) -> dict[str, Any]:
    """v2가 v1보다 개선되었는지 AI로 검증합니다."""
    comparison = compare_outputs_ai(v1, v2, eval_axes, skill_name)
    return {
        "improved": comparison.get("winner") == "B",
        "comparison": comparison,
        "reason": comparison.get("reason", ""),
    }


def _apply_improvement(
    base: str,
    suggestions: list[str],
    skill_name: str,
    eval_axes: list[str],
    init_process: dict[str, Any],
    learnings: list[str],
    label: str,
) -> str:
    improved = generate_improved_output(base, suggestions, skill_name, eval_axes)
    delta = delta_verify(base, improved, eval_axes, skill_name)
    init_process["delta_result"] = delta
    init_process["improvement_applied"] = delta.get("improved", False)
    if delta.get("improved"):
        learnings.append(f"{label} 기반 개선 적용 완료")
        return improved
    learnings.append(f"{label} 개선 후 역행 감지 → AB winner로 fallback")
    return base


def run_init_enhancement(
    output_a: str, output_b: str, eval_axes: list[str], skill_name: str, benchmark_method: str
) -> dict[str, Any]:
    """전체 초회 강화 프로세스를 오케스트레이션합니다."""
    init_process: dict[str, Any] = {
        "ab_comparison": {},
        "benchmark_result": None,
        "improvement_applied": False,
        "delta_result": None,
    }
    learnings: list[str] = []

    ab = compare_outputs_ai(output_a, output_b, eval_axes, skill_name)
    init_process["ab_comparison"] = ab
    winner_output = output_a if ab.get("winner") == "A" else output_b
    learnings.append(f"A/B 비교: winner={ab.get('winner')}, {ab.get('reason', '')[:80]}")
    champion_output = winner_output

    if benchmark_method == "online_expert":
        expert_text = search_expert_output(skill_name, skill_name)
        if not expert_text:
            learnings.append("전문가 검색 실패 → AB winner를 챔피언으로 사용")
            return {"champion_output": champion_output, "init_process": init_process, "learnings": learnings}
        init_process["benchmark_result"] = {"expert_reference": expert_text[:200]}
        prompt = (
            f"스킬 '{skill_name}' 전문가 가이드와 현재 아웃풋을 비교해 구체적 개선점을 나열하세요.\n\n"
            f"전문가:\n{expert_text}\n\n아웃풋:\n{winner_output}\n\n"
            f'JSON only: {{"suggestions": ["...", ...]}}'
        )
        suggestions: list[str] = []
        try:
            client = _get_anthropic_client()
            msg = client.messages.create(
                model=AI_MODEL, max_tokens=MAX_TOKENS_COMPARE, messages=[{"role": "user", "content": prompt}]
            )
            suggestions = _extract_json(msg.content[0].text).get("suggestions", [])  # type: ignore[index]
        except Exception as e:
            learnings.append(f"개선점 도출 실패: {e}")
        if suggestions:
            champion_output = _apply_improvement(
                winner_output, suggestions, skill_name, eval_axes, init_process, learnings, "전문가 벤치마킹"
            )

    elif benchmark_method == "cross_model":
        try:
            verify_result = cross_model_verify(winner_output, skill_name)
        except Exception as e:
            learnings.append(f"cross_model_verify 실패 → self-review fallback: {e}")
            verify_result = {"verdict": "pass", "suggestions": []}
        init_process["benchmark_result"] = verify_result
        if verify_result.get("verdict") == "improve":
            champion_output = _apply_improvement(
                winner_output,
                verify_result.get("suggestions", []),
                skill_name,
                eval_axes,
                init_process,
                learnings,
                "cross_model",
            )
        else:
            learnings.append("cross_model verdict=pass → 개선 스킵")

    return {"champion_output": champion_output, "init_process": init_process, "learnings": learnings}
