"""
주간 배치 학습 분석 스크립트 v1

사용법:
    python3 learning-analyzer.py --week 2026-04-07

옵션:
    --week          ISO 날짜 (월요일 기준, 해당 주 월~일 7일 분석) [필수]
    --audit-trail   audit-trail.jsonl 경로 오버라이드 (테스트용)
    --output-dir    출력 디렉토리 오버라이드 (테스트용)
    --whitelist     화이트리스트 YAML 경로 오버라이드 (테스트용)
    --thresholds    임계값 YAML 경로 오버라이드 (테스트용)

v1 한계:
    - 에러 유사도 분석 미지원 (v2 예정)
    - task_id 없는 레코드 제외 (old schema)
    - bot==anu 세션 제외
"""

import argparse
import json
import sys
from datetime import date, timedelta
from fnmatch import fnmatch
from pathlib import Path

import yaml

WORKSPACE_ROOT = "/home/jay/workspace/"
DEFAULT_AUDIT_TRAIL = Path("/home/jay/workspace/memory/logs/audit-trail.jsonl")
DEFAULT_OUTPUT_DIR = Path("/home/jay/workspace/memory/learnings")
DEFAULT_WHITELIST = Path("/home/jay/workspace/config/learning-whitelist.yaml")
DEFAULT_THRESHOLDS = Path("/home/jay/workspace/config/learning-thresholds.yaml")


def load_whitelist(path: Path) -> dict:
    try:
        with open(path, encoding="utf-8") as f:
            data = yaml.safe_load(f)
        return {
            "files": set(data.get("files") or []),
            "patterns": list(data.get("patterns") or []),
        }
    except Exception:
        return {"files": set(), "patterns": []}


def load_thresholds(path: Path) -> dict:
    try:
        with open(path, encoding="utf-8") as f:
            data = yaml.safe_load(f)
        return {
            "min_task_count": int(data.get("min_task_count", 3)),
            "min_task_ratio": float(data.get("min_task_ratio", 0.3)),
        }
    except Exception:
        return {"min_task_count": 3, "min_task_ratio": 0.3}


def _normalize_path(file_field: str) -> str:
    if file_field and file_field.startswith(WORKSPACE_ROOT):
        return file_field[len(WORKSPACE_ROOT) :]
    return file_field or ""


def parse_audit_trail(path: Path, week_start: date, week_end: date) -> list:
    records = []
    week_start_str = week_start.isoformat()
    week_end_str = week_end.isoformat()
    try:
        with open(path, encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                try:
                    rec = json.loads(line)
                except json.JSONDecodeError:
                    continue

                ts = rec.get("ts", "")
                if not ts:
                    continue
                ts_date = ts[:10]
                if not (week_start_str <= ts_date < week_end_str):
                    continue

                file_path = rec.get("file", "")
                if not file_path:
                    continue
                if file_path.startswith("/tmp/"):
                    continue

                bot = rec.get("bot", "")
                if bot == "anu":
                    continue

                task_id = rec.get("task_id", "")
                if not task_id or task_id == "unknown":
                    continue

                rel_path = _normalize_path(file_path)
                records.append({"file": rel_path, "task_id": task_id})
    except Exception:
        pass
    return records


def _is_whitelisted(rel_path: str, whitelist: dict) -> str:
    if rel_path in whitelist["files"]:
        return f"files: {rel_path}"
    for pattern in whitelist["patterns"]:
        if fnmatch(rel_path, pattern):
            return f"patterns: {pattern}"
    return ""


def _classify_type(rel_path: str) -> str:
    suffix = Path(rel_path).suffix.lower()
    if suffix in (".yaml", ".json", ".toml"):
        return "config_hotspot"
    return "refactor_candidate"


def _classify_priority(task_count: int) -> str:
    if task_count >= 5:
        return "high"
    return "medium"


def compute_hotspots(records: list, whitelist: dict, thresholds: dict) -> tuple:
    file_tasks: dict[str, set] = {}
    for rec in records:
        f = _normalize_path(rec["file"])
        file_tasks.setdefault(f, set()).add(rec["task_id"])

    total_tasks = len({rec["task_id"] for rec in records})

    hotspots = []
    whitelisted = []

    min_count = thresholds["min_task_count"]
    min_ratio = thresholds["min_task_ratio"]

    for rel_path, task_ids in file_tasks.items():
        wl_rule = _is_whitelisted(rel_path, whitelist)
        if wl_rule:
            whitelisted.append({"file": rel_path, "rule": wl_rule, "status": "skipped_whitelist"})
            continue

        task_count = len(task_ids)
        task_ratio = (task_count / total_tasks) if total_tasks > 0 else 0.0

        if task_count >= min_count and task_ratio >= min_ratio:
            hotspots.append(
                {
                    "file": rel_path,
                    "task_count": task_count,
                    "task_ratio": task_ratio,
                    "task_ids": sorted(task_ids),
                    "type": _classify_type(rel_path),
                    "priority": _classify_priority(task_count),
                    "total_tasks": total_tasks,
                }
            )

    hotspots.sort(key=lambda x: (-x["task_count"], x["file"]))
    return hotspots, whitelisted


def generate_report(
    hotspots: list,
    whitelisted: list,
    week_start: date,
    week_end: date,
    total_tasks: int,
    output_dir: Path,
) -> Path:
    week_end_display = week_end - timedelta(days=1)
    lines = []
    lines.append(f"# 주간 학습 리포트: {week_start} ~ {week_end_display}")
    lines.append("")
    lines.append("> 학습 기능 v1 — 에러 유사도 미지원(v2 예정)")
    lines.append("")
    lines.append(f"## 핫스팟 파일 ({len(hotspots)}건)")
    lines.append("")

    for idx, hs in enumerate(hotspots, 1):
        ratio_pct = round(hs["task_ratio"] * 100, 1)
        task_list = ", ".join(hs["task_ids"])
        action = "리팩토링 검토 필요 — 같은 파일을 반복 수정"
        lines.append(f"### {idx}. `{hs['file']}`")
        lines.append(f"- **수정 task 수**: {hs['task_count']}")
        lines.append(f"- **수정 비율**: {ratio_pct}% ({hs['task_count']}/{hs['total_tasks']} tasks)")
        lines.append(f"- **수정 task 목록**: {task_list}")
        lines.append(f"- **type**: {hs['type']}")
        lines.append(f"- **action**: {action}")
        lines.append(f"- **priority**: {hs['priority']}")
        lines.append(f"- **status**: pending")
        lines.append("")

    lines.append(f"## 화이트리스트 제외 ({len(whitelisted)}건)")
    lines.append("")
    for wl in whitelisted:
        lines.append(f"- `{wl['file']}` — {wl['rule']}")
    if not whitelisted:
        lines.append("(없음)")
    lines.append("")

    lines.append("## 요약")
    lines.append(f"- 분석 기간: {week_start} ~ {week_end_display}")
    lines.append(f"- 총 task 수: {total_tasks}")
    lines.append(f"- 핫스팟 파일 수: {len(hotspots)}")
    lines.append(f"- 화이트리스트 제외: {len(whitelisted)}")
    lines.append("")

    output_dir.mkdir(parents=True, exist_ok=True)
    output_path = output_dir / f"weekly-{week_start}.md"
    output_path.write_text("\n".join(lines), encoding="utf-8")
    return output_path


def main():
    parser = argparse.ArgumentParser(description="주간 배치 학습 분석")
    parser.add_argument("--week", required=True, help="분석 주 시작일 (ISO, 월요일)")
    parser.add_argument("--audit-trail", default=None, help="audit-trail.jsonl 경로 오버라이드")
    parser.add_argument("--output-dir", default=None, help="출력 디렉토리 오버라이드")
    parser.add_argument("--whitelist", default=None, help="화이트리스트 YAML 오버라이드")
    parser.add_argument("--thresholds", default=None, help="임계값 YAML 오버라이드")
    args = parser.parse_args()

    try:
        week_start = date.fromisoformat(args.week)
    except ValueError:
        print(f"ERROR: --week 값이 유효한 ISO 날짜가 아닙니다: {args.week}", file=sys.stderr)
        sys.exit(1)

    week_end = week_start + timedelta(days=7)

    audit_path = Path(args.audit_trail) if args.audit_trail else DEFAULT_AUDIT_TRAIL
    output_dir = Path(args.output_dir) if args.output_dir else DEFAULT_OUTPUT_DIR
    whitelist_path = Path(args.whitelist) if args.whitelist else DEFAULT_WHITELIST
    thresholds_path = Path(args.thresholds) if args.thresholds else DEFAULT_THRESHOLDS

    whitelist = load_whitelist(whitelist_path)
    thresholds = load_thresholds(thresholds_path)

    records = parse_audit_trail(audit_path, week_start, week_end)
    total_tasks = len({rec["task_id"] for rec in records})

    hotspots, whitelisted = compute_hotspots(records, whitelist, thresholds)

    result_path = generate_report(hotspots, whitelisted, week_start, week_end, total_tasks, output_dir)

    print(f"리포트 생성 완료: {result_path}")
    print(f"총 task 수: {total_tasks}, 핫스팟: {len(hotspots)}건, 화이트리스트 제외: {len(whitelisted)}건")


if __name__ == "__main__":
    main()
