"""
spec_compliance.py - 작업 지시서 체크리스트 ↔ 보고서 키워드 매칭 verifier
tasks/task-xxx.md의 체크리스트 항목을 보고서에서 키워드로 검색

LLM 호출 없음. 순수 규칙 기반(rule-based) 구현.
"""

import os
import re

DEFAULT_TASKS_DIR = "/home/jay/workspace/memory/tasks"
DEFAULT_REPORTS_DIR = "/home/jay/workspace/memory/reports"

STOPWORDS = {
    "의",
    "를",
    "을",
    "이",
    "가",
    "에",
    "에서",
    "로",
    "으로",
    "와",
    "과",
    "및",
    "또는",
    "등",
    "위",
    "아래",
    "것",
    "수",
    "때",
    "후",
    "전",
    "the",
    "a",
    "an",
    "is",
    "are",
    "in",
    "on",
    "at",
    "to",
    "for",
    "of",
    "and",
    "or",
    "with",
    "from",
    "by",
    "as",
    "be",
    "this",
    "that",
}


def _extract_checklist_items(content: str) -> list[str]:
    """
    마크다운에서 미완료 체크리스트 항목(- [ ]) 텍스트를 추출.

    - [ ] 패턴만 대상 (- [x] 는 이미 완료로 간주하여 제외)

    Returns:
        미완료 체크리스트 항목 텍스트 목록
    """
    pattern = re.compile(r"^- \[ \]\s+(.+)$", re.MULTILINE)
    return [m.group(1).strip() for m in pattern.finditer(content)]


def _extract_keywords(text: str) -> list[str]:
    """
    텍스트에서 핵심 키워드를 추출.

    - 2글자 이상 단어만 포함
    - 불용어(STOPWORDS) 제외
    - 마크다운 특수기호, 코드블록 기호 제거

    Returns:
        핵심 키워드 목록
    """
    # 마크다운 강조 및 특수기호 제거
    cleaned = re.sub(r"[`*_~\[\]()>#]", " ", text)
    # 단어 분리: 한글, 영문, 숫자 기준
    tokens = re.findall(r"[가-힣a-zA-Z0-9]+", cleaned)
    keywords = [t for t in tokens if len(t) >= 2 and t.lower() not in STOPWORDS]
    return keywords


def _item_covered(item_keywords: list[str], report_content: str) -> bool:
    """
    체크리스트 항목의 핵심 키워드 50% 이상이 보고서에 존재하면 "커버됨" 판정.

    Args:
        item_keywords: 체크리스트 항목에서 추출한 키워드 목록
        report_content: 보고서 전체 텍스트

    Returns:
        True if covered (50% 이상 매칭)
    """
    if not item_keywords:
        # 키워드를 추출할 수 없으면 커버됨으로 간주
        return True

    matched = sum(1 for kw in item_keywords if kw in report_content)
    ratio = matched / len(item_keywords)
    return ratio >= 0.5


def verify(
    task_id: str,
    tasks_dir: str = "",
    reports_dir: str = "",
) -> dict:
    """
    작업 지시서 체크리스트와 보고서 키워드 매칭을 검증합니다.

    Args:
        task_id: 검증할 task ID (예: task-792)
        tasks_dir: tasks 디렉토리 경로 (기본: DEFAULT_TASKS_DIR)
        reports_dir: reports 디렉토리 경로 (기본: DEFAULT_REPORTS_DIR)

    Returns:
        {"status": "PASS"|"WARN"|"SKIP", "details": [...]}

        - PASS: 모든 체크리스트 항목이 커버됨 (또는 항목 0개)
        - WARN: 일부 항목 미커버 (details에 미커버 항목 목록)
        - SKIP: task 파일 없음 또는 보고서 없음
    """
    effective_tasks_dir = tasks_dir if tasks_dir else DEFAULT_TASKS_DIR
    effective_reports_dir = reports_dir if reports_dir else DEFAULT_REPORTS_DIR

    task_path = os.path.join(effective_tasks_dir, f"{task_id}.md")
    report_path = os.path.join(effective_reports_dir, f"{task_id}.md")

    # task 파일 존재 확인
    if not os.path.exists(task_path):
        return {
            "status": "SKIP",
            "details": [f"Task file not found: {task_path}"],
        }

    # task 파일 읽기
    try:
        with open(task_path, "r", encoding="utf-8") as f:
            task_content = f.read()
    except OSError as e:
        return {
            "status": "SKIP",
            "details": [f"Failed to read task file: {type(e).__name__}: {e}"],
        }

    # 체크리스트 항목 추출 (- [ ] 미완료 항목만)
    checklist_items = _extract_checklist_items(task_content)

    # 체크리스트 항목 0개 → PASS
    if not checklist_items:
        return {
            "status": "PASS",
            "details": ["No unchecked checklist items found — PASS"],
        }

    # 보고서 파일 존재 확인
    if not os.path.exists(report_path):
        return {
            "status": "SKIP",
            "details": [f"Report file not found: {report_path}"],
        }

    # 보고서 파일 읽기
    try:
        with open(report_path, "r", encoding="utf-8") as f:
            report_content = f.read()
    except OSError as e:
        return {
            "status": "SKIP",
            "details": [f"Failed to read report file: {type(e).__name__}: {e}"],
        }

    # 각 체크리스트 항목 검증
    details: list[str] = []
    uncovered: list[str] = []

    for item in checklist_items:
        keywords = _extract_keywords(item)
        covered = _item_covered(keywords, report_content)
        if covered:
            details.append(f"COVERED: {item}")
        else:
            details.append(f"UNCOVERED: {item} (keywords: {keywords})")
            uncovered.append(item)

    if uncovered:
        details.insert(0, f"WARN — {len(uncovered)}/{len(checklist_items)} 항목 미커버")
        return {"status": "WARN", "details": details}

    details.insert(0, f"PASS — {len(checklist_items)}/{len(checklist_items)} 항목 커버됨")
    return {"status": "PASS", "details": details}