"""dispatch routing classifier — task-2473 false-positive hotfix.

task spec 본문에서 디자인/광고 작업을 분류할 때, 코드 블록/regex pattern/검증 컨텍스트
시그널을 인지해 코딩 작업의 false positive를 차단하는 분류기.
"""
from __future__ import annotations

from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional, List
import hashlib
import json
import re
from datetime import datetime, timezone

WORKSPACE = Path(__file__).resolve().parents[1]
AUDIT_PATH = WORKSPACE / "memory" / "orchestration-audit" / "dispatch-routing-decision.jsonl"

# 디자인 작업 키워드 (단어 경계 매칭 대상)
DESIGN_KEYWORDS_KO = ["디자인", "배너", "이미지", "포스터", "일러스트", "광고"]
DESIGN_KEYWORDS_EN = ["design", "banner", "image", "poster", "illustration", "ad"]

# 코딩/검증 컨텍스트 시그널 — 검출 시 design 점수 -2
CODING_CONTEXT_SIGNALS = [
    "Gemini gate", "gemini gate",
    "severity badge",
    "regression test", "회귀 테스트",
    "hardening",
    "검사 대상",
    "탐지",
    "휴리스틱",
    "룰 정정",
    "false positive", "false-positive",
    "qc-gate", "qc gate",
    "regex pattern",
    "코드 블록", "code block",
    "단어 경계", "word boundary",
    "dispatch 차단",
]

@dataclass
class RoutingDecision:
    classification: str  # "design" / "coding" / "research"
    result: str          # "allow" / "block"
    matched_keywords: List[str] = field(default_factory=list)
    context_signals: List[str] = field(default_factory=list)
    score: int = 0
    raw_text_hash: str = ""

def _strip_code_blocks(text: str) -> str:
    """백틱 인라인(`...`) 및 펜스(```...```) 코드 블록 제거.

    펜스 코드 블록은 ```부터 다음 ```까지(또는 EOF까지) 통째로 제거.
    인라인 코드는 single backtick으로 감싼 부분만 제거.
    """
    # 펜스 먼저 (multiline)
    text = re.sub(r"```[\s\S]*?(?:```|$)", " ", text)
    # 인라인 (single backtick — 줄바꿈 없는 segment)
    text = re.sub(r"`[^`\n]*`", " ", text)
    return text

def _strip_regex_patterns(text: str) -> str:
    """task spec 본문에서 regex 표현 안의 키워드 제거.

    대상:
    - r"...", r'...' (raw string literal)
    - 백슬래시 escape pattern (예: !\\[(...)\\], \\(, \\[)
    - 명시적 'regex pattern' 괄호 표현
    """
    # raw string literals (이스케이프된 따옴표 포함 처리)
    text = re.sub(r"r(['\"])(?:(?!\1).|\\.)*\1", " ", text)
    # markdown image alt 처럼 보이는 정규식 (! 다음 \[ 또는 [ — 백슬래시 이스케이프 + 그룹)
    text = re.sub(r"!\\\[[^\]]*\\\]", " ", text)
    text = re.sub(r"!\[[^\]]*\([^)]*\)[^\]]*\]", " ", text)  # ![(high|critical)] 등 그룹 alt
    # 백슬래시 이스케이프된 괄호/대괄호 패턴
    text = re.sub(r"\\[\[\]()|]", " ", text)
    return text

def _detect_context_signals(text: str) -> List[str]:
    detected = []
    text_lower = text.lower()
    for signal in CODING_CONTEXT_SIGNALS:
        if signal.lower() in text_lower:
            detected.append(signal)
    return list(dict.fromkeys(detected))  # dedupe, preserve order

def _match_design_keywords_word_boundary(text: str) -> List[str]:
    """단어 경계 기반 design keyword 매칭.

    한글 키워드는 단어 경계 개념이 다르므로 substring 사용 (단, 사전 코드/regex 제거됨).
    영어 키워드는 \\b...\\b 단어 경계 사용 (imagery, imagine 등 합성어 제외).
    """
    matched = []
    text_lower = text.lower()
    for kw in DESIGN_KEYWORDS_KO:
        if kw in text_lower:
            matched.append(kw)
    for kw in DESIGN_KEYWORDS_EN:
        if re.search(rf"\b{re.escape(kw)}\b", text_lower):
            matched.append(kw)
    return list(dict.fromkeys(matched))

def classify_task_routing(
    task_desc: str,
    task_id: Optional[str] = None,
    task_file: Optional[str] = None,
    write_audit: bool = True,
) -> RoutingDecision:
    """task_desc를 분류하고 audit jsonl에 기록.

    1. 백틱 코드 블록 제거
    2. regex pattern 제거
    3. context signals 검출
    4. 단어 경계 기반 design keyword 매칭
    5. 점수 = matched_count - 2 * context_signals_count
    6. score >= 1 → "design" + "block", 그 외 → "coding" + "allow"
    """
    cleaned = _strip_code_blocks(task_desc)
    cleaned = _strip_regex_patterns(cleaned)

    matched = _match_design_keywords_word_boundary(cleaned)
    signals = _detect_context_signals(task_desc)  # 시그널은 원문에서 검출 (코드 블록 안에서 정의되어도 인정)

    score = len(matched) - 2 * len(signals)

    if score >= 1:
        classification = "design"
        result = "block"
    else:
        classification = "coding"
        result = "allow"

    raw_hash = hashlib.sha256(task_desc.encode("utf-8")).hexdigest()[:16]

    decision = RoutingDecision(
        classification=classification,
        result=result,
        matched_keywords=matched,
        context_signals=signals,
        score=score,
        raw_text_hash=raw_hash,
    )

    if write_audit:
        _append_audit(decision, task_id=task_id, task_file=task_file)

    return decision

def _append_audit(decision: RoutingDecision, task_id: Optional[str], task_file: Optional[str]) -> None:
    AUDIT_PATH.parent.mkdir(parents=True, exist_ok=True)
    entry = {
        "ts": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
        "task_id": task_id or "",
        "task_file": task_file or "",
        "matched_keywords": decision.matched_keywords,
        "context_signals": decision.context_signals,
        "classification": decision.classification,
        "result": decision.result,
        "score": decision.score,
        "raw_text_hash": decision.raw_text_hash,
    }
    try:
        with AUDIT_PATH.open("a", encoding="utf-8") as f:
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")
    except OSError:
        pass  # audit 실패는 dispatch 차단 사유 아님

# dispatch.py fallback용 (classify_task_routing import 실패 시)
FALLBACK_DESIGN_KEYWORDS = DESIGN_KEYWORDS_KO + DESIGN_KEYWORDS_EN