"""magazine-ppt-ko / verify_korean — 한글 100% 검증 모듈.

IDS Phase 2의 G2 검증 모듈. manifest의 모든 한글 문자열이 HTML(또는 PPTX 추출
텍스트)에 string-match로 존재하는지 확인. OCR을 사용하지 않는 코드 패스 검증이며,
font-family fallback chain도 정규식으로 점검한다.

표준 라이브러리만 사용한다 (외부 의존 0).

Public API:
    verify_html(manifest_path, html_dir) -> dict[str, Any]
    verify_pptx_text(manifest_path, pptx_path) -> dict[str, Any]
    verify_font_stack_in_html(html_dir) -> dict[str, Any]
"""

from __future__ import annotations

import json
import re
from pathlib import Path
from typing import Any

PRIMARY_FONT: str = "Pretendard"
FALLBACK_FONT: str = "Noto Sans KR"

# body 또는 일반 셀렉터의 font-family 시작이 Pretendard 인지 확인
_FONT_FAMILY_RE = re.compile(
    r"font-family\s*:\s*['\"]?Pretendard['\"]?\s*(?:,\s*['\"]?Noto Sans KR['\"]?)?",
    re.IGNORECASE,
)


def _read_manifest(manifest_path: str | Path) -> dict[str, Any]:
    """manifest.json 읽기."""
    with Path(manifest_path).open("r", encoding="utf-8") as f:
        data: dict[str, Any] = json.load(f)
    return data


def verify_html(
    manifest_path: str | Path,
    html_dir: str | Path,
) -> dict[str, Any]:
    """manifest의 모든 한글 문자열이 대응 HTML 파일에 string-match로 존재하는지 검증.

    Args:
        manifest_path: build_deck.py가 출력한 manifest.json 경로.
        html_dir: 슬라이드 HTML 디렉토리.

    Returns:
        {"pass": bool, "results": [{"file": ..., "missing": [...], "found": [...]}, ...]}
    """
    manifest = _read_manifest(manifest_path)
    html_dir_p = Path(html_dir)

    results: list[dict[str, Any]] = []
    overall_pass = True

    for slide in manifest.get("slides", []):
        filename = slide.get("file", "")
        korean_strings: list[str] = list(slide.get("korean_strings", []) or [])
        html_path = html_dir_p / filename

        if not html_path.exists():
            results.append(
                {
                    "file": filename,
                    "pass": False,
                    "missing": korean_strings,
                    "found": [],
                    "error": "html file not found",
                }
            )
            overall_pass = False
            continue

        html_text = html_path.read_text(encoding="utf-8")
        missing: list[str] = []
        found: list[str] = []
        for s in korean_strings:
            if s and s in html_text:
                found.append(s)
            else:
                missing.append(s)

        slide_pass = len(missing) == 0
        if not slide_pass:
            overall_pass = False
        results.append(
            {
                "file": filename,
                "pass": slide_pass,
                "missing": missing,
                "found": found,
            }
        )

    return {"pass": overall_pass, "results": results}


def verify_pptx_text(
    manifest_path: str | Path,
    pptx_path: str | Path,
) -> dict[str, Any]:
    """python-pptx로 PPTX를 다시 읽어 한글 텍스트 string-match 검증.

    Args:
        manifest_path: manifest.json 경로.
        pptx_path: 컴파일된 .pptx 파일 경로.

    Returns:
        {"pass": bool, "results": [...], "slide_count": int}

    Raises:
        ImportError: python-pptx 미설치 시.
    """
    try:
        import pptx  # type: ignore[import-not-found]
    except ImportError as exc:
        raise ImportError(
            "python-pptx required for verify_pptx_text"
        ) from exc

    manifest = _read_manifest(manifest_path)
    presentation = pptx.Presentation(str(pptx_path))

    # 슬라이드별 텍스트 통합
    slide_texts: list[str] = []
    for slide in presentation.slides:
        parts: list[str] = []
        for shape in slide.shapes:
            if not getattr(shape, "has_text_frame", False):
                continue
            text_frame = getattr(shape, "text_frame", None)
            if text_frame is None:
                continue
            for paragraph in text_frame.paragraphs:
                for run in paragraph.runs:
                    if run.text:
                        parts.append(run.text)
        slide_texts.append("\n".join(parts))

    results: list[dict[str, Any]] = []
    overall_pass = True
    manifest_slides = manifest.get("slides", [])

    if len(slide_texts) != len(manifest_slides):
        return {
            "pass": False,
            "slide_count": len(slide_texts),
            "expected_count": len(manifest_slides),
            "error": "slide count mismatch",
            "results": [],
        }

    for slide_meta, text in zip(manifest_slides, slide_texts):
        korean_strings: list[str] = list(slide_meta.get("korean_strings", []) or [])
        missing: list[str] = []
        found: list[str] = []
        for s in korean_strings:
            if s and s in text:
                found.append(s)
            else:
                missing.append(s)
        slide_pass = len(missing) == 0
        if not slide_pass:
            overall_pass = False
        results.append(
            {
                "file": slide_meta.get("file", ""),
                "pass": slide_pass,
                "missing": missing,
                "found": found,
            }
        )

    return {
        "pass": overall_pass,
        "slide_count": len(slide_texts),
        "results": results,
    }


def verify_font_stack_in_html(html_dir: str | Path) -> dict[str, Any]:
    """모든 HTML의 font-family가 Pretendard로 시작하는지 정규식 확인.

    Args:
        html_dir: 슬라이드 HTML 디렉토리.

    Returns:
        {"pass": bool, "results": [{"file": ..., "pass": bool, "matches": int}, ...]}
    """
    html_dir_p = Path(html_dir)
    results: list[dict[str, Any]] = []
    overall_pass = True

    for html_path in sorted(html_dir_p.glob("*.html")):
        text = html_path.read_text(encoding="utf-8")
        matches = _FONT_FAMILY_RE.findall(text)
        # 그리고 모든 font-family 선언을 추출하여 Pretendard로 시작하지 않는 것이 있는지 검사
        all_decls = re.findall(r"font-family\s*:\s*([^;}]+)", text, re.IGNORECASE)
        bad_decls = [d.strip() for d in all_decls if not d.strip().lstrip("'\"").lower().startswith("pretendard")]
        slide_pass = len(matches) >= 1 and len(bad_decls) == 0
        if not slide_pass:
            overall_pass = False
        results.append(
            {
                "file": html_path.name,
                "pass": slide_pass,
                "matches": len(matches),
                "bad_declarations": bad_decls,
            }
        )

    return {"pass": overall_pass, "results": results}


__all__ = [
    "verify_html",
    "verify_pptx_text",
    "verify_font_stack_in_html",
]