"""OCR 기반 키프레임 검증 모듈.

IDS §0.1 — 첫/중간/끝 프레임 한글 텍스트 존재 확인
- pytesseract 사용 가능 시: OCR로 텍스트 추출
- pytesseract 미설치 시: PIL 픽셀 분석으로 콘텐츠 존재 여부 확인 (fallback)
"""
from __future__ import annotations

import os
import re
import shutil
import subprocess
import tempfile
import warnings
from pathlib import Path
from typing import Optional


def _get_ffmpeg_bin() -> str:
    """ffmpeg 실행 파일 경로를 반환합니다."""
    env_bin = os.environ.get("FFMPEG_BIN", "")
    if env_bin and Path(env_bin).exists():
        return str(env_bin)

    local_bin = "/home/jay/.local/bin/ffmpeg"
    if Path(local_bin).exists():
        return local_bin

    path_bin = shutil.which("ffmpeg")
    if path_bin:
        return path_bin

    raise FileNotFoundError("ffmpeg를 찾을 수 없습니다.")


def extract_keyframes(
    video_path: Path,
    output_dir: Path,
) -> tuple[Path, Path, Path]:
    """동영상에서 첫/중간/끝 프레임을 PNG로 추출합니다.

    Args:
        video_path: 원본 MP4 파일 경로
        output_dir: 추출된 PNG를 저장할 디렉토리

    Returns:
        (first_frame, middle_frame, last_frame) Path 튜플

    Raises:
        RuntimeError: ffmpeg 실행 실패 시
    """
    video_path = Path(video_path)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    ffmpeg_bin = _get_ffmpeg_bin()

    # 동영상 길이 파악
    probe = subprocess.run(
        [ffmpeg_bin, "-i", str(video_path), "-f", "null", "-"],
        capture_output=True,
        text=True,
    )
    duration = _parse_duration(probe.stderr)

    # 첫/중간/끝 타임스탬프
    timestamps = {
        "first": max(0.0, 0.1),
        "middle": max(0.1, duration / 2.0),
        "last": max(0.1, duration - 0.1),
    }

    frame_paths: list[Path] = []
    for label in ("first", "middle", "last"):
        t = timestamps[label]
        out_path = output_dir / f"keyframe_{label}.png"
        cmd = [
            ffmpeg_bin, "-y",
            "-ss", f"{t:.3f}",
            "-i", str(video_path),
            "-frames:v", "1",
            "-q:v", "2",
            str(out_path),
        ]
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode != 0:
            raise RuntimeError(
                f"키프레임 추출 실패 ({label}, t={t}s):\n{result.stderr}"
            )
        frame_paths.append(out_path)

    return frame_paths[0], frame_paths[1], frame_paths[2]


def _parse_duration(ffmpeg_stderr: str) -> float:
    """ffmpeg stderr에서 Duration 값을 파싱합니다. 실패 시 1.0 반환."""
    m = re.search(r"Duration:\s*(\d+):(\d+):(\d+\.?\d*)", ffmpeg_stderr)
    if not m:
        return 1.0
    h, mn, s = m.groups()
    return int(h) * 3600 + int(mn) * 60 + float(s)


def ocr_frame(frame_path: Path) -> str:
    """프레임 이미지에서 텍스트를 추출합니다.

    pytesseract가 없으면 빈 문자열을 반환하고 경고를 발생시킵니다.

    Args:
        frame_path: PNG 프레임 파일 경로

    Returns:
        추출된 텍스트 문자열 (pytesseract 없으면 빈 문자열)
    """
    try:
        import pytesseract  # type: ignore[import-not-found]
        from PIL import Image
        img = Image.open(str(frame_path))
        return pytesseract.image_to_string(img, lang="kor+eng", config="--psm 6").strip()
    except ImportError:
        warnings.warn(
            "pytesseract가 설치되지 않았습니다. OCR을 건너뜁니다. "
            "설치: pip install pytesseract && apt-get install tesseract-ocr tesseract-ocr-kor",
            UserWarning,
            stacklevel=2,
        )
        return ""
    except Exception as e:
        warnings.warn(f"OCR 오류 ({type(e).__name__}): {e}", UserWarning, stacklevel=2)
        return ""


def _is_frame_non_blank(frame_path: Path) -> bool:
    """PIL로 프레임이 비어있지 않은지(콘텐츠 존재 여부)를 확인합니다.

    mean > 5 AND stddev > 5 이면 콘텐츠가 있다고 판단합니다.
    """
    try:
        from PIL import Image
        img = Image.open(str(frame_path)).convert("L")
        pixels = list(img.getdata())
        if not pixels:
            return False
        mean = sum(pixels) / len(pixels)
        variance = sum((p - mean) ** 2 for p in pixels) / len(pixels)
        stddev = variance ** 0.5
        return mean > 5 and stddev > 5
    except Exception:
        return False


def validate_korean_frames(
    video_path: Path,
    expected_korean_chars: list[str],
    *,
    output_dir: Optional[Path] = None,
) -> dict:  # type: ignore[type-arg]
    """동영상의 첫/중간/끝 프레임에서 한글 텍스트를 검증합니다.

    IDS §0.1 준수: pytesseract 없으면 PIL 픽셀 분석으로 폴백.

    Args:
        video_path: 검증할 MP4 파일 경로
        expected_korean_chars: 존재 여부를 확인할 한글 문자/문자열 목록
        output_dir: 키프레임 저장 디렉토리 (None이면 임시 디렉토리 사용)

    Returns:
        3개 엔트리의 딕셔너리:
        {
            "first": {"frame": "first", "ocr_text": str, "has_expected": bool, "fallback": bool},
            "middle": {...},
            "last": {...},
        }
    """
    if output_dir is None:
        tmp_dir = Path(tempfile.mkdtemp(prefix="motion_ocr_"))
    else:
        tmp_dir = Path(output_dir)
        tmp_dir.mkdir(parents=True, exist_ok=True)

    try:
        first, middle, last = extract_keyframes(video_path, tmp_dir)
    except Exception as e:
        warnings.warn(f"키프레임 추출 실패: {e}", UserWarning, stacklevel=2)
        # 실패해도 빈 결과 반환
        empty = {"frame": "", "ocr_text": "", "has_expected": False, "fallback": True}
        return {"first": dict(empty, frame="first"), "middle": dict(empty, frame="middle"), "last": dict(empty, frame="last")}

    results: dict[str, dict] = {}  # type: ignore[type-arg]
    for label, frame_path in [("first", first), ("middle", middle), ("last", last)]:
        ocr_text = ocr_frame(frame_path)
        is_fallback = ocr_text == ""

        if ocr_text:
            has_expected = any(ch in ocr_text for ch in expected_korean_chars)
        else:
            # OCR 없음: PIL 픽셀 분석 폴백
            has_expected = _is_frame_non_blank(frame_path)

        results[label] = {
            "frame": label,
            "ocr_text": ocr_text,
            "has_expected": bool(has_expected),
            "fallback": is_fallback,
        }

    return results