"""
test_comparison.py - opendataloader-pdf 파싱 품질 검증 테스트

목적: opendataloader-pdf를 통한 PDF 파싱 품질 데이터 수집.
     키워드 존재, 조항 번호 감지, 특수문자 처리 등을 확인한다.
"""

import re
import sys
import time
from typing import Any

import pytest

sys.path.insert(0, "/home/jay/workspace/libs")

from doc_parser import ParseResult, parse_pdf  # type: ignore[import-not-found]

# ---------------------------------------------------------------------------
# 상수
# ---------------------------------------------------------------------------

PDF_PATH = (
    "/home/jay/.cokacdir/workspace/autoset/"
    "금융소비자_보호에_관한_법률법률제21065호20260102.pdf"
)

# 법령에 반드시 등장해야 할 핵심 키워드
REQUIRED_KEYWORDS: list[str] = [
    "금융소비자",
    "금융상품",
    "금융상품판매업자",
    "손해배상",
    "청약철회",
]

# 특수문자 검증 대상
SPECIAL_CHARS: list[str] = ["「", "」", "ㆍ", "·", "①", "②", "③"]

# 조항 번호 패턴: 제1조, 제23조, 제100조 등
ARTICLE_PATTERN = re.compile(r"제\s*\d+\s*조")

# 항 번호 패턴: ①②③ 또는 1. 2. 형식
CLAUSE_PATTERN = re.compile(r"[①②③④⑤⑥⑦⑧⑨⑩]|\b\d+\.\s")


# ---------------------------------------------------------------------------
# Helper
# ---------------------------------------------------------------------------


def _separator(char: str = "-", width: int = 60) -> str:
    return char * width


# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------


@pytest.fixture(scope="module")
def pdf_bytes() -> bytes:
    """테스트 PDF 파일을 바이트로 읽어 반환한다 (모듈 범위 캐싱)."""
    with open(PDF_PATH, "rb") as f:
        return f.read()


@pytest.fixture(scope="module")
def odl_result(pdf_bytes: bytes) -> dict[str, Any]:
    """opendataloader-pdf(parse_pdf) 파싱 결과를 캐싱한다.

    Returns:
        {
            "parse_result": ParseResult,
            "elapsed": float,  # 초 단위
        }
    """
    t0 = time.perf_counter()
    result: ParseResult = parse_pdf(pdf_bytes)
    elapsed = time.perf_counter() - t0

    return {
        "parse_result": result,
        "elapsed": elapsed,
    }


# ---------------------------------------------------------------------------
# 품질 테스트 1 - 텍스트 추출 정확도 (길이 & 키워드)
# ---------------------------------------------------------------------------


class TestTextExtractionAccuracy:
    """항목 1: 텍스트 추출 정확도."""

    def test_text_not_empty(self, odl_result: dict[str, Any]) -> None:
        """추출된 텍스트가 비어있지 않다."""
        dl_text: str = odl_result["parse_result"].text
        assert len(dl_text) > 0, "opendataloader: 텍스트가 비어 있음"

    def test_keyword_presence(self, odl_result: dict[str, Any]) -> None:
        """필수 키워드가 추출된 텍스트에 포함되어 있다."""
        dl_text: str = odl_result["parse_result"].text

        print()
        print(_separator("="))
        print("[키워드 포함 여부]")
        print(_separator())

        for kw in REQUIRED_KEYWORDS:
            dl_found = kw in dl_text
            mark = "O" if dl_found else "X (누락)"
            print(f"  {kw:<20} {mark}")

        print(_separator("="))

    def test_text_sample(self, odl_result: dict[str, Any]) -> None:
        """첫 500자 샘플을 출력하여 추출 품질을 육안으로 확인한다."""
        dl_text: str = odl_result["parse_result"].text

        sample_len = 500

        print()
        print(_separator("="))
        print("[텍스트 샘플 (첫 500자)]")
        print(_separator())
        print("-- opendataloader-pdf --")
        print(dl_text[:sample_len])
        print(_separator("="))

    def test_text_length_report(self, odl_result: dict[str, Any]) -> None:
        """추출된 텍스트 길이를 보고한다."""
        dl_text: str = odl_result["parse_result"].text
        dl_len = len(dl_text)

        print()
        print(_separator("="))
        print("[텍스트 길이]")
        print(_separator())
        print(f"  opendataloader-pdf 텍스트 길이: {dl_len:,} 문자")
        print(_separator("="))

        assert dl_len > 0


# ---------------------------------------------------------------------------
# 품질 테스트 2 - 표(table) 구조 보존
# ---------------------------------------------------------------------------


class TestTableStructurePreservation:
    """항목 2: 표(table) 구조 보존 여부."""

    def test_table_structure_detail(self, odl_result: dict[str, Any]) -> None:
        """추출한 테이블의 행/열 구조를 출력한다."""
        dl_tables: list[dict[str, Any]] = odl_result["parse_result"].tables

        print()
        print(_separator("="))
        print("[테이블 구조 상세]")
        print(_separator())

        print(f"  감지된 테이블 수: {len(dl_tables)}")
        if dl_tables:
            for idx, tbl in enumerate(dl_tables):
                headers: list[str] = tbl.get("headers", [])
                rows: list[list[str]] = tbl.get("rows", [])
                print(f"  Table[{idx}]: {len(rows)}행 x {len(headers)}열")
                print(f"    헤더: {headers}")
                if rows:
                    print(f"    첫 행: {rows[0]}")
        else:
            print("  (감지된 테이블 없음)")

        print(_separator("="))


# ---------------------------------------------------------------------------
# 품질 테스트 3 - 조항 번호 정확성
# ---------------------------------------------------------------------------


class TestArticleNumberAccuracy:
    """항목 3: 조항 번호 정확성 (제N조 패턴)."""

    def test_article_count(self, odl_result: dict[str, Any]) -> None:
        """감지된 조항(제N조) 수를 출력하고 최소 1개 이상인지 확인한다."""
        dl_text: str = odl_result["parse_result"].text

        dl_articles = ARTICLE_PATTERN.findall(dl_text)

        print()
        print(_separator("="))
        print("[조항 번호(제N조) 감지]")
        print(_separator())
        print(f"  opendataloader-pdf 감지 조항 수: {len(dl_articles)}")

        dl_unique = sorted(
            set(dl_articles),
            key=lambda x: int(re.search(r"\d+", x).group()),  # type: ignore[union-attr]
        )
        print(f"  unique 조항 수: {len(dl_unique)}")
        print(
            f"  조항 목록: {dl_unique[:20]}{'...' if len(dl_unique) > 20 else ''}"
        )
        print(_separator("="))

        assert len(dl_articles) > 0, "opendataloader: 조항 번호가 하나도 감지되지 않음"

    def test_clause_number_count(self, odl_result: dict[str, Any]) -> None:
        """항 번호(① ② ③ 등) 감지 수를 출력한다."""
        dl_text: str = odl_result["parse_result"].text

        dl_clauses = CLAUSE_PATTERN.findall(dl_text)

        print()
        print(_separator("="))
        print("[항 번호(①②③) 감지]")
        print(_separator())
        print(f"  opendataloader-pdf 감지 항 번호 수: {len(dl_clauses)}")
        print(_separator("="))


# ---------------------------------------------------------------------------
# 품질 테스트 4 - 특수문자 처리
# ---------------------------------------------------------------------------


class TestSpecialCharacterHandling:
    """항목 4: 특수문자(「」, ㆍ 등) 처리 여부."""

    def test_special_char_presence(self, odl_result: dict[str, Any]) -> None:
        """각 특수문자의 존재 여부와 등장 횟수를 출력한다."""
        dl_text: str = odl_result["parse_result"].text

        print()
        print(_separator("="))
        print("[특수문자 처리]")
        print(_separator())
        print(f"  {'문자':<8} {'opendataloader-pdf 횟수':>24}")
        print(_separator("-", 36))

        for ch in SPECIAL_CHARS:
            dl_count = dl_text.count(ch)
            dl_mark = f"{dl_count}회" if dl_count > 0 else "미발견"
            print(f"  {ch!r:<8} {dl_mark:>24}")

        dl_covered = sum(1 for ch in SPECIAL_CHARS if ch in dl_text)
        print(_separator())
        print(f"  커버 특수문자: {dl_covered}/{len(SPECIAL_CHARS)}")
        print(_separator("="))

    def test_unicode_normalization_check(self, odl_result: dict[str, Any]) -> None:
        """한글 법령 괄호「」가 온전히 쌍으로 존재하는지 확인한다."""
        dl_text: str = odl_result["parse_result"].text

        dl_open = dl_text.count("「")
        dl_close = dl_text.count("」")

        print()
        print(_separator("="))
        print("[법령 괄호「」쌍 일치 여부]")
        print(_separator())
        print(
            f"  opendataloader-pdf - 「:{dl_open}회, 」:{dl_close}회, "
            f"쌍 일치: {dl_open == dl_close}"
        )
        print(_separator("="))


# ---------------------------------------------------------------------------
# 품질 테스트 5 - 처리 속도
# ---------------------------------------------------------------------------


class TestProcessingSpeed:
    """항목 5: 처리 속도."""

    def test_parsing_speed(self, odl_result: dict[str, Any]) -> None:
        """파싱 시간을 출력하고 양수인지 확인한다."""
        dl_elapsed: float = odl_result["elapsed"]
        dl_pages: int = odl_result["parse_result"].metadata.get("page_count", 1)

        dl_per_page = dl_elapsed / dl_pages if dl_pages > 0 else 0.0

        print()
        print(_separator("="))
        print("[처리 속도]")
        print(_separator())
        print(f"  opendataloader-pdf 처리 시간: {dl_elapsed:.3f}초")
        print(f"  총 페이지: {dl_pages}, 페이지당: {dl_per_page:.4f}초")
        print(_separator("="))

        assert dl_elapsed > 0, "opendataloader-pdf 처리 시간이 0 이하"


# ---------------------------------------------------------------------------
# 종합 요약 테스트
# ---------------------------------------------------------------------------


class TestSummaryReport:
    """종합 요약 출력."""

    def test_overall_summary(self, odl_result: dict[str, Any]) -> None:
        """모든 항목의 핵심 지표를 한 화면에 요약 출력한다."""
        dl_text: str = odl_result["parse_result"].text
        dl_tables: list[dict[str, Any]] = odl_result["parse_result"].tables
        dl_elapsed: float = odl_result["elapsed"]

        dl_articles = ARTICLE_PATTERN.findall(dl_text)
        dl_special = sum(1 for ch in SPECIAL_CHARS if ch in dl_text)
        dl_kw = sum(1 for kw in REQUIRED_KEYWORDS if kw in dl_text)

        print()
        print(_separator("=", 60))
        print("[ opendataloader-pdf 파싱 품질 종합 요약 ]")
        print(_separator("=", 60))
        print(f"  {'항목':<30} {'opendataloader-pdf':>20}")
        print(_separator("-", 60))
        print(f"  {'텍스트 길이 (문자)':<30} {len(dl_text):>20,}")
        kw_header = f"키워드 검출 ({len(REQUIRED_KEYWORDS)}개 중)"
        print(f"  {kw_header:<30} {dl_kw:>20}")
        print(f"  {'감지 테이블 수':<30} {len(dl_tables):>20}")
        print(f"  {'감지 조항 수(제N조)':<30} {len(set(dl_articles)):>20}")
        print(f"  {'특수문자 커버 수':<30} {dl_special:>20}")
        print(f"  {'처리 시간 (초)':<30} {dl_elapsed:>20.3f}")
        print(_separator("=", 60))
        print()

        assert len(dl_text) > 0