"""
doc_parser.py - 문서 파싱 모듈

PDF: opendataloader-pdf 기반 파싱
기타 포맷 (DOCX, PPTX, HTML, 이미지): docling lazy import로 처리
"""

import hashlib
import json
import logging
import os
import pathlib
import tempfile
from dataclasses import dataclass, field

import opendataloader_pdf

logger = logging.getLogger(__name__)

CACHE_DIR = pathlib.Path(__file__).parent / ".parse_cache"

# Java 11 경로 (opendataloader_pdf.convert() subprocess 실행에 필요)
_JAVA_HOME = "/home/jay/.local/jdk/jdk-11.0.25+9-jre"

# 지원하는 파일 확장자 → 포맷 문자열 매핑
_EXT_TO_FORMAT: dict[str, str] = {
    ".pdf": "pdf",
    ".docx": "docx",
    ".pptx": "pptx",
    ".html": "html",
    ".htm": "html",
    ".png": "image",
    ".jpg": "image",
    ".jpeg": "image",
    ".tiff": "image",
    ".bmp": "image",
    ".gif": "image",
    ".webp": "image",
}


@dataclass
class ParseResult:
    """문서 파싱 결과"""

    text: str
    pages: list[str]
    tables: list[dict]  # [{headers: [], rows: [[]]}]
    metadata: dict = field(default_factory=dict)


def _content_hash(data: bytes) -> str:
    """SHA-256 해시를 계산한다."""
    return hashlib.sha256(data).hexdigest()


def _cache_path(content_hash: str) -> pathlib.Path:
    """캐시 파일 경로를 반환한다."""
    return CACHE_DIR / f"{content_hash}.json"


def _save_cache(content_hash: str, result: ParseResult) -> None:
    """ParseResult를 JSON으로 직렬화하여 캐시에 저장한다."""
    CACHE_DIR.mkdir(parents=True, exist_ok=True)
    data = {
        "text": result.text,
        "pages": result.pages,
        "tables": result.tables,
        "metadata": result.metadata,
    }
    _cache_path(content_hash).write_text(json.dumps(data, ensure_ascii=False), encoding="utf-8")


def _load_cache(content_hash: str) -> ParseResult | None:
    """캐시에서 ParseResult를 로드한다. 없거나 오류 시 None 반환."""
    path = _cache_path(content_hash)
    if not path.exists():
        return None
    try:
        data = json.loads(path.read_text(encoding="utf-8"))
        return ParseResult(
            text=data["text"],
            pages=data["pages"],
            tables=data["tables"],
            metadata=data["metadata"],
        )
    except Exception as exc:
        logger.warning("캐시 로드 실패 (삭제 후 재파싱): %s", exc)
        try:
            path.unlink()
        except OSError:
            pass
        return None


def clear_cache() -> None:
    """캐시 디렉토리의 모든 캐시 파일을 삭제한다."""
    if CACHE_DIR.exists():
        for f in CACHE_DIR.iterdir():
            if f.is_file() and f.suffix == ".json":
                f.unlink()
    logger.info("파싱 캐시 전체 삭제 완료")


def _collect_elements(node: dict, elements: list | None = None) -> list[dict]:
    """opendataloader JSON을 재귀 순회하여 모든 element를 flat list로 수집"""
    if elements is None:
        elements = []
    if "content" in node:
        elements.append(node)
    for kid in node.get("kids", []):
        _collect_elements(kid, elements)
    for item in node.get("list items", []):
        _collect_elements(item, elements)
    return elements


def _extract_tables_from_elements(elements: list[dict]) -> list[dict]:
    """type='table' element에서 {headers, rows} 추출"""
    tables = []
    for elem in elements:
        if elem.get("type") == "table":
            kids = elem.get("kids", [])
            if kids:
                rows_data = []
                for row_elem in kids:
                    if row_elem.get("type") == "table row":
                        cells = [
                            cell.get("content", "")
                            for cell in row_elem.get("kids", [])
                            if cell.get("type") == "table cell"
                        ]
                        rows_data.append(cells)
                if rows_data:
                    headers = rows_data[0]
                    rows = rows_data[1:]
                    tables.append({"headers": headers, "rows": rows})
            elif "content" in elem:
                # 테이블이 content만 있는 경우 (구조화되지 않은 테이블)
                lines = elem["content"].strip().split("\n")
                if len(lines) >= 2:
                    headers = [h.strip() for h in lines[0].split("|") if h.strip()]
                    rows = []
                    for line in lines[1:]:
                        row = [c.strip() for c in line.split("|") if c.strip()]
                        if row:
                            rows.append(row)
                    if headers:
                        tables.append({"headers": headers, "rows": rows})
    return tables


def parse_pdf(file_bytes: bytes, *, use_cache: bool = True) -> ParseResult:
    """
    PDF bytes를 파싱하여 ParseResult를 반환한다.

    Args:
        file_bytes: PDF 파일의 바이트 데이터
        use_cache: True이면 SHA-256 기반 캐시를 사용한다 (기본값 True)

    Returns:
        ParseResult: 파싱 결과 (text, pages, tables, metadata)

    Raises:
        ValueError: 빈 bytes가 입력된 경우
        RuntimeError: opendataloader 변환 중 오류 발생 시
    """
    if not file_bytes:
        raise ValueError("file_bytes가 비어 있습니다.")

    # 캐시 확인
    ch = _content_hash(file_bytes)
    if use_cache:
        cached = _load_cache(ch)
        if cached is not None:
            logger.info("캐시 히트: hash=%s", ch[:12])
            return cached

    logger.info("PDF 파싱 시작: %d bytes", len(file_bytes))

    # JAVA_HOME 환경변수 설정 (convert()가 내부적으로 subprocess를 사용)
    env = os.environ.copy()
    env["JAVA_HOME"] = _JAVA_HOME
    env["PATH"] = f"{_JAVA_HOME}/bin:{env.get('PATH', '')}"
    os.environ.update(env)

    # bytes → 임시 파일로 저장
    tmp_path: str | None = None
    output_dir: str | None = None

    try:
        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
            tmp.write(file_bytes)
            tmp_path = tmp.name

        output_dir = tempfile.mkdtemp()

        opendataloader_pdf.convert(
            input_path=tmp_path,
            output_dir=output_dir,
            format="json",
            quiet=True,
        )

        # output_dir에서 .json 파일 읽기
        json_files = list(pathlib.Path(output_dir).glob("*.json"))
        if not json_files:
            raise RuntimeError("opendataloader_pdf.convert()가 JSON 파일을 생성하지 않았습니다.")

        raw_data = json.loads(json_files[0].read_text(encoding="utf-8"))

        # JSON을 재귀적으로 순회하여 element 수집
        all_elements: list[dict] = []
        for kid in raw_data.get("kids", []):
            _collect_elements(kid, all_elements)

        # 페이지 번호 기준으로 텍스트 그룹핑
        pages_dict: dict[int, list[str]] = {}
        for elem in all_elements:
            page_no = elem.get("page number")
            content = elem.get("content", "")
            if page_no is not None and content:
                pages_dict.setdefault(page_no, []).append(content)

        sorted_page_nums = sorted(pages_dict.keys())
        pages: list[str] = ["\n".join(pages_dict[p]) for p in sorted_page_nums]

        # 전체 텍스트: 페이지순으로 모든 content 결합
        full_text = "\n".join(pages)

        # 테이블 추출
        tables = _extract_tables_from_elements(all_elements)

        # 메타데이터
        page_count = raw_data.get("number of pages", len(pages))
        metadata: dict = {
            "page_count": page_count,
        }
        if raw_data.get("author"):
            metadata["author"] = raw_data["author"]
        if raw_data.get("title"):
            metadata["title"] = raw_data["title"]
        if raw_data.get("file name"):
            metadata["filename"] = raw_data["file name"]

        logger.info(
            "PDF 파싱 완료: 페이지=%d, 테이블=%d, 텍스트길이=%d",
            page_count,
            len(tables),
            len(full_text),
        )

        result = ParseResult(
            text=full_text,
            pages=pages,
            tables=tables,
            metadata=metadata,
        )

        # 캐시 저장
        if use_cache:
            _save_cache(ch, result)

        return result

    except (ValueError, RuntimeError):
        raise
    except Exception as exc:
        logger.error("PDF 파싱 오류: %s", exc)
        raise RuntimeError(f"PDF 파싱 실패: {exc}") from exc
    finally:
        if tmp_path:
            try:
                os.unlink(tmp_path)
            except OSError:
                pass
        if output_dir:
            try:
                import shutil

                shutil.rmtree(output_dir, ignore_errors=True)
            except OSError:
                pass


def parse_document(file_bytes: bytes, filename: str) -> ParseResult:
    """
    파일 확장자로 포맷을 자동 감지하여 문서를 파싱한다.

    Args:
        file_bytes: 문서 파일의 바이트 데이터
        filename: 파일명 (확장자 포함)

    Returns:
        ParseResult: 파싱 결과

    Raises:
        ValueError: 지원하지 않는 파일 확장자인 경우
        RuntimeError: 변환 중 오류 발생 시
    """
    if not file_bytes:
        raise ValueError("file_bytes가 비어 있습니다.")

    _, ext = os.path.splitext(filename)
    ext_lower = ext.lower()

    if ext_lower not in _EXT_TO_FORMAT:
        raise ValueError(f"지원하지 않는 파일 확장자입니다: '{ext}'. " f"지원 목록: {sorted(_EXT_TO_FORMAT.keys())}")

    fmt = _EXT_TO_FORMAT[ext_lower]
    logger.info("parse_document: filename=%s, format=%s", filename, fmt)

    if fmt == "pdf":
        return parse_pdf(file_bytes)

    # PDF 이외 포맷: docling을 lazy import로 사용
    from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
    from docling.datamodel.base_models import InputFormat
    from docling.datamodel.pipeline_options import PdfPipelineOptions
    from docling.document_converter import DocumentConverter, PdfFormatOption

    _ext_to_input_format: dict[str, InputFormat] = {
        "docx": InputFormat.DOCX,
        "pptx": InputFormat.PPTX,
        "html": InputFormat.HTML,
        "image": InputFormat.IMAGE,
    }

    with tempfile.NamedTemporaryFile(suffix=ext_lower, delete=False) as tmp:
        tmp.write(file_bytes)
        tmp_path = tmp.name

    try:
        accelerator_options = AcceleratorOptions(device=AcceleratorDevice.CPU)
        pipeline_options = PdfPipelineOptions()
        pipeline_options.accelerator_options = accelerator_options
        pipeline_options.do_ocr = False

        converter = DocumentConverter(
            format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
        )
        conv_result = converter.convert(tmp_path)

        if conv_result is None:
            raise RuntimeError("Docling 변환 결과가 None입니다.")

        doc = conv_result.document
        full_text = doc.export_to_text() or doc.export_to_markdown()

        # 페이지별 텍스트 추출
        pages_dict_docling: dict[int, list[str]] = {}
        for item, _ in doc.iterate_items():
            prov_list = getattr(item, "prov", None)
            if prov_list:
                for prov in prov_list:
                    page_no = getattr(prov, "page_no", None)
                    if page_no is not None:
                        text_val = getattr(item, "text", None)
                        if text_val:
                            pages_dict_docling.setdefault(page_no, []).append(text_val)

        if pages_dict_docling:
            sorted_pages_docling = sorted(pages_dict_docling.keys())
            pages = ["\n".join(pages_dict_docling[p]) for p in sorted_pages_docling]
        elif full_text:
            pages = [full_text]
        else:
            pages = []

        # 테이블 추출
        tables: list[dict] = []
        for table_item in doc.tables:
            try:
                df = table_item.export_to_dataframe()
                if df is not None and not df.empty:
                    headers = list(df.columns.astype(str))
                    rows = [list(row.astype(str)) for _, row in df.iterrows()]
                    tables.append({"headers": headers, "rows": rows})
            except Exception as exc:
                logger.warning("테이블 추출 실패 (건너뜀): %s", exc)

        page_count = len(doc.pages) if doc.pages else len(pages)
        metadata: dict = {
            "page_count": page_count,
            "filename": filename,
        }

        return ParseResult(
            text=full_text,
            pages=pages,
            tables=tables,
            metadata=metadata,
        )
    except (ValueError, RuntimeError):
        raise
    except Exception as exc:
        logger.error("문서 파싱 오류 (%s): %s", filename, exc)
        raise RuntimeError(f"문서 파싱 실패 ({filename}): {exc}") from exc
    finally:
        try:
            os.unlink(tmp_path)
        except OSError:
            pass