"""Lightpanda 크롤링 래퍼 — 텍스트 기반 대량 크롤링 전용

사용법:
    from tools.lightpanda_crawler import LightpandaCrawler

    async with LightpandaCrawler() as crawler:
        result = await crawler.fetch("https://example.com")
        # result.title, result.text, result.html, result.links

        results = await crawler.fetch_many(urls, concurrency=25)

스크린샷이 필요하면 Playwright+Chrome을 사용하세요 (이 모듈은 텍스트 전용).
"""

from __future__ import annotations

import asyncio
import logging
import re
import time
from dataclasses import dataclass, field
from typing import Any
from urllib.parse import urljoin, urlparse

from playwright.async_api import Browser, Page, async_playwright

logger = logging.getLogger("lightpanda_crawler")

# ---------------------------------------------------------------------------
# 커스텀 예외
# ---------------------------------------------------------------------------

_PLURAL_KEYS = frozenset(
    {
        "links",
        "hrefs",
        "urls",
        "items",
        "results",
        "images",
        "articles",
        "tags",
        "categories",
        "lists",
        "entries",
        "rows",
        "cols",
        "columns",
        "paragraphs",
        "sections",
        "comments",
        "reviews",
        "products",
        "prices",
        "names",
        "emails",
        "phones",
        "authors",
    }
)


class CrawlError(Exception):
    """크롤링 관련 오류 기본 클래스"""

    def __init__(self, message: str, url: str = "", cause: Exception | None = None) -> None:
        super().__init__(message)
        self.url = url
        self.cause = cause


class CrawlTimeoutError(CrawlError):
    """타임아웃 오류"""


class CrawlConnectionError(CrawlError):
    """CDP 연결 오류"""


class CrawlJSError(CrawlError):
    """JavaScript 실행 오류"""


# ---------------------------------------------------------------------------
# 결과 데이터클래스
# ---------------------------------------------------------------------------


@dataclass
class CrawlResult:
    """단일 페이지 크롤링 결과"""

    url: str
    title: str
    text: str
    html: str
    links: list[str]
    meta: dict[str, str]
    status: int
    elapsed_ms: float
    engine: str  # "lightpanda" 또는 "chrome"


# ---------------------------------------------------------------------------
# 내부 헬퍼
# ---------------------------------------------------------------------------


def _extract_links(html: str, base_url: str) -> list[str]:
    """HTML 에서 모든 <a href="..."> 링크를 절대 URL 로 변환하여 반환."""
    hrefs = re.findall(r'<a[^>]+href=["\']([^"\']+)["\']', html, re.IGNORECASE)
    result: list[str] = []
    for href in hrefs:
        href = href.strip()
        if not href or href.startswith(("#", "javascript:", "mailto:", "tel:")):
            continue
        absolute = urljoin(base_url, href)
        parsed = urlparse(absolute)
        if parsed.scheme in ("http", "https"):
            result.append(absolute)
    return result


def _extract_meta(html: str) -> dict[str, str]:
    """HTML 에서 <meta name="..."> 태그의 content 를 추출."""
    meta: dict[str, str] = {}
    patterns = [
        r'<meta\s+name=["\']([^"\']+)["\']\s+content=["\']([^"\']*)["\']',
        r'<meta\s+content=["\']([^"\']*)["\'][^>]+name=["\']([^"\']+)["\']',
    ]
    for pattern in patterns:
        for match in re.finditer(pattern, html, re.IGNORECASE):
            if pattern.startswith(r"<meta\s+name"):
                name, content = match.group(1), match.group(2)
            else:
                content, name = match.group(1), match.group(2)
            meta[name.lower()] = content
    return meta


def _html_to_text(html: str) -> str:
    """HTML 태그 제거 후 텍스트 추출 (간이 구현)."""
    # script/style 블록 제거
    text = re.sub(r"<(script|style)[^>]*>.*?</(script|style)>", "", html, flags=re.DOTALL | re.IGNORECASE)
    # 나머지 태그 제거
    text = re.sub(r"<[^>]+>", " ", text)
    # 공백 정규화
    text = re.sub(r"\s+", " ", text).strip()
    return text


def _is_plural_key(key: str) -> bool:
    """셀렉터 키가 복수형인지 확인."""
    lower = key.lower()
    return lower in _PLURAL_KEYS or lower.endswith("s")


# ---------------------------------------------------------------------------
# 메인 크롤러 클래스
# ---------------------------------------------------------------------------


class LightpandaCrawler:
    """Lightpanda CDP 기반 비동기 크롤러.

    Context manager 로 사용:
        async with LightpandaCrawler() as crawler:
            result = await crawler.fetch("https://example.com")
    """

    def __init__(
        self,
        cdp_endpoint: str = "ws://127.0.0.1:9333",
        chrome_endpoint: str = "ws://127.0.0.1:9222",
        timeout_ms: int = 30000,
    ) -> None:
        self.cdp_endpoint = cdp_endpoint
        self.chrome_endpoint = chrome_endpoint
        self.timeout_ms = timeout_ms

        self._browser: Browser | None = None
        self._engine: str = "lightpanda"
        self._pw_context: Any = None  # playwright context manager

    # ------------------------------------------------------------------
    # Context manager
    # ------------------------------------------------------------------

    async def __aenter__(self) -> "LightpandaCrawler":
        self._pw_context = async_playwright()
        self._pw = await self._pw_context.__aenter__()
        await self._connect()
        return self

    async def __aexit__(self, *args: Any) -> None:
        if self._browser is not None:
            try:
                await self._browser.close()
            except Exception:
                pass
            self._browser = None
        if self._pw_context is not None:
            try:
                await self._pw_context.__aexit__(*args)
            except Exception:
                pass
            self._pw_context = None

    # ------------------------------------------------------------------
    # 연결 관리
    # ------------------------------------------------------------------

    async def _connect(self) -> None:
        """CDP 연결 시도. Lightpanda 우선, 실패 시 Chrome fallback."""
        # 1차: Lightpanda
        try:
            self._browser = await self._pw.chromium.connect_over_cdp(self.cdp_endpoint)
            self._engine = "lightpanda"
            logger.info("Connected to Lightpanda at %s", self.cdp_endpoint)
            return
        except Exception as lp_err:
            logger.warning("Lightpanda 연결 실패 (%s), Chrome fallback 시도...", lp_err)

        # 2차: Chrome fallback
        try:
            self._browser = await self._pw.chromium.connect_over_cdp(self.chrome_endpoint)
            self._engine = "chrome"
            logger.info("Connected to Chrome (fallback) at %s", self.chrome_endpoint)
            return
        except Exception as chrome_err:
            logger.error("Chrome fallback 연결 실패: %s", chrome_err)
            raise CrawlConnectionError(
                f"Lightpanda({self.cdp_endpoint})와 Chrome({self.chrome_endpoint}) 모두 연결 실패",
                cause=chrome_err,
            ) from chrome_err

    async def _reconnect(self) -> None:
        """브라우저 재연결 (자동 복구용)."""
        if self._browser is not None:
            try:
                await self._browser.close()
            except Exception:
                pass
            self._browser = None
        await self._connect()

    async def _ensure_page(self, url: str, timeout_ms: int) -> Page:
        """새 페이지를 열고 URL 을 로드. 실패 시 1회 재연결 후 재시도."""
        if self._browser is None:
            raise CrawlConnectionError("브라우저가 연결되지 않았습니다.", url=url)

        async def _open_page() -> Page:
            assert self._browser is not None
            page = await self._browser.new_page()
            try:
                await page.goto(url, timeout=timeout_ms)
            except Exception:
                await page.close()
                raise
            return page

        try:
            return await _open_page()
        except Exception as first_err:
            logger.warning("페이지 로드 실패 (%s), 재연결 시도...", first_err)
            try:
                await self._reconnect()
                return await _open_page()
            except Exception as second_err:
                err_msg = str(second_err).lower()
                if "timeout" in err_msg:
                    raise CrawlTimeoutError(f"타임아웃: {url}", url=url, cause=second_err) from second_err
                raise CrawlError(f"페이지 로드 실패: {url} — {second_err}", url=url, cause=second_err) from second_err

    # ------------------------------------------------------------------
    # 공개 API
    # ------------------------------------------------------------------

    async def fetch(self, url: str, timeout_ms: int | None = None) -> CrawlResult:
        """단일 페이지 크롤링.

        Args:
            url: 크롤링할 URL
            timeout_ms: 요청 타임아웃 (None 이면 self.timeout_ms 사용)

        Returns:
            CrawlResult 인스턴스
        """
        effective_timeout = timeout_ms if timeout_ms is not None else self.timeout_ms
        start = time.monotonic()

        page = await self._ensure_page(url, effective_timeout)
        try:
            title = await page.title()
            html = await page.content()
        finally:
            await page.close()

        elapsed = (time.monotonic() - start) * 1000

        text = _html_to_text(html)
        links = _extract_links(html, url)
        meta = _extract_meta(html)

        return CrawlResult(
            url=url,
            title=title,
            text=text,
            html=html,
            links=links,
            meta=meta,
            status=200,
            elapsed_ms=elapsed,
            engine=self._engine,
        )

    async def fetch_many(
        self,
        urls: list[str],
        concurrency: int = 25,
        timeout_ms: int | None = None,
    ) -> list[CrawlResult]:
        """대량 병렬 크롤링.

        asyncio.Semaphore 로 동시 요청 수를 제어합니다.

        Args:
            urls: 크롤링할 URL 목록
            concurrency: 최대 동시 요청 수 (기본 25)
            timeout_ms: 개별 요청 타임아웃

        Returns:
            CrawlResult 목록 (입력 URL 순서 유지)
        """
        if not urls:
            return []

        sem = asyncio.Semaphore(concurrency)

        async def _fetch_one(url: str) -> CrawlResult:
            async with sem:
                try:
                    return await self.fetch(url, timeout_ms=timeout_ms)
                except CrawlError:
                    raise
                except Exception as e:
                    raise CrawlError(f"크롤링 실패: {url}", url=url, cause=e) from e

        tasks = [asyncio.create_task(_fetch_one(u)) for u in urls]
        results = await asyncio.gather(*tasks, return_exceptions=False)
        return list(results)

    async def evaluate(self, url: str, js_code: str) -> Any:
        """페이지에서 JavaScript 를 실행하고 결과를 반환.

        Args:
            url: 로드할 URL
            js_code: 실행할 JS 표현식 또는 함수

        Returns:
            JS 실행 결과 (JSON 직렬화 가능한 값)
        """
        page = await self._ensure_page(url, self.timeout_ms)
        try:
            try:
                result = await page.evaluate(js_code)
            except Exception as e:
                raise CrawlJSError(f"JS 실행 오류: {e}", url=url, cause=e) from e
        finally:
            await page.close()
        return result

    async def extract_structured(self, url: str, selectors: dict[str, str]) -> dict[str, Any]:
        """CSS 셀렉터 기반 구조화 데이터 추출.

        셀렉터 문법:
          - 기본: "h1" → 첫 번째 매칭 요소의 텍스트
          - ::attr(name): 지정 속성 값 (예: "a::attr(href)")
          - 복수형 키 (links, items 등): 모든 매칭 요소의 리스트

        Args:
            url: 크롤링할 URL
            selectors: {"결과키": "CSS셀렉터"} 매핑

        Returns:
            {"결과키": 추출값} 딕셔너리
        """
        page = await self._ensure_page(url, self.timeout_ms)
        try:
            result: dict[str, Any] = {}
            for key, selector in selectors.items():
                # ::attr(name) 파싱
                attr_match = re.search(r"::attr\(([^)]+)\)$", selector)
                if attr_match:
                    attr_name = attr_match.group(1)
                    pure_selector = selector[: attr_match.start()]
                else:
                    attr_name = None
                    pure_selector = selector

                if _is_plural_key(key):
                    # 모든 매칭 요소 반환
                    elements = await page.query_selector_all(pure_selector)
                    values: list[str] = []
                    for el in elements:
                        if attr_name:
                            val = await el.get_attribute(attr_name)
                        else:
                            val = await el.inner_text()
                        if val is not None:
                            values.append(val)
                    result[key] = values
                else:
                    # 첫 번째 매칭 요소만 반환
                    el = await page.query_selector(pure_selector)
                    if el is None:
                        result[key] = None
                    elif attr_name:
                        result[key] = await el.get_attribute(attr_name)
                    else:
                        result[key] = await el.inner_text()
        finally:
            await page.close()

        return result

    # ------------------------------------------------------------------
    # 의도적 미구현 메서드
    # ------------------------------------------------------------------

    async def screenshot(self, url: str, **kwargs: Any) -> bytes:  # noqa: ARG002
        """미구현: 스크린샷이 필요하면 Playwright+Chrome을 사용하세요."""
        raise NotImplementedError(
            "screenshot()은 이 모듈에서 지원하지 않습니다. " "스크린샷이 필요하면 Playwright+Chrome을 사용하세요."
        )

    async def pdf(self, url: str, **kwargs: Any) -> bytes:  # noqa: ARG002
        """미구현: PDF 생성이 필요하면 Playwright+Chrome을 사용하세요."""
        raise NotImplementedError(
            "pdf()는 이 모듈에서 지원하지 않습니다. " "PDF 생성이 필요하면 Playwright+Chrome을 사용하세요."
        )