"""insurance_spider.py — 보험사 공개 데이터 Spider 구현.

Scrapling Spider ABC를 상속한 InsuranceSpider 클래스와
Response 이력 추적을 위한 ResponseHistory 유틸리티.

목적:
    보험사 공시 페이지 등 합법적 공개 데이터를 주기적으로 수집하는
    Spider를 TDD로 구현한다. InsuranceCrawler의 추출 로직을 재활용하며,
    Scrapling Spider 라이프사이클(on_start/on_close/on_error/on_scraped_item)에
    따라 출력 디렉토리 관리, 이력 저장, 에러 로깅 등을 처리한다.

주의사항:
    - 합법적 공개 데이터(보험사 공시 페이지 등)만을 대상으로 합니다.
    - 실제 크롤링 전 반드시 대상 사이트의 robots.txt를 확인하고 준수해야 합니다.
    - download_delay=1.0, concurrent_requests_per_domain=2로 서버 부하를 최소화합니다.
"""

import json
import logging
from pathlib import Path
from typing import Any, AsyncGenerator, Dict, Optional, Set

from insurance_crawler import InsuranceCrawler
from scrapling.engines.static import FetcherSession
from scrapling.spiders import Request, SessionManager
from scrapling.spiders import Spider as _Spider
from scrapling.spiders.result import CrawlResult

log = logging.getLogger(__name__)


class ResponseHistory:
    """Response 이력 추적: 리다이렉트 체인 보존.

    크롤링 중 방문한 URL의 상태 코드, 헤더, 리다이렉트 경로를 기록하고
    JSON 파일로 내보낼 수 있다.
    """

    def __init__(self) -> None:
        self._history: list[dict[str, Any]] = []

    def record(
        self,
        url: str,
        status: int,
        redirects: Optional[list[str]] = None,
        headers: Optional[dict[str, str]] = None,
    ) -> None:
        """URL 방문 기록을 추가한다.

        Args:
            url: 최종 응답 URL
            status: HTTP 상태 코드
            redirects: 리다이렉트 경로 (None이면 저장하지 않음)
            headers: 응답 헤더 딕셔너리
        """
        entry: dict[str, Any] = {"url": url, "status": status}
        if redirects is not None:
            entry["redirects"] = redirects
        if headers is not None:
            entry["headers"] = headers
        self._history.append(entry)

    def get_history(self) -> list[dict[str, Any]]:
        """전체 이력 리스트를 반환한다."""
        return list(self._history)

    def get_chain(self, url: str) -> list[dict[str, Any]]:
        """특정 URL에 연결된 이력 항목을 반환한다.

        해당 URL이 최종 URL이거나 리다이렉트 체인에 포함된 항목을 모두 반환한다.

        Args:
            url: 조회할 URL

        Returns:
            해당 URL과 관련된 이력 항목 리스트
        """
        result = []
        for entry in self._history:
            if entry["url"] == url:
                result.append(entry)
                continue
            redirects = entry.get("redirects", [])
            if url in redirects:
                result.append(entry)
        return result

    def save(self, path: "str | Path") -> None:
        """이력을 JSON 파일로 저장한다.

        Args:
            path: 저장할 파일 경로 (부모 디렉토리가 없으면 생성)
        """
        file_path = Path(path)
        file_path.parent.mkdir(parents=True, exist_ok=True)
        file_path.write_text(
            json.dumps(self._history, ensure_ascii=False, indent=2),
            encoding="utf-8",
        )

    def clear(self) -> None:
        """이력을 초기화한다."""
        self._history.clear()


class InsuranceSpider(_Spider):
    """보험사 공시 데이터 수집 Spider.

    InsuranceCrawler의 추출 로직(extract_with_selector / extract_similar /
    extract_table)을 Scrapling Spider 라이프사이클에 통합한다.

    extraction_config 딕셔너리로 추출 모드와 파라미터를 제어한다:
        - mode="css"     : extract_with_selector() (기본값)
        - mode="table"   : extract_table()
        - mode="similar" : extract_similar()

    예의 바른 크롤링(SP-5):
        - concurrent_requests=4
        - concurrent_requests_per_domain=2
        - download_delay=1.0
    """

    name = "insurance_spider"
    start_urls: list[str] = []
    allowed_domains: Set[str] = set()

    # SP-5: 예의 바른 크롤링 설정
    concurrent_requests: int = 4
    concurrent_requests_per_domain: int = 2
    download_delay: float = 1.0

    def __init__(
        self,
        start_urls: Optional[list[str]] = None,
        allowed_domains: Optional[Set[str]] = None,
        output_dir: "str | Path" = "crawl_output",
        crawldir: "str | Path | None" = None,
        interval: float = 300.0,
        extraction_config: Optional[dict[str, Any]] = None,
    ) -> None:
        """InsuranceSpider 초기화.

        Args:
            start_urls: 크롤링 시작 URL 목록
            allowed_domains: 허용 도메인 집합
            output_dir: 결과 파일 저장 디렉토리 (기본: "crawl_output")
            crawldir: 체크포인트 디렉토리 (SP-3). None이면 체크포인트 비활성화.
            interval: 체크포인트 저장 주기(초, 기본: 300.0)
            extraction_config: 추출 설정 딕셔너리
        """
        # 클래스 변수를 인스턴스 변수로 오버라이드 (다중 인스턴스 안전)
        if start_urls is not None:
            self.start_urls = list(start_urls)
        if allowed_domains is not None:
            self.allowed_domains = set(allowed_domains)

        self.output_dir: Path = Path(output_dir)
        self.extraction_config: Optional[dict[str, Any]] = extraction_config
        self._crawler = InsuranceCrawler(adaptive=False)
        self.response_history = ResponseHistory()

        # Spider ABC __init__ (configure_sessions 호출 포함)
        super().__init__(crawldir=crawldir, interval=interval)

    def configure_sessions(self, manager: SessionManager) -> None:
        """FetcherSession을 기본 세션으로 등록한다."""
        manager.add("default", FetcherSession(), default=True)

    async def parse(self, response: Any) -> AsyncGenerator[Dict[str, Any] | Request | None, None]:
        """보험 데이터 추출 + 다음 페이지 follow.

        extraction_config에 따라 추출 모드 선택:
            - mode="table"   : extract_table()
            - mode="similar" : extract_similar()
            - mode="css"     : extract_with_selector() (기본값)

        각 아이템에 _source_url 메타데이터를 추가한다.
        next_page_selector가 있으면 다음 페이지 Request를 yield한다.

        Args:
            response: Scrapling Response 객체 (Selector 상속)

        Yields:
            dict 아이템 또는 Request
        """
        if self.extraction_config is None:
            return

        config = self.extraction_config
        mode = config.get("mode", "css")
        source_url: str = getattr(response, "url", "") or ""

        # 응답 이력 기록
        status: int = getattr(response, "status", 0) or 0
        self.response_history.record(source_url, status)

        items: list[dict[str, Any]] = []

        if mode == "table":
            table_selector: str = config.get("table_selector", "table")
            items = self._crawler.extract_table(response, table_selector=table_selector)

        elif mode == "similar":
            reference_selector: str = config.get("reference_selector", "")
            fields: Optional[dict[str, str]] = config.get("fields")
            threshold: float = float(config.get("threshold", 0.2))
            items = self._crawler.extract_similar(
                response,
                reference_selector=reference_selector,
                fields=fields,
                threshold=threshold,
            )

        else:
            # mode == "css" (default)
            css_selector: str = config.get("css_selector", "")
            fields = config.get("fields")
            identifier: str = config.get("identifier", "")
            items = self._crawler.extract_with_selector(
                response,
                css_selector=css_selector,
                identifier=identifier,
                fields=fields,
            )

        # 각 아이템에 _source_url 메타데이터 추가 후 yield
        for item in items:
            item["_source_url"] = source_url
            yield item

        # 다음 페이지 follow
        next_page_selector: Optional[str] = config.get("next_page_selector")
        if next_page_selector:
            next_link = response.css(next_page_selector)
            if next_link:
                href: Optional[str] = next_link.first.attrib.get("href") if next_link.first else None
                if href:
                    # 상대 URL 처리
                    if href.startswith("http"):
                        next_url = href
                    else:
                        from urllib.parse import urljoin

                        next_url = urljoin(source_url, href)
                    yield Request(next_url, sid="default", callback=self.parse)

    async def on_start(self, resuming: bool = False) -> None:
        """output_dir 생성 및 시작 로깅.

        Args:
            resuming: True이면 체크포인트에서 재개
        """
        self.output_dir.mkdir(parents=True, exist_ok=True)
        if resuming:
            self.logger.info("Resuming spider from checkpoint (output_dir=%s)", self.output_dir)
        else:
            self.logger.info("Starting InsuranceSpider (output_dir=%s)", self.output_dir)

    async def on_close(self) -> None:
        """ResponseHistory를 파일로 저장하고 종료 로깅."""
        history_path = self.output_dir / "response_history.json"
        try:
            self.response_history.save(history_path)
            self.logger.info("Saved response history to %s", history_path)
        except Exception as exc:
            self.logger.warning("Failed to save response history: %s", exc)
        self.logger.info("InsuranceSpider closed")

    async def on_error(self, request: Request, error: Exception) -> None:
        """에러 로깅.

        Args:
            request: 실패한 요청
            error: 발생한 예외
        """
        self.logger.error(
            "Error fetching %s: %s: %s",
            request.url,
            type(error).__name__,
            error,
        )

    async def on_scraped_item(self, item: dict[str, Any]) -> Optional[dict[str, Any]]:
        """빈 아이템 필터링.

        모든 값이 None이나 빈 문자열이면 None을 반환하여 아이템을 드롭한다.

        Args:
            item: 스크래핑된 아이템

        Returns:
            유효한 아이템이면 그대로 반환, 빈 아이템이면 None (드롭)
        """
        # _source_url을 제외한 값들을 검사
        content_values = [v for k, v in item.items() if k != "_source_url"]

        # 모든 값이 None 또는 빈 문자열이면 드롭
        has_content = any(v is not None and str(v).strip() != "" for v in content_values)

        if not has_content:
            return None

        return item

    def run(self, output_format: str = "jsonl") -> CrawlResult:
        """Spider 실행 + 결과 내보내기.

        Args:
            output_format: "json" 또는 "jsonl" (기본값: "jsonl")

        Returns:
            CrawlResult (stats + items)
        """
        result: CrawlResult = self.start()

        # output_dir이 없으면 생성
        self.output_dir.mkdir(parents=True, exist_ok=True)

        output_path = self.output_dir / f"items.{output_format}"

        if output_format == "json":
            result.items.to_json(output_path, indent=True)
        else:
            result.items.to_jsonl(output_path)

        self.logger.info(
            "Exported %d items to %s",
            len(result.items),
            output_path,
        )

        return result

    @staticmethod
    def create_cron_config(
        schedule: str,
        start_urls: list[str],
        output_dir: str,
        extraction_config: Optional[dict[str, Any]] = None,
    ) -> dict[str, Any]:
        """cokacdir --cron 연동을 위한 설정 딕셔너리 생성.

        Args:
            schedule: cron 표현식 (예: "0 6 * * *")
            start_urls: 크롤링 시작 URL 목록
            output_dir: 결과 저장 디렉토리
            extraction_config: 추출 설정 딕셔너리

        Returns:
            cokacdir --cron 연동에 필요한 설정 딕셔너리
        """
        config: dict[str, Any] = {
            "spider": "insurance_spider",
            "schedule": schedule,
            "start_urls": start_urls,
            "output_dir": output_dir,
            "extraction_config": extraction_config,
        }
        return config