"""
Tests for ingest.py - TDD implementation for task-510

모든 외부 의존성 (chunker, embedding_service, supabase)을 mock으로 처리.
"""

import hashlib
import os
import sys
import uuid
from typing import Any
from unittest import mock
from unittest.mock import MagicMock, call, patch

sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

import pytest

# ---------------------------------------------------------------------------
# Fixtures & helpers
# ---------------------------------------------------------------------------

FAKE_DOCUMENT_ID = str(uuid.uuid4())
FAKE_CHUNKS = [
    {"content": "chunk 0 text", "chunk_index": 0, "token_count": 10},
    {"content": "chunk 1 text", "chunk_index": 1, "token_count": 8},
]
FAKE_EMBEDDINGS = [
    [0.1] * 1536,
    [0.2] * 1536,
]


def _make_supabase_mock(existing_doc: dict[str, Any] | None = None) -> MagicMock:
    """Supabase 클라이언트 mock 생성 헬퍼.

    Args:
        existing_doc: 중복 검사 시 반환할 기존 문서 dict.
                      None이면 문서 없음 (빈 리스트 반환).
    """
    sb = MagicMock()

    # --- documents 테이블 쿼리 체인 (중복 확인) ---
    select_response = MagicMock()
    select_response.data = [existing_doc] if existing_doc else []

    sb.table.return_value.select.return_value.eq.return_value.execute.return_value = select_response

    # --- documents INSERT ---
    insert_doc_response = MagicMock()
    insert_doc_response.data = [{"id": FAKE_DOCUMENT_ID}]

    # --- chunks INSERT ---
    insert_chunk_response = MagicMock()
    insert_chunk_response.data = [{}]

    # --- DELETE ---
    delete_response = MagicMock()
    delete_response.data = [{"id": FAKE_DOCUMENT_ID}]

    # table() 호출별로 다른 동작이 필요하므로 side_effect로 처리
    def _table_side_effect(table_name: str) -> MagicMock:
        tbl = MagicMock()

        # SELECT (중복 확인 / reindex 시 문서 재조회)
        select_chain = MagicMock()
        select_chain.eq.return_value.execute.return_value = select_response
        tbl.select.return_value = select_chain

        # INSERT
        insert_chain = MagicMock()
        if table_name == "knowledge_documents":
            insert_chain.execute.return_value = insert_doc_response
        else:
            insert_chain.execute.return_value = insert_chunk_response
        tbl.insert.return_value = insert_chain

        # DELETE
        delete_chain = MagicMock()
        delete_chain.eq.return_value.execute.return_value = delete_response
        tbl.delete.return_value = delete_chain

        return tbl

    sb.table.side_effect = _table_side_effect
    return sb


# ---------------------------------------------------------------------------
# Test 1: ingest_document() 정상 호출 → document_id (UUID str) 반환
# ---------------------------------------------------------------------------


def test_ingest_document_returns_document_id() -> None:
    """ingest_document()가 UUID 형식의 document_id 문자열을 반환해야 한다."""
    sb_mock = _make_supabase_mock(existing_doc=None)

    with (
        patch("ingest._get_supabase_client", return_value=sb_mock),
        patch("ingest.chunk_text", return_value=FAKE_CHUNKS) as mock_chunk,
        patch("ingest.get_embeddings_batch", return_value=FAKE_EMBEDDINGS),
    ):
        from ingest import ingest_document

        result = ingest_document(
            title="Test Doc",
            content="Some meaningful content for testing.",
            source="test",
        )

    assert isinstance(result, str)
    # UUID 형식 검증
    parsed = uuid.UUID(result)
    assert str(parsed) == result


# ---------------------------------------------------------------------------
# Test 2: ingest_document() → chunker 호출 후 embedding_batch 호출 검증
# ---------------------------------------------------------------------------


def test_ingest_document_calls_chunker_then_embeddings() -> None:
    """chunker.chunk_text()가 먼저 호출되고, 그 결과로 get_embeddings_batch()가 호출돼야 한다."""
    sb_mock = _make_supabase_mock(existing_doc=None)
    content = "Content to be chunked and embedded."

    with (
        patch("ingest._get_supabase_client", return_value=sb_mock),
        patch("ingest.chunk_text", return_value=FAKE_CHUNKS) as mock_chunk,
        patch("ingest.get_embeddings_batch", return_value=FAKE_EMBEDDINGS) as mock_embed,
    ):
        from ingest import ingest_document

        ingest_document(title="Doc", content=content, source="src")

    # chunk_text 호출 검증: content가 첫 번째 인자여야 함
    mock_chunk.assert_called_once()
    call_args = mock_chunk.call_args
    assert call_args[0][0] == content or call_args[1].get("text") == content

    # get_embeddings_batch 호출 검증: chunk content 목록이 전달돼야 함
    mock_embed.assert_called_once()
    embed_call_args = mock_embed.call_args
    texts_arg: list[str] = embed_call_args[0][0] if embed_call_args[0] else embed_call_args[1]["texts"]
    expected_texts = [str(c["content"]) for c in FAKE_CHUNKS]
    assert texts_arg == expected_texts


# ---------------------------------------------------------------------------
# Test 3: ingest_document() → Supabase INSERT 호출 검증 (documents + chunks)
# ---------------------------------------------------------------------------


def test_ingest_document_inserts_documents_and_chunks() -> None:
    """documents 테이블과 knowledge_chunks 테이블에 INSERT가 호출돼야 한다."""
    inserted_tables: list[str] = []

    # table() 호출 시 어떤 테이블에 insert했는지 기록하는 mock
    def _tracking_table_side_effect(table_name: str) -> MagicMock:
        tbl = MagicMock()

        # SELECT (빈 결과 = 중복 없음)
        select_response = MagicMock()
        select_response.data = []
        select_chain = MagicMock()
        select_chain.eq.return_value.execute.return_value = select_response
        tbl.select.return_value = select_chain

        # INSERT
        def _insert_side_effect(data: Any) -> MagicMock:
            inserted_tables.append(table_name)
            ins = MagicMock()
            if table_name == "knowledge_documents":
                ins.execute.return_value = MagicMock(data=[{"id": FAKE_DOCUMENT_ID}])
            else:
                ins.execute.return_value = MagicMock(data=[{}])
            return ins

        tbl.insert.side_effect = _insert_side_effect
        return tbl

    sb_mock = MagicMock()
    sb_mock.table.side_effect = _tracking_table_side_effect

    with (
        patch("ingest._get_supabase_client", return_value=sb_mock),
        patch("ingest.chunk_text", return_value=FAKE_CHUNKS),
        patch("ingest.get_embeddings_batch", return_value=FAKE_EMBEDDINGS),
    ):
        from ingest import ingest_document

        ingest_document(title="Doc", content="Content here.", source="src")

    # documents 테이블 INSERT 확인
    assert "knowledge_documents" in inserted_tables, f"knowledge_documents INSERT 호출 없음. 실제: {inserted_tables}"
    # chunks 테이블 INSERT 확인 (테이블명은 knowledge_chunks 또는 document_chunks)
    chunk_tables = [t for t in inserted_tables if "chunk" in t.lower()]
    assert len(chunk_tables) > 0, f"chunks 테이블 INSERT 호출 없음. 실제: {inserted_tables}"


# ---------------------------------------------------------------------------
# Test 4: 중복 문서 (같은 content_hash) → 기존 document_id 반환, INSERT 스킵
# ---------------------------------------------------------------------------


def test_ingest_document_duplicate_returns_existing_id() -> None:
    """같은 content_hash의 문서가 이미 존재하면 기존 id를 반환하고 INSERT를 하지 않아야 한다."""
    existing_id = str(uuid.uuid4())
    existing_doc = {"id": existing_id, "content_hash": "some_hash"}

    insert_called = False

    def _table_side_effect(table_name: str) -> MagicMock:
        tbl = MagicMock()

        select_response = MagicMock()
        select_response.data = [existing_doc]
        select_chain = MagicMock()
        select_chain.eq.return_value.execute.return_value = select_response
        tbl.select.return_value = select_chain

        def _insert_side_effect(data: Any) -> MagicMock:
            nonlocal insert_called
            insert_called = True
            return MagicMock()

        tbl.insert.side_effect = _insert_side_effect
        return tbl

    sb_mock = MagicMock()
    sb_mock.table.side_effect = _table_side_effect

    with (
        patch("ingest._get_supabase_client", return_value=sb_mock),
        patch("ingest.chunk_text", return_value=FAKE_CHUNKS) as mock_chunk,
        patch("ingest.get_embeddings_batch", return_value=FAKE_EMBEDDINGS) as mock_embed,
    ):
        from ingest import ingest_document

        result = ingest_document(
            title="Dup Doc",
            content="Duplicate content.",
            source="src",
        )

    assert result == existing_id, f"기존 id {existing_id} 를 반환해야 하지만 {result} 반환"
    assert not insert_called, "중복 문서에 대해 INSERT가 호출되면 안 됨"
    mock_chunk.assert_not_called()
    mock_embed.assert_not_called()


# ---------------------------------------------------------------------------
# Test 5: delete_document() → Supabase DELETE 호출 검증
# ---------------------------------------------------------------------------


def test_delete_document_calls_supabase_delete() -> None:
    """delete_document()가 Supabase knowledge_documents 테이블에 DELETE를 호출해야 한다."""
    doc_id = str(uuid.uuid4())
    deleted_ids: list[str] = []

    def _table_side_effect(table_name: str) -> MagicMock:
        tbl = MagicMock()

        delete_chain = MagicMock()

        def _eq_side_effect(col: str, val: Any) -> MagicMock:
            if table_name == "knowledge_documents":
                deleted_ids.append(str(val))
            eq_mock = MagicMock()
            eq_mock.execute.return_value = MagicMock(data=[{"id": val}])
            return eq_mock

        delete_chain.eq.side_effect = _eq_side_effect
        tbl.delete.return_value = delete_chain
        return tbl

    sb_mock = MagicMock()
    sb_mock.table.side_effect = _table_side_effect

    with patch("ingest._get_supabase_client", return_value=sb_mock):
        from ingest import delete_document

        result = delete_document(doc_id)

    assert result is True
    assert doc_id in deleted_ids, f"document_id {doc_id}로 DELETE 호출 안 됨. 실제: {deleted_ids}"


# ---------------------------------------------------------------------------
# Test 6: reindex_document() → 기존 청크 삭제 + 재인제스션 검증
# ---------------------------------------------------------------------------


def test_reindex_document_deletes_chunks_and_reinserts() -> None:
    """reindex_document()가 기존 청크를 삭제하고 새로 청킹/임베딩/INSERT 해야 한다."""
    doc_id = str(uuid.uuid4())
    existing_doc = {
        "id": doc_id,
        "title": "Original Title",
        "content": "Original content for reindexing.",
        "source": "src",
        "source_url": None,
        "metadata": {},
        "content_hash": "oldhash",
    }

    deleted_chunk_doc_ids: list[str] = []
    inserted_tables: list[str] = []

    def _table_side_effect(table_name: str) -> MagicMock:
        tbl = MagicMock()

        # SELECT: documents 테이블에서 문서 조회
        select_response = MagicMock()
        select_response.data = [existing_doc] if table_name == "knowledge_documents" else []
        select_chain = MagicMock()
        select_chain.eq.return_value.execute.return_value = select_response
        tbl.select.return_value = select_chain

        # DELETE: chunks 삭제 추적
        delete_chain = MagicMock()

        def _delete_eq_side_effect(col: str, val: Any) -> MagicMock:
            if "chunk" in table_name.lower():
                deleted_chunk_doc_ids.append(str(val))
            eq_mock = MagicMock()
            eq_mock.execute.return_value = MagicMock(data=[])
            return eq_mock

        delete_chain.eq.side_effect = _delete_eq_side_effect
        tbl.delete.return_value = delete_chain

        # INSERT
        def _insert_side_effect(data: Any) -> MagicMock:
            inserted_tables.append(table_name)
            ins = MagicMock()
            ins.execute.return_value = MagicMock(data=[{"id": FAKE_DOCUMENT_ID}])
            return ins

        tbl.insert.side_effect = _insert_side_effect
        return tbl

    sb_mock = MagicMock()
    sb_mock.table.side_effect = _table_side_effect

    with (
        patch("ingest._get_supabase_client", return_value=sb_mock),
        patch("ingest.chunk_text", return_value=FAKE_CHUNKS) as mock_chunk,
        patch("ingest.get_embeddings_batch", return_value=FAKE_EMBEDDINGS) as mock_embed,
    ):
        from ingest import reindex_document

        reindex_document(doc_id)

    # 기존 청크 삭제 확인
    assert len(deleted_chunk_doc_ids) > 0, "기존 청크 DELETE가 호출되지 않았음"

    # 재청킹 및 재임베딩 확인
    mock_chunk.assert_called_once()
    mock_embed.assert_called_once()

    # 새 청크 INSERT 확인
    chunk_inserts = [t for t in inserted_tables if "chunk" in t.lower()]
    assert len(chunk_inserts) > 0, "재인제스션 시 chunks INSERT가 호출되지 않았음"


# ---------------------------------------------------------------------------
# Test 7: content_hash 계산 검증 (hashlib sha256)
# ---------------------------------------------------------------------------


def test_content_hash_is_sha256_of_content() -> None:
    """ingest_document()가 content의 SHA-256 해시를 content_hash로 사용해야 한다."""
    content = "Specific content for hash verification."
    expected_hash = hashlib.sha256(content.encode("utf-8")).hexdigest()

    captured_insert_data: list[dict[str, Any]] = []

    def _table_side_effect(table_name: str) -> MagicMock:
        tbl = MagicMock()

        # SELECT: 중복 없음
        select_response = MagicMock()
        select_response.data = []
        select_chain = MagicMock()
        select_chain.eq.return_value.execute.return_value = select_response
        tbl.select.return_value = select_chain

        # INSERT: 데이터 캡처
        def _insert_side_effect(data: Any) -> MagicMock:
            if table_name == "knowledge_documents":
                captured_insert_data.append(data if isinstance(data, dict) else data)
            ins = MagicMock()
            ins.execute.return_value = MagicMock(data=[{"id": FAKE_DOCUMENT_ID}])
            return ins

        tbl.insert.side_effect = _insert_side_effect
        return tbl

    sb_mock = MagicMock()
    sb_mock.table.side_effect = _table_side_effect

    # SELECT의 .eq() 호출 시 content_hash 값 캡처
    queried_hashes: list[str] = []

    original_table = sb_mock.table.side_effect

    def _capturing_table_side_effect(table_name: str) -> MagicMock:
        tbl = original_table(table_name)

        if table_name == "knowledge_documents":
            original_select = tbl.select

            def _select_side_effect(*args: Any, **kwargs: Any) -> MagicMock:
                sel = MagicMock()

                def _eq_side_effect(col: str, val: Any) -> MagicMock:
                    if col == "content_hash":
                        queried_hashes.append(str(val))
                    eq_mock = MagicMock()
                    eq_mock.execute.return_value = MagicMock(data=[])
                    return eq_mock

                sel.eq.side_effect = _eq_side_effect
                return sel

            tbl.select.side_effect = _select_side_effect

        return tbl

    sb_mock.table.side_effect = _capturing_table_side_effect

    with (
        patch("ingest._get_supabase_client", return_value=sb_mock),
        patch("ingest.chunk_text", return_value=FAKE_CHUNKS),
        patch("ingest.get_embeddings_batch", return_value=FAKE_EMBEDDINGS),
    ):
        from ingest import ingest_document

        ingest_document(title="Hash Test", content=content, source="src")

    # content_hash로 조회했는지 확인
    assert (
        expected_hash in queried_hashes
    ), f"SHA-256 해시 {expected_hash}로 조회하지 않았음. 실제 조회값: {queried_hashes}"


# ---------------------------------------------------------------------------
# Test 8: 빈 content → ValueError 발생
# ---------------------------------------------------------------------------


def test_ingest_document_empty_content_raises_value_error() -> None:
    """빈 content를 전달하면 ValueError가 발생해야 한다."""
    with (
        patch("ingest._get_supabase_client", return_value=MagicMock()),
        patch("ingest.chunk_text", return_value=[]),
        patch("ingest.get_embeddings_batch", return_value=[]),
    ):
        from ingest import ingest_document

        with pytest.raises(ValueError, match="content"):
            ingest_document(title="Empty", content="", source="src")


def test_ingest_document_whitespace_only_content_raises_value_error() -> None:
    """공백만 있는 content를 전달하면 ValueError가 발생해야 한다."""
    with (
        patch("ingest._get_supabase_client", return_value=MagicMock()),
        patch("ingest.chunk_text", return_value=[]),
        patch("ingest.get_embeddings_batch", return_value=[]),
    ):
        from ingest import ingest_document

        with pytest.raises(ValueError, match="content"):
            ingest_document(title="Whitespace", content="   \n\t  ", source="src")
