"""
Tests for chunker.py - TDD implementation for task-510
"""

import os
import sys

sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

import pytest
import tiktoken
from chunker import chunk_text


@pytest.fixture
def encoder() -> tiktoken.Encoding:
    return tiktoken.get_encoding("cl100k_base")


def count_tokens(text: str, encoder: tiktoken.Encoding) -> int:
    return len(encoder.encode(text))


# 1. 짧은 텍스트 (max_tokens 이내) → 청크 1개 반환
def test_short_text_returns_single_chunk() -> None:
    text = "This is a short text."
    result = chunk_text(text, max_tokens=500, overlap=0)
    assert len(result) == 1
    assert result[0]["content"] == text
    assert result[0]["chunk_index"] == 0


# 2. 긴 텍스트 (max_tokens 초과) → 여러 청크로 분할
def test_long_text_returns_multiple_chunks() -> None:
    # 각 문단이 약 20토큰, 총 10문단으로 max_tokens=50 초과 유도
    paragraphs = [f"This is paragraph number {i} with some content." for i in range(10)]
    text = "\n\n".join(paragraphs)
    result = chunk_text(text, max_tokens=50, overlap=0)
    assert len(result) > 1


# 3. 문단 경계에서 분할 검증 (더블 뉴라인 기준)
def test_split_on_paragraph_boundary() -> None:
    para1 = "First paragraph with enough content to be meaningful."
    para2 = "Second paragraph with enough content to be meaningful."
    para3 = "Third paragraph with enough content to be meaningful."
    text = f"{para1}\n\n{para2}\n\n{para3}"

    # max_tokens를 각 문단 하나만 들어갈 정도로 설정
    result = chunk_text(text, max_tokens=20, overlap=0)

    # 각 청크는 문단 경계에서 잘려야 하므로 \n\n이 청크 중간에 없어야 함
    for chunk in result:
        content: str = chunk["content"]
        # 청크 내부에 문단 구분자가 있다면 그건 해당 문단 전체가 들어간 경우
        # 핵심은 각 청크가 문단 단위로 묶인다는 것
        assert isinstance(content, str)
        assert len(content) > 0

    # 분할된 청크들의 내용을 합쳤을 때 원본 텍스트의 모든 내용이 포함되어야 함
    all_contents = " ".join(chunk["content"] for chunk in result)
    assert "First paragraph" in all_contents
    assert "Second paragraph" in all_contents
    assert "Third paragraph" in all_contents


# 4. 문장 경계 분할 검증 (마침표 기준)
def test_split_on_sentence_boundary() -> None:
    # 하나의 문단에 여러 문장, max_tokens를 작게 설정하여 문장 단위 분할 유도
    text = (
        "The first sentence is here. "
        "The second sentence follows. "
        "The third sentence continues. "
        "The fourth sentence ends here."
    )
    result = chunk_text(text, max_tokens=15, overlap=0)

    assert len(result) > 1
    # 각 청크가 문장 단위로 끊겨야 함 (문장 중간에서 끊기지 않아야 함)
    for chunk in result:
        content: str = chunk["content"]
        assert isinstance(content, str)
        assert len(content) > 0


# 5. overlap 파라미터 작동 검증: 이전 청크 끝부분이 다음 청크 시작에 포함
def test_overlap_includes_previous_chunk_end(encoder: tiktoken.Encoding) -> None:
    paragraphs = [f"This is paragraph number {i} with some content here." for i in range(8)]
    text = "\n\n".join(paragraphs)
    overlap = 10

    result_with_overlap = chunk_text(text, max_tokens=50, overlap=overlap)
    result_no_overlap = chunk_text(text, max_tokens=50, overlap=0)

    # overlap이 있을 때 청크가 2개 이상이어야 테스트 의미가 있음
    assert len(result_with_overlap) >= 2

    # overlap이 있는 경우 두 번째 청크부터 이전 청크의 내용 일부를 포함해야 함.
    # overlap이 없는 경우보다 해당 청크의 토큰 수가 많아야 함 (overlap 토큰만큼 증가).
    if len(result_no_overlap) >= 2:
        # overlap 없는 버전의 두 번째 청크 토큰 수
        no_overlap_chunk1_tokens: int = int(result_no_overlap[1]["token_count"])
        # overlap 있는 버전의 두 번째 청크 토큰 수
        with_overlap_chunk1_tokens: int = int(result_with_overlap[1]["token_count"])

        # overlap이 적용된 청크는 overlap이 없는 청크보다 토큰 수가 많아야 함
        assert with_overlap_chunk1_tokens > no_overlap_chunk1_tokens

        # 추가로, overlap 토큰들이 실제로 다음 청크 앞부분에 포함되어 있는지 검증
        # raw_chunks[0] 기준의 마지막 overlap 토큰을 디코드하여 chunk1 시작에 있는지 확인
        prev_content: str = str(result_with_overlap[0]["content"])
        prev_tokens = encoder.encode(prev_content)
        actual_overlap = min(overlap, len(prev_tokens))
        overlap_token_ids = prev_tokens[-actual_overlap:]
        overlap_text = encoder.decode(overlap_token_ids)

        next_chunk_content: str = str(result_with_overlap[1]["content"])
        # overlap 텍스트의 핵심 단어들이 다음 청크 앞부분에 포함되어야 함
        overlap_stripped = overlap_text.strip()
        search_window = next_chunk_content[: len(overlap_stripped) + 30]
        assert overlap_stripped in search_window or any(
            word in search_window for word in overlap_stripped.split() if len(word) > 3
        )


# 6. chunk_index가 0부터 순차적으로 증가
def test_chunk_index_sequential() -> None:
    paragraphs = [f"This is paragraph number {i} with some content." for i in range(10)]
    text = "\n\n".join(paragraphs)
    result = chunk_text(text, max_tokens=50, overlap=0)

    assert len(result) > 1
    for i, chunk in enumerate(result):
        assert chunk["chunk_index"] == i


# 7. token_count 필드가 실제 토큰 수와 일치
def test_token_count_matches_actual(encoder: tiktoken.Encoding) -> None:
    paragraphs = [f"This is paragraph number {i} with some content." for i in range(6)]
    text = "\n\n".join(paragraphs)
    result = chunk_text(text, max_tokens=50, overlap=0)

    for chunk in result:
        content: str = chunk["content"]
        actual_token_count = count_tokens(content, encoder)
        assert chunk["token_count"] == actual_token_count, (
            f"chunk_index={chunk['chunk_index']}: " f"expected {actual_token_count}, got {chunk['token_count']}"
        )


# 8. 빈 텍스트 → 빈 리스트 반환
def test_empty_text_returns_empty_list() -> None:
    result = chunk_text("", max_tokens=500, overlap=50)
    assert result == []


def test_whitespace_only_text_returns_empty_list() -> None:
    result = chunk_text("   \n\n  \t  ", max_tokens=500, overlap=50)
    assert result == []


# 9. max_tokens=50, overlap=10인 소규모 테스트
def test_small_scale_max_tokens_50_overlap_10(encoder: tiktoken.Encoding) -> None:
    sentences = [
        "Alpha sentence with words.",
        "Beta sentence with words.",
        "Gamma sentence with words.",
        "Delta sentence with words.",
        "Epsilon sentence with words.",
        "Zeta sentence with words.",
        "Eta sentence with words.",
        "Theta sentence with words.",
    ]
    text = " ".join(sentences)
    max_tokens = 50
    overlap = 10

    result = chunk_text(text, max_tokens=max_tokens, overlap=overlap)

    assert len(result) >= 1

    # 각 청크의 token_count가 max_tokens 이하여야 함
    for chunk in result:
        assert (
            chunk["token_count"] <= max_tokens
        ), f"chunk_index={chunk['chunk_index']} has token_count={chunk['token_count']} > max_tokens={max_tokens}"

    # chunk_index 순차 증가 확인
    for i, chunk in enumerate(result):
        assert chunk["chunk_index"] == i

    # token_count 정확성 확인
    for chunk in result:
        content: str = chunk["content"]
        actual = count_tokens(content, encoder)
        assert chunk["token_count"] == actual


# 10. 한글 텍스트 청킹 검증
def test_korean_text_chunking(encoder: tiktoken.Encoding) -> None:
    korean_paragraphs = [
        "첫 번째 문단입니다. 한글 텍스트를 처리하는 기능을 테스트합니다.",
        "두 번째 문단입니다. 한글은 영어와 다른 토큰 구조를 가집니다.",
        "세 번째 문단입니다. tiktoken은 한글도 정상적으로 처리할 수 있습니다.",
        "네 번째 문단입니다. 청킹 기능이 한글에서도 올바르게 동작해야 합니다.",
        "다섯 번째 문단입니다. 이 테스트로 한글 지원을 검증합니다.",
    ]
    text = "\n\n".join(korean_paragraphs)
    result = chunk_text(text, max_tokens=30, overlap=5)

    # 결과가 존재해야 함
    assert len(result) >= 1

    # 모든 한글 문단 내용이 어느 청크에든 포함되어야 함
    all_content = " ".join(chunk["content"] for chunk in result)
    assert "첫 번째" in all_content
    assert "두 번째" in all_content
    assert "세 번째" in all_content

    # token_count 정확성
    for chunk in result:
        content: str = chunk["content"]
        actual = count_tokens(content, encoder)
        assert chunk["token_count"] == actual

    # chunk_index 순차 증가
    for i, chunk in enumerate(result):
        assert chunk["chunk_index"] == i

    # 각 청크가 max_tokens 이하
    for chunk in result:
        assert chunk["token_count"] <= 30