"""keyword_cluster.py 유닛 테스트 (TDD - RED→GREEN).

샘플 데이터 기반 테스트: 보험 도메인 검색어 클러스터링 기능 검증.
"""

import json
import os
import sys
import tempfile
import csv
import pytest

# 프로젝트 루트를 sys.path에 추가
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

import keyword_cluster as kc


# ---------------------------------------------------------------------------
# 샘플 데이터 (보험 도메인)
# ---------------------------------------------------------------------------

SAMPLE_KEYWORDS = [
    # COST 클러스터 샘플 데이터
    "보험료 계산",
    "보험료 할인",
    "월납 보험료",
    "보험 비용",
    "납입 방법",
    # LEARNING 클러스터 샘플 데이터
    "보험 종류",
    "보험이란",
    "보험 뜻",
    "보험 기초",
    "보험 설명",
    # PROCESS 클러스터 샘플 데이터
    "보험 가입",
    "가입 절차",
    "청약 서류",
    "보험 심사",
    "가입 방법",
    # TRUST 클러스터 샘플 데이터
    "보험 추천",
    "보험 비교",
    "보험 순위",
    "보험 후기",
    "보험 평판",
    # INVESTMENT 클러스터 샘플 데이터
    "연금 보험",
    "변액 보험",
    "저축 보험",
    "보험 수익률",
    "적금 보험",
]

SAMPLE_KEYWORDS_SMALL = [
    "보험료 계산",
    "보험 종류",
    "보험 가입",
    "연금 보험",
    "보험 추천",
    "보험 비용",
]


# ---------------------------------------------------------------------------
# 1. tokenize_korean - 한글 토크나이저
# ---------------------------------------------------------------------------


class TestTokenizeKorean:
    def test_basic_tokenization(self):
        """공백 기반 토큰화가 정상 동작해야 한다."""
        tokens = kc.tokenize_korean("보험료 계산 방법")
        assert tokens == ["보험료", "계산", "방법"]

    def test_single_word(self):
        """단어 하나도 리스트로 반환해야 한다."""
        tokens = kc.tokenize_korean("보험")
        assert tokens == ["보험"]

    def test_empty_string(self):
        """빈 문자열은 빈 리스트를 반환해야 한다."""
        tokens = kc.tokenize_korean("")
        assert tokens == []

    def test_extra_spaces(self):
        """여러 공백도 올바르게 처리해야 한다."""
        tokens = kc.tokenize_korean("보험  종류  비교")
        assert "보험" in tokens
        assert "종류" in tokens
        assert "비교" in tokens


# ---------------------------------------------------------------------------
# 2. load_keywords_from_csv - CSV 파일 로드
# ---------------------------------------------------------------------------


class TestLoadKeywordsFromCsv:
    def test_load_single_column_csv(self, tmp_path):
        """헤더 없이 키워드만 있는 CSV를 읽어야 한다 (샘플 데이터)."""
        csv_file = tmp_path / "sample_queries.csv"
        csv_file.write_text("보험료 계산\n보험 종류\n보험 가입\n", encoding="utf-8")
        keywords = kc.load_keywords_from_csv(str(csv_file))
        assert keywords == ["보험료 계산", "보험 종류", "보험 가입"]

    def test_load_csv_with_header_query_column(self, tmp_path):
        """'query' 컬럼 헤더가 있는 CSV를 읽어야 한다 (샘플 데이터)."""
        csv_file = tmp_path / "sample_ga4_queries.csv"
        with open(str(csv_file), "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=["query", "sessions"])
            writer.writeheader()
            writer.writerows([
                {"query": "보험료 계산", "sessions": "100"},
                {"query": "보험 종류", "sessions": "80"},
            ])
        keywords = kc.load_keywords_from_csv(str(csv_file))
        assert "보험료 계산" in keywords
        assert "보험 종류" in keywords

    def test_load_csv_with_searchterm_column(self, tmp_path):
        """'searchTerm' 컬럼 헤더가 있는 CSV를 읽어야 한다 (샘플 데이터)."""
        csv_file = tmp_path / "sample_searchterm.csv"
        with open(str(csv_file), "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=["searchTerm", "count"])
            writer.writeheader()
            writer.writerows([
                {"searchTerm": "보험 비교", "count": "50"},
            ])
        keywords = kc.load_keywords_from_csv(str(csv_file))
        assert "보험 비교" in keywords

    def test_load_nonexistent_file_raises(self):
        """존재하지 않는 파일은 FileNotFoundError를 발생시켜야 한다."""
        with pytest.raises(FileNotFoundError):
            kc.load_keywords_from_csv("/nonexistent/path/queries.csv")

    def test_empty_csv_returns_empty_list(self, tmp_path):
        """빈 CSV 파일은 빈 리스트를 반환해야 한다."""
        csv_file = tmp_path / "empty.csv"
        csv_file.write_text("", encoding="utf-8")
        keywords = kc.load_keywords_from_csv(str(csv_file))
        assert keywords == []

    def test_deduplication(self, tmp_path):
        """중복 키워드는 제거되어야 한다 (샘플 데이터)."""
        csv_file = tmp_path / "sample_dup.csv"
        csv_file.write_text(
            "보험료 계산\n보험료 계산\n보험 종류\n", encoding="utf-8"
        )
        keywords = kc.load_keywords_from_csv(str(csv_file))
        assert keywords.count("보험료 계산") == 1


# ---------------------------------------------------------------------------
# 3. assign_cluster_label - 클러스터 라벨 할당
# ---------------------------------------------------------------------------


class TestAssignClusterLabel:
    def test_cost_cluster_label(self):
        """보험료/비용 키워드는 COST 라벨을 받아야 한다 (샘플 데이터)."""
        label = kc.assign_cluster_label(["보험료 계산", "월납 보험료", "보험 비용"])
        assert label == "COST"

    def test_learning_cluster_label(self):
        """보험 종류/뜻 키워드는 LEARNING 라벨을 받아야 한다 (샘플 데이터)."""
        label = kc.assign_cluster_label(["보험 종류", "보험이란", "보험 기초"])
        assert label == "LEARNING"

    def test_process_cluster_label(self):
        """가입/절차 키워드는 PROCESS 라벨을 받아야 한다 (샘플 데이터)."""
        label = kc.assign_cluster_label(["보험 가입", "가입 절차", "청약 서류"])
        assert label == "PROCESS"

    def test_trust_cluster_label(self):
        """추천/비교 키워드는 TRUST 라벨을 받아야 한다 (샘플 데이터)."""
        label = kc.assign_cluster_label(["보험 추천", "보험 비교", "보험 후기"])
        assert label == "TRUST"

    def test_investment_cluster_label(self):
        """연금/투자 키워드는 INVESTMENT 라벨을 받아야 한다 (샘플 데이터)."""
        label = kc.assign_cluster_label(["연금 보험", "변액 보험", "저축 보험"])
        assert label == "INVESTMENT"

    def test_unknown_cluster_label(self):
        """프리셋과 무관한 키워드는 UNKNOWN 라벨을 받아야 한다."""
        label = kc.assign_cluster_label(["xyz abc", "foo bar", "test 123"])
        assert label == "UNKNOWN"

    def test_empty_keywords_returns_unknown(self):
        """빈 키워드 리스트는 UNKNOWN을 반환해야 한다."""
        label = kc.assign_cluster_label([])
        assert label == "UNKNOWN"


# ---------------------------------------------------------------------------
# 4. cluster_keywords - 핵심 클러스터링 함수
# ---------------------------------------------------------------------------


class TestClusterKeywords:
    def test_returns_dict_with_required_keys(self):
        """결과 딕셔너리는 필수 키를 포함해야 한다 (샘플 데이터)."""
        result = kc.cluster_keywords(SAMPLE_KEYWORDS, n_clusters=5)
        assert "clusters" in result
        assert "total_keywords" in result
        assert "silhouette_score" in result

    def test_cluster_count(self):
        """요청한 클러스터 수만큼 클러스터가 생성되어야 한다 (샘플 데이터)."""
        result = kc.cluster_keywords(SAMPLE_KEYWORDS, n_clusters=5)
        assert len(result["clusters"]) == 5

    def test_total_keywords_count(self):
        """total_keywords가 입력 키워드 수와 일치해야 한다 (샘플 데이터)."""
        result = kc.cluster_keywords(SAMPLE_KEYWORDS, n_clusters=5)
        assert result["total_keywords"] == len(SAMPLE_KEYWORDS)

    def test_each_cluster_has_required_fields(self):
        """각 클러스터는 필수 필드를 모두 가져야 한다 (샘플 데이터)."""
        result = kc.cluster_keywords(SAMPLE_KEYWORDS, n_clusters=5)
        required_fields = {
            "id", "label", "representative_keyword",
            "keywords", "size", "pillar_document_suggestion",
        }
        for cluster in result["clusters"]:
            assert required_fields.issubset(cluster.keys()), (
                f"클러스터 {cluster.get('id')}에 필드 누락: "
                f"{required_fields - cluster.keys()}"
            )

    def test_cluster_keywords_sum_equals_total(self):
        """모든 클러스터의 키워드 합이 total_keywords와 일치해야 한다 (샘플 데이터)."""
        result = kc.cluster_keywords(SAMPLE_KEYWORDS, n_clusters=5)
        total = sum(c["size"] for c in result["clusters"])
        assert total == result["total_keywords"]

    def test_silhouette_score_range(self):
        """silhouette_score는 -1에서 1 사이여야 한다 (샘플 데이터)."""
        result = kc.cluster_keywords(SAMPLE_KEYWORDS, n_clusters=5)
        assert -1.0 <= result["silhouette_score"] <= 1.0

    def test_label_is_valid_preset_or_unknown(self):
        """클러스터 라벨은 프리셋 또는 UNKNOWN이어야 한다 (샘플 데이터)."""
        from config import INSURANCE_CLUSTER_PRESETS
        valid_labels = set(INSURANCE_CLUSTER_PRESETS.keys()) | {"UNKNOWN"}
        result = kc.cluster_keywords(SAMPLE_KEYWORDS, n_clusters=5)
        for cluster in result["clusters"]:
            assert cluster["label"] in valid_labels

    def test_representative_keyword_in_cluster(self):
        """representative_keyword는 해당 클러스터의 keywords 안에 있어야 한다 (샘플 데이터)."""
        result = kc.cluster_keywords(SAMPLE_KEYWORDS, n_clusters=5)
        for cluster in result["clusters"]:
            assert cluster["representative_keyword"] in cluster["keywords"]

    def test_small_dataset(self):
        """소규모 샘플 데이터도 클러스터링 가능해야 한다."""
        result = kc.cluster_keywords(SAMPLE_KEYWORDS_SMALL, n_clusters=3)
        assert len(result["clusters"]) == 3
        assert result["total_keywords"] == len(SAMPLE_KEYWORDS_SMALL)

    def test_cluster_count_capped_at_keyword_count(self):
        """클러스터 수가 키워드 수보다 클 경우 조정되어야 한다."""
        keywords = ["보험료", "보험 종류"]
        result = kc.cluster_keywords(keywords, n_clusters=5)
        assert len(result["clusters"]) <= len(keywords)

    def test_json_serializable(self):
        """결과는 JSON 직렬화 가능해야 한다 (샘플 데이터)."""
        result = kc.cluster_keywords(SAMPLE_KEYWORDS, n_clusters=5)
        json_str = json.dumps(result, ensure_ascii=False)
        parsed = json.loads(json_str)
        assert "clusters" in parsed


# ---------------------------------------------------------------------------
# 5. 엣지 케이스
# ---------------------------------------------------------------------------


class TestEdgeCases:
    def test_single_keyword_raises_or_returns(self):
        """키워드 1개는 ValueError 또는 단일 클러스터를 반환해야 한다."""
        try:
            result = kc.cluster_keywords(["보험료"], n_clusters=1)
            assert len(result["clusters"]) == 1
        except ValueError:
            pass  # ValueError도 허용

    def test_empty_keywords_raises_value_error(self):
        """빈 키워드 리스트는 ValueError를 발생시켜야 한다."""
        with pytest.raises(ValueError):
            kc.cluster_keywords([], n_clusters=3)

    def test_duplicate_keywords_handled(self):
        """중복 키워드 입력도 처리 가능해야 한다 (샘플 데이터)."""
        keywords = ["보험료 계산"] * 10 + ["보험 종류"] * 10
        result = kc.cluster_keywords(keywords, n_clusters=2)
        assert "clusters" in result

    def test_pillar_document_suggestion_is_string(self):
        """pillar_document_suggestion은 문자열이어야 한다 (샘플 데이터)."""
        result = kc.cluster_keywords(SAMPLE_KEYWORDS, n_clusters=5)
        for cluster in result["clusters"]:
            assert isinstance(cluster["pillar_document_suggestion"], str)


# ---------------------------------------------------------------------------
# 6. build_report - JSON 보고서 생성
# ---------------------------------------------------------------------------


class TestBuildReport:
    def test_build_report_writes_json_file(self, tmp_path):
        """build_report는 JSON 파일을 생성해야 한다 (샘플 데이터)."""
        cluster_result = kc.cluster_keywords(SAMPLE_KEYWORDS, n_clusters=5)
        output_path = str(tmp_path / "sample_report.json")
        kc.build_report(cluster_result, output_path)
        assert os.path.exists(output_path)

    def test_build_report_content_valid_json(self, tmp_path):
        """생성된 파일은 유효한 JSON이어야 한다 (샘플 데이터)."""
        cluster_result = kc.cluster_keywords(SAMPLE_KEYWORDS, n_clusters=5)
        output_path = str(tmp_path / "sample_report.json")
        kc.build_report(cluster_result, output_path)
        with open(output_path, encoding="utf-8") as f:
            parsed = json.load(f)
        assert "clusters" in parsed
        assert "total_keywords" in parsed
        assert "silhouette_score" in parsed

    def test_build_report_returns_none_when_no_path(self):
        """output_path가 None이면 파일 생성 없이 결과를 반환해야 한다 (샘플 데이터)."""
        cluster_result = kc.cluster_keywords(SAMPLE_KEYWORDS, n_clusters=5)
        result = kc.build_report(cluster_result, output_path=None)
        assert result is None or isinstance(result, dict)


# ---------------------------------------------------------------------------
# 7. GA4 연동 (옵셔널 - 환경변수 미설정 시 스킵)
# ---------------------------------------------------------------------------


class TestGa4Optional:
    def test_ga4_not_configured_returns_false(self):
        """GA4 미설정 시 is_ga4_configured()는 False를 반환해야 한다."""
        from config import is_ga4_configured
        # CI 환경에서는 GA4가 미설정이므로 False 예상
        # (설정된 환경에서는 스킵)
        if is_ga4_configured():
            pytest.skip("GA4가 설정된 환경에서는 이 테스트를 스킵합니다.")
        assert not is_ga4_configured()

    def test_fetch_ga4_keywords_raises_when_not_configured(self):
        """GA4 미설정 시 fetch_ga4_keywords는 RuntimeError를 발생시켜야 한다."""
        from config import is_ga4_configured
        if is_ga4_configured():
            pytest.skip("GA4가 설정된 환경에서는 이 테스트를 스킵합니다.")
        with pytest.raises(RuntimeError):
            kc.fetch_ga4_keywords(property_id="123456", days=30)