"""GA4 검색어 TF-IDF + K-Means 자동 클러스터링.

사용:
  python3 keyword_cluster.py --input queries.csv --clusters 5 --output report.json
  python3 keyword_cluster.py --keywords "보험료 계산,보험 종류" --clusters 3
  python3 keyword_cluster.py --ga4 --property-id 123456 --date-range 30d --clusters 5
"""

import argparse
import csv
import json
import os
import sys
from typing import Dict, List, Optional

import numpy as np
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score

from config import INSURANCE_CLUSTER_PRESETS, is_ga4_configured  # type: ignore[import-not-found]

try:
    from google.analytics.data_v1beta import BetaAnalyticsDataClient  # type: ignore[import-not-found]
    from google.analytics.data_v1beta.types import (  # type: ignore[import-not-found]
        DateRange,
        Dimension,
        Metric,
        RunReportRequest,
    )

    _GA4_AVAILABLE = True
except ImportError:
    _GA4_AVAILABLE = False

_PILLAR: Dict[str, str] = {
    "COST": "InsuWiki 보험료 완전 가이드",
    "LEARNING": "InsuWiki 보험 가이드",
    "PROCESS": "InsuWiki 보험 가입 절차 가이드",
    "TRUST": "InsuWiki 보험사 비교 추천 가이드",
    "INVESTMENT": "InsuWiki 연금·투자형 보험 가이드",
    "UNKNOWN": "InsuWiki 보험 종합 가이드",
}
_STOPWORDS = {"보험"}  # 모든 카테고리에 공통으로 등장 → 변별력 없음


def tokenize_korean(text: str) -> List[str]:
    """공백 기반 토크나이저 (konlpy 폴백)."""
    if not text or not text.strip():
        return []
    return [t for t in text.split() if t]


def load_keywords_from_csv(file_path: str) -> List[str]:
    """CSV에서 검색어를 읽어 중복 제거 후 반환. 미존재 시 FileNotFoundError."""
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}")

    with open(file_path, newline="", encoding="utf-8") as f:
        content = f.read().strip()
    if not content:
        return []

    known_headers = {"query", "searchterm", "keyword", "검색어"}
    has_header = any(h in content.splitlines()[0].lower() for h in known_headers)

    keywords: List[str] = []
    with open(file_path, newline="", encoding="utf-8") as f:
        if has_header:
            reader = csv.DictReader(f)
            fl = {k.lower(): k for k in (reader.fieldnames or [])}
            col = next((fl[h] for h in ["query", "searchterm", "keyword", "검색어"] if h in fl), None)
            if col:
                keywords = [row.get(col, "").strip() for row in reader if row.get(col, "").strip()]
        else:
            keywords = [row[0].strip() for row in csv.reader(f) if row and row[0].strip()]

    seen: set = set()
    unique: List[str] = []
    for kw in keywords:
        if kw not in seen:
            seen.add(kw)
            unique.append(kw)
    return unique


def assign_cluster_label(keywords: List[str]) -> str:
    """프리셋 키워드 토큰 매칭으로 클러스터 라벨 할당. 미매칭 시 UNKNOWN."""
    if not keywords:
        return "UNKNOWN"

    input_words = {t for kw in keywords for t in tokenize_korean(kw) if t not in _STOPWORDS}
    scores = {
        label: sum(1 for pkw in preset_kws for t in tokenize_korean(pkw) if t not in _STOPWORDS and t in input_words)
        for label, preset_kws in INSURANCE_CLUSTER_PRESETS.items()
    }
    best = max(scores, key=lambda l: scores[l])
    return best if scores[best] > 0 else "UNKNOWN"


def cluster_keywords(keywords: List[str], n_clusters: int = 5) -> Dict:
    """TF-IDF + K-Means 클러스터링. 빈 리스트는 ValueError.

    Returns: {clusters, total_keywords, silhouette_score}
    """
    if not keywords:
        raise ValueError("키워드 리스트가 비어 있습니다.")

    n = len(keywords)
    k = min(n_clusters, n)

    vectorizer = TfidfVectorizer(analyzer="word", tokenizer=tokenize_korean, token_pattern="", min_df=1)
    X = vectorizer.fit_transform(keywords)

    km = KMeans(n_clusters=k, random_state=42, n_init="auto")
    labels = km.fit_predict(X)
    sil = float(silhouette_score(X, labels)) if 1 < k < n else 0.0

    cluster_map: Dict[int, List[str]] = {i: [] for i in range(k)}
    for kw, lbl in zip(keywords, labels):
        cluster_map[int(lbl)].append(kw)

    X_dense = X.toarray()  # type: ignore[union-attr]
    centers = km.cluster_centers_
    clusters_out = []
    for cid in range(k):
        members = cluster_map[cid]
        if members:
            idxs = [i for i, lbl in enumerate(labels) if lbl == cid]
            dists = np.linalg.norm(X_dense[idxs] - centers[cid], axis=1)
            rep_kw = keywords[idxs[int(np.argmin(dists))]]
        else:
            rep_kw = ""

        label = assign_cluster_label(members)
        clusters_out.append(
            {
                "id": cid,
                "label": label,
                "representative_keyword": rep_kw,
                "keywords": members,
                "size": len(members),
                "pillar_document_suggestion": _PILLAR.get(label, _PILLAR["UNKNOWN"]),
            }
        )

    return {"clusters": clusters_out, "total_keywords": n, "silhouette_score": round(sil, 4)}


def build_report(cluster_result: Dict, output_path: Optional[str] = None) -> Optional[Dict]:
    """JSON 파일 저장. output_path=None 이면 결과 딕셔너리 반환."""
    if output_path is None:
        return cluster_result
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(cluster_result, f, ensure_ascii=False, indent=2)
    return None


def fetch_ga4_keywords(property_id: str, days: int = 30) -> List[str]:
    """GA4 API에서 검색어 수집. 미설정/미설치 시 RuntimeError."""
    if not is_ga4_configured():
        raise RuntimeError("GA4 미설정: GA4_PROPERTY_ID 환경변수와 인증 파일을 확인하세요.")
    if not _GA4_AVAILABLE:
        raise RuntimeError("google-analytics-data 미설치: pip install google-analytics-data")

    client = BetaAnalyticsDataClient()  # type: ignore[possibly-undefined]
    req = RunReportRequest(  # type: ignore[possibly-undefined]
        property=f"properties/{property_id}",
        dimensions=[Dimension(name="searchTerm")],  # type: ignore[possibly-undefined]
        metrics=[Metric(name="sessions")],  # type: ignore[possibly-undefined]
        date_ranges=[DateRange(start_date=f"{days}daysAgo", end_date="today")],  # type: ignore[possibly-undefined]
    )
    resp = client.run_report(req)
    return list(
        dict.fromkeys(
            row.dimension_values[0].value for row in resp.rows if row.dimension_values[0].value not in ("(not set)", "")
        )
    )


def main() -> None:
    p = argparse.ArgumentParser(description="GA4 검색어 TF-IDF + K-Means 클러스터링")
    src = p.add_mutually_exclusive_group(required=True)
    src.add_argument("--input", metavar="FILE")
    src.add_argument("--keywords", metavar="KW")
    src.add_argument("--ga4", action="store_true")
    p.add_argument("--clusters", type=int, default=5)
    p.add_argument("--output", metavar="FILE")
    p.add_argument("--property-id", metavar="ID")
    p.add_argument("--date-range", default="30d")
    args = p.parse_args()

    if args.input:
        kws = load_keywords_from_csv(args.input)
    elif args.keywords:
        kws = [k.strip() for k in args.keywords.split(",") if k.strip()]
    else:
        pid = args.property_id or os.environ.get("GA4_PROPERTY_ID", "")
        if not pid:
            print("오류: --property-id 또는 GA4_PROPERTY_ID 환경변수 필요", file=sys.stderr)
            sys.exit(1)
        kws = fetch_ga4_keywords(pid, int(args.date_range.rstrip("d")))

    if not kws:
        print("오류: 입력 키워드가 없습니다.", file=sys.stderr)
        sys.exit(1)

    result = cluster_keywords(kws, n_clusters=args.clusters)
    build_report(result, args.output)
    if args.output:
        print(f"보고서 저장: {args.output}")
    else:
        print(json.dumps(result, ensure_ascii=False, indent=2))


if __name__ == "__main__":
    main()