
    i                         U d Z ddlZddlZddlZddlZddlZddlmZmZm	Z	 ddl
ZddlmZ ddlmZ ddlmZ ddlmZmZ 	 ddlmZ dd	lmZmZmZmZ d
ZdddddddZeeef   e d<   dhZ!dedee   fdZ"dedee   fdZ#dee   defdZ$d&dee   de%defdZ&d'dede	e   de	e   fd Z'd(d!ed"e%dee   fd#Z(d)d$Z)e*d%k(  r e)        yy# e$ r dZY w xY w)*u?  GA4 검색어 TF-IDF + K-Means 자동 클러스터링.

사용:
  python3 keyword_cluster.py --input queries.csv --clusters 5 --output report.json
  python3 keyword_cluster.py --keywords "보험료 계산,보험 종류" --clusters 3
  python3 keyword_cluster.py --ga4 --property-id 123456 --date-range 30d --clusters 5
    N)DictListOptional)KMeans)TfidfVectorizer)silhouette_score)INSURANCE_CLUSTER_PRESETSis_ga4_configured)BetaAnalyticsDataClient)	DateRange	DimensionMetricRunReportRequestTFu#   InsuWiki 보험료 완전 가이드u   InsuWiki 보험 가이드u'   InsuWiki 보험 가입 절차 가이드u*   InsuWiki 보험사 비교 추천 가이드u+   InsuWiki 연금·투자형 보험 가이드u    InsuWiki 보험 종합 가이드)COSTLEARNINGPROCESSTRUST
INVESTMENTUNKNOWN_PILLARu   보험textreturnc                 v    | r| j                         sg S | j                         D cg c]  }|s|	 c}S c c}w )u.   공백 기반 토크나이저 (konlpy 폴백).)stripsplit)r   ts     T/home/jay/workspace/.worktrees/task-2116-dev1/tools/geo-analytics/keyword_cluster.pytokenize_koreanr   /   s.    tzz|	zz|)!qA)))s   66	file_pathc                    t         j                  j                  |       st        d|        t	        | dd      5 }|j                         j                         ddd       sg S h d}t        fd|D              }g }t	        | dd      5 }|rt        j                  |      }|j                  xs g D ci c]  }|j                         | c}t        fdd	D        d      }|r|D cg c]C  }|j                  |d      j                         s$|j                  |d      j                         E }}nJt        j                  |      D cg c],  }|s|d
   j                         s|d
   j                         . }}ddd       t               }	g }
|D ])  }||	vs|	j!                  |       |
j#                  |       + |
S # 1 sw Y   oxY wc c}w c c}w c c}w # 1 sw Y   axY w)uX   CSV에서 검색어를 읽어 중복 제거 후 반환. 미존재 시 FileNotFoundError.u#   파일을 찾을 수 없습니다:  utf-8)newlineencodingN>   querykeyword
searchterm	   검색어c              3   d   K   | ]'  }|j                         d    j                         v  ) yw)r   N)
splitlineslower).0hcontents     r   	<genexpr>z)load_keywords_from_csv.<locals>.<genexpr>A   s-     QaQ',,.q17799Qs   -0c              3   2   K   | ]  }|v s|     y wN )r,   r-   fls     r   r/   z)load_keywords_from_csv.<locals>.<genexpr>H   s     b!Z[_aZa1bs   	
)r%   r'   r&   r(   r   )ospathexistsFileNotFoundErroropenreadr   anycsv
DictReader
fieldnamesr+   nextgetreadersetaddappend)r   fknown_headers
has_headerkeywordsr@   kcolrowseenuniquekwr.   r3   s               @@r   load_keywords_from_csvrN   6   s   77>>)$"Ei[ QRR	ig	6 #!&&(.."#	CMQ=QQJH	ig	6 [!^^A&F)/):):)@bB1!'')Q,BBb'VbdhiC@Fc#''RUWYJZJ`J`JbCGGC,224cc25**Q-Z333q6<<>AZHZ[ DF T>HHRLMM" M3# # C dZ[ [sS   G
*G&;GG&1%G"G9G&G!G!/G!G&
GG&&G/rG   c           	      &   | sy| D ch c]  }t        |      D ]  }|t        vs|  c}}t        j                         D ci c]  \  }}|t	        fd|D               c}}t        fd      }|   dkD  r|S dS c c}}w c c}}w )uZ   프리셋 키워드 토큰 매칭으로 클러스터 라벨 할당. 미매칭 시 UNKNOWN.r   c              3   ^   K   | ]$  }t        |      D ]  }|t        vs|v sd   & yw)   N)r   
_STOPWORDS)r,   pkwr   input_wordss      r   r/   z'assign_cluster_label.<locals>.<genexpr>^   s3     uOC4HuqAU_L_deitdt1u1us   --	-c                     |    S r1   r2   )lscoress    r   <lambda>z&assign_cluster_label.<locals>.<lambda>a   s    VAY     )keyr   )r   rR   r	   itemssummax)rG   rM   r   label
preset_kwsbestrT   rW   s         @@r   assign_cluster_labelra   W   s    '\1D\AQ[H[1\1\K ";!@!@!BE: 	su
uuuF v./D$<!#422 ]s   BBB
n_clustersc                    | st        d      t        |       }t        ||      }t        dt        dd      }|j                  |       }t        |dd      }|j                  |      }d|cxk  r|k  rn nt        t        ||            nd	}t        |      D 	ci c]  }	|	g  }
}	t        | |      D ]"  \  }}|
t        |         j                  |       $ |j                         }|j                  }g }t        |      D ]  }|
|   }|rrt!        |      D 	cg c]  \  }	}||k(  s|	 }}	}t"        j$                  j'                  ||   ||   z
  d
      }| |t        t#        j(                  |                  }nd}t+        |      }|j                  ||||t        |      t,        j/                  |t,        d         d        ||t1        |d      dS c c}	w c c}}	w )u~   TF-IDF + K-Means 클러스터링. 빈 리스트는 ValueError.

    Returns: {clusters, total_keywords, silhouette_score}
    u+   키워드 리스트가 비어 있습니다.wordr!   rQ   )analyzer	tokenizertoken_patternmin_df*   auto)rb   random_staten_initg        )axisr   )idr^   representative_keywordrG   sizepillar_document_suggestion   )clusterstotal_keywordsr   )
ValueErrorlenminr   r   fit_transformr   fit_predictfloatr   rangezipintrC   toarraycluster_centers_	enumeratenplinalgnormargminra   r   r?   round)rG   rb   nrH   
vectorizerXkmlabelssilicluster_maprM   lblX_densecentersclusters_outcidmembersidxsdistsrep_kwr^   s                         r   cluster_keywordsr   e   s   
 FGGHAJA &O[]fghJ  *A	12f	=B^^AF01A		% F+
,sC8=a(A1B(AK(Ax( )CCH$$R() iikG!!GLQx 
c"$-f$5D&!SADDDIINN74=73<#?aNHEd3ryy'7#89:FF$W-*0#G.5kk%AS.T		

* %uUXZ[}]]9 )B Es   
GG&Gcluster_resultoutput_pathc                     || S t        |dd      5 }t        j                  | |dd       ddd       y# 1 sw Y   yxY w)uG   JSON 파일 저장. output_path=None 이면 결과 딕셔너리 반환.Nwr"   )r$   F   ensure_asciiindent)r8   jsondump)r   r   rD   s      r   build_reportr      sG    	k3	1 CQ		.!%BCCs   6?property_iddaysc           	      P   t               st        d      t        st        d      t               }t	        d|  t        d      gt        d      gt        | dd	      g
      }|j                  |      }t        t        j                  d |j                  D                    S )uE   GA4 API에서 검색어 수집. 미설정/미설치 시 RuntimeError.uP   GA4 미설정: GA4_PROPERTY_ID 환경변수와 인증 파일을 확인하세요.uB   google-analytics-data 미설치: pip install google-analytics-datazproperties/
searchTerm)namesessionsdaysAgotoday)
start_dateend_date)property
dimensionsmetricsdate_rangesc              3      K   | ]9  }|j                   d    j                  dvs|j                   d    j                   ; yw)r   )z	(not set)r!   N)dimension_valuesvalue)r,   rJ   s     r   r/   z%fetch_ga4_keywords.<locals>.<genexpr>   s@      
.1#BVBVWXBYB_B_gxBxC  #))
s
    AA)r
   RuntimeError_GA4_AVAILABLEr   r   r   r   r   
run_reportlistdictfromkeysrows)r   r   clientreqresps        r   fetch_ga4_keywordsr      s    mnn_``$&F
{m,<01Z()TF'*:WMN	C S!D 
59YY
 	
 rY   c                     t        j                  d      } | j                  d      }|j                  dd       |j                  dd	       |j                  d
d       | j                  dt        d       | j                  dd       | j                  dd       | j                  dd       | j                         }|j                  rt        |j                        }n|j                  rH|j                  j                  d      D cg c]#  }|j                         s|j                         % }}n|j                  xs  t        j                  j                  dd      }|s0t        dt         j"                         t!        j$                  d       t'        |t	        |j(                  j+                  d                  }|s0t        dt         j"                         t!        j$                  d       t-        ||j.                        }t1        ||j2                         |j2                  rt        d|j2                          y t        t5        j6                  |d d!"             y c c}w )#Nu.   GA4 검색어 TF-IDF + K-Means 클러스터링)descriptionT)requiredz--inputFILE)metavarz
--keywordsKWz--ga4
store_true)actionz
--clusters   )typedefaultz--outputz--property-idIDz--date-range30d)r   ,GA4_PROPERTY_IDr!   u@   오류: --property-id 또는 GA4_PROPERTY_ID 환경변수 필요)filerQ   du)   오류: 입력 키워드가 없습니다.)rb   u   보고서 저장: Fr   r   )argparseArgumentParseradd_mutually_exclusive_groupadd_argumentr}   
parse_argsinputrN   rG   r   r   r   r4   environr?   printsysstderrexitr   
date_rangerstripr   rs   r   outputr   dumps)psrcargskwsrH   pidresults          r   mainr      s   ,\]A

(
($
(
7CY/\40W\2NN<c1N5NN:vN.NN?DN1NN>5N1<<>Dzz$TZZ0	"&--"5"5c":HQaggiqwwyHHG"**..1BB"GT[^[e[efHHQK c$//*@*@*E&FG9

Kcdmm<F%{{"4;;-01djjeA>?# Is   I-$I-__main__)r   r1   )   )r   N)+__doc__r   r;   r   r4   r   typingr   r   r   numpyr   sklearn.clusterr   sklearn.feature_extraction.textr   sklearn.metricsr   configr	   r
   google.analytics.data_v1betar   "google.analytics.data_v1beta.typesr   r   r   r   r   ImportErrorr   str__annotations__rR   r   rN   ra   r}   r   r   r   r   __name__r2   rY   r   <module>r      sX    
  	 
 ' '  " ; , ?D  N
 2+89?1c3h  Z
*# *$s) *c d3i B349 3 3.^tCy .^c .^$ .^b HSM XVZ^ C s DI , @F zF o  Ns   C CC