
    iv                         d Z ddlZddlZddlZddlmZ ddlmZ dede	e
   fdZde	e
   d	edeeef   fd
Zdde	e
   dede
fdZ	 	 	 ddedee   dededededede
fdZde	e
   de
fdZdefdZedk(  r ej*                   e              yy)u   
analyze_ab.py - haiku vs sonnet A/B 실험 분석 스크립트

Usage:
    python scripts/analyze_ab.py --input logs/ab_results.jsonl
    N)defaultdict)Optionalpathreturnc           	      n   g }t        | dd      5 }t        |d      D ]=  \  }}|j                         }|s	 |j                  t	        j
                  |             ? 	 ddd       |S # t        j                  $ r+}t        d| d| t        j                         Y d}~d}~ww xY w# 1 sw Y   |S xY w)	u!   JSONL 파일에서 결과 로드.rzutf-8)encoding   z[WARN] Line u    JSON 파싱 실패: fileN)
open	enumeratestripappendjsonloadsJSONDecodeErrorprintsysstderr)r   resultsfline_numlinees         C/home/jay/workspace/.worktrees/task-2116-dev1/scripts/analyze_ab.pyload_resultsr      s    G	dC'	* Za'1o 	ZNHd::<DZtzz$/0	ZZ N '' ZXJ.CA3GcjjYYZZ Ns4   &B*$A)B*)B'<!B"B*"B''B**B4r   modelc                    | D cg c],  }|j                  d      |k(  s|j                  dd      r+|. }}|sy|D cg c]  }|j                  dd       }}t        |      t        |      z  }|t        |      fS c c}w c c}w )u`   특정 모델의 FNR(False Negative Rate) 계산.

    Returns:
        (fnr, count) 튜플
    assigned_model
is_recheckF)        r   fnrr"   )getsumlen)r   r   r   model_results
fnr_valuesavg_fnrs         r   compute_fnrr*      s     !(o11551A+Be+KTUTYTYZfhmTnQoMo-:;!%%s#;J;*oJ/GC&&& p <s   A>A>A>B	thresholdc                    	 ddl m} | D cg c],  }|j                  d      dk(  s|j                  d	d
      r+|. }}| D cg c],  }|j                  d      dk(  s|j                  d	d
      r+|. }}t	        fd|D              }t        |      |z
  }t	        fd|D              }t        |      |z
  }	||g|	|gg}
	  ||
d      \  }}t        |      }t        |      }|
|t        d      k7  rt        |d      ndt        |d      t        |      t        |      dS # t        $ r	 ddddcY S w xY wc c}w c c}w # t        $ r}t        |      dddcY d}~S d}~ww xY w)u   Fisher's exact test 수행.

    실험군(haiku)과 대조군(sonnet)의 FNR을 비교.
    FNR >= threshold이면 'fail', 그렇지 않으면 'pass'로 이진 분류.

    Returns:
        dict with p_value, odds_ratio, table, counts
    r   )fisher_exactu)   scipy 미설치. pip install scipy 필요NERROR)errorp_valueverdictr    haikur!   Fsonnetc              3   N   K   | ]  }|j                  d d      k\  sd  ywr#   r"   r
   Nr$   .0r   r+   s     r   	<genexpr>z%fishers_exact_test.<locals>.<genexpr>B   s#     P1uc1Bi1OQP   %%c              3   N   K   | ]  }|j                  d d      k\  sd  ywr5   r6   r7   s     r   r9   z%fishers_exact_test.<locals>.<genexpr>D   s#     RA1553D	3QaRr:   z	two-sided)alternativeinf      )table
odds_ratior0   haiku_nsonnet_n)
scipy.statsr-   ImportErrorr$   r%   r&   	Exceptionstrfloatround)r   r+   r-   r   haiku_resultssonnet_results
haiku_fail
haiku_passsonnet_failsonnet_passr@   odds_ratio_rawp_value_rawr   or_valpv_vals    `              r   fishers_exact_testrT   -   s   
, !(q11551A+Bg+MVWV[V[\hjoVpQqMq!(sAAEE2B,Cx,OXYX]X]^jlqXrasNsPPPJ]#j0JRRRKn%3K*%['ABEF&25k&R# .)F+&F*0E%L*@eFA&e#}%' 1  
@
 	

 rs  FQDWEEFsP   D% D:D:D:D?D?1D?E %D76D7	E&E!E&!E&	haiku_fnrr0   rB   rC   alphafnr_threshold
min_samplec           
          ||k  s||k  rdd| d| d| ddS |dddS | |k  r||k  rd	d
| dd| d|dd| d	dS dd
| dd| d|dd| d	dS )u$   판정 로직: 채택/기각/연장.EXTENDu   표본 부족: haiku=z	, sonnet=u	    (최소 u    필요))r1   reasonr.   u   p-value 계산 실패ADOPTzFNR=.3fz (<z) AND p=z.6f)REJECTz (threshold=z), p=z (alpha= rU   r0   rB   rC   rV   rW   rX   s          r   determine_verdictrb   Y   s     x*4-gYizS]R^^fg
 	

 ".EFF= Wu_YsO3}oXgc]RUV[U\\]^
 	
 3|M?%PS}T\]b\ccde     c           
         t        d       }| D ]N  }|j                  dd      r|j                  dd      }|j                  dd      }|dv s?||   |xx   dz  cc<   P i }t        |j                               D ]a  \  }}|d	   |d
   z   }|dk(  r|d	   |z  dz  }|d
   |z  dz  }	|d	   |d
   |t	        |d      t	        |	d      t        |dz
        dk  d||<   c |S )u#   층화 추출 균등 분배 검증.c                      dddS )Nr   r2   r3   r`   r`   rc   r   <lambda>z&stratification_check.<locals>.<lambda>z   s    AYZB[ rc   r!   F
task_levelunknownr    rf   r
   r2   r3   r   d   g      I@g       @)r2   r3   total	haiku_pct
sonnet_pctbalanced)r   r$   sorteditemsrI   abs)
r   level_countsr   levelr   reportcountsrk   rl   rm   s
             r   stratification_checkrv   x   s"   .9:[.\L ,55u%lI.&	2''&!+&, F 2 2 45 
vw&"22A:7Oe+c1	H%-3
G_X&y!,
A.I,-4
u
 Mrc   c            	         t        j                  d      } | j                  ddd       | j                  dt        dd	
       | j                  dt        dd
       | j                  dt        dd
       | j                         }t        |j                        }|s%t        t        j                  ddddd             yt        dt        |       dt        j                         t        |d      \  }}t        |d      \  }}t        d|dd| d t        j                         t        d!|dd| d t        j                         t        ||j                   "      }t#        ||j%                  d#      |||j&                  |j                   |j(                  $      }t+        |      }	t        |      t-        |d%      |d&t-        |d%      |d&|||	d'}
t        t        j                  |
dd             y())Nu!   haiku vs sonnet A/B 실험 분석)descriptionz--inputTu   JSONL 입력 파일 경로)requiredhelpz--threshold333333?u   FNR 임계값 (기본: 0.15))typedefaultrz   z--alpha皙?u   유의 수준 (기본: 0.05)z--min-sample   u"   최소 표본 크기 (기본: 150)u   결과 데이터 없음r.   )r/   r1   F   )ensure_asciiindentr
   u	   [AB] 총 u
   건 로드r   r2   r3   z[AB] haiku FNR=r]   z (n=r^   z[AB] sonnet FNR=)r+   r0   ra   r>   )r#   n)total_recordsr2   r3   r-   r1   stratificationr   )argparseArgumentParseradd_argumentrH   int
parse_argsr   inputr   r   dumpsr&   r   r   r*   rT   r+   rb   r$   rV   rX   rv   rI   )parserargsr   rU   rB   
sonnet_fnrrC   fisherr1   stratoutputs              r   mainr      s   $$1TUF
	D7ST
E4Fde
	tB`a
S#DhiD4::&Gdjj#<Q`enopq	Ic'l^:
.SZZ@$Wg6Iw&w9J	OIc?$wiq
9

K	Z,D
!
<3::N4>>BF

9%jjnn??G !)E Wy!,7;
A.X>F 
$**V%
:;rc   __main__)r{   )r~   r{   r   )__doc__r   r   r   collectionsr   typingr   rG   listdictr   tuplerH   r   r*   rT   rb   rv   r   __name__exitr`   rc   r   <module>r      s     
 # s tDz 'd 'C 'E%*4E ')T
 )u ) )b e_  	
    
>$t*  :-c -` zCHHTV rc   