
    Ri|7              	       ^   d Z ddlZddlZddlmZmZ dZddddd	d
dddddddddgddidZ G d dej                        Z	 G d dej                        Z
 G d dej                        Z G d dej                        Z G d dej                        Zedk(  r ej                          yy) u&   test_judge.py - judge 모듈 테스트    N)	MagicMockpatchuk  name: skill-quality
version: 1
description: "스킬 품질 체크리스트"
items:
  - id: clarity
    question: "지시문이 명확한가?"
    weight: 2.0
  - id: examples
    question: "예시가 포함되어 있는가?"
    weight: 1.0
  - id: no_ambiguity
    question: "모호한 표현이 없는가?"
    weight: 1.5
scoring:
  method: "weighted_average"
skill-quality   u   스킬 품질 체크리스트clarity   지시문이 명확한가?       @)idquestionweightexamples!   예시가 포함되어 있는가?      ?no_ambiguityu   모호한 표현이 없는가?g      ?methodweighted_averagenameversiondescriptionitemsscoringc                   0    e Zd ZdZd ZddZd Zd Zd Zy)	TestLoadChecklistu   load_checklist 함수 테스트c                     ddl m} || _        y )Nr   )load_checklist)autoresearch.judger   )selfr   s     V/home/jay/workspace/.worktrees/task-2117-dev1/scripts/autoresearch/tests/test_judge.pysetUpzTestLoadChecklist.setUp)   s    5,    Nc                    ddl }ddl}|j                  ddd      5 }|j                  t               |j
                  }ddd       	 | j                        }| j                  |d   d       | j                  |d	   d
       | j                  t        |d         d       | j                  |d   d   d   d       | j                  |d   d   d   d       |j                  |       y# 1 sw Y   xY w# |j                         w xY w)u   정상 YAML 파일 로드r   Nw.yamlFmodesuffixdeleter   r   r   r   r      r
   r   r   r	   )
ostempfileNamedTemporaryFilewriteSAMPLE_CHECKLIST_YAMLr   r   assertEquallenunlink)r   tmp_pathr*   r+   fpathresults          r   test_load_normal_yamlz'TestLoadChecklist.test_load_normal_yaml.   s    ((c'%(P 	TUGG)*66D		((.FVF^_=VI.2S115VG_Q/5yAVG_Q/93?IIdO	 	 IIdOs   "C)BC5 )C25Dc                 j   ddl }ddl}d}|j                  ddd      5 }|j                  |       |j                  }ddd       	 | j                  t              5  | j                         ddd       |j                         y# 1 sw Y   KxY w# 1 sw Y   'xY w# |j                         w xY w)u"   items 7개 이상이면 ValueErrorr   Na}  name: test
version: 1
description: "test"
items:
  - {id: i1, question: "q1", weight: 1.0}
  - {id: i2, question: "q2", weight: 1.0}
  - {id: i3, question: "q3", weight: 1.0}
  - {id: i4, question: "q4", weight: 1.0}
  - {id: i5, question: "q5", weight: 1.0}
  - {id: i6, question: "q6", weight: 1.0}
  - {id: i7, question: "q7", weight: 1.0}
scoring:
  method: "weighted_average"
r#   r$   Fr%   )	r*   r+   r,   r-   r   assertRaises
ValueErrorr   r1   )r   r*   r+   yaml_contentr3   r4   s         r   &test_too_many_items_raises_value_errorz8TestLoadChecklist.test_too_many_items_raises_value_error@   s     ((c'%(P 	TUGGL!66D		"":. *##D)* IIdO	 	* * IIdOs/   BB B-B BBB B2c                 R   ddl }ddl}d}|j                  ddd      5 }|j                  |       |j                  }ddd       	 | j                        }| j                  t        |d         d	       |j                  |       y# 1 sw Y   KxY w# |j                         w xY w)
u   items 정확히 6개는 통과r   NaS  name: test
version: 1
description: "test"
items:
  - {id: i1, question: "q1", weight: 1.0}
  - {id: i2, question: "q2", weight: 1.0}
  - {id: i3, question: "q3", weight: 1.0}
  - {id: i4, question: "q4", weight: 1.0}
  - {id: i5, question: "q5", weight: 1.0}
  - {id: i6, question: "q6", weight: 1.0}
scoring:
  method: "weighted_average"
r#   r$   Fr%   r      )	r*   r+   r,   r-   r   r   r/   r0   r1   )r   r*   r+   r:   r3   r4   r5   s          r   test_exactly_six_items_okz+TestLoadChecklist.test_exactly_six_items_ok]   s     ((c'%(P 	TUGGL!66D		((.FS115IIdO	 	 IIdOs   B/B BB&c                 N   ddl }ddl}|j                  ddd      5 }|j                  t               |j
                  }ddd       	 | j                        }dD ]  }| j                  ||        	 |j                  |       y# 1 sw Y   GxY w# |j                         w xY w)u*   반환 dict에 필수 필드 포함 확인r   Nr#   r$   Fr%   r   )	r*   r+   r,   r-   r.   r   r   assertInr1   )r   r*   r+   r3   r4   r5   fields          r   &test_returns_dict_with_required_fieldsz8TestLoadChecklist.test_returns_dict_with_required_fieldsy   s    ((c'%(P 	TUGG)*66D		((.FO -eV,- IIdO	 	 IIdOs   "B*B BB$)N)	__name__
__module____qualname____doc__r    r6   r;   r>   rB    r!   r   r   r   &   s    )-
$:8r!   r   c                   :    e Zd ZdZd Zd Zd Zd Zd Zd Z	d Z
y	)
TestFormatChecklistForPromptu,   format_checklist_for_prompt 함수 테스트c                     ddl m} || _        y )Nr   )format_checklist_for_prompt)r   rK   format_checklist)r   rK   s     r   r    z"TestFormatChecklistForPrompt.setUp   s    B ;r!   c                     | j                  t              }| j                  d|       | j                  d|       | j                  d|       y)u   출력에 item id 포함r   r   r   NrL   SAMPLE_CHECKLIST_DICTr@   r   r5   s     r   test_format_contains_idz4TestFormatChecklistForPrompt.test_format_contains_id   s=    &&'<=i(j&)nf-r!   c                 v    | j                  t              }| j                  d|       | j                  d|       y)u   출력에 질문 포함r   r   NrN   rP   s     r   test_format_contains_questionz:TestFormatChecklistForPrompt.test_format_contains_question   s1    &&'<=2F;96Br!   c                     | j                  t              }| j                  d|       | j                  d|       | j                  d|       y)u   출력에 weight 포함z2.0z1.0z1.5NrN   rP   s     r   test_format_contains_weightz8TestFormatChecklistForPrompt.test_format_contains_weight   s=    &&'<=eV$eV$eV$r!   c                     | j                  t              }|j                  d      D cg c]  }|j                         s| }}| j	                  t        |      d       yc c}w )u,   각 항목이 별도 줄에 있는지 확인
r)   N)rL   rO   splitstripassertGreaterEqualr0   )r   r5   lliness       r   &test_format_each_item_on_separate_linezCTestFormatChecklistForPrompt.test_format_each_item_on_separate_line   sP    &&'<="LL.<q!'')<<E
A. =s
   A!A!c                    | j                  t              }|j                  d      D cg c]#  }|j                         s|j                         % }}|D ]&  }| j	                  |j                  d      d|       ( yc c}w )u,   각 항목이 '- '로 시작하는지 확인rW   z- zLine does not start with '- ': N)rL   rO   rX   rY   
assertTrue
startswith)r   r5   r[   r\   lines        r   test_format_dash_prefixz4TestFormatChecklistForPrompt.test_format_dash_prefix   su    &&'<=$*LL$6Dq!'')DD 	_DOODOOD15TUYT\3]^	_ Es
   A?A?c                 R    | j                  t              }| j                  d|       y)u!   'weight:' 키워드 포함 확인zweight:NrN   rP   s     r   test_format_weight_keywordz7TestFormatChecklistForPrompt.test_format_weight_keyword   s!    &&'<=i(r!   N)rC   rD   rE   rF   r    rQ   rS   rU   r]   rb   rd   rG   r!   r   rI   rI      s*    6<
.C%/_)r!   rI   c                   (    e Zd ZdZd Zd Zd Zd Zy)TestBuildJudgePromptu#   build_judge_prompt 함수 테스트c                     ddl m} || _        y )Nr   )build_judge_prompt)r   rh   )r   rh   s     r   r    zTestBuildJudgePrompt.setUp   s    9"4r!   c                 T    | j                  t        d      }| j                  d|       y)u#   프롬프트에 skill_output 포함u   결과물 내용Nrh   rO   r@   rP   s     r   test_contains_skill_outputz/TestBuildJudgePrompt.test_contains_skill_output   s%    (()>@RS(&1r!   c                 T    | j                  t        d      }| j                  d|       y)u-   프롬프트에 체크리스트 항목 포함	   결과물r   Nrj   rP   s     r   test_contains_checklist_itemsz2TestBuildJudgePrompt.test_contains_checklist_items   s#    (()>Li(r!   c                 \    | j                  t        d      }| j                  |t               y)u   문자열 반환 확인rm   N)rh   rO   assertIsInstancestrrP   s     r   test_returns_stringz(TestBuildJudgePrompt.test_returns_string   s%    (()>Lfc*r!   N)rC   rD   rE   rF   r    rk   rn   rr   rG   r!   r   rf   rf      s    -5
2
)
+r!   rf   c                   H    e Zd ZdZd ZddZd Zd Zd Zd Z	d Z
d	 Zd
 Zy)TestParseJudgeResponseu%   parse_judge_response 함수 테스트c                     ddl m} || _        y )Nr   )parse_judge_response)r   rv   parse)r   rv   s     r   r    zTestParseJudgeResponse.setUp   s    ;)
r!   c                 <    dd l }|||d}|j                  |d      S Nr   r   total_scoresummaryFensure_asciijsondumps)r   
items_datar{   r|   r   datas         r   _make_json_responsez*TestParseJudgeResponse._make_json_response   s,      &

 zz$Uz33r!   c                    ddddddddddd	dg}| j                  |      }| j                  |t              }| j                  t	        |d
         d       | j                  |d
   d   d   d       | j                  |d
   d   d   d       | j                  |d   t               | j                  |d   t               y)u   정상 JSON 파싱r   PASS	   명확함r
   r5   reasonr   FAIL   예시 없음r      모호함 없음r   r)   r   r5   r   r{   r|   N)r   rw   rO   r/   r0   rp   floatrq   r   r   responser5   s       r   test_parse_normal_responsez1TestParseJudgeResponse.test_parse_normal_response   s     +F?K!V?QR


 ++J7H&;<VG_-q1+H5v>+H5v>f]3U;fY/5r!   c                     ddddddddddd	dg}| j                  |d
      }| j                  |t              }d}| j                  |d   |d       y)u   total_score 가중평균 직접 재계산 검증

        checklist weights: clarity=2.0, examples=1.0, no_ambiguity=1.5
        PASS: clarity(2.0), no_ambiguity(1.5) -> 3.5 / 4.5 = 0.7777...
        FAIL: examples(1.0)
        r   r   r   r   r   r   r   r   r   g      ?r{   g98?r{      placesNr   rw   rO   assertAlmostEqual)r   r   r   r5   expecteds        r   #test_weighted_average_recalculationz:TestParseJudgeResponse.test_weighted_average_recalculation   sq     +F?K!V?QR

 ++JC+HH&;<8vm4hqIr!   c                     ddddddddddddg}| j                  |d      }| j                  |t              }| j                  |d	   d
d       y)u   모두 PASS이면 score=1.0r   r   okr   r   r           r   r{   r   r   r   Nr   r   s       r   test_all_pass_score_is_onez1TestParseJudgeResponse.test_all_pass_score_is_one  k     $?4@!VtD


 ++JC+HH&;<vm4c!Dr!   c                     ddddddddddddg}| j                  |d      }| j                  |t              }| j                  |d	   d
d       y)u   모두 FAIL이면 score=0.0r   r   nor   r   r   r   r   r{   r   r   r   Nr   r   s       r   test_all_fail_score_is_zeroz2TestParseJudgeResponse.test_all_fail_score_is_zero  r   r!   c                     ddddddddddddg}| j                  |      }| j                  |t              }dD ]  }| j                  ||        y)	u   반환 dict 키 확인r   r   r   r   r   r   rz   N)r   rw   rO   r@   )r   r   r   r5   keys        r   test_result_keysz'TestParseJudgeResponse.test_result_keys  si     $?4@!VtD


 ++J7H&;<6 	'CMM#v&	'r!   c                     | j                  t        t        f      5  | j                  dt               ddd       y# 1 sw Y   yxY w)u!   JSON 파싱 실패 시 ValueErroru   이건 JSON이 아닙니다N)r8   r9   	Exceptionrw   rO   )r   s    r   test_invalid_json_raisesz/TestParseJudgeResponse.test_invalid_json_raises'  s>    
I67 	MJJ46KL	M 	M 	Ms	   <Ac                     ddl }ddddddddddddg}|d	d
d}|j                  |d      }d| d}| j                  |t              }| j	                  |       | j                  d|       y)u/   텍스트에 JSON이 포함된 경우도 파싱r   Nr   r   r   r   r   r   ?   좋음rz   Fr}   u"   다음은 채점 결과입니다:
u   

이상입니다.r   )r   r   rw   rO   assertIsNotNoner@   )r   r   r   r   json_strresponse_with_textr5   s          r   test_json_embedded_in_textz1TestParseJudgeResponse.test_json_embedded_in_text,  s     $?4@!VtD


 $CHM::d:7B8*L`a.0EFV$gv&r!   N)r   r   )rC   rD   rE   rF   r    r   r   r   r   r   r   r   r   rG   r!   r   rt   rt      s6    /*
46 J&	E	E
'M
'r!   rt   c                   l    e Zd ZdZddZ ed      d        Z ed      d        Z ed      d        Zy)	TestJudgeOutputu2   judge_output 함수 테스트 (call_claude 모킹)c                 8    dd l }|j                  |||dd      S ry   r   )r   r   r{   r|   r   s        r   _make_llm_jsonzTestJudgeOutput._make_llm_json@  s.    zz#*"
   
 	
r!   zautoresearch.judge.call_claudec                 <   ddddddddddddg}| j                  |      }||_        ddlm}  |t        d	      }| j                  d
|       | j                  d|       | j                  d|       | j                  |d   d       | j                  |d   d       y)u4   judge_output이 파싱 결과 + 토큰 정보 반환r   r   r   r   r   r   r   judge_outputrm   r   r{   r|   input_tokensoutput_tokensN)r   return_valuer   r   rO   r@   assertGreater)r   mock_call_clauder   	json_textr   r5   s         r   ,test_judge_output_returns_result_with_tokensz<TestJudgeOutput.test_judge_output_returns_result_with_tokensL  s     $?4@!VtD


 ''
3	(1%33[Agv&mV,i(6.1156/2A6r!   c                     ddddddddddddg}| j                  |      |_        ddlm}  |t        d	       |j
                  }| j                  d
|j                  d          y)u#   기본 모델이 haiku인지 확인r   r   r   r   r   r   r   r   rm   haikumodelN)r   r   r   r   rO   	call_argsr@   kwargs)r   r   r   r   call_kwargss        r   %test_judge_output_default_model_haikuz5TestJudgeOutput.test_judge_output_default_model_haikua  sq     $?4@!VtD


 )-(;(;J(G%3*K8&00g{11':;r!   c                     ddddddddddddg}| j                  |      |_        ddlm}  |t        d	       |j                          y
)u%   call_claude가 호출되는지 확인r   r   r   r   r   r   r   r   rm   N)r   r   r   r   rO   assert_called_once)r   r   r   r   s       r   #test_judge_output_calls_call_claudez3TestJudgeOutput.test_judge_output_calls_call_clauder  sY     $?4@!VtD


 )-(;(;J(G%3*K8++-r!   N)r   LGTM)	rC   rD   rE   rF   r   r   r   r   r   rG   r!   r   r   r   =  sX    <

 +,7 -7( +,< -<  +,. -.r!   r   __main__)rF   iounittestunittest.mockr   r   r.   rO   TestCaser   rI   rf   rt   r   rC   mainrG   r!   r   <module>r      s    , 	  * & 2&BcR'JVYZ+KWZ[
 ,-
 `)) `F,)8#4#4 ,)^+8,, +0j'X.. j'ZC.h'' C.L zHMMO r!   