
    j(/                    x   d Z ddlmZ ddlZddlmc mZ ddl	Z	ddl
Z
ddlZej                  j                  dd       ddlZddlmZ ddlmZmZ ddlmZ e
j                  j-                  e
j                  j/                  e      d      Zdd	Zdd
Z G d d      Z G d d      Z G d d      Z G d d      Zy)u  Regression replay tests for task-2703/2704/2705+3 using fixtures.

chair_authorization_id=CHAIR-AUTH-TASK-2706-V36-FINISH-TASK-PROFILE-LAYER-P1B-260529

Tests §13 regression scenarios:
- task-2703: task_mode=system_hook, G3=PASS, overall=PASS
- task-2704: task_mode=code, G3=PASS, G4=WARN (INHERITED_DIRTY), overall=WARN
- task-2705+3: task_mode=code, G3=ESCALATE (forbidden+misfire), G4=FAIL (EXTERNAL_DIRTY), overall=ESCALATE
    )annotationsNz/home/jay/workspace)classify_task_mode)evaluate_gatescompute_overall)validate_profilefixturesc                    t         j                  j                  t        |       }t	        |dd      5 }t        j                  |      cd d d        S # 1 sw Y   y xY w)Nrzutf-8)encoding)ospathjoinFIXTURES_DIRopenjsonload)namer   fs      L/home/jay/workspace/tests/harness/test_v36_finish_task_profile_regression.py_load_fixturer      sD    77<<d+D	dC'	* ayy|  s   AAc                    | j                  dd      }| j                  di       }| j                  dd      }t        |      }|d   }t        ||      }t        |      \  }}||||||dS )z>Run classifier + judge on a replay fixture and return summary.task_md_text_fragment gate_inputstask_idUNKNOWN	task_mode)r   r   classificationgatesoverall_resultoverall_rationale)getr   r   r   )	fixturetask_md_textr   r   r   r   r   r    r!   s	            r   _run_replayr%       s    ;;6;L++mR0Kkk)Y/G'5N{+I9k2E(7(>%N% ((.     c                  (    e Zd ZdZd Zd Zd Zd Zy)TestTask2703Regressionu+   §13.1 task-2703 replay fixture validation.c                z   t        d      }t        |      }|d   }d}||k(  }|st        j                  d|fd||f      t        j                  |      t        j                  |      dz  }t        j
                  d|d   d|d	   d
          dz   d|iz  }t        t        j                  |            d x}x}}y )Nv36_task_2703_replay.jsonr   system_hook==z%(py1)s == %(py4)spy1py4z/task-2703: expected task_mode=system_hook, got 	; signal=r   signal
>assert %(py6)spy6r   r%   
@pytest_ar_call_reprcompare	_saferepr_format_assertmsgAssertionError_format_explanationselfr#   replay@py_assert0@py_assert3@py_assert2@py_format5@py_format7s           r   $test_task_2703_task_mode_system_hookz;TestTask2703Regression.test_task_2703_task_mode_system_hook=   s     ;<W%k" 	
m 	
"m3 	
 	
"m 	
 	
 
	 # 	
 	
 
	 '4 	
 	
  >f[>Q=T U-.x89;	
 	
 	
 	
 	
 	
r&   c                   t        d      }t        |      }|d   d   d   }d}||k(  }|st        j                  d|fd||f      dt	        j
                         v st        j                  |      rt        j                  |      ndt        j                  |      d	z  }t        j                  d
|d|d   d   d          dz   d|iz  }t        t        j                  |            d x}}y )Nr*   r   G3_scope_guardresultPASSr,   z%(py0)s == %(py3)s	g3_resultpy0py3z!task-2703: expected G3=PASS, got ; rationale=	rationale
>assert %(py5)spy5r   r%   r7   r8   @py_builtinslocals_should_repr_global_namer9   r:   r;   r<   r>   r#   r?   rK   rB   @py_assert1@py_format4@py_format6s           r   test_task_2703_g3_passz-TestTask2703Regression.test_task_2703_g3_passE        ;<W%7O$45h?	" 	
yF" 	
 	
yF 	
 	
 
6	
 	
   	
 	
 
	  	
 	
 
	 # 	
 	
  0	} =)9:;GHJ	
 	
 	
 	
 	
r&   c                r   t        d      }t        |      }|d   }d}||v }|st        j                  d|fd||f      t        j                  |      t        j                  |      dz  }t        j
                  d|d   d|d	          d
z   d|iz  }t        t        j                  |            d x}x}}y )Nr*   r    )rI   WARNin)z%(py1)s in %(py4)sr/   z+task-2703: expected overall=PASS/WARN, got rO   r!   r4   r5   r6   r=   s           r   test_task_2703_overall_passz2TestTask2703Regression.test_task_2703_overall_passN   s     ;<W% &' 	
+; 	
'+;; 	
 	
'+; 	
 	
 
	 ( 	
 	
 
	 ,< 	
 	
  :&AQ:R9U V 3457	
 	
 	
 	
 	
 	
r&   c           	        t        d      }t        |      }ddlm} ||d   |d   |d   d   |d   d   |d   d	   d
|d   |d   |d   dd}t	        |      \  }}|s~t        j                  d|       dz   ddt        j                         v st        j                  |      rt        j                  |      ndiz  }t        t        j                  |            y )Nr*   r   SCHEMA_VERSIONr   r   r   r3   priority_applied
confidencer3   re   rf   r   r    r!   2026-05-29T00:00:00+00:00schema_versionr   r   !task_mode_classification_evidencer   r    r!   tsz(task-2703 replay profile schema errors: 
>assert %(py0)srM   valid)r   r%   .scripts.harness.v36.finish_task_profile_schemard   r   r7   r:   rT   rU   rV   r9   r;   r<   )r>   r#   r?   rd   profilern   errors@py_format1s           r   #test_task_2703_fixture_schema_validz:TestTask2703Regression.test_task_2703_fixture_schema_validX   s     ;<W%Q,i(, !128<$*+;$<=O$P$%56|D2
 G_$%56!'(;!<-
 )1vII@IIIIIIIuIIIuIIIIIur&   N)__name__
__module____qualname____doc__rE   r[   ra   rs    r&   r   r(   r(   :   s    5


Jr&   r(   c                  .    e Zd ZdZd Zd Zd Zd Zd Zy)TestTask2704Regressionu+   §13.2 task-2704 replay fixture validation.c                z   t        d      }t        |      }|d   }d}||k(  }|st        j                  d|fd||f      t        j                  |      t        j                  |      dz  }t        j
                  d|d   d|d	   d
          dz   d|iz  }t        t        j                  |            d x}x}}y )Nv36_task_2704_replay.jsonr   coder,   r.   r/   z(task-2704: expected task_mode=code, got r2   r   r3   r4   r5   r6   r=   s           r   test_task_2704_task_mode_codez4TestTask2704Regression.test_task_2704_task_mode_codeu   s     ;<W%k" 	
f 	
"f, 	
 	
"f 	
 	
 
	 # 	
 	
 
	 '- 	
 	
  7vk7J6M N-.x89;	
 	
 	
 	
 	
 	
r&   c                   t        d      }t        |      }|d   d   d   }d}||k(  }|st        j                  d|fd||f      dt	        j
                         v st        j                  |      rt        j                  |      ndt        j                  |      d	z  }t        j                  d
|d|d   d   d          dz   d|iz  }t        t        j                  |            d x}}y )Nr|   r   rG   rH   rI   r,   rJ   rK   rL   z!task-2704: expected G3=PASS, got rO   rP   rQ   rR   rS   rW   s           r   test_task_2704_g3_passz-TestTask2704Regression.test_task_2704_g3_pass}   r\   r&   c                   t        d      }t        |      }|d   d   d   }d}||k(  }|st        j                  d|fd||f      dt	        j
                         v st        j                  |      rt        j                  |      ndt        j                  |      d	z  }t        j                  d
|d|d   d   d          dz   d|iz  }t        t        j                  |            d x}}|d   d   }d}	|j                  }
d}d} |
||      }|j                  }d}d} |||      }||z   }|	|v }|st        j                  d|fd|	|f      t        j                  |	      dt	        j
                         v st        j                  |      rt        j                  |      ndt        j                  |
      t        j                  |      t        j                  |      t        j                  |      dt	        j
                         v st        j                  |      rt        j                  |      ndt        j                  |      t        j                  |      t        j                  |      t        j                  |      dz  }t        j                  d      dz   d|iz  }t        t        j                  |            d x}	x}x}
x}x}x}x}x}x}x}}y )Nr|   r   G4_dirty_workspacerH   r^   r,   rJ   	g4_resultrL   z!task-2704: expected G4=WARN, got rO   rP   rQ   rR   INHERITED_DIRTYevidencer   r_   )z%(py1)s in (%(py11)s
{%(py11)s = %(py5)s
{%(py5)s = %(py3)s.get
}(%(py7)s, %(py9)s)
} + %(py20)s
{%(py20)s = %(py14)s
{%(py14)s = %(py12)s.get
}(%(py16)s, %(py18)s)
})g4_info)r0   rN   rR   py7py9py11py12py14py16py18py20z,task-2704: G4 should mention INHERITED_DIRTYz
>assert %(py23)spy23)r   r%   r7   r8   rT   rU   rV   r9   r:   r;   r<   r"   )r>   r#   r?   r   rB   rX   rY   rZ   r   r@   @py_assert4@py_assert6@py_assert8@py_assert10@py_assert13@py_assert15@py_assert17@py_assert19@py_assert21@py_format22@py_format24s                        r   &test_task_2704_g4_warn_inherited_dirtyz=TestTask2704Regression.test_task_2704_g4_warn_inherited_dirty   s    ;<W%7O$89(C	" 	
yF" 	
 	
yF 	
 	
 
6	
 	
   	
 	
 
	  	
 	
 
	 # 	
 	
  0	} =)=>{KLN	
 	
 	
 	
 	

 /"67  	
W[[ 	
 	
R 	
[R%@ 	
7;; 	
{ 	
\^ 	
;{\^C_ 	
%@C_%_ 	
 %_` 	
 	
 	
 %_ 	
 	
 		 ! 	
 	
	6	
 	
  &- 	
 	
 		 &- 	
 	
 		 &1 	
 	
 		 2< 	
 	
 		 >@ 	
 	
 		 &A 	
 	
	6	
 	
  DK 	
 	
 		 DK 	
 	
 		 DO 	
 	
 		 P[ 	
 	
 		 ]_ 	
 	
 		 D` 	
 	
  ;	
 	
 	
 	
 	
 	
 	
 	
r&   c                t   t        d      }t        |      }|d   }d}||k(  }|st        j                  d|fd||f      t        j                  |      t        j                  |      dz  }t        j
                  d|d   d|d	          d
z   d|iz  }t        t        j                  |            d x}x}}y )Nr|   r    r^   r,   r.   r/   z&task-2704: expected overall=WARN, got rO   r!   r4   r5   r6   r=   s           r   test_task_2704_overall_warnz2TestTask2704Regression.test_task_2704_overall_warn   s     ;<W%&' 	
6 	
'61 	
 	
'6 	
 	
 
	 ( 	
 	
 
	 ,2 	
 	
  5V<L5M4P Q 3457	
 	
 	
 	
 	
 	
r&   c                \   t        d      }t        |      }|d   }d}||k7  }|st        j                  d|fd||f      t        j                  |      t        j                  |      dz  }t        j
                  d      dz   d	|iz  }t        t        j                  |            d x}x}}y )
Nr|   r    ESCALATE)!=)z%(py1)s != %(py4)sr/   z:task-2704: should not ESCALATE (ACCEPT_WITH_KNOWN_CAVEATS)r4   r5   r6   r=   s           r   test_task_2704_no_escalatez1TestTask2704Regression.test_task_2704_no_escalate   s     ;<W%&' 	
: 	
':5 	
 	
': 	
 	
 		 ( 	
 	
 		 ,6 	
 	
  I	
 	
 	
 	
 	
 	
r&   N)	rt   ru   rv   rw   r~   r   r   r   r   rx   r&   r   rz   rz   r   s    5




r&   rz   c                  4    e Zd ZdZd Zd Zd Zd Zd Zd Z	y)	TestTask2705Plus3Regressionu<   §13.3 task-2705+3 replay — core scope-guard misfire case.c                z   t        d      }t        |      }|d   }d}||k(  }|st        j                  d|fd||f      t        j                  |      t        j                  |      dz  }t        j
                  d|d   d|d	   d
          dz   d|iz  }t        t        j                  |            d x}x}}y )Nv36_task_2705plus3_replay.jsonr   r}   r,   r.   r/   z*task-2705+3: expected task_mode=code, got r2   r   r3   r4   r5   r6   r=   s           r   "test_task_2705plus3_task_mode_codez>TestTask2705Plus3Regression.test_task_2705plus3_task_mode_code   s     @AW%k" 	
f 	
"f, 	
 	
"f 	
 	
 
	 # 	
 	
 
	 '- 	
 	
  99L8O P-.x89;	
 	
 	
 	
 	
 	
r&   c                   t        d      }t        |      }|d   d   d   }d}||k(  }|st        j                  d|fd||f      dt	        j
                         v st        j                  |      rt        j                  |      ndt        j                  |      d	z  }t        j                  d
|d|d   d   d          dz   d|iz  }t        t        j                  |            dx}}y)zHG3 must ESCALATE: session-watchdog.sh forbidden + large scope violation.r   r   rG   rH   r   r,   rJ   rK   rL   z;task-2705+3: expected G3=ESCALATE (forbidden+misfire), got rO   rP   rQ   rR   NrS   rW   s           r   test_task_2705plus3_g3_escalatez;TestTask2705Plus3Regression.test_task_2705plus3_g3_escalate   s     @AW%7O$45h?	& 	
yJ& 	
 	
yJ 	
 	
 
6	
 	
   	
 	
 
	  	
 	
 
	 ' 	
 	
  J) W)9:;GHJ	
 	
 	
 	
 	
r&   c                   t        d      }t        |      }|d   d   d   }d}||k(  }|st        j                  d|fd||f      dt	        j
                         v st        j                  |      rt        j                  |      ndt        j                  |      d	z  }t        j                  d
|d|d   d   d          dz   d|iz  }t        t        j                  |            dx}}y)z)G4 must FAIL: EXTERNAL_DIRTY 1041+ files.r   r   r   rH   FAILr,   rJ   r   rL   z:task-2705+3: expected G4=FAIL (EXTERNAL_DIRTY 1041+), got rO   rP   rQ   rR   NrS   )r>   r#   r?   r   rB   rX   rY   rZ   s           r   test_task_2705plus3_g4_failz7TestTask2705Plus3Regression.test_task_2705plus3_g4_fail   s     @AW%7O$89(C	" 	
yF" 	
 	
yF 	
 	
 
6	
 	
   	
 	
 
	  	
 	
 
	 # 	
 	
  I V)=>{KLN	
 	
 	
 	
 	
r&   c                t   t        d      }t        |      }|d   }d}||k(  }|st        j                  d|fd||f      t        j                  |      t        j                  |      dz  }t        j
                  d|d   d|d	          d
z   d|iz  }t        t        j                  |            dx}x}}y)z4Overall must be ESCALATE: G3=ESCALATE is worst case.r   r    r   r,   r.   r/   z,task-2705+3: expected overall=ESCALATE, got rO   r!   r4   r5   Nr6   r=   s           r   $test_task_2705plus3_overall_escalatez@TestTask2705Plus3Regression.test_task_2705plus3_overall_escalate   s     @AW%&' 	
: 	
':5 	
 	
': 	
 	
 
	 ( 	
 	
 
	 ,6 	
 	
  ;6BR;S:V W 3457	
 	
 	
 	
 	
 	
r&   c                0   t        d      }t        |      }|d   d   }|j                  dd      |j                  dd      z   }g }d}||v }|}|sd}	|j                  }
 |
       }|	|v }|}|st	        j
                  d	|fd
||f      t	        j                  |      dt        j                         v st	        j                  |      rt	        j                  |      nddz  }dd|iz  }|j                  |       |st	        j
                  d	fd	f      t	        j                  |	      dt        j                         v st	        j                  |      rt	        j                  |      ndt	        j                  
      t	        j                  |      dz  }dd|iz  }|j                  |       t	        j                  |d      i z  }t	        j                  d|      dz   d|iz  }t        t	        j                  |            dx}x}x}x}x}	x}x}
}y)z<G3 rationale or evidence should mention session-watchdog.sh.r   r   rG   r   r   rP   zsession-watchdog	forbiddenr_   )z%(py3)s in %(py5)sg3_text)rN   rR   z%(py7)sr   )zJ%(py10)s in %(py16)s
{%(py16)s = %(py14)s
{%(py14)s = %(py12)s.lower
}()
})py10r   r   r   z%(py18)sr      z8G3 should mention forbidden path in evidence/rationale: z
>assert %(py21)spy21N)r   r%   r"   lowerr7   r8   r9   rT   rU   rV   append_format_boolopr:   r;   r<   )r>   r#   r?   g3_infor   rX   rB   r   r@   @py_assert9r   r   @py_assert11rZ   @py_format8@py_format17@py_format19@py_format20r   s                      r   )test_task_2705plus3_g3_mentions_forbiddenzETestTask2705Plus3Regression.test_task_2705plus3_g3_mentions_forbidden   s    @AW%/"23++j"-K0LL	
! 	
!W, 	
 	
w}} 	
} 	
0N 	
 	
 	
!W 	
 	
 		 " 	
 	
	6	
 	
  &- 	
 	
 		 &- 	
 	
 	
	6	
		
 	
 	
 	
 		 1< 	
 	
	6	
 	
  @G 	
 	
 		 @G 	
 	
 		 @M 	
 	
 		 @O 	
 	
 	
	6	
		
 	
 	
  GwkR	
 	
 	
 	
 	
 	
 	
r&   c                   t        d      }t        |      }|d   d   }|j                  dd      |j                  dd      z   }d}||v }|st        j                  d|fd	||f      t        j
                  |      d
t        j                         v st        j                  |      rt        j
                  |      nd
dz  }t        j                  d|      dz   d|iz  }t        t        j                  |            dx}}y)z+G4 rationale should mention EXTERNAL_DIRTY.r   r   r   r   r   rP   EXTERNAL_DIRTYr_   )z%(py1)s in %(py3)sg4_text)r0   rN   z"G4 should mention EXTERNAL_DIRTY: rQ   rR   N)r   r%   r"   r7   r8   r9   rT   rU   rV   r:   r;   r<   )	r>   r#   r?   r   r   r@   rB   rY   rZ   s	            r   .test_task_2705plus3_g4_mentions_external_dirtyzJTestTask2705Plus3Regression.test_task_2705plus3_g4_mentions_external_dirty   s     @AW%/"67++j"-K0LL 	
7* 	
 	
7 	
 	
 		   	
 	
	6	
 	
  $+ 	
 	
 		 $+ 	
 	
  1<	
 	
 	
 	
 	
r&   N)
rt   ru   rv   rw   r   r   r   r   r   r   rx   r&   r   r   r      s#    F





r&   r   c                      e Zd ZdZej
                  j                  dg d      d        Zej
                  j                  dg d      d        Zy)TestRegressionSchemaValidz<All regression replays should produce schema-valid profiles.fixture_name)r*   r|   r   c           	        ddl m} t        |      }t        |      }||d   |d   |d   d   |d   d   |d   d   d	|d
   |d   |d   dd}t	        |      \  }}|st        j                  d|d|       dz   ddt        j                         v st        j                  |      rt        j                  |      ndiz  }t        t        j                  |            y )Nr   rc   r   r   r   r3   re   rf   rg   r   r    r!   rh   ri   zfixture z" produced schema-invalid profile: rm   rM   rn   )ro   rd   r   r%   r   r7   r:   rT   rU   rV   r9   r;   r<   )	r>   r   rd   r#   r?   rp   rn   rq   rr   s	            r   !test_replay_produces_valid_schemaz;TestRegressionSchemaValid.test_replay_produces_valid_schema   s    	R-W% -i(, !128<$*+;$<=O$P$%56|D2
 G_$%56!'(;!<-
 )1v 	
  |&&HQ	
 	
	6	
 	
   	
 	
 		  	
 	
 	
 	
 	
ur&   c                   t        |      }	 t        |      }t        |t              }|sddt	        j
                         v st        j                  t              rt        j                  t              nddt	        j
                         v st        j                  |      rt        j                  |      nddt	        j
                         v st        j                  t              rt        j                  t              ndt        j                  |      dz  }t        t        j                  |            d }y # t        $ r%}t        j                  d|d|        Y d }~y d }~ww xY w)Nz5assert %(py4)s
{%(py4)s = %(py0)s(%(py1)s, %(py2)s)
}
isinstancer?   dict)rM   r0   py2r1   z
replay of z	 raised: )r   r%   r   r   rT   rU   r7   rV   r9   r;   r<   	Exceptionpytestfail)r>   r   r#   r?   rA   rC   excs          r   test_replay_never_raisesz2TestRegressionSchemaValid.test_replay_never_raises
  s      -	E )Ffd++++++++:+++:++++++f+++f++++++d+++d++++++++++ 	EKK*\$4IcUCDD	Es   D+D9 9	E'E""E'N)	rt   ru   rv   rw   r   markparametrizer   r   rx   r&   r   r   r      sY    F[[^ . 



0 [[^ . 
E
Er&   r   )r   strreturnr   )r#   r   r   r   ) rw   
__future__r   builtinsrT   _pytest.assertion.rewrite	assertionrewriter7   r   r   sysr   insertr   2scripts.harness.v36.finish_task_profile_classifierr   -scripts.harness.v36.finish_task_profile_judger   r   ro   r   r   dirname__file__r   r   r%   r(   rz   r   r   rx   r&   r   <module>r      s    #    	 
 ( )  Q Y Kww||BGGOOH5zB41J 1Jp0
 0
n:
 :
B+E +Er&   