
    iA2                    V    d Z ddlmZ ddlmZ ddlmZ ddd	 	 	 	 	 	 	 	 	 d
dZdd	Zy)u   렌더된 PNG가 silent corruption인지 검증.

tesseract 없어도 픽셀 기반으로 강제 검증.
OCR 미가용 시 mode="PIXEL-ONLY" 명시 — silent skip 절대 금지.

IDS Phase 1 task-2401 — silent corruption 검증.
    )annotations)Path)AnyNF)html_sourcerequire_ocrc                  ! ddl m} t        |       } g }ddddddd ddddddd}| j                         s|j	                  d	|         d |d
|dS | j                         j                  }|dz  }t        |d      |d<   |dk\  rd d<   nd d<   |j	                  d|dd       	 |j                  |       j                  d      }	|	j                  \  }}|	j                  d      }|d}nt        |      }||d<   |dk\  rd d<   nd d<   |j	                  d| d       d}|2t        |      dkD  r$||z  }t        d |D              }|dkD  r||z  nd}n|d}t        |d      |d <   |d!k  rd d"<   nd d"<   |j	                  d#|d$d%       t        |	      }t        |d      |d&<   |d'k  rd d(<   nd d(<   |j	                  d)|d*d+       `t!        fd,|D              }t        |      }|dkD  r||k(  nd}| d-<   |s.|D cg c]	  }|vs| }}|j	                  d.| d/| d0|        d
}d}	 ddl}d}|rd1}	 j'                  |	d23      !!j)                         |d4<   |r#t!        !fd5|D              }|t        |      z  }nd}t        |d      |d6<   |d7k\  }| d8<   |s|j	                  d9|d$d:!dd;        n.|r|j	                  d=       d d8<   d
}nd
}|j	                  d>       g d?}|j	                  d-       |s|r|j	                  d8       t+         fd@|D              }|t-               t-        |      ||dS # t        $ r&}
|j	                  d|
        d |d
|dcY d}
~
S d}
~
ww xY wc c}w # t$        $ r Y Tw xY w# t        $ r#}
d d8<   |j	                  d<|
        Y d}
~
d}
~
ww xY w)Au  PNG가 silent corruption인지 검증.

    Args:
        png_path: 검증할 PNG 파일 경로.
        expected_korean: 기대 한글 문자열 목록 (OCR/HTML string match용).
        html_source: HTML 소스 문자열 (제공 시 string match 체크).
        require_ocr: True면 OCR 미가용 시 pass=False 처리.

    Returns:
        {
            "pass": bool,
            "checks": {
                "file_size_ok": bool,         # >= 10KB
                "color_diversity_ok": bool,   # unique 색상 >= 10
                "blank_ratio_ok": bool,       # 단일 색상 <= 97%
                "tofu_glyph_clear": bool,     # tofu width 분포 검출
                "html_string_match": bool | None,  # html_source 제공 시
                "ocr_pass": bool | None,      # pytesseract 가용 시
            },
            "metrics": {
                "file_size_kb": float,
                "unique_colors": int,
                "dominant_color_ratio": float,
                "ocr_text": str | None,
                "ocr_match_ratio": float | None,
                "tofu_score": float,          # 0~1 (높을수록 tofu 의심)
            },
            "mode": "PIXEL+OCR" | "PIXEL-ONLY",
            "errors": list[str],
        }
    r   )ImageFN)file_size_okcolor_diversity_okblank_ratio_oktofu_glyph_clearhtml_string_matchocr_pass              ?)file_size_kbunique_colorsdominant_color_ratioocr_textocr_match_ratio
tofu_scorezPNG file not found: z
PIXEL-ONLY)passchecksmetricsmodeerrorsg      @   r   g      $@Tr
   zfile_size_ok FAIL: z.1fu-   KB < 10KB — likely placeholder or blank PNGRGBzPIL cannot open image: i   )	maxcolorsr   
   r   z'color_diversity_ok FAIL: unique_colors=u,    < 10 — image likely blank or single-colorc              3  &   K   | ]	  \  }}|  y w)N ).0count_s      C/home/jay/workspace/skills/satori-cardnews/scripts/verify_korean.py	<genexpr>zverify_png.<locals>.<genexpr>   s     9(%9s      r   g
ףp=
?r   z*blank_ratio_ok FAIL: dominant_color_ratio=z.2%u5    > 97% — image is nearly blank (solid single color)r   ffffff?r   z"tofu_glyph_clear FAIL: tofu_score=z.3fu>    >= 0.7 — possible tofu (□□□) glyph rendering detectedc              3  ,   K   | ]  }|v sd   yw   Nr"   )r#   kwr   s     r&   r'   zverify_png.<locals>.<genexpr>   s     M"2;LAM   	r   zhtml_string_match FAIL: /z matched. Missing: z	PIXEL+OCRkor)langr   c              3  ,   K   | ]  }|v sd   ywr+   r"   )r#   r-   r   s     r&   r'   zverify_png.<locals>.<genexpr>   s     !Px!!Pr.   r         ?r   zocr_pass FAIL: ocr_match_ratio=z < 50%. OCR text snippet:    u'   ocr_pass ERROR: pytesseract failed — ue   ocr_pass FAIL: require_ocr=True but pytesseract not available — install pytesseract + tesseract-ocruv   INFO: pytesseract not available — running PIXEL-ONLY mode. Install pytesseract for OCR-based Korean text validation.)r
   r   r   r   c              3  D   K   | ]  }j                  |      d u   yw)TN)get)r#   kr   s     r&   r'   zverify_png.<locals>.<genexpr>   s#      "#

1s    )PILr	   r   existsappendstatst_sizeroundopenconvert	Exceptionsize	getcolorslenmax_compute_tofu_scoresumpytesseractImportErrorimage_to_stringstripalldict)"png_pathexpected_koreanr   r   r	   r   r   file_size_bytesr   imgexc	img_width
img_height
color_listr   r   total_pixels	max_countr   matched_counttotal	all_matchr-   missingr   ocr_availablerG   matched_ocrr   r   mandatory_keysoverall_passr   r   s"     `                             @@r&   
verify_pngr_      s   L H~HF#!!&F  #G ??,XJ78 
 	
 mmo--O"V+L#L!4GNt!%~!&~!,s!3 4. .	


jj"**51  HHIz /JJ,GO'+#$',#$5m_ E1 1	
 #j/A"5 :-9j99	;G!;Ky<7QT		"&+,@!&DG"#t##' #( 89Mc8R S9 9	
 %S)J!*a0GLC%)!"%*!"0C0@ AA A	
 MMMO$05	]e+t	&/"#$3Mbr7LrMGMMM*=/5' B#9& DM 	K'77%7HH"*.."2GJ!!Po!PP"-O0D"D"%).)BG%&&#-H!)F:5oc5J K))1$3(:< 
2	
 #z H	
 bN12mj) '5 L
 v,= m  
/u56 
 	

\ N  0  	K!&F:MMCC5IJJ	KsU   * M 	NN;N	 BN 	NM<6N<N		NN	O"O  Oc                   ddl }| j                  d      }|j                  \  }}|j                         }|yt	        d|dz        }g }t        d||      D ]b  }d}	d}
t        |      D ]4  }|||f   }|dk  }|rd	}	|
dz  }
|	r|
dkD  r|j                  |
       d}	d}
6 |	sL|
dkD  sR|j                  |
       d t        |      d
k  ry	 |j                  |      }|j                  |      }|dk  ry||z  }|dk  rd}n"|dk  rd}n|dk  rd}n|dk  rd}n
|dk  rd}nd}t        dt        |      dz        }||z  S # |j                  $ r Y yw xY w)u  이미지에서 tofu glyph 점수(0~1)를 계산.

    알고리즘:
    1. 그레이스케일 변환 + 이진화 (어두운 픽셀 = 텍스트 후보)
    2. 각 행(row)에서 연속된 dark pixel 클러스터 width 수집
    3. 클러스터 width의 표준편차가 매우 낮으면 tofu 의심
       - tofu는 동일한 width의 □ 박스가 반복되는 특성
       - 정상 한글은 자/모 조합으로 width 변동이 큼

    Returns:
        0.0 (정상) ~ 1.0 (tofu 강의심)
    r   NLr   r,   r4   F   T   g?gffffff?g?g333333?g333333?r)   g?r3   g333333?g      ?r   g      I@)
statisticsr?   rA   loadrD   ranger:   rC   stdevmeanStatisticsErrormin)rP   rd   graywidthheightpixelsrow_stepall_cluster_widthsy
in_clustercluster_widthxpx_valis_darkrg   mean_wcvr   
confidences                      r&   rE   rE     s    ;;sDIIME6 YY[F~ 1fm$H$&1fh' 5
u 	"AAqD\F|G!
"-!"3&--m<"
 !	" -!+%%m4'5* "   !34!34 { 
B
 
Dy
	d
	d
	d
	d

 S#01D89J
""7 %% s   "D? ?EE)
rM   r   rN   z	list[str]r   z
str | Noner   boolreturnzdict[str, Any])rP   r   r{   float)	__doc__
__future__r   pathlibr   typingr   r_   rE   r"       r&   <module>r      s_    #   #ss s 	s
 s slT#r   