
    i                        d Z ddlZddlZddlmZ ddlmZ ddlZddlm	Z	  ej                  e      Ze G d d             Zd	ed
efdZded
ee   fdZd	ed
ee   fdZd	ed
ee   fdZdeded
efdZd	ededed
efdZy)uH   3-tier 전사: YouTube 자막 → Whisper STT → 제목+설명 fallback    N)	dataclass)Optional   )configc                   J    e Zd ZU eed<   eed<   dZee   ed<   dZee   ed<   y)TranscriptResulttextsourceNlanguagecaption_type)__name__
__module____qualname__str__annotations__r   r   r        T/home/jay/projects/insuwiki/scripts/youtube-pipeline/youtube_pipeline/transcriber.pyr   r      s(    
IK"Hhsm""&L(3-&r   r   video_idreturnc                     | j                  d      }dt        t        |      g      z   |z   }dt        t        |      g      z   |z   }t        j                  |      j                  d      S )u  InnerTube get_transcript API용 params 생성 (protobuf-style 인코딩)

    field 1 (params): embedded message
      field 1 (video_id): string  →  
 <len> <video_id bytes>
    outer: 
 <inner_len> <inner>
    최종적으로 base64url 인코딩하여 반환.
    zutf-8   
)encodebyteslenbase64	b64encodedecode)r   video_id_bytesinnerouters       r   _build_transcript_paramsr"      sd     __W-NeS0122^CEeSZL))E1EE"))'22r   datac                 L   	 | j                  dg       }|D ]  }|j                  di       }|j                  di       }|j                  di       }|j                  di       }|j                  di       j                  dg       }|spg }|D ]q  }	|	j                  di       j                  d	g       }
|
D ]H  }|j                  d
i       j                  di       j                  dd      }|s8|j                  |       J s |sdj                  |      c S  	 y# t        $ r Y yw xY w)u.   InnerTube 응답에서 자막 텍스트 추출actionsupdateEngagementPanelActioncontenttranscriptRendererbodytranscriptBodyRenderer	cueGroupstranscriptCueGroupRenderercuestranscriptCueRenderercue
simpleText 
N)getappendjoin	Exception)r#   r%   actionupdater'   transcript_rendererr)   segment_listtextsgroupr-   r/   cue_texts                r   _parse_innertube_transcriptr>   ,   s;   ((9b) 	,FZZ =rBFjjB/G")++.BB"G&**626D88$<bAEEkSUVL#%) 	3E 99%A2FJJ6SUVD# 3GG$;R@ S^ Sr2 !
 $!LL23	3 99U++)	,.   s+   BD A"D )D D D 	D#"D#c                    	 t        |       }t        j                  dddddi|dddid	
      }|j                  dk(  r)t	        |j                               }|rt        |dd      S t        j                  dddddi|dddid	
      }|j                  dk(  r)t	        |j                               }|rt        |dd      S y# t        $ r }t        j                  d|       Y d}~yd}~ww xY w)uH   YouTube InnerTube API로 자막 추출 시도 (WEB → TVHTML5 fallback)z2https://www.youtube.com/youtubei/v1/get_transcriptclientWEBz
2.20240101)
clientNameclientVersion)contextparamszContent-Typezapplication/json   )jsonheaderstimeout   youtube_captionko)r	   r
   r   TVHTML5z
7.20240101u    YouTube 자막 추출 실패: %sN)
r"   requestspoststatus_coder>   rG   r   r6   loggerwarning)r   rE   respr	   resp2text2es          r   fetch_youtube_captionsrW   O   s-   />)(3 }}@ &+)5 ! $%78
 s".tyy{;D'&7$ 
 @ &/)5 ! $%78
 #/

=E''84 
   >91==>s   A%C (AC 	C-C((C-c                 0   	 t        j                  t        j                  | ddd      }|j                  dk(  r1|j                         }|j                  dd      }|rt        |d	      S y# t        $ r }t        j                  d
|       Y d}~yd}~ww xY w)uF   Whisper GPU 서비스로 전사 (localhost:8200/v1/youtube-transcribe)rL   )r   r   iX  )rG   rI   rJ   r	   r1   whisper_sttr	   r
   u   Whisper 전사 실패: %sN)rN   rO   r   WHISPER_TRANSCRIBE_ENDPOINTrP   rG   r3   r   r6   rQ   rR   )r   rS   r#   r	   rV   s        r   fetch_whisper_transcriptionr\      s    7}}..&D9

 s"99;D88FB'D'T-HH   72A667s   A(A, ,	B5BBtitledescriptionc                 ,    d|  d| }t        |d      S )u-   제목+설명 fallback (품질 주의 대상)u   영상 제목: u   

영상 설명:
title_descriptionrZ   )r   )r]   r^   r	   s      r   make_title_description_fallbackra      s#    UG#7}ED.ABBr   c                 $   t        |       }|r+t        j                  dt        |j                               |S t        |       }|r+t        j                  dt        |j                               |S t        j                  d       t        ||      S )u   3-tier 전사 오케스트레이터

    1순위: YouTube 자막 (InnerTube API)
    2순위: Whisper STT (yt-dlp → GPU 전사)
    3순위: 제목+설명 fallback
    u-   자막 추출 성공 (YouTube caption): %d자u   Whisper 전사 성공: %d자u3   자막/STT 모두 실패 — 제목+설명 fallback)rW   rQ   infor   r	   r\   rR   ra   )r   r]   r^   results       r   
transcribere      sv     $H-FCSEUV )2F2C4DE NNHI*5+>>r   )__doc__r   loggingdataclassesr   typingr   rN   r1   r   	getLoggerr   rQ   r   r   r"   dictr>   rW   r\   ra   re   r   r   r   <module>rl      s    N   !   			8	$ ' ' '3s 3s 3 d x} F2S 2X6F-G 2j# (;K2L $C3 CS CEU C? ?S ?s ??O ?r   