
    i$                         d Z ddlZddlZej                  j	                  dej                  j                  ej                  j                  e      d             ddlZddl	Z	ddl
mZ ej                  de	j                  fd       Zdede	j                  defd	Zdd
ZddZddZddZde	j                  ddfdZddZde	j                  ddfdZddZddZde	j                  ddfdZde	j                  ddfdZy)z8
Tests for chunker.py - TDD implementation for task-510
    Nz..
chunk_textreturnc                  ,    t        j                  d      S )Ncl100k_base)tiktokenget_encoding     H/home/jay/workspace/.worktrees/task-2116-dev1/libs/tests/test_chunker.pyencoderr      s      //r   textr   c                 6    t        |j                  |             S )N)lenencode)r   r   s     r   count_tokensr      s    w~~d#$$r   c                  x    d} t        | dd      }t        |      dk(  sJ |d   d   | k(  sJ |d   d   dk(  sJ y )NzThis is a short text.  r   
max_tokensoverlap   contentchunk_index)r   r   )r   results     r   $test_short_text_returns_single_chunkr      sU    "Da8Fv;!!9Y4'''!9]#q(((r   c                      t        d      D  cg c]  } d|  d
 }} dj                  |      }t        |dd      }t        |      dkD  sJ y c c} w )	N
   This is paragraph number  with some content.

2   r   r   r   )rangejoinr   r   )i
paragraphsr   r   s       r   &test_long_text_returns_multiple_chunksr'   "   sW    NSTViX-aS0CDXJX;;z"DQ7Fv;?? Ys   Ac                      d} d}d}|  d| d| }t        |dd      }|D ])  }|d   }t        |t              sJ t        |      dkD  r)J  d	j	                  d
 |D              }d|v sJ d|v sJ d|v sJ y )Nz5First paragraph with enough content to be meaningful.z6Second paragraph with enough content to be meaningful.z5Third paragraph with enough content to be meaningful.r!      r   r   r    c              3   &   K   | ]	  }|d      ywr   Nr
   .0chunks     r   	<genexpr>z3test_split_on_paragraph_boundary.<locals>.<genexpr>=   s     AE),A   zFirst paragraphzSecond paragraphzThird paragraph)r   
isinstancestrr   r$   )para1para2para3r   r   r/   r   all_contentss           r    test_split_on_paragraph_boundaryr8   +   s    CEDECEWDtE7+D Q7F   Y' '3'''7|a  88A&AAL,,,---,,,r   c                      d} t        | dd      }t        |      dkD  sJ |D ])  }|d   }t        |t              sJ t        |      dkD  r)J  y )NzuThe first sentence is here. The second sentence follows. The third sentence continues. The fourth sentence ends here.   r   r   r   r   )r   r   r2   r3   )r   r   r/   r   s       r   test_split_on_sentence_boundaryr;   D   sg    	) 	 Q7Fv;??  Y''3'''7|a r   c                    t        d      D cg c]  }d| d
 }}dj                  |      }d}t        |d|      }t        |dd      }t        |      d	k\  sJ t        |      d	k\  rt	        |d
   d         }t	        |d
   d         }||kD  sJ t        |d   d         }	| j                  |	      }
t        |t        |
            }|
| d  }| j                  |      }t        |d
   d         }|j                         }|d t        |      dz    |v s%t        fd|j                         D              sJ y y y c c}w )N   r   z with some content here.r!   r   r"   r   r      r   token_countr      c              3   D   K   | ]  }t        |      d kD  s|v   yw)   N)r   )r.   wordsearch_windows     r   r0   z;test_overlap_includes_previous_chunk_end.<locals>.<genexpr>y   s&      8
&*#d)VW-DM!8
s    	 )r#   r$   r   r   intr3   r   mindecodestripanysplit)r   r%   r&   r   r   result_with_overlapresult_no_overlapno_overlap_chunk1_tokenswith_overlap_chunk1_tokensprev_contentprev_tokensactual_overlapoverlap_token_idsoverlap_textnext_chunk_contentoverlap_strippedrD   s                   @r   (test_overlap_includes_previous_chunk_endrV   W   s   SXYZS[\a-aS0HI\J\;;z"DG$Tb'J"4BB "#q((( "(+,=a,@,O(P *-.A!.D].S*T" *,DDDD   3A 6y ABnn\2Wc+&67'(89~~&78"%&9!&<Y&G"H'--/*+GS1A-BR-GH=0C 8
.>.D.D.F8
 5
 	
 
 5
0+ # ]s   D?c                      t        d      D  cg c]  } d|  d
 }} dj                  |      }t        |dd      }t        |      dkD  sJ t	        |      D ]  \  } }|d	   | k(  rJ  y c c} w )
Nr   r   r    r!   r"   r   r   r   r   )r#   r$   r   r   	enumerate)r%   r&   r   r   r/   s        r   test_chunk_index_sequentialrY      s    NSTViX-aS0CDXJX;;z"DQ7Fv;??f% )5]#q((() Ys   A*c           	          t        d      D cg c]  }d| d
 }}dj                  |      }t        |dd      }|D ]1  }|d   }t        ||       }|d	   |k(  rJ d
|d    d| d|d	            y c c}w )N   r   r    r!   r"   r   r   r   r?   chunk_index=r   z: expected z, got )r#   r$   r   r   )r   r%   r&   r   r   r/   r   actual_token_counts           r   test_token_count_matches_actualr^      s    NSTUhW-aS0CDWJW;;z"DQ7F 
Y')'7;]#'99 	
5/0@R?SSYZ_`mZnYop	
9
	 Xs   A3c                  .    t        ddd      } | g k(  sJ y )N r   r"   r   r   r   s    r   "test_empty_text_returns_empty_listrb      s    sB7FR<<r   c                  .    t        ddd      } | g k(  sJ y )Nz
   

  	  r   r"   r   r   ra   s    r   ,test_whitespace_only_text_returns_empty_listrd      s    CDFR<<r   c                 :   g d}dj                  |      }d}d}t        |||      }t        |      dk\  sJ |D ]   }|d   |k  rJ d|d	    d
|d    d|         t        |      D ]  \  }}|d	   |k(  rJ  |D ]  }|d   }t	        ||       }	|d   |	k(  rJ  y )N)zAlpha sentence with words.zBeta sentence with words.zGamma sentence with words.zDelta sentence with words.zEpsilon sentence with words.zZeta sentence with words.zEta sentence with words.zTheta sentence with words.r*   r"   r   r   r   r?   r\   r   z has token_count=z > max_tokens=r   )r$   r   r   rX   r   )
r   	sentencesr   r   r   r   r/   r%   r   actuals
             r   )test_small_scale_max_tokens_50_overlap_10rh      s    	I 88IDJGWEFv;!  r- J.	r%.//@}AU@VVdeodpq	r.r f% )5]#q((()  .Y'gw/]#v---.r   c                 ^   g d}dj                  |      }t        |dd      }t        |      dk\  sJ dj                  d |D              }d	|v sJ d
|v sJ d|v sJ |D ]  }|d   }t        ||       }|d   |k(  rJ  t	        |      D ]  \  }}|d   |k(  rJ  |D ]  }|d   dk  rJ  y )N)uZ   첫 번째 문단입니다. 한글 텍스트를 처리하는 기능을 테스트합니다.uU   두 번째 문단입니다. 한글은 영어와 다른 토큰 구조를 가집니다.u]   세 번째 문단입니다. tiktoken은 한글도 정상적으로 처리할 수 있습니다.ua   네 번째 문단입니다. 청킹 기능이 한글에서도 올바르게 동작해야 합니다.uQ   다섯 번째 문단입니다. 이 테스트로 한글 지원을 검증합니다.r!   r@      r   r   r*   c              3   &   K   | ]	  }|d      ywr,   r
   r-   s     r   r0   z,test_korean_text_chunking.<locals>.<genexpr>   s     @5+@r1   u
   첫 번째u
   두 번째u
   세 번째r   r?   r   )r$   r   r   r   rX   )	r   korean_paragraphsr   r   all_contentr/   r   rg   r%   s	            r   test_korean_text_chunkingrn      s    ;;()DQ7F v;! ((@@@K;&&&;&&&;&&&  .Y'gw/]#v---. f% )5]#q((()  *]#r)))*r   )r   N)__doc__ossyspathinsertr$   dirname__file__pytestr   chunkerr   fixtureEncodingr   r3   rE   r   r   r'   r8   r;   rV   rY   r^   rb   rd   rh   rn   r
   r   r   <module>rz      s   
 
 277<< 94@ A    0"" 0 0%s %X%6%6 %3 %
)-2 &$
h6G6G $
D $
P)

X->-> 

4 


!.x7H7H !.T !.J *x'8'8  *T  *r   