
    %<is                     $   d Z ddlZddlZ	 	 ddedededeeeeez  f      fdZdedej                  defd	Z	d
ededej                  dee   fdZ
dededej                  dee   fdZdee   dededej                  deeeeez  f      f
dZy)a  
chunker.py - Text chunking utility using tiktoken for token-aware splitting.

Chunking strategy:
1. Split on paragraph boundaries (\n\n)
2. If a paragraph exceeds max_tokens, split on sentence boundaries (. ! ?)
3. If a sentence exceeds max_tokens, force-split by token limit
4. Apply overlap: prepend the last `overlap` tokens of the previous chunk
   to the start of the next chunk.
    Ntext
max_tokensoverlapreturnc                 P   | r| j                         sg S t        j                  d      }t        j                  d|       }|D cg c]#  }|j                         s|j                         % }}g }|D ]  }|j                  t        |||               t        ||||      }	|	S c c}w )a  Split text into token-aware chunks.

    Args:
        text: The input text to be chunked.
        max_tokens: Maximum number of tokens allowed per chunk.
        overlap: Number of tokens from the end of the previous chunk
                 to prepend to the start of the next chunk.

    Returns:
        A list of dicts with keys:
            - "content"     (str): The chunk text.
            - "chunk_index" (int): Zero-based index of the chunk.
            - "token_count" (int): Actual token count of the chunk content.
    cl100k_basez\n\n+)striptiktokenget_encodingresplitextend_split_paragraph_merge_segments)
r   r   r   encoderraw_paragraphsp
paragraphssegmentsparachunkss
             =/home/jay/workspace/.worktrees/task-2057-dev2/libs/chunker.py
chunk_textr      s    & tzz|	!)!6!6}!EG !#4 8N0>L1!'')QWWYLJL H E(z7CDE *9:wX_)`FM Ms   B#B#r   c                 6    t        |j                  |             S )z&Return the number of tokens in *text*.)lenencode)r   r   s     r   _token_countr   =   s    w~~d#$$    	paragraphc                    t        | |      |k  r| gS t        j                  d|       }|D cg c]#  }|j                         s|j                         % }}g }g }d}|D ]  }	t        |	|      }
|
|kD  rC|r$|j	                  dj                  |             g }d}|j                  t        |	||             W|rdnd}||z   |
z   |kD  r&|r$|j	                  dj                  |             g }d}|j	                  |	       t        dj                  |      |      } |r |j	                  dj                  |             |r|S | gS c c}w )zReturn a list of segments from a single paragraph.

    If the paragraph fits within max_tokens it is returned as-is.
    Otherwise it is split on sentence boundaries, and each sentence group
    is further force-split if needed.
    z(?<=[.!?])\s+r       )r   r   r   r	   appendjoinr   _force_split)r   r   r   raw_sentencess	sentencesr   current_partscurrent_tokenssentencesentence_tokens
sep_tokenss               r   r   r   B   sO    Iw':5{  "xx(8)DM/<J!	AGGIJIJH!MN H&x9Z' 78 "!"OOL:wGH (QQ
J&8:E 78 "!"X&%chh}&=wG/H2 /080i[0E Ks   EEc                     |j                  |       }g }t        dt        |      |      D ]*  }||||z    }|j                  |j	                  |             , |S )z)Force-split *text* purely by token count.r   )r   ranger   r#   decode)r   r   r   tokensresultstartchunk_tokenss          r   r%   r%   w   s`      t,FFq#f+z2 4eej&89gnn\234 Mr   r   c           
      V   | sg S g }g }d}| D ]q  }t        ||      }|rdnd}	||	z   |z   |kD  r&|r$|j                  dj                  |             g }d}|j                  |       t        dj                  |      |      }s |r |j                  dj                  |             |dk  st        |      dk  r,t	        |      D 
cg c]  \  }
}||
t        ||      d c}}
S g }t	        |      D ]  \  }
}|
dk(  r|}n|j                  ||
dz
           }|j                  |      }t        |t        |            }|t        |      z
  }t        dt        ||            }|dkD  r|| d }|j                  |      }||z   }n|}|j                  ||
t        ||      d        |S c c}}
w )z?Merge small segments together and apply overlap between chunks.r   r"   z

)contentchunk_indextoken_countN)	r   r#   r$   r   	enumerater   minmaxr0   )r   r   r   r   
raw_chunksr)   r*   segment
seg_tokensr-   idxchunkr2   chunk_text_valr6   prev_tokenscurr_tokensdesired_overlap	availableactual_overlapoverlap_tokensoverlap_texts                         r   r   r      s    	 J!MN K!'73
'QQ
J&3j@!!&++m"<= "!"W%%fkk-&@'JK &++m45 !|s:!+ (
3
 U	 !"+E7;
 	
 *,F(4 
^!8$G%,^^JsQw4G%HK%,^^N%CK "'3{+;<O"S%55I C$CDN!!,n_-=!>$+NN>$B&7(""+GW=	
)
8 MM
s   F%)i  2   )__doc__r   r
   strintlistdictr   Encodingr   r   r%   r    r   r   <module>rQ      s2  	 
 
 $
$$ $ 
$sC#I~
	$X%s %X%6%6 %3 %
212121 21 
#Y	21j
  
#Y	G3iGG G 	G
 
$sC#I~
Gr   