classTokenTextSplitter(MetadataAwareTextSplitter):"""Implementation of splitting text that looks at word tokens."""chunk_size:int=Field(default=DEFAULT_CHUNK_SIZE,description="The token chunk size for each chunk.",gt=0,)chunk_overlap:int=Field(default=DEFAULT_CHUNK_OVERLAP,description="The token overlap of each chunk when splitting.",ge=0,)separator:str=Field(default=" ",description="Default separator for splitting into words")backup_separators:List=Field(default_factory=list,description="Additional separators for splitting.")keep_whitespaces:bool=Field(default=False,description="Whether to keep leading/trailing whitespaces in the chunk.",)_tokenizer:Callable=PrivateAttr()_split_fns:List[Callable]=PrivateAttr()def__init__(self,chunk_size:int=DEFAULT_CHUNK_SIZE,chunk_overlap:int=DEFAULT_CHUNK_OVERLAP,tokenizer:Optional[Callable]=None,callback_manager:Optional[CallbackManager]=None,separator:str=" ",backup_separators:Optional[List[str]]=["\n"],keep_whitespaces:bool=False,include_metadata:bool=True,include_prev_next_rel:bool=True,id_func:Optional[Callable[[int,Document],str]]=None,):"""Initialize with parameters."""ifchunk_overlap>chunk_size:raiseValueError(f"Got a larger chunk overlap ({chunk_overlap}) than chunk size "f"({chunk_size}), should be smaller.")callback_manager=callback_managerorCallbackManager([])id_func=id_funcordefault_id_funcsuper().__init__(chunk_size=chunk_size,chunk_overlap=chunk_overlap,separator=separator,backup_separators=backup_separators,keep_whitespaces=keep_whitespaces,callback_manager=callback_manager,include_metadata=include_metadata,include_prev_next_rel=include_prev_next_rel,id_func=id_func,)self._tokenizer=tokenizerorget_tokenizer()all_seps=[separator]+(backup_separatorsor[])self._split_fns=[split_by_sep(sep)forsepinall_seps]+[split_by_char()]@classmethoddeffrom_defaults(cls,chunk_size:int=DEFAULT_CHUNK_SIZE,chunk_overlap:int=DEFAULT_CHUNK_OVERLAP,separator:str=" ",backup_separators:Optional[List[str]]=["\n"],callback_manager:Optional[CallbackManager]=None,keep_whitespaces:bool=False,include_metadata:bool=True,include_prev_next_rel:bool=True,id_func:Optional[Callable[[int,Document],str]]=None,)->"TokenTextSplitter":"""Initialize with default parameters."""callback_manager=callback_managerorCallbackManager([])returncls(chunk_size=chunk_size,chunk_overlap=chunk_overlap,separator=separator,backup_separators=backup_separators,keep_whitespaces=keep_whitespaces,callback_manager=callback_manager,include_metadata=include_metadata,include_prev_next_rel=include_prev_next_rel,id_func=id_func,)@classmethoddefclass_name(cls)->str:return"TokenTextSplitter"defsplit_text_metadata_aware(self,text:str,metadata_str:str)->List[str]:"""Split text into chunks, reserving space required for metadata str."""metadata_len=len(self._tokenizer(metadata_str))+DEFAULT_METADATA_FORMAT_LENeffective_chunk_size=self.chunk_size-metadata_lenifeffective_chunk_size<=0:raiseValueError(f"Metadata length ({metadata_len}) is longer than chunk size "f"({self.chunk_size}). Consider increasing the chunk size or ""decreasing the size of your metadata to avoid this.")elifeffective_chunk_size<50:print(f"Metadata length ({metadata_len}) is close to chunk size "f"({self.chunk_size}). Resulting chunks are less than 50 tokens. ""Consider increasing the chunk size or decreasing the size of ""your metadata to avoid this.",flush=True,)returnself._split_text(text,chunk_size=effective_chunk_size)defsplit_text(self,text:str)->List[str]:"""Split text into chunks."""returnself._split_text(text,chunk_size=self.chunk_size)def_split_text(self,text:str,chunk_size:int)->List[str]:"""Split text into chunks up to chunk_size."""iftext=="":return[text]withself.callback_manager.event(CBEventType.CHUNKING,payload={EventPayload.CHUNKS:[text]})asevent:splits=self._split(text,chunk_size)chunks=self._merge(splits,chunk_size)event.on_end(payload={EventPayload.CHUNKS:chunks},)returnchunksdef_split(self,text:str,chunk_size:int)->List[str]:""" Break text into splits that are smaller than chunk size. The order of splitting is: 1. split by separator 2. split by backup separators (if any) 3. split by characters NOTE: the splits contain the separators. """iflen(self._tokenizer(text))<=chunk_size:return[text]forsplit_fninself._split_fns:splits=split_fn(text)iflen(splits)>1:breaknew_splits=[]forsplitinsplits:split_len=len(self._tokenizer(split))ifsplit_len<=chunk_size:new_splits.append(split)else:# recursively splitnew_splits.extend(self._split(split,chunk_size=chunk_size))returnnew_splitsdef_merge(self,splits:List[str],chunk_size:int)->List[str]:""" Merge splits into chunks. The high-level idea is to keep adding splits to a chunk until we exceed the chunk size, then we start a new chunk with overlap. When we start a new chunk, we pop off the first element of the previous chunk until the total length is less than the chunk size. """chunks:List[str]=[]cur_chunk:List[str]=[]cur_len=0forsplitinsplits:split_len=len(self._tokenizer(split))ifsplit_len>chunk_size:_logger.warning(f"Got a split of size {split_len}, ",f"larger than chunk size {chunk_size}.",)# if we exceed the chunk size after adding the new split, then# we need to end the current chunk and start a new oneifcur_len+split_len>chunk_size:# end the previous chunkchunk=("".join(cur_chunk)ifself.keep_whitespaceselse"".join(cur_chunk).strip())ifchunk:chunks.append(chunk)# start a new chunk with overlap# keep popping off the first element of the previous chunk until:# 1. the current chunk length is less than chunk overlap# 2. the total length is less than chunk sizewhilecur_len>self.chunk_overlaporcur_len+split_len>chunk_size:# pop off the first elementfirst_chunk=cur_chunk.pop(0)cur_len-=len(self._tokenizer(first_chunk))cur_chunk.append(split)cur_len+=split_len# handle the last chunkchunk=("".join(cur_chunk)ifself.keep_whitespaceselse"".join(cur_chunk).strip())ifchunk:chunks.append(chunk)returnchunks
@classmethoddeffrom_defaults(cls,chunk_size:int=DEFAULT_CHUNK_SIZE,chunk_overlap:int=DEFAULT_CHUNK_OVERLAP,separator:str=" ",backup_separators:Optional[List[str]]=["\n"],callback_manager:Optional[CallbackManager]=None,keep_whitespaces:bool=False,include_metadata:bool=True,include_prev_next_rel:bool=True,id_func:Optional[Callable[[int,Document],str]]=None,)->"TokenTextSplitter":"""Initialize with default parameters."""callback_manager=callback_managerorCallbackManager([])returncls(chunk_size=chunk_size,chunk_overlap=chunk_overlap,separator=separator,backup_separators=backup_separators,keep_whitespaces=keep_whitespaces,callback_manager=callback_manager,include_metadata=include_metadata,include_prev_next_rel=include_prev_next_rel,id_func=id_func,)
defsplit_text_metadata_aware(self,text:str,metadata_str:str)->List[str]:"""Split text into chunks, reserving space required for metadata str."""metadata_len=len(self._tokenizer(metadata_str))+DEFAULT_METADATA_FORMAT_LENeffective_chunk_size=self.chunk_size-metadata_lenifeffective_chunk_size<=0:raiseValueError(f"Metadata length ({metadata_len}) is longer than chunk size "f"({self.chunk_size}). Consider increasing the chunk size or ""decreasing the size of your metadata to avoid this.")elifeffective_chunk_size<50:print(f"Metadata length ({metadata_len}) is close to chunk size "f"({self.chunk_size}). Resulting chunks are less than 50 tokens. ""Consider increasing the chunk size or decreasing the size of ""your metadata to avoid this.",flush=True,)returnself._split_text(text,chunk_size=effective_chunk_size)