classMarkdownNodeParser(NodeParser):""" Markdown node parser. Splits a document into Nodes using Markdown header-based splitting logic. Each node contains its text content and the path of headers leading to it. Args: include_metadata (bool): whether to include metadata in nodes include_prev_next_rel (bool): whether to include prev/next relationships header_path_separator (str): separator char used for section header path metadata """header_path_separator:str=Field(default="/",description="Separator char used for section header path metadata.")@classmethoddeffrom_defaults(cls,include_metadata:bool=True,include_prev_next_rel:bool=True,header_path_separator:str="/",callback_manager:Optional[CallbackManager]=None,)->"MarkdownNodeParser":callback_manager=callback_managerorCallbackManager([])returncls(include_metadata=include_metadata,include_prev_next_rel=include_prev_next_rel,header_path_separator=header_path_separator,callback_manager=callback_manager,)defget_nodes_from_node(self,node:BaseNode)->List[TextNode]:"""Get nodes from document by splitting on headers."""text=node.get_content(metadata_mode=MetadataMode.NONE)markdown_nodes=[]lines=text.split("\n")current_section=""# Keep track of (markdown level, text) for headersheader_stack:List[tuple[int,str]]=[]code_block=Falseforlineinlines:# Track if we're inside a code block to avoid parsing headers in codeifline.lstrip().startswith("```"):code_block=notcode_blockcurrent_section+=line+"\n"continue# Only parse headers if we're not in a code blockifnotcode_block:header_match=re.match(r"^(#+)\s(.*)",line)ifheader_match:# Save the previous section before starting a new oneifcurrent_section.strip():markdown_nodes.append(self._build_node_from_split(current_section.strip(),node,self.header_path_separator.join(h[1]forhinheader_stack[:-1]),))header_level=len(header_match.group(1))header_text=header_match.group(2)# Compare against top-of-stack item’s markdown level.# Pop headers of equal or higher markdown level; not necessarily current stack size / depth.# Hierarchy depth gets deeper one level at a time, but markdown headers can jump from H1 to H3, for example.whileheader_stackandheader_stack[-1][0]>=header_level:header_stack.pop()# Add the new headerheader_stack.append((header_level,header_text))current_section="#"*header_level+f" {header_text}\n"continuecurrent_section+=line+"\n"# Add the final sectionifcurrent_section.strip():markdown_nodes.append(self._build_node_from_split(current_section.strip(),node,self.header_path_separator.join(h[1]forhinheader_stack[:-1]),))returnmarkdown_nodesdef_build_node_from_split(self,text_split:str,node:BaseNode,header_path:str,)->TextNode:"""Build node from single text split."""node=build_nodes_from_splits([text_split],node,id_func=self.id_func)[0]ifself.include_metadata:separator=self.header_path_separatornode.metadata["header_path"]=(# ex: "/header1/header2/" || "/"separator+header_path+separatorifheader_pathelseseparator)returnnodedef_parse_nodes(self,nodes:Sequence[BaseNode],show_progress:bool=False,**kwargs:Any,)->List[BaseNode]:"""Parse nodes."""all_nodes:List[BaseNode]=[]nodes_with_progress=get_tqdm_iterable(nodes,show_progress,"Parsing nodes")fornodeinnodes_with_progress:nodes=self.get_nodes_from_node(node)all_nodes.extend(nodes)returnall_nodes
defget_nodes_from_node(self,node:BaseNode)->List[TextNode]:"""Get nodes from document by splitting on headers."""text=node.get_content(metadata_mode=MetadataMode.NONE)markdown_nodes=[]lines=text.split("\n")current_section=""# Keep track of (markdown level, text) for headersheader_stack:List[tuple[int,str]]=[]code_block=Falseforlineinlines:# Track if we're inside a code block to avoid parsing headers in codeifline.lstrip().startswith("```"):code_block=notcode_blockcurrent_section+=line+"\n"continue# Only parse headers if we're not in a code blockifnotcode_block:header_match=re.match(r"^(#+)\s(.*)",line)ifheader_match:# Save the previous section before starting a new oneifcurrent_section.strip():markdown_nodes.append(self._build_node_from_split(current_section.strip(),node,self.header_path_separator.join(h[1]forhinheader_stack[:-1]),))header_level=len(header_match.group(1))header_text=header_match.group(2)# Compare against top-of-stack item’s markdown level.# Pop headers of equal or higher markdown level; not necessarily current stack size / depth.# Hierarchy depth gets deeper one level at a time, but markdown headers can jump from H1 to H3, for example.whileheader_stackandheader_stack[-1][0]>=header_level:header_stack.pop()# Add the new headerheader_stack.append((header_level,header_text))current_section="#"*header_level+f" {header_text}\n"continuecurrent_section+=line+"\n"# Add the final sectionifcurrent_section.strip():markdown_nodes.append(self._build_node_from_split(current_section.strip(),node,self.header_path_separator.join(h[1]forhinheader_stack[:-1]),))returnmarkdown_nodes