[docs]classVsdxParser(BaseBlobParser,ABC):"""Parser for vsdx files."""
[docs]defparse(self,blob:Blob)->Iterator[Document]:# type: ignore[override]"""Parse a vsdx file."""returnself.lazy_parse(blob)
[docs]deflazy_parse(self,blob:Blob)->Iterator[Document]:"""Retrieve the contents of pages from a .vsdx file and insert them into documents, one document per page."""withblob.as_bytes_io()aspdf_file_obj:withzipfile.ZipFile(pdf_file_obj,"r")aszfile:pages=self.get_pages_content(zfile,blob.source)# type: ignore[arg-type]yield from[Document(page_content=page_content,metadata={"source":blob.source,"page":page_number,"page_name":page_name,},)forpage_number,page_name,page_contentinpages]
[docs]defget_relationships(self,page:str,zfile:zipfile.ZipFile,filelist:List[str],pagexml_rels:List[dict],)->Set[str]:"""Get the relationships of a page and the relationships of its relationships, etc... recursively. Pages are based on other pages (ex: background page), so we need to get all the relationships to get all the content of a single page. """name_path=Path(page).nameparent_path=Path(page).parentrels_path=parent_path/f"_rels/{name_path}.rels"ifstr(rels_path)notinzfile.namelist():returnset()pagexml_rels_content=next(page_["content"]forpage_inpagexml_relsifpage_["path"]==page)ifisinstance(pagexml_rels_content["Relationships"]["Relationship"],list):targets=[rel["@Target"]forrelinpagexml_rels_content["Relationships"]["Relationship"]]else:targets=[pagexml_rels_content["Relationships"]["Relationship"]["@Target"]]relationships=set([str(parent_path/target)fortargetintargets]).intersection(filelist)forrelinrelationships:relationships=relationships|self.get_relationships(rel,zfile,filelist,pagexml_rels)returnrelationships