Pdf table

PDFTableReader #

Bases: BaseReader

PDF表格阅读器。从PDF中读取表格。

Parameters:

Name	Type	Description	Default
`row_separator`	`str`	用于连接DataFrame行的行分隔符。	`'\n'`
`col_separator`	`str`	用于连接DataFrame列的列分隔符。	`', '`

Source code in llama_index/readers/pdf_table/base.py

class PDFTableReader(BaseReader):
    """PDF表格阅读器。从PDF中读取表格。

    Args:
        row_separator (str): 用于连接DataFrame行的行分隔符。
        col_separator (str): 用于连接DataFrame列的列分隔符。"""

    def __init__(
        self,
        *args: Any,
        row_separator: str = "\n",
        col_separator: str = ", ",
        **kwargs: Any
    ) -> None:
        super().__init__(*args, **kwargs)
        self._row_separator = row_separator
        self._col_separator = col_separator

    def load_data(
        self, file: Path, pages: str = "1", extra_info: Optional[Dict] = None
    ) -> List[Document]:
        """加载数据并从PDF文件中提取表格。

Args:
    file (Path): PDF文件的路径。
    pages (str): 从中读取表格的页面。
    extra_info (Optional[Dict]): 额外信息。

Returns:
    List[Document]: 文档列表。
"""
        import camelot

        results = []
        tables = camelot.read_pdf(filepath=str(file), pages=pages)

        for table in tables:
            document = self._dataframe_to_document(df=table.df, extra_info=extra_info)
            results.append(document)

        return results

    def _dataframe_to_document(
        self, df: pd.DataFrame, extra_info: Optional[Dict] = None
    ) -> Document:
        df_list = df.apply(
            lambda row: (self._col_separator).join(row.astype(str).tolist()), axis=1
        ).tolist()

        return Document(
            text=self._row_separator.join(df_list), extra_info=extra_info or {}
        )

load_data #

load_data(
    file: Path,
    pages: str = "1",
    extra_info: Optional[Dict] = None,
) -> List[Document]

加载数据并从PDF文件中提取表格。

Parameters:

Name	Type	Description	Default
`file`	`Path`	PDF文件的路径。	required
`pages`	`str`	从中读取表格的页面。	`'1'`
`extra_info`	`Optional[Dict]`	额外信息。	`None`

Returns:

Type	Description
`List[Document]`	List[Document]: 文档列表。

Source code in llama_index/readers/pdf_table/base.py

    def load_data(
        self, file: Path, pages: str = "1", extra_info: Optional[Dict] = None
    ) -> List[Document]:
        """加载数据并从PDF文件中提取表格。

Args:
    file (Path): PDF文件的路径。
    pages (str): 从中读取表格的页面。
    extra_info (Optional[Dict]): 额外信息。

Returns:
    List[Document]: 文档列表。
"""
        import camelot

        results = []
        tables = camelot.read_pdf(filepath=str(file), pages=pages)

        for table in tables:
            document = self._dataframe_to_document(df=table.df, extra_info=extra_info)
            results.append(document)

        return results