Docstring walker

初始化文件。

DocstringWalker #

Bases: BaseReader

一个用于提取文档字符串并从中构建结构化文档的加载器。递归地遍历目录，并从每个Python模块中提取文档字符串 - 从模块本身开始，然后是类，然后是函数。构建提取的文档字符串之间的依赖关系图。

Source code in llama_index/readers/docstring_walker/base.py

class DocstringWalker(BaseReader):
    """一个用于提取文档字符串并从中构建结构化文档的加载器。
递归地遍历目录，并从每个Python模块中提取文档字符串 - 从模块本身开始，然后是类，然后是函数。
构建提取的文档字符串之间的依赖关系图。"""

    def load_data(
        self,
        code_dir: str,
        skip_initpy: bool = True,
        fail_on_malformed_files: bool = False,
    ) -> List[Document]:
        """从指定的代码目录加载数据。
此外，在加载数据后，构建加载文档之间的依赖图。
该图被存储为类的属性。

参数
----------
code_dir : str
    代码文件的目录路径。
skip_initpy : bool
    是否跳过__init__.py文件。默认为True。
fail_on_malformed_files : bool
    是否在文件格式错误时失败。默认为False - 在这种情况下，会跳过格式错误的文件并记录警告。

返回:
-------
List[Document]
    加载的文档列表。
"""
        return self.process_directory(code_dir, skip_initpy, fail_on_malformed_files)

    def process_directory(
        self,
        code_dir: str,
        skip_initpy: bool = True,
        fail_on_malformed_files: bool = False,
    ) -> List[Document]:
        """处理一个目录并从Python文件中提取信息。
参数
----------
code_dir：str
    代码文件的目录路径。
skip_initpy：bool
    是否跳过__init__.py文件。默认为True。
fail_on_malformed_files：bool
    是否在文件格式错误时失败。默认为False - 在这种情况下，会跳过格式错误的文件并记录警告。

返回：
-------
List[Document]
    Document对象的列表。
"""
        llama_docs = []
        for root, _, files in os.walk(code_dir):
            for file in files:
                if file.endswith(".py"):
                    if skip_initpy and file == "__init__.py":
                        continue
                    module_name = file.replace(".py", "")
                    module_path = os.path.join(root, file)
                    try:
                        doc = self.parse_module(module_name, module_path)
                        llama_docs.append(doc)
                    except Exception as e:
                        if fail_on_malformed_files:
                            raise e  # noqa: TRY201
                        log.warning(
                            "Failed to parse file %s. Skipping. Error: %s",
                            module_path,
                            e,
                        )
                        continue
        return llama_docs

    def read_module_text(self, path: str) -> str:
        """读取Python模块的文本。对于测试，这个函数可以被模拟。

参数
----------
path : str
    模块的路径。

返回:
-------
str
    模块的文本。
"""
        with open(path, encoding="utf-8") as f:
            return f.read()

    def parse_module(self, module_name: str, path: str) -> Document:
        """解析单个Python模块的函数。

参数
----------
module_name : str
    模块名称。
path : str
    模块的路径。

返回:
-------
Document
    一个LLama索引文档对象，其中包含从模块中提取的信息。
"""
        module_text = self.read_module_text(path)
        module = ast.parse(module_text)
        module_docstring = ast.get_docstring(module)
        module_text = f"Module name: {module_name} \n Docstring: {module_docstring} \n"
        sub_texts = []
        for elem in module.body:
            if type(elem) in TYPES_TO_PROCESS:
                sub_text = self.process_elem(elem, module_name)
                sub_texts.append(sub_text)
        module_text += "\n".join(sub_texts)
        return Document(text=module_text)

    def process_class(self, class_node: ast.ClassDef, parent_node: str):
        """处理AST中的类节点，并向图中添加相关信息。

Args:
----------
class_node : ast.ClassDef
    要处理的类节点。它代表抽象语法树（AST）中的类定义。
parent_node : str
    父节点的名称。它指定图中父节点的名称。

Returns:
----------
str
    处理过的类节点及其子元素的字符串表示形式。它提供了处理过的类节点及其子元素的文本表示形式。
"""
        cls_name = class_node.name
        cls_docstring = ast.get_docstring(class_node)

        text = f"\n Class name: {cls_name}, In: {parent_node} \n Docstring: {cls_docstring}"
        sub_texts = []
        for elem in class_node.body:
            sub_text = self.process_elem(elem, cls_name)
            sub_texts.append(sub_text)
        return text + "\n".join(sub_texts)

    def process_function(self, func_node: ast.FunctionDef, parent_node: str) -> str:
        """处理AST中的函数节点，并将其添加到图中。构建节点文本。

参数
----------
func_node : ast.FunctionDef
    要处理的函数节点。
parent_node : str
    父节点的名称。

返回:
-------
str
    处理后的函数节点及其子元素的字符串表示形式。
"""
        func_name = func_node.name
        func_docstring = ast.get_docstring(func_node)

        text = f"\n Function name: {func_name}, In: {parent_node} \n Docstring: {func_docstring}"
        sub_texts = []
        for elem in func_node.body:
            sub_text = self.process_elem(elem, func_name)
            sub_texts.append(sub_text)
        return text + "\n".join(sub_texts)

    def process_elem(self, elem, parent_node: str) -> str:
        """处理抽象语法树（AST）中的元素。

这是一个通用函数，根据元素的类型将执行委托给更具体的函数。

Args:
    elem（ast.AST）：要处理的元素。
    parent_node（str）：图中的父节点。
    graph（nx.Graph）：要更新的图。

Returns:
    str：处理元素的结果。
"""
        if isinstance(elem, ast.FunctionDef):
            return self.process_function(elem, parent_node)
        elif isinstance(elem, ast.ClassDef):
            return self.process_class(elem, parent_node)
        return ""

load_data #

load_data(
    code_dir: str,
    skip_initpy: bool = True,
    fail_on_malformed_files: bool = False,
) -> List[Document]

从指定的代码目录加载数据。此外，在加载数据后，构建加载文档之间的依赖图。该图被存储为类的属性。

参数#

code_dir : str 代码文件的目录路径。 skip_initpy : bool 是否跳过__init__.py文件。默认为True。 fail_on_malformed_files : bool 是否在文件格式错误时失败。默认为False - 在这种情况下，会跳过格式错误的文件并记录警告。

返回:#

List[Document] 加载的文档列表。

Source code in llama_index/readers/docstring_walker/base.py

    def load_data(
        self,
        code_dir: str,
        skip_initpy: bool = True,
        fail_on_malformed_files: bool = False,
    ) -> List[Document]:
        """从指定的代码目录加载数据。
此外，在加载数据后，构建加载文档之间的依赖图。
该图被存储为类的属性。

参数
----------
code_dir : str
    代码文件的目录路径。
skip_initpy : bool
    是否跳过__init__.py文件。默认为True。
fail_on_malformed_files : bool
    是否在文件格式错误时失败。默认为False - 在这种情况下，会跳过格式错误的文件并记录警告。

返回:
-------
List[Document]
    加载的文档列表。
"""
        return self.process_directory(code_dir, skip_initpy, fail_on_malformed_files)

process_directory #

process_directory(
    code_dir: str,
    skip_initpy: bool = True,
    fail_on_malformed_files: bool = False,
) -> List[Document]

处理一个目录并从Python文件中提取信息。参数

code_dir：str 代码文件的目录路径。 skip_initpy：bool 是否跳过__init__.py文件。默认为True。 fail_on_malformed_files：bool 是否在文件格式错误时失败。默认为False - 在这种情况下，会跳过格式错误的文件并记录警告。

返回：#

List[Document] Document对象的列表。

Source code in llama_index/readers/docstring_walker/base.py

    def process_directory(
        self,
        code_dir: str,
        skip_initpy: bool = True,
        fail_on_malformed_files: bool = False,
    ) -> List[Document]:
        """处理一个目录并从Python文件中提取信息。
参数
----------
code_dir：str
    代码文件的目录路径。
skip_initpy：bool
    是否跳过__init__.py文件。默认为True。
fail_on_malformed_files：bool
    是否在文件格式错误时失败。默认为False - 在这种情况下，会跳过格式错误的文件并记录警告。

返回：
-------
List[Document]
    Document对象的列表。
"""
        llama_docs = []
        for root, _, files in os.walk(code_dir):
            for file in files:
                if file.endswith(".py"):
                    if skip_initpy and file == "__init__.py":
                        continue
                    module_name = file.replace(".py", "")
                    module_path = os.path.join(root, file)
                    try:
                        doc = self.parse_module(module_name, module_path)
                        llama_docs.append(doc)
                    except Exception as e:
                        if fail_on_malformed_files:
                            raise e  # noqa: TRY201
                        log.warning(
                            "Failed to parse file %s. Skipping. Error: %s",
                            module_path,
                            e,
                        )
                        continue
        return llama_docs

read_module_text #

read_module_text(path: str) -> str

读取Python模块的文本。对于测试，这个函数可以被模拟。

参数#

path : str 模块的路径。

返回:#

str 模块的文本。

Source code in llama_index/readers/docstring_walker/base.py

    def read_module_text(self, path: str) -> str:
        """读取Python模块的文本。对于测试，这个函数可以被模拟。

参数
----------
path : str
    模块的路径。

返回:
-------
str
    模块的文本。
"""
        with open(path, encoding="utf-8") as f:
            return f.read()

parse_module #

parse_module(module_name: str, path: str) -> Document

解析单个Python模块的函数。

参数#

module_name : str 模块名称。 path : str 模块的路径。

返回:#

Document 一个LLama索引文档对象，其中包含从模块中提取的信息。

Source code in llama_index/readers/docstring_walker/base.py

    def parse_module(self, module_name: str, path: str) -> Document:
        """解析单个Python模块的函数。

参数
----------
module_name : str
    模块名称。
path : str
    模块的路径。

返回:
-------
Document
    一个LLama索引文档对象，其中包含从模块中提取的信息。
"""
        module_text = self.read_module_text(path)
        module = ast.parse(module_text)
        module_docstring = ast.get_docstring(module)
        module_text = f"Module name: {module_name} \n Docstring: {module_docstring} \n"
        sub_texts = []
        for elem in module.body:
            if type(elem) in TYPES_TO_PROCESS:
                sub_text = self.process_elem(elem, module_name)
                sub_texts.append(sub_text)
        module_text += "\n".join(sub_texts)
        return Document(text=module_text)

process_class #

process_class(class_node: ClassDef, parent_node: str)

处理AST中的类节点，并向图中添加相关信息。

Args:#

class_node : ast.ClassDef 要处理的类节点。它代表抽象语法树（AST）中的类定义。 parent_node : str 父节点的名称。它指定图中父节点的名称。

Returns:#

str 处理过的类节点及其子元素的字符串表示形式。它提供了处理过的类节点及其子元素的文本表示形式。

Source code in llama_index/readers/docstring_walker/base.py

    def process_class(self, class_node: ast.ClassDef, parent_node: str):
        """处理AST中的类节点，并向图中添加相关信息。

Args:
----------
class_node : ast.ClassDef
    要处理的类节点。它代表抽象语法树（AST）中的类定义。
parent_node : str
    父节点的名称。它指定图中父节点的名称。

Returns:
----------
str
    处理过的类节点及其子元素的字符串表示形式。它提供了处理过的类节点及其子元素的文本表示形式。
"""
        cls_name = class_node.name
        cls_docstring = ast.get_docstring(class_node)

        text = f"\n Class name: {cls_name}, In: {parent_node} \n Docstring: {cls_docstring}"
        sub_texts = []
        for elem in class_node.body:
            sub_text = self.process_elem(elem, cls_name)
            sub_texts.append(sub_text)
        return text + "\n".join(sub_texts)

process_function #

process_function(
    func_node: FunctionDef, parent_node: str
) -> str

处理AST中的函数节点，并将其添加到图中。构建节点文本。

参数#

func_node : ast.FunctionDef 要处理的函数节点。 parent_node : str 父节点的名称。

返回:#

str 处理后的函数节点及其子元素的字符串表示形式。

Source code in llama_index/readers/docstring_walker/base.py

    def process_function(self, func_node: ast.FunctionDef, parent_node: str) -> str:
        """处理AST中的函数节点，并将其添加到图中。构建节点文本。

参数
----------
func_node : ast.FunctionDef
    要处理的函数节点。
parent_node : str
    父节点的名称。

返回:
-------
str
    处理后的函数节点及其子元素的字符串表示形式。
"""
        func_name = func_node.name
        func_docstring = ast.get_docstring(func_node)

        text = f"\n Function name: {func_name}, In: {parent_node} \n Docstring: {func_docstring}"
        sub_texts = []
        for elem in func_node.body:
            sub_text = self.process_elem(elem, func_name)
            sub_texts.append(sub_text)
        return text + "\n".join(sub_texts)

process_elem #

process_elem(elem, parent_node: str) -> str

处理抽象语法树（AST）中的元素。

这是一个通用函数，根据元素的类型将执行委托给更具体的函数。

Returns:

Type	Description
`str`	str：处理元素的结果。

Source code in llama_index/readers/docstring_walker/base.py

    def process_elem(self, elem, parent_node: str) -> str:
        """处理抽象语法树（AST）中的元素。

这是一个通用函数，根据元素的类型将执行委托给更具体的函数。

Args:
    elem（ast.AST）：要处理的元素。
    parent_node（str）：图中的父节点。
    graph（nx.Graph）：要更新的图。

Returns:
    str：处理元素的结果。
"""
        if isinstance(elem, ast.FunctionDef):
            return self.process_function(elem, parent_node)
        elif isinstance(elem, ast.ClassDef):
            return self.process_class(elem, parent_node)
        return ""