Source code for langchain_community.document_loaders.parsers.language.language_parser

from __future__ import annotations

from typing import Any, Dict, Iterator, Literal, Optional

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers.language.c import CSegmenter
from langchain_community.document_loaders.parsers.language.cobol import CobolSegmenter
from langchain_community.document_loaders.parsers.language.cpp import CPPSegmenter
from langchain_community.document_loaders.parsers.language.csharp import CSharpSegmenter
from langchain_community.document_loaders.parsers.language.go import GoSegmenter
from langchain_community.document_loaders.parsers.language.java import JavaSegmenter
from langchain_community.document_loaders.parsers.language.javascript import (
    JavaScriptSegmenter,
)
from langchain_community.document_loaders.parsers.language.kotlin import KotlinSegmenter
from langchain_community.document_loaders.parsers.language.lua import LuaSegmenter
from langchain_community.document_loaders.parsers.language.perl import PerlSegmenter
from langchain_community.document_loaders.parsers.language.php import PHPSegmenter
from langchain_community.document_loaders.parsers.language.python import PythonSegmenter
from langchain_community.document_loaders.parsers.language.ruby import RubySegmenter
from langchain_community.document_loaders.parsers.language.rust import RustSegmenter
from langchain_community.document_loaders.parsers.language.scala import ScalaSegmenter
from langchain_community.document_loaders.parsers.language.typescript import (
    TypeScriptSegmenter,
)

LANGUAGE_EXTENSIONS: Dict[str, str] = {
    "py": "python",
    "js": "js",
    "cobol": "cobol",
    "c": "c",
    "cpp": "cpp",
    "cs": "csharp",
    "rb": "ruby",
    "scala": "scala",
    "rs": "rust",
    "go": "go",
    "kt": "kotlin",
    "lua": "lua",
    "pl": "perl",
    "ts": "ts",
    "java": "java",
    "php": "php",
}

LANGUAGE_SEGMENTERS: Dict[str, Any] = {
    "python": PythonSegmenter,
    "js": JavaScriptSegmenter,
    "cobol": CobolSegmenter,
    "c": CSegmenter,
    "cpp": CPPSegmenter,
    "csharp": CSharpSegmenter,
    "ruby": RubySegmenter,
    "rust": RustSegmenter,
    "scala": ScalaSegmenter,
    "go": GoSegmenter,
    "kotlin": KotlinSegmenter,
    "lua": LuaSegmenter,
    "perl": PerlSegmenter,
    "ts": TypeScriptSegmenter,
    "java": JavaSegmenter,
    "php": PHPSegmenter,
}

Language = Literal[
    "cpp",
    "go",
    "java",
    "kotlin",
    "js",
    "ts",
    "php",
    "proto",
    "python",
    "rst",
    "ruby",
    "rust",
    "scala",
    "swift",
    "markdown",
    "latex",
    "html",
    "sol",
    "csharp",
    "cobol",
    "c",
    "lua",
    "perl",
]


[docs]class LanguageParser(BaseBlobParser): """使用相应的编程语言语法进行解析。 代码中的每个顶层函数和类都加载到单独的文档中。 此外,还会生成一个额外的文档,其中包含剩余的顶层代码,不包括已经分割的函数和类。 这种方法可能会提高源代码上的QA模型的准确性。 支持代码解析的语言有: - C语言: "c" (*) - C++语言: "cpp" (*) - C#语言: "csharp" (*) - COBOL语言: "cobol" - Go语言: "go" (*) - Java语言: "java" (*) - JavaScript语言: "js" (需要安装 `esprima` 包) - Kotlin语言: "kotlin" (*) - Lua语言: "lua" (*) - Perl语言: "perl" (*) - Python语言: "python" - Ruby语言: "ruby" (*) - Rust语言: "rust" (*) - Scala语言: "scala" (*) - TypeScript语言: "ts" (*) 带有 (*) 标记的项目需要安装 `tree_sitter` 和 `tree_sitter_languages` 包。使用 `tree_sitter` 可以很容易地添加对其他语言的支持,尽管目前需要修改 LangChain。 可以配置用于解析的语言,以及基于语法激活分割所需的最小行数。 如果没有明确指定语言,则 `LanguageParser` 将根据文件扩展名推断语言。 示例: .. code-block:: python from langchain_community.document_loaders.generic import GenericLoader from langchain_community.document_loaders.parsers import LanguageParser loader = GenericLoader.from_filesystem( "./code", glob="**/*", suffixes=[".py", ".js"], parser=LanguageParser() ) docs = loader.load() 手动选择语言的示例实例化: .. code-block:: python loader = GenericLoader.from_filesystem( "./code", glob="**/*", suffixes=[".py"], parser=LanguageParser(language="python") ) 设置行数阈值的示例实例化: .. code-block:: python loader = GenericLoader.from_filesystem( "./code", glob="**/*", suffixes=[".py"], parser=LanguageParser(parser_threshold=200) )"""
[docs] def __init__(self, language: Optional[Language] = None, parser_threshold: int = 0): """语言解析器,根据各种语言的语法来分割代码。 参数: language:如果为None(默认值),将尝试从源代码中推断语言。 parser_threshold:激活解析所需的最小行数(默认为0)。 """ if language and language not in LANGUAGE_SEGMENTERS: raise Exception(f"No parser available for {language}") self.language = language self.parser_threshold = parser_threshold
[docs] def lazy_parse(self, blob: Blob) -> Iterator[Document]: code = blob.as_string() language = self.language or ( LANGUAGE_EXTENSIONS.get(blob.source.rsplit(".", 1)[-1]) if isinstance(blob.source, str) else None ) if language is None: yield Document( page_content=code, metadata={ "source": blob.source, }, ) return if self.parser_threshold >= len(code.splitlines()): yield Document( page_content=code, metadata={ "source": blob.source, "language": language, }, ) return self.Segmenter = LANGUAGE_SEGMENTERS[language] segmenter = self.Segmenter(blob.as_string()) if not segmenter.is_valid(): yield Document( page_content=code, metadata={ "source": blob.source, }, ) return for functions_classes in segmenter.extract_functions_classes(): yield Document( page_content=functions_classes, metadata={ "source": blob.source, "content_type": "functions_classes", "language": language, }, ) yield Document( page_content=segmenter.simplify_code(), metadata={ "source": blob.source, "content_type": "simplified_code", "language": language, }, )