from __future__ import annotations
from typing import Any, Dict, Iterator, Literal, Optional
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers.language.c import CSegmenter
from langchain_community.document_loaders.parsers.language.cobol import CobolSegmenter
from langchain_community.document_loaders.parsers.language.cpp import CPPSegmenter
from langchain_community.document_loaders.parsers.language.csharp import CSharpSegmenter
from langchain_community.document_loaders.parsers.language.go import GoSegmenter
from langchain_community.document_loaders.parsers.language.java import JavaSegmenter
from langchain_community.document_loaders.parsers.language.javascript import (
JavaScriptSegmenter,
)
from langchain_community.document_loaders.parsers.language.kotlin import KotlinSegmenter
from langchain_community.document_loaders.parsers.language.lua import LuaSegmenter
from langchain_community.document_loaders.parsers.language.perl import PerlSegmenter
from langchain_community.document_loaders.parsers.language.php import PHPSegmenter
from langchain_community.document_loaders.parsers.language.python import PythonSegmenter
from langchain_community.document_loaders.parsers.language.ruby import RubySegmenter
from langchain_community.document_loaders.parsers.language.rust import RustSegmenter
from langchain_community.document_loaders.parsers.language.scala import ScalaSegmenter
from langchain_community.document_loaders.parsers.language.typescript import (
TypeScriptSegmenter,
)
LANGUAGE_EXTENSIONS: Dict[str, str] = {
"py": "python",
"js": "js",
"cobol": "cobol",
"c": "c",
"cpp": "cpp",
"cs": "csharp",
"rb": "ruby",
"scala": "scala",
"rs": "rust",
"go": "go",
"kt": "kotlin",
"lua": "lua",
"pl": "perl",
"ts": "ts",
"java": "java",
"php": "php",
}
LANGUAGE_SEGMENTERS: Dict[str, Any] = {
"python": PythonSegmenter,
"js": JavaScriptSegmenter,
"cobol": CobolSegmenter,
"c": CSegmenter,
"cpp": CPPSegmenter,
"csharp": CSharpSegmenter,
"ruby": RubySegmenter,
"rust": RustSegmenter,
"scala": ScalaSegmenter,
"go": GoSegmenter,
"kotlin": KotlinSegmenter,
"lua": LuaSegmenter,
"perl": PerlSegmenter,
"ts": TypeScriptSegmenter,
"java": JavaSegmenter,
"php": PHPSegmenter,
}
Language = Literal[
"cpp",
"go",
"java",
"kotlin",
"js",
"ts",
"php",
"proto",
"python",
"rst",
"ruby",
"rust",
"scala",
"swift",
"markdown",
"latex",
"html",
"sol",
"csharp",
"cobol",
"c",
"lua",
"perl",
]
[docs]class LanguageParser(BaseBlobParser):
"""使用相应的编程语言语法进行解析。
代码中的每个顶层函数和类都加载到单独的文档中。
此外,还会生成一个额外的文档,其中包含剩余的顶层代码,不包括已经分割的函数和类。
这种方法可能会提高源代码上的QA模型的准确性。
支持代码解析的语言有:
- C语言: "c" (*)
- C++语言: "cpp" (*)
- C#语言: "csharp" (*)
- COBOL语言: "cobol"
- Go语言: "go" (*)
- Java语言: "java" (*)
- JavaScript语言: "js" (需要安装 `esprima` 包)
- Kotlin语言: "kotlin" (*)
- Lua语言: "lua" (*)
- Perl语言: "perl" (*)
- Python语言: "python"
- Ruby语言: "ruby" (*)
- Rust语言: "rust" (*)
- Scala语言: "scala" (*)
- TypeScript语言: "ts" (*)
带有 (*) 标记的项目需要安装 `tree_sitter` 和 `tree_sitter_languages` 包。使用 `tree_sitter` 可以很容易地添加对其他语言的支持,尽管目前需要修改 LangChain。
可以配置用于解析的语言,以及基于语法激活分割所需的最小行数。
如果没有明确指定语言,则 `LanguageParser` 将根据文件扩展名推断语言。
示例:
.. code-block:: python
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser
loader = GenericLoader.from_filesystem(
"./code",
glob="**/*",
suffixes=[".py", ".js"],
parser=LanguageParser()
)
docs = loader.load()
手动选择语言的示例实例化:
.. code-block:: python
loader = GenericLoader.from_filesystem(
"./code",
glob="**/*",
suffixes=[".py"],
parser=LanguageParser(language="python")
)
设置行数阈值的示例实例化:
.. code-block:: python
loader = GenericLoader.from_filesystem(
"./code",
glob="**/*",
suffixes=[".py"],
parser=LanguageParser(parser_threshold=200)
)"""
[docs] def __init__(self, language: Optional[Language] = None, parser_threshold: int = 0):
"""语言解析器,根据各种语言的语法来分割代码。
参数:
language:如果为None(默认值),将尝试从源代码中推断语言。
parser_threshold:激活解析所需的最小行数(默认为0)。
"""
if language and language not in LANGUAGE_SEGMENTERS:
raise Exception(f"No parser available for {language}")
self.language = language
self.parser_threshold = parser_threshold
[docs] def lazy_parse(self, blob: Blob) -> Iterator[Document]:
code = blob.as_string()
language = self.language or (
LANGUAGE_EXTENSIONS.get(blob.source.rsplit(".", 1)[-1])
if isinstance(blob.source, str)
else None
)
if language is None:
yield Document(
page_content=code,
metadata={
"source": blob.source,
},
)
return
if self.parser_threshold >= len(code.splitlines()):
yield Document(
page_content=code,
metadata={
"source": blob.source,
"language": language,
},
)
return
self.Segmenter = LANGUAGE_SEGMENTERS[language]
segmenter = self.Segmenter(blob.as_string())
if not segmenter.is_valid():
yield Document(
page_content=code,
metadata={
"source": blob.source,
},
)
return
for functions_classes in segmenter.extract_functions_classes():
yield Document(
page_content=functions_classes,
metadata={
"source": blob.source,
"content_type": "functions_classes",
"language": language,
},
)
yield Document(
page_content=segmenter.simplify_code(),
metadata={
"source": blob.source,
"content_type": "simplified_code",
"language": language,
},
)