Source code for langchain_community.llms.exllamav2

from typing import Any, Dict, Iterator, List, Optional

from langchain_core.callbacks import CallbackManagerForLLMRun
from langchain_core.language_models import LLM
from langchain_core.outputs import GenerationChunk
from langchain_core.pydantic_v1 import Field, root_validator


[docs]class ExLlamaV2(LLM):
    """ExllamaV2 API。

    - 目前仅适用于GPTQ模型。
    - Lora模型目前尚不受支持。

    要使用，您应该已安装exllamav2库，并将Llama模型的路径作为构造函数的命名参数提供。
    请查看：

    示例：
        .. code-block:: python

            from langchain_community.llms import Exllamav2

            llm = Exllamav2(model_path="/path/to/llama/model")

    # 待办事项：
    - 添加对loras的支持
    - 添加对自定义设置的支持
    - 添加对自定义停止序列的支持"""

    client: Any
    model_path: str
    exllama_cache: Any = None
    config: Any = None
    generator: Any = None
    tokenizer: Any = None
    # If settings is None, it will be used as the default settings for the model.
    # All other parameters won't be used.
    settings: Any = None

    # Langchain parameters
    logfunc = print

    stop_sequences: List[str] = Field("")
    """立即停止生成器的序列。"""

    max_new_tokens: int = Field(150)
    """生成的令牌的最大数量。"""

    streaming: bool = Field(True)
    """是否逐个标记流式传输结果。"""

    verbose: bool = Field(True)
    """是否打印调试信息。"""

    # Generator parameters
    disallowed_tokens: List[int] = Field(None)
    """生成过程中禁止使用的令牌列表。"""

    @root_validator()
    def validate_environment(cls, values: Dict[str, Any]) -> Dict[str, Any]:
        try:
            import torch
        except ImportError as e:
            raise ImportError(
                "Unable to import torch, please install with `pip install torch`."
            ) from e
        # check if cuda is available
        if not torch.cuda.is_available():
            raise EnvironmentError("CUDA is not available. ExllamaV2 requires CUDA.")
        try:
            from exllamav2 import (
                ExLlamaV2,
                ExLlamaV2Cache,
                ExLlamaV2Config,
                ExLlamaV2Tokenizer,
            )
            from exllamav2.generator import (
                ExLlamaV2BaseGenerator,
                ExLlamaV2StreamingGenerator,
            )
        except ImportError:
            raise ImportError(
                "Could not import exllamav2 library. "
                "Please install the exllamav2 library with (cuda 12.1 is required)"
                "example : "
                "!python -m pip install https://github.com/turboderp/exllamav2/releases/download/v0.0.12/exllamav2-0.0.12+cu121-cp311-cp311-linux_x86_64.whl"
            )

        # Set logging function if verbose or set to empty lambda
        verbose = values["verbose"]
        if not verbose:
            values["logfunc"] = lambda *args, **kwargs: None
        logfunc = values["logfunc"]

        if values["settings"]:
            settings = values["settings"]
            logfunc(settings.__dict__)
        else:
            raise NotImplementedError(
                "settings is required. Custom settings are not supported yet."
            )

        config = ExLlamaV2Config()
        config.model_dir = values["model_path"]
        config.prepare()

        model = ExLlamaV2(config)

        exllama_cache = ExLlamaV2Cache(model, lazy=True)
        model.load_autosplit(exllama_cache)

        tokenizer = ExLlamaV2Tokenizer(config)
        if values["streaming"]:
            generator = ExLlamaV2StreamingGenerator(model, exllama_cache, tokenizer)
        else:
            generator = ExLlamaV2BaseGenerator(model, exllama_cache, tokenizer)

        # Configure the model and generator
        values["stop_sequences"] = [x.strip().lower() for x in values["stop_sequences"]]
        setattr(settings, "stop_sequences", values["stop_sequences"])
        logfunc(f"stop_sequences {values['stop_sequences']}")

        disallowed = values.get("disallowed_tokens")
        if disallowed:
            settings.disallow_tokens(tokenizer, disallowed)

        values["client"] = model
        values["generator"] = generator
        values["config"] = config
        values["tokenizer"] = tokenizer
        values["exllama_cache"] = exllama_cache

        return values

    @property
    def _llm_type(self) -> str:
        """llm的返回类型。"""
        return "ExLlamaV2"

[docs]    def get_num_tokens(self, text: str) -> int:
        """获取文本中存在的标记数量。"""
        return self.generator.tokenizer.num_tokens(text)

    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> str:
        generator = self.generator

        if self.streaming:
            combined_text_output = ""
            for chunk in self._stream(
                prompt=prompt, stop=stop, run_manager=run_manager, kwargs=kwargs
            ):
                combined_text_output += str(chunk)
            return combined_text_output
        else:
            output = generator.generate_simple(
                prompt=prompt,
                gen_settings=self.settings,
                num_tokens=self.max_new_tokens,
            )
            # subtract subtext from output
            output = output[len(prompt) :]
            return output

    def _stream(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> Iterator[GenerationChunk]:
        input_ids = self.tokenizer.encode(prompt)
        self.generator.warmup()
        self.generator.set_stop_conditions([])
        self.generator.begin_stream(input_ids, self.settings)

        generated_tokens = 0

        while True:
            chunk, eos, _ = self.generator.stream()
            generated_tokens += 1

            if run_manager:
                run_manager.on_llm_new_token(
                    token=chunk,
                    verbose=self.verbose,
                )
            yield chunk
            if eos or generated_tokens == self.max_new_tokens:
                break

        return