Nvidia triton

NvidiaTriton #

Bases: LLM

Nvidia Triton LLM.

Nvidia的Triton是一个推理服务器，提供对托管的LLM模型的API访问。该连接器允许llama_index通过GRPC远程与Triton推理服务器交互，加速推理操作。

示例

pip install llama-index-llms-nvidia-triton

from llama_index.llms.nvidia_triton import NvidiaTriton

# 确保Triton服务器实例正在运行，并为Triton服务器实例提供正确的URL
triton_url = "localhost:8001"

# 实例化NvidiaTriton类
triton_client = NvidiaTriton()

# 使用提示调用complete方法
resp = triton_client.complete("北美洲最高的山是 ")
print(resp)

Source code in llama_index/llms/nvidia_triton/base.py

class NvidiaTriton(LLM):
    """Nvidia Triton LLM.

    Nvidia的Triton是一个推理服务器，提供对托管的LLM模型的API访问。该连接器允许llama_index通过GRPC远程与Triton推理服务器交互，加速推理操作。

    [Triton推理服务器Github](https://github.com/triton-inference-server/server)

    示例:
        `pip install llama-index-llms-nvidia-triton`

        ```python
        from llama_index.llms.nvidia_triton import NvidiaTriton

        # 确保Triton服务器实例正在运行，并为Triton服务器实例提供正确的URL
        triton_url = "localhost:8001"

        # 实例化NvidiaTriton类
        triton_client = NvidiaTriton()

        # 使用提示调用complete方法
        resp = triton_client.complete("北美洲最高的山是 ")
        print(resp)
        ```"""

    server_url: str = Field(
        default=DEFAULT_SERVER_URL,
        description="The URL of the Triton inference server to use.",
    )
    model_name: str = Field(
        default=DEFAULT_MODEL,
        description="The name of the Triton hosted model this client should use",
    )
    temperature: Optional[float] = Field(
        default=DEFAULT_TEMPERATURE, description="Temperature to use for sampling"
    )
    top_p: Optional[float] = Field(
        default=DEFAULT_TOP_P, description="The top-p value to use for sampling"
    )
    top_k: Optional[float] = Field(
        default=DEFAULT_TOP_K, description="The top k value to use for sampling"
    )
    tokens: Optional[int] = Field(
        default=DEFAULT_MAX_TOKENS,
        description="The maximum number of tokens to generate.",
    )
    beam_width: Optional[int] = Field(
        default=DEFAULT_BEAM_WIDTH, description="Last n number of tokens to penalize"
    )
    repetition_penalty: Optional[float] = Field(
        default=DEFAULT_REPTITION_PENALTY,
        description="Last n number of tokens to penalize",
    )
    length_penalty: Optional[float] = Field(
        default=DEFAULT_LENGTH_PENALTY,
        description="The penalty to apply repeated tokens",
    )
    max_retries: Optional[int] = Field(
        default=DEFAULT_MAX_RETRIES,
        description="Maximum number of attempts to retry Triton client invocation before erroring",
    )
    timeout: Optional[float] = Field(
        default=DEFAULT_TIMEOUT,
        description="Maximum time (seconds) allowed for a Triton client call before erroring",
    )
    reuse_client: Optional[bool] = Field(
        default=DEFAULT_REUSE_CLIENT,
        description="True for reusing the same client instance between invocations",
    )
    triton_load_model_call: Optional[bool] = Field(
        default=DEFAULT_TRITON_LOAD_MODEL,
        description="True if a Triton load model API call should be made before using the client",
    )

    _client: Optional[GrpcTritonClient] = PrivateAttr()

    def __init__(
        self,
        server_url: str = DEFAULT_SERVER_URL,
        model: str = DEFAULT_MODEL,
        temperature: float = DEFAULT_TEMPERATURE,
        top_p: float = DEFAULT_TOP_P,
        top_k: float = DEFAULT_TOP_K,
        tokens: Optional[int] = DEFAULT_MAX_TOKENS,
        beam_width: int = DEFAULT_BEAM_WIDTH,
        repetition_penalty: float = DEFAULT_REPTITION_PENALTY,
        length_penalty: float = DEFAULT_LENGTH_PENALTY,
        max_retries: int = DEFAULT_MAX_RETRIES,
        timeout: float = DEFAULT_TIMEOUT,
        reuse_client: bool = DEFAULT_REUSE_CLIENT,
        triton_load_model_call: bool = DEFAULT_TRITON_LOAD_MODEL,
        callback_manager: Optional[CallbackManager] = None,
        additional_kwargs: Optional[Dict[str, Any]] = None,
        **kwargs: Any,
    ) -> None:
        additional_kwargs = additional_kwargs or {}

        super().__init__(
            server_url=server_url,
            model=model,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            tokens=tokens,
            beam_width=beam_width,
            repetition_penalty=repetition_penalty,
            length_penalty=length_penalty,
            max_retries=max_retries,
            timeout=timeout,
            reuse_client=reuse_client,
            triton_load_model_call=triton_load_model_call,
            callback_manager=callback_manager,
            additional_kwargs=additional_kwargs,
            **kwargs,
        )

        try:
            self._client = GrpcTritonClient(server_url)
        except ImportError as err:
            raise ImportError(
                "Could not import triton client python package. "
                "Please install it with `pip install tritonclient`."
            ) from err

    @property
    def _get_model_default_parameters(self) -> Dict[str, Any]:
        return {
            "tokens": self.tokens,
            "top_k": self.top_k,
            "top_p": self.top_p,
            "temperature": self.temperature,
            "repetition_penalty": self.repetition_penalty,
            "length_penalty": self.length_penalty,
            "beam_width": self.beam_width,
        }

    @property
    def _invocation_params(self, **kwargs: Any) -> Dict[str, Any]:
        return {**self._get_model_default_parameters, **kwargs}

    @property
    def _identifying_params(self) -> Dict[str, Any]:
        """获取所有的识别参数。"""
        return {
            "server_url": self.server_url,
            "model_name": self.model_name,
        }

    def _get_client(self) -> Any:
        """创建或重用 Triton 客户端连接。"""
        if not self.reuse_client:
            return GrpcTritonClient(self.server_url)

        if self._client is None:
            self._client = GrpcTritonClient(self.server_url)
        return self._client

    @property
    def metadata(self) -> LLMMetadata:
        """收集并返回有关用户Triton配置的LLM模型的元数据。"""
        return LLMMetadata(
            model_name=self.model_name,
        )

    @llm_chat_callback()
    def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
        chat_fn = completion_to_chat_decorator(self.complete)
        return chat_fn(messages, **kwargs)

    @llm_chat_callback()
    def stream_chat(
        self, messages: Sequence[ChatMessage], **kwargs: Any
    ) -> ChatResponseGen:
        chat_stream_fn = stream_completion_to_chat_decorator(self.stream_complete)
        return chat_stream_fn(messages, **kwargs)

    def complete(
        self, prompt: str, formatted: bool = False, **kwargs: Any
    ) -> CompletionResponse:
        from tritonclient.utils import InferenceServerException

        client = self._get_client()

        invocation_params = self._get_model_default_parameters
        invocation_params.update(kwargs)
        invocation_params["prompt"] = [[prompt]]
        model_params = self._identifying_params
        model_params.update(kwargs)
        request_id = str(random.randint(1, 9999999))  # nosec

        if self.triton_load_model_call:
            client.load_model(model_params["model_name"])

        result_queue = client.request_streaming(
            model_params["model_name"], request_id, **invocation_params
        )
        response = ""
        for token in result_queue:
            if isinstance(token, InferenceServerException):
                client.stop_stream(model_params["model_name"], request_id)
                raise token
            response += token

        return CompletionResponse(
            text=response,
        )

    def stream_complete(
        self, prompt: str, formatted: bool = False, **kwargs: Any
    ) -> CompletionResponseGen:
        from tritonclient.utils import InferenceServerException

        client = self._get_client()

        invocation_params = self._get_model_default_parameters
        invocation_params.update(kwargs)
        invocation_params["prompt"] = [[prompt]]
        model_params = self._identifying_params
        model_params.update(kwargs)
        request_id = str(random.randint(1, 9999999))  # nosec

        if self.triton_load_model_call:
            client.load_model(model_params["model_name"])

        result_queue = client.request_streaming(
            model_params["model_name"], request_id, **invocation_params
        )

        def gen() -> CompletionResponseGen:
            text = ""
            for token in result_queue:
                if isinstance(token, InferenceServerException):
                    client.stop_stream(model_params["model_name"], request_id)
                    raise token
                text += token
                yield CompletionResponse(text=text, delta=token)

        return gen()

    async def achat(
        self, messages: Sequence[ChatMessage], **kwargs: Any
    ) -> ChatResponse:
        raise NotImplementedError

    async def acomplete(
        self, prompt: str, formatted: bool = False, **kwargs: Any
    ) -> CompletionResponse:
        raise NotImplementedError

    async def astream_chat(
        self, messages: Sequence[ChatMessage], **kwargs: Any
    ) -> ChatResponseAsyncGen:
        raise NotImplementedError

    async def astream_complete(
        self, prompt: str, formatted: bool = False, **kwargs: Any
    ) -> CompletionResponseAsyncGen:
        raise NotImplementedError

metadata `property` #

metadata: LLMMetadata

收集并返回有关用户Triton配置的LLM模型的元数据。

Nvidia triton

NvidiaTriton #

metadata property #

metadata `property` #