Source code for langchain_community.chat_loaders.telegram

import json
import logging
import os
import tempfile
import zipfile
from pathlib import Path
from typing import Iterator, List, Union

from langchain_core.chat_loaders import BaseChatLoader
from langchain_core.chat_sessions import ChatSession
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage

logger = logging.getLogger(__name__)


[docs]class TelegramChatLoader(BaseChatLoader):
    """将`telegram`对话加载到LangChain聊天消息中。

要导出，请使用来自https://desktop.telegram.org/ 的Telegram桌面应用程序，选择一个对话，单击右上角的三个点，然后选择“导出聊天记录”。然后选择“机器可读的JSON”（首选）进行导出。注意：桌面应用程序的“精简”版本（如“Telegram for MacOS”）不支持导出聊天记录。"""

[docs]    def __init__(
        self,
        path: Union[str, Path],
    ):
        """初始化TelegramChatLoader。

参数：
    path（Union[str, Path]）：导出的Telegram聊天zip文件、目录、json文件或HTML文件的路径。
"""
        self.path = path if isinstance(path, str) else str(path)

    def _load_single_chat_session_html(self, file_path: str) -> ChatSession:
        """从HTML文件中加载单个聊天会话。

参数：
    file_path（str）：HTML文件的路径。

返回：
    ChatSession：加载的聊天会话。
"""
        try:
            from bs4 import BeautifulSoup
        except ImportError:
            raise ImportError(
                "Please install the 'beautifulsoup4' package to load"
                " Telegram HTML files. You can do this by running"
                "'pip install beautifulsoup4' in your terminal."
            )
        with open(file_path, "r", encoding="utf-8") as file:
            soup = BeautifulSoup(file, "html.parser")

        results: List[Union[HumanMessage, AIMessage]] = []
        previous_sender = None
        for message in soup.select(".message.default"):
            timestamp = message.select_one(".pull_right.date.details")["title"]
            from_name_element = message.select_one(".from_name")
            if from_name_element is None and previous_sender is None:
                logger.debug("from_name not found in message")
                continue
            elif from_name_element is None:
                from_name = previous_sender
            else:
                from_name = from_name_element.text.strip()
            text = message.select_one(".text").text.strip()
            results.append(
                HumanMessage(
                    content=text,
                    additional_kwargs={
                        "sender": from_name,
                        "events": [{"message_time": timestamp}],
                    },
                )
            )
            previous_sender = from_name

        return ChatSession(messages=results)

    def _load_single_chat_session_json(self, file_path: str) -> ChatSession:
        """从一个JSON文件中加载单个聊天会话。

参数：
    file_path (str): JSON文件的路径。

返回：
    ChatSession: 加载的聊天会话。
"""
        with open(file_path, "r", encoding="utf-8") as file:
            data = json.load(file)

        messages = data.get("messages", [])
        results: List[BaseMessage] = []
        for message in messages:
            text = message.get("text", "")
            timestamp = message.get("date", "")
            from_name = message.get("from", "")

            results.append(
                HumanMessage(
                    content=text,
                    additional_kwargs={
                        "sender": from_name,
                        "events": [{"message_time": timestamp}],
                    },
                )
            )

        return ChatSession(messages=results)

    def _iterate_files(self, path: str) -> Iterator[str]:
        """遍历目录或zip文件中的文件。

参数：
    path (str): 目录或zip文件的路径。

生成：
    str: 每个文件的路径。
"""
        if os.path.isfile(path) and path.endswith((".html", ".json")):
            yield path
        elif os.path.isdir(path):
            for root, _, files in os.walk(path):
                for file in files:
                    if file.endswith((".html", ".json")):
                        yield os.path.join(root, file)
        elif zipfile.is_zipfile(path):
            with zipfile.ZipFile(path) as zip_file:
                for file in zip_file.namelist():
                    if file.endswith((".html", ".json")):
                        with tempfile.TemporaryDirectory() as temp_dir:
                            yield zip_file.extract(file, path=temp_dir)

[docs]    def lazy_load(self) -> Iterator[ChatSession]:
        """懒加载聊天文件中的消息，并将它们作为聊天会话逐个生成。

生成：
    ChatSession：加载的聊天会话。
"""
        for file_path in self._iterate_files(self.path):
            if file_path.endswith(".html"):
                yield self._load_single_chat_session_html(file_path)
            elif file_path.endswith(".json"):
                yield self._load_single_chat_session_json(file_path)