Source code for langchain_community.chat_loaders.telegram

import json
import logging
import os
import tempfile
import zipfile
from pathlib import Path
from typing import Iterator, List, Union

from langchain_core.chat_loaders import BaseChatLoader
from langchain_core.chat_sessions import ChatSession
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage

logger = logging.getLogger(__name__)


[docs]class TelegramChatLoader(BaseChatLoader): """将`telegram`对话加载到LangChain聊天消息中。 要导出,请使用来自https://desktop.telegram.org/ 的Telegram桌面应用程序,选择一个对话,单击右上角的三个点,然后选择“导出聊天记录”。然后选择“机器可读的JSON”(首选)进行导出。注意:桌面应用程序的“精简”版本(如“Telegram for MacOS”)不支持导出聊天记录。"""
[docs] def __init__( self, path: Union[str, Path], ): """初始化TelegramChatLoader。 参数: path(Union[str, Path]):导出的Telegram聊天zip文件、目录、json文件或HTML文件的路径。 """ self.path = path if isinstance(path, str) else str(path)
def _load_single_chat_session_html(self, file_path: str) -> ChatSession: """从HTML文件中加载单个聊天会话。 参数: file_path(str):HTML文件的路径。 返回: ChatSession:加载的聊天会话。 """ try: from bs4 import BeautifulSoup except ImportError: raise ImportError( "Please install the 'beautifulsoup4' package to load" " Telegram HTML files. You can do this by running" "'pip install beautifulsoup4' in your terminal." ) with open(file_path, "r", encoding="utf-8") as file: soup = BeautifulSoup(file, "html.parser") results: List[Union[HumanMessage, AIMessage]] = [] previous_sender = None for message in soup.select(".message.default"): timestamp = message.select_one(".pull_right.date.details")["title"] from_name_element = message.select_one(".from_name") if from_name_element is None and previous_sender is None: logger.debug("from_name not found in message") continue elif from_name_element is None: from_name = previous_sender else: from_name = from_name_element.text.strip() text = message.select_one(".text").text.strip() results.append( HumanMessage( content=text, additional_kwargs={ "sender": from_name, "events": [{"message_time": timestamp}], }, ) ) previous_sender = from_name return ChatSession(messages=results) def _load_single_chat_session_json(self, file_path: str) -> ChatSession: """从一个JSON文件中加载单个聊天会话。 参数: file_path (str): JSON文件的路径。 返回: ChatSession: 加载的聊天会话。 """ with open(file_path, "r", encoding="utf-8") as file: data = json.load(file) messages = data.get("messages", []) results: List[BaseMessage] = [] for message in messages: text = message.get("text", "") timestamp = message.get("date", "") from_name = message.get("from", "") results.append( HumanMessage( content=text, additional_kwargs={ "sender": from_name, "events": [{"message_time": timestamp}], }, ) ) return ChatSession(messages=results) def _iterate_files(self, path: str) -> Iterator[str]: """遍历目录或zip文件中的文件。 参数: path (str): 目录或zip文件的路径。 生成: str: 每个文件的路径。 """ if os.path.isfile(path) and path.endswith((".html", ".json")): yield path elif os.path.isdir(path): for root, _, files in os.walk(path): for file in files: if file.endswith((".html", ".json")): yield os.path.join(root, file) elif zipfile.is_zipfile(path): with zipfile.ZipFile(path) as zip_file: for file in zip_file.namelist(): if file.endswith((".html", ".json")): with tempfile.TemporaryDirectory() as temp_dir: yield zip_file.extract(file, path=temp_dir)
[docs] def lazy_load(self) -> Iterator[ChatSession]: """懒加载聊天文件中的消息,并将它们作为聊天会话逐个生成。 生成: ChatSession:加载的聊天会话。 """ for file_path in self._iterate_files(self.path): if file_path.endswith(".html"): yield self._load_single_chat_session_html(file_path) elif file_path.endswith(".json"): yield self._load_single_chat_session_json(file_path)