Source code for langchain_community.chat_loaders.telegram
import json
import logging
import os
import tempfile
import zipfile
from pathlib import Path
from typing import Iterator, List, Union
from langchain_core.chat_loaders import BaseChatLoader
from langchain_core.chat_sessions import ChatSession
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage
logger = logging.getLogger(__name__)
[docs]class TelegramChatLoader(BaseChatLoader):
"""将`telegram`对话加载到LangChain聊天消息中。
要导出,请使用来自https://desktop.telegram.org/ 的Telegram桌面应用程序,选择一个对话,单击右上角的三个点,然后选择“导出聊天记录”。然后选择“机器可读的JSON”(首选)进行导出。注意:桌面应用程序的“精简”版本(如“Telegram for MacOS”)不支持导出聊天记录。"""
[docs] def __init__(
self,
path: Union[str, Path],
):
"""初始化TelegramChatLoader。
参数:
path(Union[str, Path]):导出的Telegram聊天zip文件、目录、json文件或HTML文件的路径。
"""
self.path = path if isinstance(path, str) else str(path)
def _load_single_chat_session_html(self, file_path: str) -> ChatSession:
"""从HTML文件中加载单个聊天会话。
参数:
file_path(str):HTML文件的路径。
返回:
ChatSession:加载的聊天会话。
"""
try:
from bs4 import BeautifulSoup
except ImportError:
raise ImportError(
"Please install the 'beautifulsoup4' package to load"
" Telegram HTML files. You can do this by running"
"'pip install beautifulsoup4' in your terminal."
)
with open(file_path, "r", encoding="utf-8") as file:
soup = BeautifulSoup(file, "html.parser")
results: List[Union[HumanMessage, AIMessage]] = []
previous_sender = None
for message in soup.select(".message.default"):
timestamp = message.select_one(".pull_right.date.details")["title"]
from_name_element = message.select_one(".from_name")
if from_name_element is None and previous_sender is None:
logger.debug("from_name not found in message")
continue
elif from_name_element is None:
from_name = previous_sender
else:
from_name = from_name_element.text.strip()
text = message.select_one(".text").text.strip()
results.append(
HumanMessage(
content=text,
additional_kwargs={
"sender": from_name,
"events": [{"message_time": timestamp}],
},
)
)
previous_sender = from_name
return ChatSession(messages=results)
def _load_single_chat_session_json(self, file_path: str) -> ChatSession:
"""从一个JSON文件中加载单个聊天会话。
参数:
file_path (str): JSON文件的路径。
返回:
ChatSession: 加载的聊天会话。
"""
with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file)
messages = data.get("messages", [])
results: List[BaseMessage] = []
for message in messages:
text = message.get("text", "")
timestamp = message.get("date", "")
from_name = message.get("from", "")
results.append(
HumanMessage(
content=text,
additional_kwargs={
"sender": from_name,
"events": [{"message_time": timestamp}],
},
)
)
return ChatSession(messages=results)
def _iterate_files(self, path: str) -> Iterator[str]:
"""遍历目录或zip文件中的文件。
参数:
path (str): 目录或zip文件的路径。
生成:
str: 每个文件的路径。
"""
if os.path.isfile(path) and path.endswith((".html", ".json")):
yield path
elif os.path.isdir(path):
for root, _, files in os.walk(path):
for file in files:
if file.endswith((".html", ".json")):
yield os.path.join(root, file)
elif zipfile.is_zipfile(path):
with zipfile.ZipFile(path) as zip_file:
for file in zip_file.namelist():
if file.endswith((".html", ".json")):
with tempfile.TemporaryDirectory() as temp_dir:
yield zip_file.extract(file, path=temp_dir)
[docs] def lazy_load(self) -> Iterator[ChatSession]:
"""懒加载聊天文件中的消息,并将它们作为聊天会话逐个生成。
生成:
ChatSession:加载的聊天会话。
"""
for file_path in self._iterate_files(self.path):
if file_path.endswith(".html"):
yield self._load_single_chat_session_html(file_path)
elif file_path.endswith(".json"):
yield self._load_single_chat_session_json(file_path)