Source code for langchain_community.chat_loaders.whatsapp

import logging
import os
import re
import zipfile
from typing import Iterator, List, Union

from langchain_core.chat_loaders import BaseChatLoader
from langchain_core.chat_sessions import ChatSession
from langchain_core.messages import AIMessage, HumanMessage

logger = logging.getLogger(__name__)


[docs]class WhatsAppChatLoader(BaseChatLoader): """从一个压缩文件或目录中加载`WhatsApp`的对话。"""
[docs] def __init__(self, path: str): """初始化WhatsAppChatLoader。 参数: path(str):导出的WhatsApp聊天zip目录、文件夹或文件的路径。 要生成转储文件,请打开聊天,点击右上角的三个点,然后选择“更多”。然后选择“导出聊天”,并选择“不带媒体”。 """ self.path = path ignore_lines = [ "This message was deleted", "<Media omitted>", "image omitted", "Messages and calls are end-to-end encrypted. No one outside of this chat," " not even WhatsApp, can read or listen to them.", ] self._ignore_lines = re.compile( r"(" + "|".join([r"\u200E*" + line for line in ignore_lines]) + r")", flags=re.IGNORECASE, ) self._message_line_regex = re.compile( r"\u200E*\[?(\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2}:\d{2} (?:AM|PM))\]?[ \u200E]*([^:]+): (.+)", # noqa flags=re.IGNORECASE, )
def _load_single_chat_session(self, file_path: str) -> ChatSession: """从文件中加载单个聊天会话。 参数: file_path(str):聊天文件的路径。 返回: ChatSession:加载的聊天会话。 """ with open(file_path, "r", encoding="utf-8") as file: txt = file.read() # Split messages by newlines, but keep multi-line messages grouped chat_lines: List[str] = [] current_message = "" for line in txt.split("\n"): if self._message_line_regex.match(line): if current_message: chat_lines.append(current_message) current_message = line else: current_message += " " + line.strip() if current_message: chat_lines.append(current_message) results: List[Union[HumanMessage, AIMessage]] = [] for line in chat_lines: result = self._message_line_regex.match(line.strip()) if result: timestamp, sender, text = result.groups() if not self._ignore_lines.match(text.strip()): results.append( HumanMessage( # type: ignore[call-arg] role=sender, content=text, additional_kwargs={ "sender": sender, "events": [{"message_time": timestamp}], }, ) ) else: logger.debug(f"Could not parse line: {line}") return ChatSession(messages=results) def _iterate_files(self, path: str) -> Iterator[str]: """在目录或zip文件中迭代文件。 参数: path(str):目录或zip文件的路径。 产出: str:每个文件的路径。 """ if os.path.isfile(path): yield path elif os.path.isdir(path): for root, _, files in os.walk(path): for file in files: if file.endswith(".txt"): yield os.path.join(root, file) elif zipfile.is_zipfile(path): with zipfile.ZipFile(path) as zip_file: for file in zip_file.namelist(): if file.endswith(".txt"): yield zip_file.extract(file)
[docs] def lazy_load(self) -> Iterator[ChatSession]: """从聊天文件中延迟加载消息,并将它们作为聊天会话生成。 生成: 迭代器[ChatSession]:加载的聊天会话。 """ yield self._load_single_chat_session(self.path)