Source code for langchain_community.chat_loaders.whatsapp
import logging
import os
import re
import zipfile
from typing import Iterator, List, Union
from langchain_core.chat_loaders import BaseChatLoader
from langchain_core.chat_sessions import ChatSession
from langchain_core.messages import AIMessage, HumanMessage
logger = logging.getLogger(__name__)
[docs]class WhatsAppChatLoader(BaseChatLoader):
"""从一个压缩文件或目录中加载`WhatsApp`的对话。"""
[docs] def __init__(self, path: str):
"""初始化WhatsAppChatLoader。
参数:
path(str):导出的WhatsApp聊天zip目录、文件夹或文件的路径。
要生成转储文件,请打开聊天,点击右上角的三个点,然后选择“更多”。然后选择“导出聊天”,并选择“不带媒体”。
"""
self.path = path
ignore_lines = [
"This message was deleted",
"<Media omitted>",
"image omitted",
"Messages and calls are end-to-end encrypted. No one outside of this chat,"
" not even WhatsApp, can read or listen to them.",
]
self._ignore_lines = re.compile(
r"(" + "|".join([r"\u200E*" + line for line in ignore_lines]) + r")",
flags=re.IGNORECASE,
)
self._message_line_regex = re.compile(
r"\u200E*\[?(\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{2}:\d{2} (?:AM|PM))\]?[ \u200E]*([^:]+): (.+)", # noqa
flags=re.IGNORECASE,
)
def _load_single_chat_session(self, file_path: str) -> ChatSession:
"""从文件中加载单个聊天会话。
参数:
file_path(str):聊天文件的路径。
返回:
ChatSession:加载的聊天会话。
"""
with open(file_path, "r", encoding="utf-8") as file:
txt = file.read()
# Split messages by newlines, but keep multi-line messages grouped
chat_lines: List[str] = []
current_message = ""
for line in txt.split("\n"):
if self._message_line_regex.match(line):
if current_message:
chat_lines.append(current_message)
current_message = line
else:
current_message += " " + line.strip()
if current_message:
chat_lines.append(current_message)
results: List[Union[HumanMessage, AIMessage]] = []
for line in chat_lines:
result = self._message_line_regex.match(line.strip())
if result:
timestamp, sender, text = result.groups()
if not self._ignore_lines.match(text.strip()):
results.append(
HumanMessage( # type: ignore[call-arg]
role=sender,
content=text,
additional_kwargs={
"sender": sender,
"events": [{"message_time": timestamp}],
},
)
)
else:
logger.debug(f"Could not parse line: {line}")
return ChatSession(messages=results)
def _iterate_files(self, path: str) -> Iterator[str]:
"""在目录或zip文件中迭代文件。
参数:
path(str):目录或zip文件的路径。
产出:
str:每个文件的路径。
"""
if os.path.isfile(path):
yield path
elif os.path.isdir(path):
for root, _, files in os.walk(path):
for file in files:
if file.endswith(".txt"):
yield os.path.join(root, file)
elif zipfile.is_zipfile(path):
with zipfile.ZipFile(path) as zip_file:
for file in zip_file.namelist():
if file.endswith(".txt"):
yield zip_file.extract(file)
[docs] def lazy_load(self) -> Iterator[ChatSession]:
"""从聊天文件中延迟加载消息,并将它们作为聊天会话生成。
生成:
迭代器[ChatSession]:加载的聊天会话。
"""
yield self._load_single_chat_session(self.path)