Source code for langchain_community.vectorstores.singlestoredb

from __future__ import annotations

import json
import re
from enum import Enum
from typing import (
    Any,
    Callable,
    Iterable,
    List,
    Optional,
    Tuple,
    Type,
)

from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore, VectorStoreRetriever
from sqlalchemy.pool import QueuePool

from langchain_community.vectorstores.utils import DistanceStrategy

DEFAULT_DISTANCE_STRATEGY = DistanceStrategy.DOT_PRODUCT

ORDERING_DIRECTIVE: dict = {
    DistanceStrategy.EUCLIDEAN_DISTANCE: "",
    DistanceStrategy.DOT_PRODUCT: "DESC",
}


[docs]class SingleStoreDB(VectorStore):
    """`SingleStore DB` 向量存储。

    使用此类的先决条件是安装 ``singlestoredb`` Python 包。

    可以通过提供嵌入函数以及与数据库连接、连接池相关的参数来创建 SingleStoreDB 向量存储，还可以选择性地提供要使用的表和字段的名称。"""

    class SearchStrategy(str, Enum):
        """在向量存储中搜索的搜索策略的枚举器。"""

        VECTOR_ONLY = "VECTOR_ONLY"
        TEXT_ONLY = "TEXT_ONLY"
        FILTER_BY_TEXT = "FILTER_BY_TEXT"
        FILTER_BY_VECTOR = "FILTER_BY_VECTOR"
        WEIGHTED_SUM = "WEIGHTED_SUM"

    def _get_connection(self: SingleStoreDB) -> Any:
        try:
            import singlestoredb as s2
        except ImportError:
            raise ImportError(
                "Could not import singlestoredb python package. "
                "Please install it with `pip install singlestoredb`."
            )
        return s2.connect(**self.connection_kwargs)

[docs]    def __init__(
        self,
        embedding: Embeddings,
        *,
        distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
        table_name: str = "embeddings",
        content_field: str = "content",
        metadata_field: str = "metadata",
        vector_field: str = "vector",
        id_field: str = "id",
        use_vector_index: bool = False,
        vector_index_name: str = "",
        vector_index_options: Optional[dict] = None,
        vector_size: int = 1536,
        use_full_text_search: bool = False,
        pool_size: int = 5,
        max_overflow: int = 10,
        timeout: float = 30,
        **kwargs: Any,
    ):
        """初始化所需的组件。

参数：
    embedding (Embeddings): 文本嵌入模型。

    distance_strategy (DistanceStrategy, 可选):
        确定用于计算嵌入空间中向量之间距离的策略。
        默认为DOT_PRODUCT。
        可用选项为：
        - DOT_PRODUCT: 计算两个向量的数量积。
            这是默认行为。
        - EUCLIDEAN_DISTANCE: 计算两个向量之间的欧氏距离。
            此度量考虑向量空间中的几何距离，可能更适合依赖空间关系的嵌入。
            此度量不兼容WEIGHTED_SUM搜索策略。

    table_name (str, 可选): 指定正在使用的表的名称。
        默认为"embeddings"。
    content_field (str, 可选): 指定存储内容的字段。
        默认为"content"。
    metadata_field (str, 可选): 指定存储元数据的字段。
        默认为"metadata"。
    vector_field (str, 可选): 指定存储向量的字段。
        默认为"vector"。
    id_field (str, 可选): 指定存储id的字段。
        默认为"id"。

    use_vector_index (bool, 可选): 切换使用向量索引。
        仅适用于SingleStoreDB 8.5或更高版本。默认为False。
        如果设置为True，则需要将vector_size参数设置为适当的值。

    vector_index_name (str, 可选): 指定向量索引的名称。
        默认为空。如果use_vector_index设置为False，则将被忽略。

    vector_index_options (dict, 可选): 指定向量索引的选项。
        默认为{}。
        如果use_vector_index设置为False，则将被忽略。选项包括：
        index_type (str, 可选): 指定索引的类型。
            默认为IVF_PQFS。
        更多选项，请参考SingleStoreDB文档：
        https://docs.singlestore.com/cloud/reference/sql-reference/vector-functions/vector-indexing/

    vector_size (int, 可选): 指定向量的大小。
        默认为1536。如果use_vector_index设置为True，则需要此参数。
        应设置为与存储在vector_field中的向量大小相同的值。

    use_full_text_search (bool, 可选): 切换使用文本全文索引。
        默认为False。如果设置为True，表将在内容字段上创建全文索引，
        并且simularity_search方法将使用TEXT_ONLY、FILTER_BY_TEXT、FILTER_BY_VECTOR和WIGHTED_SUM搜索策略。
        如果设置为False，simularity_search方法将仅允许VECTOR_ONLY搜索策略。

    以下参数与连接池有关：

    pool_size (int, 可选): 确定连接池中活动连接的数量。
        默认为5。
    max_overflow (int, 可选): 确定连接池大小之外允许的最大连接数。
        默认为10。
    timeout (float, 可选): 指定建立连接的最大等待时间（秒）。
        默认为30。

    以下参数与数据库连接有关：

    host (str, 可选): 指定数据库连接的主机名、IP地址或URL。
        默认方案为"mysql"。
    user (str, 可选): 数据库用户名。
    password (str, 可选): 数据库密码。
    port (int, 可选): 数据库端口。对于非HTTP连接，默认为3306，HTTP连接为80，HTTPS连接为443。
    database (str, 可选): 数据库名称。

    其他可选参数可进一步定制数据库连接：

    pure_python (bool, 可选): 切换连接器模式。如果为True，则以纯Python模式运行。
    local_infile (bool, 可选): 允许本地文件上传。
    charset (str, 可选): 指定字符串值的字符集。
    ssl_key (str, 可选): 指定包含SSL密钥的文件路径。
    ssl_cert (str, 可选): 指定包含SSL证书的文件路径。
    ssl_ca (str, 可选): 指定包含SSL证书颁发机构的文件路径。
    ssl_cipher (str, 可选): 设置SSL密码列表。
    ssl_disabled (bool, 可选): 禁用SSL使用。
    ssl_verify_cert (bool, 可选): 验证服务器的证书。
        如果指定了``ssl_ca``，则会自动启用。
    ssl_verify_identity (bool, 可选): 验证服务器的身份。
    conv (dict[int, Callable], 可选): 数据转换函数的字典。
    credential_type (str, 可选): 指定要使用的身份验证类型：auth.PASSWORD、auth.JWT或auth.BROWSER_SSO。
    autocommit (bool, 可选): 启用自动提交。
    results_type (str, 可选): 确定查询结果的结构：
        元组、命名元组、字典。
    results_format (str, 可选): 已弃用。此选项已更名为results_type。

示例：
    基本用法：

    .. code-block:: python

        from langchain_openai import OpenAIEmbeddings
        from langchain_community.vectorstores import SingleStoreDB

        vectorstore = SingleStoreDB(
            OpenAIEmbeddings(),
            host="https://user:password@127.0.0.1:3306/database"
        )

    高级用法：

    .. code-block:: python

        from langchain_openai import OpenAIEmbeddings
        from langchain_community.vectorstores import SingleStoreDB

        vectorstore = SingleStoreDB(
            OpenAIEmbeddings(),
            distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
            host="127.0.0.1",
            port=3306,
            user="user",
            password="password",
            database="db",
            table_name="my_custom_table",
            pool_size=10,
            timeout=60,
        )

    使用环境变量：

    .. code-block:: python

        from langchain_openai import OpenAIEmbeddings
        from langchain_community.vectorstores import SingleStoreDB

        os.environ['SINGLESTOREDB_URL'] = 'me:p455w0rd@s2-host.com/my_db'
        vectorstore = SingleStoreDB(OpenAIEmbeddings())

    使用向量索引：

    .. code-block:: python

        from langchain_openai import OpenAIEmbeddings
        from langchain_community.vectorstores import SingleStoreDB

        os.environ['SINGLESTOREDB_URL'] = 'me:p455w0rd@s2-host.com/my_db'
        vectorstore = SingleStoreDB(
            OpenAIEmbeddings(),
            use_vector_index=True,
        )

    使用全文索引：

    .. code-block:: python
        from langchain_openai import OpenAIEmbeddings
        from langchain_community.vectorstores import SingleStoreDB

        os.environ['SINGLESTOREDB_URL'] = 'me:p455w0rd@s2-host.com/my_db'
        vectorstore = SingleStoreDB(
            OpenAIEmbeddings(),
            use_full_text_search=True,
        )
"""

        self.embedding = embedding
        self.distance_strategy = distance_strategy
        self.table_name = self._sanitize_input(table_name)
        self.content_field = self._sanitize_input(content_field)
        self.metadata_field = self._sanitize_input(metadata_field)
        self.vector_field = self._sanitize_input(vector_field)
        self.id_field = self._sanitize_input(id_field)

        self.use_vector_index = bool(use_vector_index)
        self.vector_index_name = self._sanitize_input(vector_index_name)
        self.vector_index_options = dict(vector_index_options or {})
        self.vector_index_options["metric_type"] = self.distance_strategy
        self.vector_size = int(vector_size)

        self.use_full_text_search = bool(use_full_text_search)

        # Pass the rest of the kwargs to the connection.
        self.connection_kwargs = kwargs

        # Add program name and version to connection attributes.
        if "conn_attrs" not in self.connection_kwargs:
            self.connection_kwargs["conn_attrs"] = dict()

        self.connection_kwargs["conn_attrs"]["_connector_name"] = "langchain python sdk"
        self.connection_kwargs["conn_attrs"]["_connector_version"] = "2.0.0"

        # Create connection pool.
        self.connection_pool = QueuePool(
            self._get_connection,
            max_overflow=max_overflow,
            pool_size=pool_size,
            timeout=timeout,
        )
        self._create_table()

    @property
    def embeddings(self) -> Embeddings:
        return self.embedding

    def _sanitize_input(self, input_str: str) -> str:
        # Remove characters that are not alphanumeric or underscores
        return re.sub(r"[^a-zA-Z0-9_]", "", input_str)

    def _select_relevance_score_fn(self) -> Callable[[float], float]:
        return self._max_inner_product_relevance_score_fn

    def _create_table(self: SingleStoreDB) -> None:
        """如果表不存在，则创建表。"""
        conn = self.connection_pool.connect()
        try:
            cur = conn.cursor()
            try:
                full_text_index = ""
                if self.use_full_text_search:
                    full_text_index = ", FULLTEXT({})".format(self.content_field)
                if self.use_vector_index:
                    index_options = ""
                    if self.vector_index_options and len(self.vector_index_options) > 0:
                        index_options = "INDEX_OPTIONS '{}'".format(
                            json.dumps(self.vector_index_options)
                        )
                    cur.execute(
                        """CREATE TABLE IF NOT EXISTS {}
                        ({} BIGINT AUTO_INCREMENT PRIMARY KEY, {} LONGTEXT CHARACTER
                        SET utf8mb4 COLLATE utf8mb4_general_ci, {} VECTOR({}, F32)
                        NOT NULL, {} JSON, VECTOR INDEX {} ({}) {}{});""".format(
                            self.table_name,
                            self.id_field,
                            self.content_field,
                            self.vector_field,
                            self.vector_size,
                            self.metadata_field,
                            self.vector_index_name,
                            self.vector_field,
                            index_options,
                            full_text_index,
                        ),
                    )
                else:
                    cur.execute(
                        """CREATE TABLE IF NOT EXISTS {}
                        ({} BIGINT AUTO_INCREMENT PRIMARY KEY, {} LONGTEXT CHARACTER
                        SET utf8mb4 COLLATE utf8mb4_general_ci, {} BLOB, {} JSON{});
                        """.format(
                            self.table_name,
                            self.id_field,
                            self.content_field,
                            self.vector_field,
                            self.metadata_field,
                            full_text_index,
                        ),
                    )
            finally:
                cur.close()
        finally:
            conn.close()

[docs]    def add_images(
        self,
        uris: List[str],
        metadatas: Optional[List[dict]] = None,
        embeddings: Optional[List[List[float]]] = None,
        **kwargs: Any,
    ) -> List[str]:
        """将图像通过嵌入运行并添加到向量存储中。

参数：
    uris List[str]：图像文件路径。
        每个URI将作为文档内容添加到向量存储中。
    metadatas（可选[List[dict]]，可选）：元数据的可选列表。
        默认为None。
    embeddings（可选[List[List[float]]]，可选）：可选的预生成的
        嵌入。默认为None。

返回：
    List[str]：空列表
"""
        # Set embeddings
        if (
            embeddings is None
            and self.embedding is not None
            and hasattr(self.embedding, "embed_image")
        ):
            embeddings = self.embedding.embed_image(uris=uris)
        return self.add_texts(uris, metadatas, embeddings, **kwargs)

[docs]    def add_texts(
        self,
        texts: Iterable[str],
        metadatas: Optional[List[dict]] = None,
        embeddings: Optional[List[List[float]]] = None,
        **kwargs: Any,
    ) -> List[str]:
        """将更多文本添加到向量存储中。

参数：
    texts（Iterable[str]）：要添加到向量存储中的字符串/文本的可迭代对象。
    metadatas（Optional[List[dict]，可选）：元数据的可选列表。默认为None。
    embeddings（Optional[List[List[float]]]，可选）：可选的预生成的嵌入。默认为None。

返回：
    List[str]：空列表
"""
        conn = self.connection_pool.connect()
        try:
            cur = conn.cursor()
            try:
                # Write data to singlestore db
                for i, text in enumerate(texts):
                    # Use provided values by default or fallback
                    metadata = metadatas[i] if metadatas else {}
                    embedding = (
                        embeddings[i]
                        if embeddings
                        else self.embedding.embed_documents([text])[0]
                    )
                    cur.execute(
                        """INSERT INTO {}({}, {}, {})
                        VALUES (%s, JSON_ARRAY_PACK(%s), %s)""".format(
                            self.table_name,
                            self.content_field,
                            self.vector_field,
                            self.metadata_field,
                        ),
                        (
                            text,
                            "[{}]".format(",".join(map(str, embedding))),
                            json.dumps(metadata),
                        ),
                    )
                if self.use_vector_index or self.use_full_text_search:
                    cur.execute("OPTIMIZE TABLE {} FLUSH;".format(self.table_name))
            finally:
                cur.close()
        finally:
            conn.close()
        return []

[docs]    def similarity_search(
        self,
        query: str,
        k: int = 4,
        filter: Optional[dict] = None,
        search_strategy: SearchStrategy = SearchStrategy.VECTOR_ONLY,
        filter_threshold: float = 0,
        text_weight: float = 0.5,
        vector_weight: float = 0.5,
        vector_select_count_multiplier: int = 10,
        **kwargs: Any,
    ) -> List[Document]:
        """返回与查询文本最相似的索引文档。

使用余弦相似度。

参数：
    query（str）：要查找相似文档的查询文本。
    k（int）：要返回的文档数量。默认为4。
    filter（dict）：要按元数据字段和值进行过滤的字典。默认为None。
    search_strategy（SearchStrategy）：要使用的搜索策略。
        默认为SearchStrategy.VECTOR_ONLY。
        可用选项包括：
        - SearchStrategy.VECTOR_ONLY：仅按矢量相似性搜索。
        - SearchStrategy.TEXT_ONLY：仅按文本相似性搜索。仅当use_full_text_search为True时才可用。
        - SearchStrategy.FILTER_BY_TEXT：按文本相似性过滤并按矢量相似性搜索。仅当use_full_text_search为True时才可用。
        - SearchStrategy.FILTER_BY_VECTOR：按矢量相似性过滤并按文本相似性搜索。仅当use_full_text_search为True时才可用。
        - SearchStrategy.WEIGHTED_SUM：按文本和矢量相似性的加权和搜索。仅当use_full_text_search为True且distance_strategy为DOT_PRODUCT时才可用。
    filter_threshold（float）：按文本或矢量相似性进行过滤的阈值。默认为0。仅当search_strategy为SearchStrategy.FILTER_BY_TEXT或SearchStrategy.FILTER_BY_VECTOR时才有效。
    text_weight（float）：加权和搜索策略中文本相似性的权重。默认为0.5。仅当search_strategy为SearchStrategy.WEIGHTED_SUM时才有效。
    vector_weight（float）：加权和搜索策略中矢量相似性的权重。默认为0.5。仅当search_strategy为SearchStrategy.WEIGHTED_SUM时才有效。
    vector_select_count_multiplier（int）：在使用矢量索引时选择矢量数量的乘数。默认为10。
        仅当use_vector_index为True且search_strategy为SearchStrategy.WEIGHTED_SUM或SearchStrategy.FILTER_BY_TEXT时才有效。
        选择的矢量数量将为k * vector_select_count_multiplier。
        这是由于矢量索引的限制而需要的。

返回：
    List[Document]：与查询文本最相似的文档列表。

示例：

    基本用法：
    .. code-block:: python

        from langchain_community.vectorstores import SingleStoreDB
        from langchain_openai import OpenAIEmbeddings

        s2 = SingleStoreDB.from_documents(
            docs,
            OpenAIEmbeddings(),
            host="username:password@localhost:3306/database"
        )
        results = s2.similarity_search("query text", 1,
                            {"metadata_field": "metadata_value"})

    不同的搜索策略：
    .. code-block:: python

        from langchain_community.vectorstores import SingleStoreDB
        from langchain_openai import OpenAIEmbeddings

        s2 = SingleStoreDB.from_documents(
            docs,
            OpenAIEmbeddings(),
            host="username:password@localhost:3306/database",
            use_full_text_search=True,
            use_vector_index=True,
        )
        results = s2.similarity_search("query text", 1,
                search_strategy=SingleStoreDB.SearchStrategy.FILTER_BY_TEXT,
                filter_threshold=0.5)

    加权和搜索策略：
    .. code-block:: python

        from langchain_community.vectorstores import SingleStoreDB
        from langchain_openai import OpenAIEmbeddings

        s2 = SingleStoreDB.from_documents(
            docs,
            OpenAIEmbeddings(),
            host="username:password@localhost:3306/database",
            use_full_text_search=True,
            use_vector_index=True,
        )
        results = s2.similarity_search("query text", 1,
            search_strategy=SingleStoreDB.SearchStrategy.WEIGHTED_SUM,
            text_weight=0.3,
            vector_weight=0.7)
"""
        docs_and_scores = self.similarity_search_with_score(
            query=query,
            k=k,
            filter=filter,
            search_strategy=search_strategy,
            filter_threshold=filter_threshold,
            text_weight=text_weight,
            vector_weight=vector_weight,
            vector_select_count_multiplier=vector_select_count_multiplier,
            **kwargs,
        )
        return [doc for doc, _ in docs_and_scores]

[docs]    def similarity_search_with_score(
        self,
        query: str,
        k: int = 4,
        filter: Optional[dict] = None,
        search_strategy: SearchStrategy = SearchStrategy.VECTOR_ONLY,
        filter_threshold: float = 1,
        text_weight: float = 0.5,
        vector_weight: float = 0.5,
        vector_select_count_multiplier: int = 10,
        **kwargs: Any,
    ) -> List[Tuple[Document, float]]:
        """返回与查询最相似的文档。使用余弦相似度。

参数：
    query：要查找相似文档的文本。
    k：要返回的文档数量。默认为4。
    filter：要按元数据字段和值进行过滤的字典。默认为None。
    search_strategy（SearchStrategy）：要使用的搜索策略。
        默认为SearchStrategy.VECTOR_ONLY。
        可用选项包括：
        - SearchStrategy.VECTOR_ONLY：仅按矢量相似性搜索。
        - SearchStrategy.TEXT_ONLY：仅按文本相似性搜索。仅当use_full_text_search为True时才可用。
        - SearchStrategy.FILTER_BY_TEXT：按文本相似性过滤并按矢量相似性搜索。仅当use_full_text_search为True时才可用。
        - SearchStrategy.FILTER_BY_VECTOR：按矢量相似性过滤并按文本相似性搜索。仅当use_full_text_search为True时才可用。
        - SearchStrategy.WEIGHTED_SUM：按文本和矢量相似性的加权和搜索。仅当use_full_text_search为True且distance_strategy为DOT_PRODUCT时才可用。
    filter_threshold（float）：按文本或矢量相似性过滤的阈值。默认为0。仅当search_strategy为SearchStrategy.FILTER_BY_TEXT或SearchStrategy.FILTER_BY_VECTOR时才生效。
    text_weight（float）：加权和搜索策略中文本相似性的权重。默认为0.5。仅当search_strategy为SearchStrategy.WEIGHTED_SUM时才生效。
    vector_weight（float）：加权和搜索策略中矢量相似性的权重。默认为0.5。仅当search_strategy为SearchStrategy.WEIGHTED_SUM时才生效。
    vector_select_count_multiplier（int）：在使用矢量索引时选择矢量数量的乘数。默认为10。
        该参数仅在use_vector_index为True且search_strategy为SearchStrategy.WEIGHTED_SUM或SearchStrategy.FILTER_BY_TEXT时生效。
        选择的矢量数量将为k * vector_select_count_multiplier。
        这是由于矢量索引的限制而需要的。
返回：
    返回与查询最相似的文档列表，以及每个文档的分数。

异常：
    ValueError：如果搜索策略与距离策略不受支持。

示例：
    基本用法：
    .. code-block:: python

        from langchain_community.vectorstores import SingleStoreDB
        from langchain_openai import OpenAIEmbeddings

        s2 = SingleStoreDB.from_documents(
            docs,
            OpenAIEmbeddings(),
            host="username:password@localhost:3306/database"
        )
        results = s2.similarity_search_with_score("query text", 1,
                            {"metadata_field": "metadata_value"})

    不同的搜索策略：

    .. code-block:: python

        from langchain_community.vectorstores import SingleStoreDB
        from langchain_openai import OpenAIEmbeddings

        s2 = SingleStoreDB.from_documents(
            docs,
            OpenAIEmbeddings(),
            host="username:password@localhost:3306/database",
            use_full_text_search=True,
            use_vector_index=True,
        )
        results = s2.similarity_search_with_score("query text", 1,
                search_strategy=SingleStoreDB.SearchStrategy.FILTER_BY_VECTOR,
                filter_threshold=0.5)

    加权和搜索策略：
    .. code-block:: python

        from langchain_community.vectorstores import SingleStoreDB
        from langchain_openai import OpenAIEmbeddings

        s2 = SingleStoreDB.from_documents(
            docs,
            OpenAIEmbeddings(),
            host="username:password@localhost:3306/database",
            use_full_text_search=True,
            use_vector_index=True,
        )
        results = s2.similarity_search_with_score("query text", 1,
            search_strategy=SingleStoreDB.SearchStrategy.WEIGHTED_SUM,
            text_weight=0.3,
            vector_weight=0.7)
"""

        if (
            search_strategy != SingleStoreDB.SearchStrategy.VECTOR_ONLY
            and not self.use_full_text_search
        ):
            raise ValueError(
                """Search strategy {} is not supported
                when use_full_text_search is False""".format(search_strategy)
            )

        if (
            search_strategy == SingleStoreDB.SearchStrategy.WEIGHTED_SUM
            and self.distance_strategy != DistanceStrategy.DOT_PRODUCT
        ):
            raise ValueError(
                "Search strategy {} is not supported with distance strategy {}".format(
                    search_strategy, self.distance_strategy
                )
            )

        # Creates embedding vector from user query
        embedding = []
        if search_strategy != SingleStoreDB.SearchStrategy.TEXT_ONLY:
            embedding = self.embedding.embed_query(query)

        self.embedding.embed_query(query)
        conn = self.connection_pool.connect()
        result = []
        where_clause: str = ""
        where_clause_values: List[Any] = []
        if filter or search_strategy in [
            SingleStoreDB.SearchStrategy.FILTER_BY_TEXT,
            SingleStoreDB.SearchStrategy.FILTER_BY_VECTOR,
        ]:
            where_clause = "WHERE "
            arguments = []

            if search_strategy == SingleStoreDB.SearchStrategy.FILTER_BY_TEXT:
                arguments.append(
                    "MATCH ({}) AGAINST (%s) > %s".format(self.content_field)
                )
                where_clause_values.append(query)
                where_clause_values.append(float(filter_threshold))

            if search_strategy == SingleStoreDB.SearchStrategy.FILTER_BY_VECTOR:
                condition = "{}({}, JSON_ARRAY_PACK(%s)) ".format(
                    self.distance_strategy.name
                    if isinstance(self.distance_strategy, DistanceStrategy)
                    else self.distance_strategy,
                    self.vector_field,
                )
                if self.distance_strategy == DistanceStrategy.EUCLIDEAN_DISTANCE:
                    condition += "< %s"
                else:
                    condition += "> %s"
                arguments.append(condition)
                where_clause_values.append("[{}]".format(",".join(map(str, embedding))))
                where_clause_values.append(float(filter_threshold))

            def build_where_clause(
                where_clause_values: List[Any],
                sub_filter: dict,
                prefix_args: Optional[List[str]] = None,
            ) -> None:
                prefix_args = prefix_args or []
                for key in sub_filter.keys():
                    if isinstance(sub_filter[key], dict):
                        build_where_clause(
                            where_clause_values, sub_filter[key], prefix_args + [key]
                        )
                    else:
                        arguments.append(
                            "JSON_EXTRACT_JSON({}, {}) = %s".format(
                                self.metadata_field,
                                ", ".join(["%s"] * (len(prefix_args) + 1)),
                            )
                        )
                        where_clause_values += prefix_args + [key]
                        where_clause_values.append(json.dumps(sub_filter[key]))

            if filter:
                build_where_clause(where_clause_values, filter)
            where_clause += " AND ".join(arguments)

        try:
            cur = conn.cursor()
            try:
                if (
                    search_strategy == SingleStoreDB.SearchStrategy.VECTOR_ONLY
                    or search_strategy == SingleStoreDB.SearchStrategy.FILTER_BY_TEXT
                ):
                    search_options = ""
                    if (
                        self.use_vector_index
                        and search_strategy
                        == SingleStoreDB.SearchStrategy.FILTER_BY_TEXT
                    ):
                        search_options = "SEARCH_OPTIONS '{\"k\":%d}'" % (
                            k * vector_select_count_multiplier
                        )
                    cur.execute(
                        """SELECT {}, {}, {}({}, JSON_ARRAY_PACK(%s)) as __score
                        FROM {} {} ORDER BY __score {}{} LIMIT %s""".format(
                            self.content_field,
                            self.metadata_field,
                            self.distance_strategy.name
                            if isinstance(self.distance_strategy, DistanceStrategy)
                            else self.distance_strategy,
                            self.vector_field,
                            self.table_name,
                            where_clause,
                            search_options,
                            ORDERING_DIRECTIVE[self.distance_strategy],
                        ),
                        ("[{}]".format(",".join(map(str, embedding))),)
                        + tuple(where_clause_values)
                        + (k,),
                    )
                elif (
                    search_strategy == SingleStoreDB.SearchStrategy.FILTER_BY_VECTOR
                    or search_strategy == SingleStoreDB.SearchStrategy.TEXT_ONLY
                ):
                    cur.execute(
                        """SELECT {}, {}, MATCH ({}) AGAINST (%s) as __score
                        FROM {} {} ORDER BY __score DESC LIMIT %s""".format(
                            self.content_field,
                            self.metadata_field,
                            self.content_field,
                            self.table_name,
                            where_clause,
                        ),
                        (query,) + tuple(where_clause_values) + (k,),
                    )
                elif search_strategy == SingleStoreDB.SearchStrategy.WEIGHTED_SUM:
                    cur.execute(
                        """SELECT {}, {}, __score1 * %s + __score2 * %s as __score
                        FROM (
                            SELECT {}, {}, {}, MATCH ({}) AGAINST (%s) as __score1 
                        FROM {} {}) r1 FULL OUTER JOIN (
                            SELECT {}, {}({}, JSON_ARRAY_PACK(%s)) as __score2
                            FROM {} {} ORDER BY __score2 {} LIMIT %s
                        ) r2 ON r1.{} = r2.{} ORDER BY __score {} LIMIT %s""".format(
                            self.content_field,
                            self.metadata_field,
                            self.id_field,
                            self.content_field,
                            self.metadata_field,
                            self.content_field,
                            self.table_name,
                            where_clause,
                            self.id_field,
                            self.distance_strategy.name
                            if isinstance(self.distance_strategy, DistanceStrategy)
                            else self.distance_strategy,
                            self.vector_field,
                            self.table_name,
                            where_clause,
                            ORDERING_DIRECTIVE[self.distance_strategy],
                            self.id_field,
                            self.id_field,
                            ORDERING_DIRECTIVE[self.distance_strategy],
                        ),
                        (text_weight, vector_weight, query)
                        + tuple(where_clause_values)
                        + ("[{}]".format(",".join(map(str, embedding))),)
                        + tuple(where_clause_values)
                        + (k * vector_select_count_multiplier, k),
                    )
                else:
                    raise ValueError(
                        "Invalid search strategy: {}".format(search_strategy)
                    )

                for row in cur.fetchall():
                    doc = Document(page_content=row[0], metadata=row[1])
                    result.append((doc, float(row[2])))
            finally:
                cur.close()
        finally:
            conn.close()
        return result

[docs]    @classmethod
    def from_texts(
        cls: Type[SingleStoreDB],
        texts: List[str],
        embedding: Embeddings,
        metadatas: Optional[List[dict]] = None,
        distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
        table_name: str = "embeddings",
        content_field: str = "content",
        metadata_field: str = "metadata",
        vector_field: str = "vector",
        id_field: str = "id",
        use_vector_index: bool = False,
        vector_index_name: str = "",
        vector_index_options: Optional[dict] = None,
        vector_size: int = 1536,
        use_full_text_search: bool = False,
        pool_size: int = 5,
        max_overflow: int = 10,
        timeout: float = 30,
        **kwargs: Any,
    ) -> SingleStoreDB:
        """从原始文档创建一个SingleStoreDB向量存储。
这是一个用户友好的接口，可以：
    1. 嵌入文档。
    2. 在SingleStoreDB中为嵌入创建一个新表。
    3. 将文档添加到新创建的表中。
这旨在是一个快速入门的方式。
参数：
    texts（List[str]）：要添加到向量存储中的文本列表。
    embedding（Embeddings）：文本嵌入模型。
    metadatas（Optional[List[dict]，可选）：元数据的可选列表。
        默认为None。
    distance_strategy（DistanceStrategy，可选）：
        确定用于计算嵌入空间中向量之间距离的策略。
        默认为DOT_PRODUCT。
        可用选项包括：
        - DOT_PRODUCT：计算两个向量的数量积。
            这是默认行为。
        - EUCLIDEAN_DISTANCE：计算两个向量之间的欧氏距离。
            此度量考虑向量空间中的几何距离，可能更适用于依赖空间关系的嵌入。
            此度量与WEIGHTED_SUM搜索策略不兼容。
    table_name（str，可选）：指定要使用的表的名称。
        默认为"embeddings"。
    content_field（str，可选）：指定存储内容的字段。
        默认为"content"。
    metadata_field（str，可选）：指定存储元数据的字段。
        默认为"metadata"。
    vector_field（str，可选）：指定存储向量的字段。
        默认为"vector"。
    id_field（str，可选）：指定存储id的字段。
        默认为"id"。
    use_vector_index（bool，可选）：切换使用向量索引。
        仅适用于SingleStoreDB 8.5或更高版本。默认为False。
        如果设置为True，则需要将vector_size参数设置为适当的值。
    vector_index_name（str，可选）：指定向量索引的名称。
        默认为空。如果use_vector_index设置为False，则将被忽略。
    vector_index_options（dict，可选）：指定向量索引的选项。
        默认为{}。
        如果use_vector_index设置为False，则将被忽略。选项包括：
        index_type（str，可选）：指定索引的类型。
            默认为IVF_PQFS。
        更多选项，请参阅SingleStoreDB文档：
        https://docs.singlestore.com/cloud/reference/sql-reference/vector-functions/vector-indexing/
    vector_size（int，可选）：指定向量的大小。
        默认为1536。如果use_vector_index设置为True，则需要设置。
        应设置为与存储在vector_field中的向量大小相同的值。
    use_full_text_search（bool，可选）：切换是否在文档内容上使用全文索引。
        默认为False。如果设置为True，则表将创建具有全文索引的内容字段，
        并且simularity_search方法将使用TEXT_ONLY、FILTER_BY_TEXT、FILTER_BY_VECTOR和WIGHTED_SUM搜索策略。
        如果设置为False，则simularity_search方法将仅允许VECTOR_ONLY搜索策略。

    pool_size（int，可选）：确定池中活动连接的数量。
        默认为5。
    max_overflow（int，可选）：确定池大小之外允许的最大连接数。
        默认为10。
    timeout（float，可选）：指定建立连接的最大等待时间（秒）。
        默认为30。

    其他可选参数可进一步定制数据库连接：

    pure_python（bool，可选）：切换连接器模式。如果为True，则以纯Python模式运行。
    local_infile（bool，可选）：允许本地文件上传。
    charset（str，可选）：指定字符串值的字符集。
    ssl_key（str，可选）：指定包含SSL密钥的文件路径。
    ssl_cert（str，可选）：指定包含SSL证书的文件路径。
    ssl_ca（str，可选）：指定包含SSL证书颁发机构的文件路径。
    ssl_cipher（str，可选）：设置SSL密码列表。
    ssl_disabled（bool，可选）：禁用SSL使用。
    ssl_verify_cert（bool，可选）：验证服务器的证书。
        如果指定了``ssl_ca``，则自动启用。
    ssl_verify_identity（bool，可选）：验证服务器的身份。
    conv（dict[int，Callable]，可选）：数据转换函数的字典。
    credential_type（str，可选）：指定要使用的身份验证类型：auth.PASSWORD、auth.JWT或auth.BROWSER_SSO。
    autocommit（bool，可选）：启用自动提交。
    results_type（str，可选）：确定查询结果的结构：元组、命名元组、字典。
    results_format（str，可选）：已弃用。此选项已更名为results_type。

示例：
    .. code-block:: python

        from langchain_community.vectorstores import SingleStoreDB
        from langchain_openai import OpenAIEmbeddings

        s2 = SingleStoreDB.from_texts(
            texts,
            OpenAIEmbeddings(),
            host="username:password@localhost:3306/database"
        )
"""

        instance = cls(
            embedding,
            distance_strategy=distance_strategy,
            table_name=table_name,
            content_field=content_field,
            metadata_field=metadata_field,
            vector_field=vector_field,
            id_field=id_field,
            pool_size=pool_size,
            max_overflow=max_overflow,
            timeout=timeout,
            use_vector_index=use_vector_index,
            vector_index_name=vector_index_name,
            vector_index_options=vector_index_options,
            vector_size=vector_size,
            use_full_text_search=use_full_text_search,
            **kwargs,
        )
        instance.add_texts(texts, metadatas, embedding.embed_documents(texts), **kwargs)
        return instance


# SingleStoreDBRetriever is not needed, but we keep it for backwards compatibility
SingleStoreDBRetriever = VectorStoreRetriever