Clickhouse

ClickHouseReader #

Bases: BaseReader

ClickHouse阅读器。

Parameters:

Name	Type	Description	Default
`clickhouse_host`	`str)`	用于连接到ClickHouse后端的URL。默认为"localhost"。	`'localhost'`
`username`	`str)`	登录的用户名。默认为"default"。	`'default'`
`password`	`str)`	登录的密码。默认为空字符串""。	`''`
`clickhouse_port`	`int)`	用于通过HTTP连接的URL端口。默认为8123。	`8123`
`database`	`str)`	要查找表的数据库名称。默认为'default'。	`'default'`
`engine`	`str)`	引擎。选项为"MergeTree"和"Memory"。默认为"MergeTree"。	`'MergeTree'`
`table`	`str)`	要操作的表名称。默认为'vector_table'。	`'llama_index'`
`index_type`	`str`	索引类型字符串。默认为"NONE"，支持的类型有("NONE", "HNSW", "ANNOY")。	`'NONE'`
`metric`	`str)`	计算距离的度量，支持的有('l2', 'cosine', 'dot')。默认为'cosine'。	`'cosine'`
`batch_size`	`int`	要插入的文档大小。默认为1000。	`1000`
`index_params`	`dict`	ClickHouse的索引参数。默认为None。	`None`
`search_params`	`dict`	ClickHouse查询的搜索参数。默认为None。	`None`

Source code in llama_index/readers/clickhouse/base.py

class ClickHouseReader(BaseReader):
    """ClickHouse阅读器。

    Args:
        clickhouse_host (str) : 用于连接到ClickHouse后端的URL。默认为"localhost"。
        username (str) : 登录的用户名。默认为"default"。
        password (str) : 登录的密码。默认为空字符串""。
        clickhouse_port (int) : 用于通过HTTP连接的URL端口。默认为8123。
        database (str) : 要查找表的数据库名称。默认为'default'。
        engine (str) : 引擎。选项为"MergeTree"和"Memory"。默认为"MergeTree"。
        table (str) : 要操作的表名称。默认为'vector_table'。
        index_type (str): 索引类型字符串。默认为"NONE"，支持的类型有("NONE", "HNSW", "ANNOY")。
        metric (str) : 计算距离的度量，支持的有('l2', 'cosine', 'dot')。默认为'cosine'。
        batch_size (int, optional): 要插入的文档大小。默认为1000。
        index_params (dict, optional): ClickHouse的索引参数。默认为None。
        search_params (dict, optional): ClickHouse查询的搜索参数。默认为None。"""

    def __init__(
        self,
        clickhouse_host: str = "localhost",
        username: str = "default",
        password: str = "",
        clickhouse_port: Optional[int] = 8123,
        database: str = "default",
        engine: str = "MergeTree",
        table: str = "llama_index",
        index_type: str = "NONE",
        metric: str = "cosine",
        batch_size: int = 1000,
        index_params: Optional[dict] = None,
        search_params: Optional[dict] = None,
        **kwargs: Any,
    ) -> None:
        self.client = clickhouse_connect.get_client(
            host=clickhouse_host,
            port=clickhouse_port,
            username=username,
            password=password,
        )

        self.config = ClickHouseSettings(
            table=table,
            database=database,
            engine=engine,
            index_type=index_type,
            metric=metric,
            batch_size=batch_size,
            index_params=index_params,
            search_params=search_params,
            **kwargs,
        )

    def load_data(
        self,
        query_vector: List[float],
        where_str: Optional[str] = None,
        limit: int = 10,
    ) -> List[Document]:
        """从ClickHouse加载数据。

Args:
    query_vector（List[float]）：查询向量。
    where_str（Optional[str]，可选）：where条件字符串。
        默认为None。
    limit（int）：要返回的结果数量。

Returns:
    List[Document]：文档列表。
"""
        query_statement = self.config.build_query_statement(
            query_embed=query_vector,
            where_str=where_str,
            limit=limit,
        )

        return [
            Document(id_=r["doc_id"], text=r["text"], metadata=r["metadata"])
            for r in self.client.query(query_statement).named_results()
        ]

load_data #

load_data(
    query_vector: List[float],
    where_str: Optional[str] = None,
    limit: int = 10,
) -> List[Document]

从ClickHouse加载数据。

Returns:

Type	Description
`List[Document]`	List[Document]：文档列表。

Source code in llama_index/readers/clickhouse/base.py

    def load_data(
        self,
        query_vector: List[float],
        where_str: Optional[str] = None,
        limit: int = 10,
    ) -> List[Document]:
        """从ClickHouse加载数据。

Args:
    query_vector（List[float]）：查询向量。
    where_str（Optional[str]，可选）：where条件字符串。
        默认为None。
    limit（int）：要返回的结果数量。

Returns:
    List[Document]：文档列表。
"""
        query_statement = self.config.build_query_statement(
            query_embed=query_vector,
            where_str=where_str,
            limit=limit,
        )

        return [
            Document(id_=r["doc_id"], text=r["text"], metadata=r["metadata"])
            for r in self.client.query(query_statement).named_results()
        ]