Neo4j

Neo4jGraphStore #

基类: GraphStore

Source code in llama-index-integrations/graph_stores/llama-index-graph-stores-neo4j/llama_index/graph_stores/neo4j/base.py

class Neo4jGraphStore(GraphStore):
    def __init__(
        self,
        username: str,
        password: str,
        url: str,
        database: str = "neo4j",
        node_label: str = "Entity",
        refresh_schema: bool = True,
        timeout: Optional[float] = None,
        **kwargs: Any,
    ) -> None:
        self.node_label = node_label
        self._driver = neo4j.GraphDatabase.driver(url, auth=(username, password))
        self._database = database
        self._timeout = timeout
        self.schema = ""
        self.structured_schema: Dict[str, Any] = {}
        # Verify connection
        try:
            with self._driver as driver:
                driver.verify_connectivity()
        except neo4j.exceptions.ServiceUnavailable:
            raise ValueError(
                "Could not connect to Neo4j database. "
                "Please ensure that the url is correct"
            )
        except neo4j.exceptions.AuthError:
            raise ValueError(
                "Could not connect to Neo4j database. "
                "Please ensure that the username and password are correct"
            )
        # Set schema
        self.schema = ""
        self.structured_schema = {}
        if refresh_schema:
            try:
                self.refresh_schema()
            except neo4j.exceptions.ClientError:
                raise ValueError(
                    "Could not use APOC procedures. "
                    "Please ensure the APOC plugin is installed in Neo4j and that "
                    "'apoc.meta.data()' is allowed in Neo4j configuration "
                )
        # Create constraint for faster insert and retrieval
        try:  # Using Neo4j 5
            self.query(
                """
                CREATE CONSTRAINT IF NOT EXISTS FOR (n:%s) REQUIRE n.id IS UNIQUE;
                """
                % (self.node_label)
            )
        except Exception:  # Using Neo4j <5
            self.query(
                """
                CREATE CONSTRAINT IF NOT EXISTS ON (n:%s) ASSERT n.id IS UNIQUE;
                """
                % (self.node_label)
            )

    @property
    def client(self) -> Any:
        return self._driver

    def get(self, subj: str) -> List[List[str]]:
        """Get triplets."""
        query = """
            MATCH (n1:%s)-[r]->(n2:%s)
            WHERE n1.id = $subj
            RETURN type(r), n2.id;
        """

        prepared_statement = query % (self.node_label, self.node_label)

        with self._driver.session(database=self._database) as session:
            data = session.run(prepared_statement, {"subj": subj})
            return [record.values() for record in data]

    def get_rel_map(
        self, subjs: Optional[List[str]] = None, depth: int = 2, limit: int = 30
    ) -> Dict[str, List[List[str]]]:
        """Get flat rel map."""
        # The flat means for multi-hop relation path, we could get
        # knowledge like: subj -> rel -> obj -> rel -> obj -> rel -> obj.
        # This type of knowledge is useful for some tasks.
        # +-------------+------------------------------------+
        # | subj        | flattened_rels                     |
        # +-------------+------------------------------------+
        # | "player101" | [95, "player125", 2002, "team204"] |
        # | "player100" | [1997, "team204"]                  |
        # ...
        # +-------------+------------------------------------+

        rel_map: Dict[Any, List[Any]] = {}
        if subjs is None or len(subjs) == 0:
            # unlike simple graph_store, we don't do get_all here
            return rel_map

        query = (
            f"""MATCH p=(n1:{self.node_label})-[*1..{depth}]->() """
            f"""WHERE toLower(n1.id) IN {[subj.lower() for subj in subjs] if subjs else []}"""
            "UNWIND relationships(p) AS rel "
            "WITH n1.id AS subj, p, apoc.coll.flatten(apoc.coll.toSet("
            "collect([type(rel), endNode(rel).id]))) AS flattened_rels "
            f"RETURN subj, collect(flattened_rels) AS flattened_rels LIMIT {limit}"
        )

        data = list(self.query(query, {"subjs": subjs}))
        if not data:
            return rel_map

        for record in data:
            rel_map[record["subj"]] = record["flattened_rels"]
        return rel_map

    def upsert_triplet(self, subj: str, rel: str, obj: str) -> None:
        """Add triplet."""
        query = """
            MERGE (n1:`%s` {id:$subj})
            MERGE (n2:`%s` {id:$obj})
            MERGE (n1)-[:`%s`]->(n2)
        """

        prepared_statement = query % (
            self.node_label,
            self.node_label,
            rel.replace(" ", "_").upper(),
        )

        with self._driver.session(database=self._database) as session:
            session.run(prepared_statement, {"subj": subj, "obj": obj})

    def delete(self, subj: str, rel: str, obj: str) -> None:
        """Delete triplet."""

        def delete_rel(subj: str, obj: str, rel: str) -> None:
            with self._driver.session(database=self._database) as session:
                session.run(
                    (
                        "MATCH (n1:{})-[r:{}]->(n2:{}) WHERE n1.id = $subj AND n2.id"
                        " = $obj DELETE r"
                    ).format(self.node_label, rel, self.node_label),
                    {"subj": subj, "obj": obj},
                )

        def delete_entity(entity: str) -> None:
            with self._driver.session(database=self._database) as session:
                session.run(
                    "MATCH (n:%s) WHERE n.id = $entity DELETE n" % self.node_label,
                    {"entity": entity},
                )

        def check_edges(entity: str) -> bool:
            with self._driver.session(database=self._database) as session:
                is_exists_result = session.run(
                    "MATCH (n1:%s)--() WHERE n1.id = $entity RETURN count(*)"
                    % (self.node_label),
                    {"entity": entity},
                )
                return bool(list(is_exists_result))

        delete_rel(subj, obj, rel)
        if not check_edges(subj):
            delete_entity(subj)
        if not check_edges(obj):
            delete_entity(obj)

    def refresh_schema(self) -> None:
        """
        Refreshes the Neo4j graph schema information.
        """
        node_properties = [el["output"] for el in self.query(node_properties_query)]
        rel_properties = [el["output"] for el in self.query(rel_properties_query)]
        relationships = [el["output"] for el in self.query(rel_query)]

        self.structured_schema = {
            "node_props": {el["labels"]: el["properties"] for el in node_properties},
            "rel_props": {el["type"]: el["properties"] for el in rel_properties},
            "relationships": relationships,
        }

        # Format node properties
        formatted_node_props = []
        for el in node_properties:
            props_str = ", ".join(
                [f"{prop['property']}: {prop['type']}" for prop in el["properties"]]
            )
            formatted_node_props.append(f"{el['labels']} {{{props_str}}}")

        # Format relationship properties
        formatted_rel_props = []
        for el in rel_properties:
            props_str = ", ".join(
                [f"{prop['property']}: {prop['type']}" for prop in el["properties"]]
            )
            formatted_rel_props.append(f"{el['type']} {{{props_str}}}")

        # Format relationships
        formatted_rels = [
            f"(:{el['start']})-[:{el['type']}]->(:{el['end']})" for el in relationships
        ]

        self.schema = "\n".join(
            [
                "Node properties are the following:",
                ",".join(formatted_node_props),
                "Relationship properties are the following:",
                ",".join(formatted_rel_props),
                "The relationships are the following:",
                ",".join(formatted_rels),
            ]
        )

    def get_schema(self, refresh: bool = False) -> str:
        """Get the schema of the Neo4jGraph store."""
        if self.schema and not refresh:
            return self.schema
        self.refresh_schema()
        logger.debug(f"get_schema() schema:\n{self.schema}")
        return self.schema

    def query(self, query: str, param_map: Optional[Dict[str, Any]] = None) -> Any:
        param_map = param_map or {}
        try:
            data, _, _ = self._driver.execute_query(
                neo4j.Query(text=query, timeout=self._timeout),
                database_=self._database,
                parameters_=param_map,
            )
            return [r.data() for r in data]
        except neo4j.exceptions.Neo4jError as e:
            if not (
                (
                    (  # isCallInTransactionError
                        e.code == "Neo.DatabaseError.Statement.ExecutionFailed"
                        or e.code
                        == "Neo.DatabaseError.Transaction.TransactionStartFailed"
                    )
                    and "in an implicit transaction" in e.message
                )
                or (  # isPeriodicCommitError
                    e.code == "Neo.ClientError.Statement.SemanticError"
                    and (
                        "in an open transaction is not possible" in e.message
                        or "tried to execute in an explicit transaction" in e.message
                    )
                )
            ):
                raise
        # Fallback to allow implicit transactions
        with self._driver.session(database=self._database) as session:
            data = session.run(
                neo4j.Query(text=query, timeout=self._timeout), param_map
            )
            return [r.data() for r in data]

    def close(self) -> None:
        """
        Explicitly close the Neo4j driver connection.

        Delegates connection management to the Neo4j driver.
        """
        if hasattr(self, "_driver"):
            self._driver.close()
            # Remove the driver attribute to indicate closure
            delattr(self, "_driver")

    def __enter__(self) -> "Neo4jGraphStore":
        """
        Enter the runtime context for the Neo4j graph connection.

        Enables use of the graph connection with the 'with' statement.
        This method allows for automatic resource management and ensures
        that the connection is properly handled.

        Returns:
            Neo4jPropertyGraphStore: The current graph connection instance

        """
        return self

    def __exit__(
        self,
        exc_type: Optional[Type[BaseException]],
        exc_val: Optional[BaseException],
        exc_tb: Optional[TracebackType],
    ) -> None:
        """
        Exit the runtime context for the Neo4j graph connection.

        This method is automatically called when exiting a 'with' statement.
        It ensures that the database connection is closed, regardless of
        whether an exception occurred during the context's execution.

        Args:
            exc_type: The type of exception that caused the context to exit
                      (None if no exception occurred)
            exc_val: The exception instance that caused the context to exit
                     (None if no exception occurred)
            exc_tb: The traceback for the exception (None if no exception occurred)

        Note:
            Any exception is re-raised after the connection is closed.

        """
        self.close()

    def __del__(self) -> None:
        """
        Destructor for the Neo4j graph connection.

        This method is called during garbage collection to ensure that
        database resources are released if not explicitly closed.

        Caution:
            - Do not rely on this method for deterministic resource cleanup
            - Always prefer explicit .close() or context manager

        Best practices:
            1. Use context manager:
               with Neo4jGraph(...) as graph:
                   ...
            2. Explicitly close:
               graph = Neo4jGraph(...)
               try:
                   ...
               finally:
                   graph.close()

        """
        try:
            self.close()
        except Exception:
            # Suppress any exceptions during garbage collection
            pass

获取 #

get(subj: str) -> List[List[str]]

获取三元组。

Source code in llama-index-integrations/graph_stores/llama-index-graph-stores-neo4j/llama_index/graph_stores/neo4j/base.py

def get(self, subj: str) -> List[List[str]]:
    """Get triplets."""
    query = """
        MATCH (n1:%s)-[r]->(n2:%s)
        WHERE n1.id = $subj
        RETURN type(r), n2.id;
    """

    prepared_statement = query % (self.node_label, self.node_label)

    with self._driver.session(database=self._database) as session:
        data = session.run(prepared_statement, {"subj": subj})
        return [record.values() for record in data]

get_rel_map #

get_rel_map(subjs: Optional[List[str]] = None, depth: int = 2, limit: int = 30) -> Dict[str, List[List[str]]]

获取扁平关系映射。

Source code in llama-index-integrations/graph_stores/llama-index-graph-stores-neo4j/llama_index/graph_stores/neo4j/base.py

def get_rel_map(
    self, subjs: Optional[List[str]] = None, depth: int = 2, limit: int = 30
) -> Dict[str, List[List[str]]]:
    """Get flat rel map."""
    # The flat means for multi-hop relation path, we could get
    # knowledge like: subj -> rel -> obj -> rel -> obj -> rel -> obj.
    # This type of knowledge is useful for some tasks.
    # +-------------+------------------------------------+
    # | subj        | flattened_rels                     |
    # +-------------+------------------------------------+
    # | "player101" | [95, "player125", 2002, "team204"] |
    # | "player100" | [1997, "team204"]                  |
    # ...
    # +-------------+------------------------------------+

    rel_map: Dict[Any, List[Any]] = {}
    if subjs is None or len(subjs) == 0:
        # unlike simple graph_store, we don't do get_all here
        return rel_map

    query = (
        f"""MATCH p=(n1:{self.node_label})-[*1..{depth}]->() """
        f"""WHERE toLower(n1.id) IN {[subj.lower() for subj in subjs] if subjs else []}"""
        "UNWIND relationships(p) AS rel "
        "WITH n1.id AS subj, p, apoc.coll.flatten(apoc.coll.toSet("
        "collect([type(rel), endNode(rel).id]))) AS flattened_rels "
        f"RETURN subj, collect(flattened_rels) AS flattened_rels LIMIT {limit}"
    )

    data = list(self.query(query, {"subjs": subjs}))
    if not data:
        return rel_map

    for record in data:
        rel_map[record["subj"]] = record["flattened_rels"]
    return rel_map

upsert_triplet #

upsert_triplet(subj: str, rel: str, obj: str) -> None

添加三元组。

Source code in llama-index-integrations/graph_stores/llama-index-graph-stores-neo4j/llama_index/graph_stores/neo4j/base.py

def upsert_triplet(self, subj: str, rel: str, obj: str) -> None:
    """Add triplet."""
    query = """
        MERGE (n1:`%s` {id:$subj})
        MERGE (n2:`%s` {id:$obj})
        MERGE (n1)-[:`%s`]->(n2)
    """

    prepared_statement = query % (
        self.node_label,
        self.node_label,
        rel.replace(" ", "_").upper(),
    )

    with self._driver.session(database=self._database) as session:
        session.run(prepared_statement, {"subj": subj, "obj": obj})

删除 #

delete(subj: str, rel: str, obj: str) -> None

删除三元组。

Source code in llama-index-integrations/graph_stores/llama-index-graph-stores-neo4j/llama_index/graph_stores/neo4j/base.py

def delete(self, subj: str, rel: str, obj: str) -> None:
    """Delete triplet."""

    def delete_rel(subj: str, obj: str, rel: str) -> None:
        with self._driver.session(database=self._database) as session:
            session.run(
                (
                    "MATCH (n1:{})-[r:{}]->(n2:{}) WHERE n1.id = $subj AND n2.id"
                    " = $obj DELETE r"
                ).format(self.node_label, rel, self.node_label),
                {"subj": subj, "obj": obj},
            )

    def delete_entity(entity: str) -> None:
        with self._driver.session(database=self._database) as session:
            session.run(
                "MATCH (n:%s) WHERE n.id = $entity DELETE n" % self.node_label,
                {"entity": entity},
            )

    def check_edges(entity: str) -> bool:
        with self._driver.session(database=self._database) as session:
            is_exists_result = session.run(
                "MATCH (n1:%s)--() WHERE n1.id = $entity RETURN count(*)"
                % (self.node_label),
                {"entity": entity},
            )
            return bool(list(is_exists_result))

    delete_rel(subj, obj, rel)
    if not check_edges(subj):
        delete_entity(subj)
    if not check_edges(obj):
        delete_entity(obj)

refresh_schema #

refresh_schema() -> None

刷新Neo4j图数据库的模式信息。

Source code in llama-index-integrations/graph_stores/llama-index-graph-stores-neo4j/llama_index/graph_stores/neo4j/base.py

def refresh_schema(self) -> None:
    """
    Refreshes the Neo4j graph schema information.
    """
    node_properties = [el["output"] for el in self.query(node_properties_query)]
    rel_properties = [el["output"] for el in self.query(rel_properties_query)]
    relationships = [el["output"] for el in self.query(rel_query)]

    self.structured_schema = {
        "node_props": {el["labels"]: el["properties"] for el in node_properties},
        "rel_props": {el["type"]: el["properties"] for el in rel_properties},
        "relationships": relationships,
    }

    # Format node properties
    formatted_node_props = []
    for el in node_properties:
        props_str = ", ".join(
            [f"{prop['property']}: {prop['type']}" for prop in el["properties"]]
        )
        formatted_node_props.append(f"{el['labels']} {{{props_str}}}")

    # Format relationship properties
    formatted_rel_props = []
    for el in rel_properties:
        props_str = ", ".join(
            [f"{prop['property']}: {prop['type']}" for prop in el["properties"]]
        )
        formatted_rel_props.append(f"{el['type']} {{{props_str}}}")

    # Format relationships
    formatted_rels = [
        f"(:{el['start']})-[:{el['type']}]->(:{el['end']})" for el in relationships
    ]

    self.schema = "\n".join(
        [
            "Node properties are the following:",
            ",".join(formatted_node_props),
            "Relationship properties are the following:",
            ",".join(formatted_rel_props),
            "The relationships are the following:",
            ",".join(formatted_rels),
        ]
    )

get_schema #

get_schema(refresh: bool = False) -> str

获取Neo4jGraph存储的架构。

Source code in llama-index-integrations/graph_stores/llama-index-graph-stores-neo4j/llama_index/graph_stores/neo4j/base.py

def get_schema(self, refresh: bool = False) -> str:
    """Get the schema of the Neo4jGraph store."""
    if self.schema and not refresh:
        return self.schema
    self.refresh_schema()
    logger.debug(f"get_schema() schema:\n{self.schema}")
    return self.schema

关闭 #

close() -> None

显式关闭Neo4j驱动连接。

将连接管理委托给Neo4j驱动程序。

Source code in llama-index-integrations/graph_stores/llama-index-graph-stores-neo4j/llama_index/graph_stores/neo4j/base.py

def close(self) -> None:
    """
    Explicitly close the Neo4j driver connection.

    Delegates connection management to the Neo4j driver.
    """
    if hasattr(self, "_driver"):
        self._driver.close()
        # Remove the driver attribute to indicate closure
        delattr(self, "_driver")

Neo4j属性图存储 #

基础类: PropertyGraphStore

Neo4j 属性图存储。

该类实现了Neo4j属性图存储。

如果您使用的是本地Neo4j而非Aura，以下是一个有用的命令用于启动Docker容器：

docker run \
    -p 7474:7474 -p 7687:7687 \
    -v $PWD/data:/data -v $PWD/plugins:/plugins \
    --name neo4j-apoc \
    -e NEO4J_apoc_export_file_enabled=true \
    -e NEO4J_apoc_import_file_enabled=true \
    -e NEO4J_apoc_import_file_use__neo4j__config=true \
    -e NEO4JLABS_PLUGINS=\\[\"apoc\"\\] \
    neo4j:latest

参数:

名称	类型	描述	默认值
`username`	`str`	Neo4j数据库的用户名。	required
`password`	`str`	Neo4j数据库的密码。	required
`url`	`str`	Neo4j数据库的URL。	required
`database`	`Optional[str]`	要连接的数据库名称。默认为"neo4j"。	`'neo4j'`
`timeout`	`Optional[float]`	事务超时时间（以秒为单位）。	`None`

示例：

pip install llama-index-graph-stores-neo4j

from llama_index.core.indices.property_graph import PropertyGraphIndex
from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore

# Create a Neo4jPropertyGraphStore instance
graph_store = Neo4jPropertyGraphStore(
    username="neo4j",
    password="neo4j",
    url="bolt://localhost:7687",
    database="neo4j"
)

# create the index
index = PropertyGraphIndex.from_documents(
    documents,
    property_graph_store=graph_store,
)

# Close the neo4j connection explicitly.
graph_store.close()

Source code in

llama-index-integrations/graph_stores/llama-index-graph-stores-neo4j/llama_index/graph_stores/neo4j/neo4j_property_graph.py

class Neo4jPropertyGraphStore(PropertyGraphStore):
    r"""
    Neo4j Property Graph Store.

    This class implements a Neo4j property graph store.

    If you are using local Neo4j instead of aura, here's a helpful
    command for launching the docker container:

    ```bash
    docker run \
        -p 7474:7474 -p 7687:7687 \
        -v $PWD/data:/data -v $PWD/plugins:/plugins \
        --name neo4j-apoc \
        -e NEO4J_apoc_export_file_enabled=true \
        -e NEO4J_apoc_import_file_enabled=true \
        -e NEO4J_apoc_import_file_use__neo4j__config=true \
        -e NEO4JLABS_PLUGINS=\\[\"apoc\"\\] \
        neo4j:latest
    ```

    Args:
        username (str): The username for the Neo4j database.
        password (str): The password for the Neo4j database.
        url (str): The URL for the Neo4j database.
        database (Optional[str]): The name of the database to connect to. Defaults to "neo4j".
        timeout (Optional[float]): The timeout for transactions in seconds.
        Useful for terminating long-running queries.
        By default, there is no timeout set.

    Examples:
        `pip install llama-index-graph-stores-neo4j`

        ```python
        from llama_index.core.indices.property_graph import PropertyGraphIndex
        from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore

        # Create a Neo4jPropertyGraphStore instance
        graph_store = Neo4jPropertyGraphStore(
            username="neo4j",
            password="neo4j",
            url="bolt://localhost:7687",
            database="neo4j"
        )

        # create the index
        index = PropertyGraphIndex.from_documents(
            documents,
            property_graph_store=graph_store,
        )

        # Close the neo4j connection explicitly.
        graph_store.close()
        ```

    """

    supports_structured_queries: bool = True
    supports_vector_queries: bool = True
    text_to_cypher_template: PromptTemplate = DEFAULT_CYPHER_TEMPALTE

    def __init__(
        self,
        username: str,
        password: str,
        url: str,
        database: Optional[str] = "neo4j",
        refresh_schema: bool = True,
        sanitize_query_output: bool = True,
        enhanced_schema: bool = False,
        create_indexes: bool = True,
        timeout: Optional[float] = None,
        **neo4j_kwargs: Any,
    ) -> None:
        self.sanitize_query_output = sanitize_query_output
        self.enhanced_schema = enhanced_schema
        self._driver = neo4j.GraphDatabase.driver(
            url,
            auth=(username, password),
            notifications_min_severity="OFF",
            **neo4j_kwargs,
        )
        self._async_driver = neo4j.AsyncGraphDatabase.driver(
            url,
            auth=(username, password),
            notifications_min_severity="OFF",
            **neo4j_kwargs,
        )
        self._database = database
        self._timeout = timeout
        self.structured_schema = {}
        if refresh_schema:
            self.refresh_schema()
        # Verify version to check if we can use vector index
        self.verify_version()
        # Create index for faster imports and retrieval
        if create_indexes:
            self.structured_query(
                f"""CREATE CONSTRAINT IF NOT EXISTS FOR (n:`{BASE_NODE_LABEL}`)
                REQUIRE n.id IS UNIQUE;"""
            )
            self.structured_query(
                f"""CREATE CONSTRAINT IF NOT EXISTS FOR (n:`{BASE_ENTITY_LABEL}`)
                REQUIRE n.id IS UNIQUE;"""
            )

            if self._supports_vector_index:
                self.structured_query(
                    f"CREATE VECTOR INDEX {VECTOR_INDEX_NAME} IF NOT EXISTS "
                    "FOR (m:__Entity__) ON m.embedding"
                )

    @property
    def client(self):
        return self._driver

    def close(self) -> None:
        self._driver.close()

    def refresh_schema(self) -> None:
        """Refresh the schema."""
        node_query_results = self.structured_query(
            node_properties_query,
            param_map={
                "EXCLUDED_LABELS": [
                    *EXCLUDED_LABELS,
                    BASE_ENTITY_LABEL,
                    BASE_NODE_LABEL,
                ]
            },
        )
        node_properties = (
            [el["output"] for el in node_query_results] if node_query_results else []
        )

        rels_query_result = self.structured_query(
            rel_properties_query, param_map={"EXCLUDED_LABELS": EXCLUDED_RELS}
        )
        rel_properties = (
            [el["output"] for el in rels_query_result] if rels_query_result else []
        )

        rel_objs_query_result = self.structured_query(
            rel_query,
            param_map={
                "EXCLUDED_LABELS": [
                    *EXCLUDED_LABELS,
                    BASE_ENTITY_LABEL,
                    BASE_NODE_LABEL,
                ]
            },
        )
        relationships = (
            [el["output"] for el in rel_objs_query_result]
            if rel_objs_query_result
            else []
        )

        # Get constraints & indexes
        try:
            constraint = self.structured_query("SHOW CONSTRAINTS")
            index = self.structured_query(
                "CALL apoc.schema.nodes() YIELD label, properties, type, size, "
                "valuesSelectivity WHERE type = 'RANGE' RETURN *, "
                "size * valuesSelectivity as distinctValues"
            )
        except (
            neo4j.exceptions.ClientError
        ):  # Read-only user might not have access to schema information
            constraint = []
            index = []

        self.structured_schema = {
            "node_props": {el["labels"]: el["properties"] for el in node_properties},
            "rel_props": {el["type"]: el["properties"] for el in rel_properties},
            "relationships": relationships,
            "metadata": {"constraint": constraint, "index": index},
        }
        schema_counts = self.structured_query(
            "CALL apoc.meta.subGraph({}) YIELD nodes, relationships "
            "RETURN nodes, [rel in relationships | {name:apoc.any.property"
            "(rel, 'type'), count: apoc.any.property(rel, 'count')}]"
            " AS relationships"
        )
        # Update node info
        for node in schema_counts[0].get("nodes", []):
            # Skip bloom labels
            if node["name"] in EXCLUDED_LABELS:
                continue
            node_props = self.structured_schema["node_props"].get(node["name"])
            if not node_props:  # The node has no properties
                continue
            enhanced_cypher = self._enhanced_schema_cypher(
                node["name"], node_props, node["count"] < EXHAUSTIVE_SEARCH_LIMIT
            )
            enhanced_info = self.structured_query(enhanced_cypher)[0]["output"]
            for prop in node_props:
                # Map to custom types
                # Text
                if prop["type"] == "STRING" and any(
                    len(value) >= LONG_TEXT_THRESHOLD
                    for value in enhanced_info[prop["property"]]["values"]
                ):
                    enhanced_info[prop["property"]]["type"] = "TEXT"
                # Embedding
                if (
                    prop["type"] == "LIST"
                    and enhanced_info[prop["property"]]["max_size"] > LIST_LIMIT
                ):
                    enhanced_info[prop["property"]]["type"] = "EMBEDDING"
                if prop["property"] in enhanced_info:
                    prop.update(enhanced_info[prop["property"]])
        # Update rel info
        for rel in schema_counts[0].get("relationships", []):
            # Skip bloom labels
            if rel["name"] in EXCLUDED_RELS:
                continue
            rel_props = self.structured_schema["rel_props"].get(rel["name"])
            if not rel_props:  # The rel has no properties
                continue
            enhanced_cypher = self._enhanced_schema_cypher(
                rel["name"],
                rel_props,
                rel["count"] < EXHAUSTIVE_SEARCH_LIMIT,
                is_relationship=True,
            )
            try:
                enhanced_info = self.structured_query(enhanced_cypher)[0]["output"]
                for prop in rel_props:
                    if prop["property"] in enhanced_info:
                        prop.update(enhanced_info[prop["property"]])
            except neo4j.exceptions.ClientError:
                # Sometimes the types are not consistent in the db
                pass

    def upsert_nodes(self, nodes: List[LabelledNode]) -> None:
        # Lists to hold separated types
        entity_dicts: List[dict] = []
        chunk_dicts: List[dict] = []

        # Sort by type
        for item in nodes:
            if isinstance(item, EntityNode):
                entity_dicts.append({**item.dict(), "id": item.id})
            elif isinstance(item, ChunkNode):
                chunk_dicts.append({**item.dict(), "id": item.id})
            else:
                # Log that we do not support these types of nodes
                # Or raise an error?
                pass

        if chunk_dicts:
            for index in range(0, len(chunk_dicts), CHUNK_SIZE):
                chunked_params = chunk_dicts[index : index + CHUNK_SIZE]
                self.structured_query(
                    f"""
                    UNWIND $data AS row
                    MERGE (c:{BASE_NODE_LABEL} {{id: row.id}})
                    SET c.text = row.text, c:Chunk
                    WITH c, row
                    SET c += row.properties
                    WITH c, row.embedding AS embedding
                    WHERE embedding IS NOT NULL
                    CALL db.create.setNodeVectorProperty(c, 'embedding', embedding)
                    RETURN count(*)
                    """,
                    param_map={"data": chunked_params},
                )

        if entity_dicts:
            for index in range(0, len(entity_dicts), CHUNK_SIZE):
                chunked_params = entity_dicts[index : index + CHUNK_SIZE]
                self.structured_query(
                    f"""
                    UNWIND $data AS row
                    MERGE (e:{BASE_NODE_LABEL} {{id: row.id}})
                    SET e += apoc.map.clean(row.properties, [], [])
                    SET e.name = row.name, e:`{BASE_ENTITY_LABEL}`
                    WITH e, row
                    CALL apoc.create.addLabels(e, [row.label])
                    YIELD node
                    WITH e, row
                    CALL (e, row) {{
                        WITH e, row
                        WHERE row.embedding IS NOT NULL
                        CALL db.create.setNodeVectorProperty(e, 'embedding', row.embedding)
                        RETURN count(*) AS count
                    }}
                    WITH e, row WHERE row.properties.triplet_source_id IS NOT NULL
                    MERGE (c:{BASE_NODE_LABEL} {{id: row.properties.triplet_source_id}})
                    MERGE (e)<-[:MENTIONS]-(c)
                    """,
                    param_map={"data": chunked_params},
                )

    def upsert_relations(self, relations: List[Relation]) -> None:
        """Add relations."""
        params = [r.dict() for r in relations]
        for index in range(0, len(params), CHUNK_SIZE):
            chunked_params = params[index : index + CHUNK_SIZE]

            self.structured_query(
                f"""
                UNWIND $data AS row
                MERGE (source: {BASE_NODE_LABEL} {{id: row.source_id}})
                ON CREATE SET source:Chunk
                MERGE (target: {BASE_NODE_LABEL} {{id: row.target_id}})
                ON CREATE SET target:Chunk
                WITH source, target, row
                CALL apoc.merge.relationship(source, row.label, {{}}, row.properties, target) YIELD rel
                RETURN count(*)
                """,
                param_map={"data": chunked_params},
            )

    def get(
        self,
        properties: Optional[dict] = None,
        ids: Optional[List[str]] = None,
    ) -> List[LabelledNode]:
        """Get nodes."""
        cypher_statement = f"MATCH (e: {BASE_NODE_LABEL}) "

        params = {}
        cypher_statement += "WHERE e.id IS NOT NULL "

        if ids:
            cypher_statement += "AND e.id in $ids "
            params["ids"] = ids

        if properties:
            prop_list = []
            for i, prop in enumerate(properties):
                prop_list.append(f"e.`{prop}` = $property_{i}")
                params[f"property_{i}"] = properties[prop]
            cypher_statement += " AND " + " AND ".join(prop_list)

        return_statement = """
        WITH e
        RETURN e.id AS name,
               [l in labels(e) WHERE l <> '__Entity__' | l][0] AS type,
               e{.* , embedding: Null, id: Null} AS properties
        """
        cypher_statement += return_statement

        response = self.structured_query(cypher_statement, param_map=params)
        response = response if response else []

        nodes = []
        for record in response:
            # text indicates a chunk node
            # none on the type indicates an implicit node, likely a chunk node
            if "text" in record["properties"] or record["type"] is None:
                text = record["properties"].pop("text", "")
                nodes.append(
                    ChunkNode(
                        id_=record["name"],
                        text=text,
                        properties=remove_empty_values(record["properties"]),
                    )
                )
            else:
                nodes.append(
                    EntityNode(
                        name=record["name"],
                        label=record["type"],
                        properties=remove_empty_values(record["properties"]),
                    )
                )

        return nodes

    def get_triplets(
        self,
        entity_names: Optional[List[str]] = None,
        relation_names: Optional[List[str]] = None,
        properties: Optional[dict] = None,
        ids: Optional[List[str]] = None,
    ) -> List[Triplet]:
        # TODO: handle ids of chunk nodes
        cypher_statement = f"MATCH (e:`{BASE_ENTITY_LABEL}`) "

        params = {}
        if entity_names or properties or ids:
            cypher_statement += "WHERE "

        if entity_names:
            cypher_statement += "e.name in $entity_names "
            params["entity_names"] = entity_names

        if ids:
            cypher_statement += "e.id in $ids "
            params["ids"] = ids

        if properties:
            prop_list = []
            for i, prop in enumerate(properties):
                prop_list.append(f"e.`{prop}` = $property_{i}")
                params[f"property_{i}"] = properties[prop]
            cypher_statement += " AND ".join(prop_list)

        return_statement = f"""
        WITH e
        CALL (e) {{
            WITH e
            MATCH (e)-[r{":`" + "`|`".join(relation_names) + "`" if relation_names else ""}]->(t:`{BASE_ENTITY_LABEL}`)
            RETURN e.name AS source_id, [l in labels(e) WHERE NOT l IN ['{BASE_ENTITY_LABEL}', '{BASE_NODE_LABEL}'] | l][0] AS source_type,
                   e{{.* , embedding: Null, name: Null}} AS source_properties,
                   type(r) AS type,
                   r{{.*}} AS rel_properties,
                   t.name AS target_id, [l in labels(t) WHERE NOT l IN ['{BASE_ENTITY_LABEL}', '{BASE_NODE_LABEL}'] | l][0] AS target_type,
                   t{{.* , embedding: Null, name: Null}} AS target_properties
            UNION ALL
            WITH e
            MATCH (e)<-[r{":`" + "`|`".join(relation_names) + "`" if relation_names else ""}]-(t:`{BASE_ENTITY_LABEL}`)
            RETURN t.name AS source_id, [l in labels(t) WHERE NOT l IN ['{BASE_ENTITY_LABEL}', '{BASE_NODE_LABEL}'] | l][0] AS source_type,
                   t{{.* , embedding: Null, name: Null}} AS source_properties,
                   type(r) AS type,
                   r{{.*}} AS rel_properties,
                   e.name AS target_id, [l in labels(e) WHERE NOT l IN ['{BASE_ENTITY_LABEL}', '{BASE_NODE_LABEL}'] | l][0] AS target_type,
                   e{{.* , embedding: Null, name: Null}} AS target_properties
        }}
        RETURN source_id, source_type, type, rel_properties, target_id, target_type, source_properties, target_properties"""
        cypher_statement += return_statement

        data = self.structured_query(cypher_statement, param_map=params)
        data = data if data else []

        triples = []
        for record in data:
            source = EntityNode(
                name=record["source_id"],
                label=record["source_type"],
                properties=remove_empty_values(record["source_properties"]),
            )
            target = EntityNode(
                name=record["target_id"],
                label=record["target_type"],
                properties=remove_empty_values(record["target_properties"]),
            )
            rel = Relation(
                source_id=record["source_id"],
                target_id=record["target_id"],
                label=record["type"],
                properties=remove_empty_values(record["rel_properties"]),
            )
            triples.append([source, rel, target])
        return triples

    def get_rel_map(
        self,
        graph_nodes: List[LabelledNode],
        depth: int = 2,
        limit: int = 30,
        ignore_rels: Optional[List[str]] = None,
    ) -> List[Triplet]:
        """Get depth-aware rel map."""
        triples = []

        ids = [node.id for node in graph_nodes]
        # Needs some optimization
        response = self.structured_query(
            f"""
            WITH $ids AS id_list
            UNWIND range(0, size(id_list) - 1) AS idx
            MATCH (e:`{BASE_ENTITY_LABEL}`)
            WHERE e.id = id_list[idx]
            MATCH p=(e)-[r*1..{depth}]-(other)
            WHERE ALL(rel in relationships(p) WHERE type(rel) <> 'MENTIONS')
            UNWIND relationships(p) AS rel
            WITH distinct rel, idx
            WITH startNode(rel) AS source,
                type(rel) AS type,
                rel{{.*}} AS rel_properties,
                endNode(rel) AS endNode,
                idx
            LIMIT toInteger($limit)
            RETURN source.id AS source_id, [l in labels(source)
                   WHERE NOT l IN ['{BASE_ENTITY_LABEL}', '{BASE_NODE_LABEL}'] | l][0] AS source_type,
                source{{.* , embedding: Null, id: Null}} AS source_properties,
                type,
                rel_properties,
                endNode.id AS target_id, [l in labels(endNode)
                   WHERE NOT l IN ['{BASE_ENTITY_LABEL}', '{BASE_NODE_LABEL}'] | l][0] AS target_type,
                endNode{{.* , embedding: Null, id: Null}} AS target_properties,
                idx
            ORDER BY idx
            LIMIT toInteger($limit)
            """,
            param_map={"ids": ids, "limit": limit},
        )
        response = response if response else []

        ignore_rels = ignore_rels or []
        for record in response:
            if record["type"] in ignore_rels:
                continue

            source = EntityNode(
                name=record["source_id"],
                label=record["source_type"],
                properties=remove_empty_values(record["source_properties"]),
            )
            target = EntityNode(
                name=record["target_id"],
                label=record["target_type"],
                properties=remove_empty_values(record["target_properties"]),
            )
            rel = Relation(
                source_id=record["source_id"],
                target_id=record["target_id"],
                label=record["type"],
                properties=remove_empty_values(record["rel_properties"]),
            )
            triples.append([source, rel, target])

        return triples

    def structured_query(
        self,
        query: str,
        param_map: Optional[Dict[str, Any]] = None,
    ) -> Any:
        param_map = param_map or {}
        try:
            data, _, _ = self._driver.execute_query(
                neo4j.Query(text=query, timeout=self._timeout),
                database_=self._database,
                parameters_=param_map,
            )
            full_result = [d.data() for d in data]

            if self.sanitize_query_output:
                return [value_sanitize(el) for el in full_result]
            return full_result
        except neo4j.exceptions.Neo4jError as e:
            if not (
                (
                    (  # isCallInTransactionError
                        e.code == "Neo.DatabaseError.Statement.ExecutionFailed"
                        or e.code
                        == "Neo.DatabaseError.Transaction.TransactionStartFailed"
                    )
                    and "in an implicit transaction" in e.message
                )
                or (  # isPeriodicCommitError
                    e.code == "Neo.ClientError.Statement.SemanticError"
                    and (
                        "in an open transaction is not possible" in e.message
                        or "tried to execute in an explicit transaction" in e.message
                    )
                )
            ):
                raise
        # Fallback to allow implicit transactions
        with self._driver.session(database=self._database) as session:
            data = session.run(
                neo4j.Query(text=query, timeout=self._timeout), param_map
            )
            full_result = [d.data() for d in data]

            if self.sanitize_query_output:
                return [value_sanitize(el) for el in full_result]
            return full_result

    def vector_query(
        self, query: VectorStoreQuery, **kwargs: Any
    ) -> Tuple[List[LabelledNode], List[float]]:
        """Query the graph store with a vector store query."""
        conditions = []
        filter_params = {}
        if query.filters:
            for index, filter in enumerate(query.filters.filters):
                conditions.append(
                    f"{'NOT' if filter.operator.value in ['nin'] else ''} e.`{filter.key}` "
                    f"{convert_operator(filter.operator.value)} $param_{index}"
                )
                filter_params[f"param_{index}"] = filter.value
        filters = (
            f" {query.filters.condition.value} ".join(conditions)
            if conditions
            else "1 = 1"
        )
        if not query.filters and self._supports_vector_index:
            data = self.structured_query(
                f"""CALL db.index.vector.queryNodes('{VECTOR_INDEX_NAME}', $limit, $embedding)
                YIELD node, score RETURN node.id AS name,
                [l in labels(node) WHERE NOT l IN ['{BASE_ENTITY_LABEL}', '{BASE_NODE_LABEL}'] | l][0] AS type,
                node{{.* , embedding: Null, name: Null, id: Null}} AS properties,
                score
                """,
                param_map={
                    "embedding": query.query_embedding,
                    "limit": query.similarity_top_k,
                },
            )
        else:
            data = self.structured_query(
                f"""MATCH (e:`{BASE_ENTITY_LABEL}`)
                WHERE e.embedding IS NOT NULL AND size(e.embedding) = $dimension AND ({filters})
                WITH e, vector.similarity.cosine(e.embedding, $embedding) AS score
                ORDER BY score DESC LIMIT toInteger($limit)
                RETURN e.id AS name,
                [l in labels(e) WHERE NOT l IN ['{BASE_ENTITY_LABEL}', '{BASE_NODE_LABEL}'] | l][0] AS type,
                e{{.* , embedding: Null, name: Null, id: Null}} AS properties,
                score""",
                param_map={
                    "embedding": query.query_embedding,
                    "dimension": len(query.query_embedding),
                    "limit": query.similarity_top_k,
                    **filter_params,
                },
            )
        data = data if data else []

        nodes = []
        scores = []
        for record in data:
            node = EntityNode(
                name=record["name"],
                label=record["type"],
                properties=remove_empty_values(record["properties"]),
            )
            nodes.append(node)
            scores.append(record["score"])

        return (nodes, scores)

    def delete(
        self,
        entity_names: Optional[List[str]] = None,
        relation_names: Optional[List[str]] = None,
        properties: Optional[dict] = None,
        ids: Optional[List[str]] = None,
    ) -> None:
        """Delete matching data."""
        if entity_names:
            self.structured_query(
                "MATCH (n) WHERE n.name IN $entity_names DETACH DELETE n",
                param_map={"entity_names": entity_names},
            )

        if ids:
            self.structured_query(
                "MATCH (n) WHERE n.id IN $ids DETACH DELETE n",
                param_map={"ids": ids},
            )

        if relation_names:
            for rel in relation_names:
                self.structured_query(f"MATCH ()-[r:`{rel}`]->() DELETE r")

        if properties:
            cypher = "MATCH (e) WHERE "
            prop_list = []
            params = {}
            for i, prop in enumerate(properties):
                prop_list.append(f"e.`{prop}` = $property_{i}")
                params[f"property_{i}"] = properties[prop]
            cypher += " AND ".join(prop_list)
            self.structured_query(cypher + " DETACH DELETE e", param_map=params)

    def _enhanced_schema_cypher(
        self,
        label_or_type: str,
        properties: List[Dict[str, Any]],
        exhaustive: bool,
        is_relationship: bool = False,
    ) -> str:
        if is_relationship:
            match_clause = f"MATCH ()-[n:`{label_or_type}`]->()"
        else:
            match_clause = f"MATCH (n:`{label_or_type}`)"

        with_clauses = []
        return_clauses = []
        output_dict = {}
        if exhaustive:
            for prop in properties:
                prop_name = prop["property"]
                prop_type = prop["type"]
                if prop_type == "STRING":
                    with_clauses.append(
                        f"collect(distinct substring(toString(coalesce(n.`{prop_name}`, '')), 0, {LONG_TEXT_THRESHOLD})) "
                        f"AS `{prop_name}_values`"
                    )
                    return_clauses.append(
                        f"values:`{prop_name}_values`[..{DISTINCT_VALUE_LIMIT}],"
                        f" distinct_count: size(`{prop_name}_values`)"
                    )
                elif prop_type in [
                    "INTEGER",
                    "FLOAT",
                    "DATE",
                    "DATE_TIME",
                    "LOCAL_DATE_TIME",
                ]:
                    with_clauses.append(f"min(n.`{prop_name}`) AS `{prop_name}_min`")
                    with_clauses.append(f"max(n.`{prop_name}`) AS `{prop_name}_max`")
                    with_clauses.append(
                        f"count(distinct n.`{prop_name}`) AS `{prop_name}_distinct`"
                    )
                    return_clauses.append(
                        f"min: toString(`{prop_name}_min`), "
                        f"max: toString(`{prop_name}_max`), "
                        f"distinct_count: `{prop_name}_distinct`"
                    )
                elif prop_type == "LIST":
                    with_clauses.append(
                        f"min(size(coalesce(n.`{prop_name}`, []))) AS `{prop_name}_size_min`, "
                        f"max(size(coalesce(n.`{prop_name}`, []))) AS `{prop_name}_size_max`, "
                        # Get first 3 sub-elements of the first element as sample values
                        f"collect(n.`{prop_name}`)[0][..3] AS `{prop_name}_values`"
                    )
                    return_clauses.append(
                        f"min_size: `{prop_name}_size_min`, "
                        f"max_size: `{prop_name}_size_max`, "
                        f"values:`{prop_name}_values`"
                    )
                elif prop_type in ["BOOLEAN", "POINT", "DURATION"]:
                    continue
                output_dict[prop_name] = "{" + return_clauses.pop() + "}"
        else:
            # Just sample 5 random nodes
            match_clause += " WITH n LIMIT 5"
            for prop in properties:
                prop_name = prop["property"]
                prop_type = prop["type"]

                # Check if indexed property, we can still do exhaustive
                prop_index = [
                    el
                    for el in self.structured_schema["metadata"]["index"]
                    if el["label"] == label_or_type
                    and el["properties"] == [prop_name]
                    and el["type"] == "RANGE"
                ]
                if prop_type == "STRING":
                    if (
                        prop_index
                        and prop_index[0].get("size") > 0
                        and prop_index[0].get("distinctValues") <= DISTINCT_VALUE_LIMIT
                    ):
                        distinct_values = self.query(
                            f"CALL apoc.schema.properties.distinct("
                            f"'{label_or_type}', '{prop_name}') YIELD value"
                        )[0]["value"]
                        return_clauses.append(
                            f"values: {distinct_values},"
                            f" distinct_count: {len(distinct_values)}"
                        )
                    else:
                        with_clauses.append(
                            f"collect(distinct substring(toString(n.`{prop_name}`), 0, {LONG_TEXT_THRESHOLD})) "
                            f"AS `{prop_name}_values`"
                        )
                        return_clauses.append(f"values: `{prop_name}_values`")
                elif prop_type in [
                    "INTEGER",
                    "FLOAT",
                    "DATE",
                    "DATE_TIME",
                    "LOCAL_DATE_TIME",
                ]:
                    if not prop_index:
                        with_clauses.append(
                            f"collect(distinct toString(coalesce(n.`{prop_name}`, ''))) "
                            f"AS `{prop_name}_values`"
                        )
                        return_clauses.append(f"values: `{prop_name}_values`")
                    else:
                        with_clauses.append(
                            f"min(n.`{prop_name}`) AS `{prop_name}_min`"
                        )
                        with_clauses.append(
                            f"max(n.`{prop_name}`) AS `{prop_name}_max`"
                        )
                        with_clauses.append(
                            f"count(distinct n.`{prop_name}`) AS `{prop_name}_distinct`"
                        )
                        return_clauses.append(
                            f"min: toString(`{prop_name}_min`), "
                            f"max: toString(`{prop_name}_max`), "
                            f"distinct_count: `{prop_name}_distinct`"
                        )

                elif prop_type == "LIST":
                    with_clauses.append(
                        f"min(size(coalesce(n.`{prop_name}`, []))) AS `{prop_name}_size_min`, "
                        f"max(size(coalesce(n.`{prop_name}`, []))) AS `{prop_name}_size_max`, "
                        # Get first 3 sub-elements of the first element as sample values
                        f"collect(n.`{prop_name}`)[0][..3] AS `{prop_name}_values`"
                    )
                    return_clauses.append(
                        f"min_size: `{prop_name}_size_min`, "
                        f"max_size: `{prop_name}_size_max`, "
                        f"values:`{prop_name}_values`"
                    )
                elif prop_type in ["BOOLEAN", "POINT", "DURATION"]:
                    continue

                output_dict[prop_name] = "{" + return_clauses.pop() + "}"

        with_clause = "WITH " + ",\n     ".join(with_clauses)
        return_clause = (
            "RETURN {"
            + ", ".join(f"`{k}`: {v}" for k, v in output_dict.items())
            + "} AS output"
        )

        # Combine all parts of the Cypher query
        return f"{match_clause}\n{with_clause}\n{return_clause}"

    def get_schema(self, refresh: bool = False) -> Any:
        if refresh:
            self.refresh_schema()

        return self.structured_schema

    def get_schema_str(
        self,
        refresh: bool = False,
        exclude_types: List[str] = [],
        include_types: List[str] = [],
    ) -> str:
        schema = self.get_schema(refresh=refresh)

        def filter_func(x: str) -> bool:
            return x in include_types if include_types else x not in exclude_types

        filtered_schema: Dict[str, Any] = {
            "node_props": {
                k: v for k, v in schema.get("node_props", {}).items() if filter_func(k)
            },
            "rel_props": {
                k: v for k, v in schema.get("rel_props", {}).items() if filter_func(k)
            },
            "relationships": [
                r
                for r in schema.get("relationships", [])
                if all(filter_func(r[t]) for t in ["start", "end", "type"])
            ],
        }

        formatted_node_props = []
        formatted_rel_props = []

        if self.enhanced_schema:
            # Enhanced formatting for nodes
            for node_type, properties in filtered_schema["node_props"].items():
                formatted_node_props.append(f"- **{node_type}**")
                for prop in properties:
                    example = ""
                    if prop["type"] == "STRING" and prop.get("values"):
                        if prop.get("distinct_count", 11) > DISTINCT_VALUE_LIMIT:
                            example = (
                                f'Example: "{clean_string_values(prop["values"][0])}"'
                                if prop["values"]
                                else ""
                            )
                        else:  # If less than 10 possible values return all
                            example = (
                                (
                                    "Available options: "
                                    f"{[clean_string_values(el) for el in prop['values']]}"
                                )
                                if prop["values"]
                                else ""
                            )
                    elif prop["type"] == "TEXT":
                        example = (
                            f'Example: "{clean_string_values(prop["values"][0])}"'
                            if prop["values"]
                            else ""
                        )
                    elif prop["type"] in [
                        "INTEGER",
                        "FLOAT",
                        "DATE",
                        "DATE_TIME",
                        "LOCAL_DATE_TIME",
                    ]:
                        if prop.get("min") is not None:
                            example = f"Min: {prop['min']}, Max: {prop['max']}"
                        else:
                            example = (
                                f'Example: "{prop["values"][0]}"'
                                if prop.get("values")
                                else ""
                            )
                    elif prop["type"] == "LIST":
                        # Skip embeddings
                        # if not prop.get("min_size") or prop["min_size"] > LIST_LIMIT:
                        #    continue
                        example = (
                            f"Min Size: {prop.get('min_size', 'N/A')}, "
                            f"Max Size: {prop.get('max_size', 'N/A')}, "
                            + (
                                f"Example: [{prop['values'][0]}]"
                                if prop.get("values") and len(prop["values"]) > 0
                                else ""
                            )
                        )
                    formatted_node_props.append(
                        f"  - `{prop['property']}`: {prop['type']} {example}"
                    )

            # Enhanced formatting for relationships
            for rel_type, properties in filtered_schema["rel_props"].items():
                formatted_rel_props.append(f"- **{rel_type}**")
                for prop in properties:
                    example = ""
                    if prop["type"] == "STRING":
                        if prop.get("distinct_count", 11) > DISTINCT_VALUE_LIMIT:
                            example = (
                                f'Example: "{clean_string_values(prop["values"][0])}"'
                                if prop.get("values")
                                else ""
                            )
                        else:  # If less than 10 possible values return all
                            example = (
                                (
                                    "Available options: "
                                    f"{[clean_string_values(el) for el in prop['values']]}"
                                )
                                if prop.get("values")
                                else ""
                            )
                    elif prop["type"] in [
                        "INTEGER",
                        "FLOAT",
                        "DATE",
                        "DATE_TIME",
                        "LOCAL_DATE_TIME",
                    ]:
                        if prop.get("min"):  # If we have min/max
                            example = f"Min: {prop['min']}, Max:  {prop['max']}"
                        else:  # return a single value
                            example = (
                                f'Example: "{prop["values"][0]}"'
                                if prop.get("values")
                                else ""
                            )
                    elif prop["type"] == "LIST":
                        # Skip embeddings
                        if prop["min_size"] > LIST_LIMIT:
                            continue
                        example = f"Min Size: {prop['min_size']}, Max Size: {prop['max_size']}"
                    formatted_rel_props.append(
                        f"  - `{prop['property']}: {prop['type']}` {example}"
                    )
        else:
            # Format node properties
            for label, props in filtered_schema["node_props"].items():
                props_str = ", ".join(
                    [f"{prop['property']}: {prop['type']}" for prop in props]
                )
                formatted_node_props.append(f"{label} {{{props_str}}}")

            # Format relationship properties using structured_schema
            for type, props in filtered_schema["rel_props"].items():
                props_str = ", ".join(
                    [f"{prop['property']}: {prop['type']}" for prop in props]
                )
                formatted_rel_props.append(f"{type} {{{props_str}}}")

        # Format relationships
        formatted_rels = [
            f"(:{el['start']})-[:{el['type']}]->(:{el['end']})"
            for el in filtered_schema["relationships"]
        ]

        return "\n".join(
            [
                "Node properties:",
                "\n".join(formatted_node_props),
                "Relationship properties:",
                "\n".join(formatted_rel_props),
                "The relationships:",
                "\n".join(formatted_rels),
            ]
        )

    def verify_version(self) -> None:
        """
        Check if the connected Neo4j database version supports vector indexing
        without specifying embedding dimension.

        Queries the Neo4j database to retrieve its version and compares it
        against a target version (5.23.0) that is known to support vector
        indexing. Raises a ValueError if the connected Neo4j version is
        not supported.
        """
        db_data = self.structured_query("CALL dbms.components()")
        version = db_data[0]["versions"][0]
        if "aura" in version:
            version_tuple = (*map(int, version.split("-")[0].split(".")), 0)
        else:
            version_tuple = tuple(map(int, version.split(".")))

        target_version = (5, 23, 0)

        if version_tuple >= target_version:
            self._supports_vector_index = True
        else:
            self._supports_vector_index = False

    def close(self) -> None:
        """
        Explicitly close the Neo4j driver connection.

        Delegates connection management to the Neo4j driver.
        """
        if hasattr(self, "_driver"):
            self._driver.close()
            # Remove the driver attribute to indicate closure
            delattr(self, "_driver")

    def __enter__(self) -> "Neo4jPropertyGraphStore":
        """
        Enter the runtime context for the Neo4j graph connection.

        Enables use of the graph connection with the 'with' statement.
        This method allows for automatic resource management and ensures
        that the connection is properly handled.

        Returns:
            Neo4jPropertyGraphStore: The current graph connection instance

        """
        return self

    def __exit__(
        self,
        exc_type: Optional[Type[BaseException]],
        exc_val: Optional[BaseException],
        exc_tb: Optional[TracebackType],
    ) -> None:
        """
        Exit the runtime context for the Neo4j graph connection.

        This method is automatically called when exiting a 'with' statement.
        It ensures that the database connection is closed, regardless of
        whether an exception occurred during the context's execution.

        Args:
            exc_type: The type of exception that caused the context to exit
                      (None if no exception occurred)
            exc_val: The exception instance that caused the context to exit
                     (None if no exception occurred)
            exc_tb: The traceback for the exception (None if no exception occurred)

        Note:
            Any exception is re-raised after the connection is closed.

        """
        self.close()

    def __del__(self) -> None:
        """
        Destructor for the Neo4j graph connection.

        This method is called during garbage collection to ensure that
        database resources are released if not explicitly closed.

        Caution:
            - Do not rely on this method for deterministic resource cleanup
            - Always prefer explicit .close() or context manager

        Best practices:
            1. Use context manager:
               with Neo4jGraph(...) as graph:
                   ...
            2. Explicitly close:
               graph = Neo4jGraph(...)
               try:
                   ...
               finally:
                   graph.close()

        """
        try:
            self.close()
        except Exception:
            # Suppress any exceptions during garbage collection
            pass

关闭 #

close() -> None

显式关闭Neo4j驱动连接。

将连接管理委托给Neo4j驱动程序。

Source code in

llama-index-integrations/graph_stores/llama-index-graph-stores-neo4j/llama_index/graph_stores/neo4j/neo4j_property_graph.py

def close(self) -> None:
    """
    Explicitly close the Neo4j driver connection.

    Delegates connection management to the Neo4j driver.
    """
    if hasattr(self, "_driver"):
        self._driver.close()
        # Remove the driver attribute to indicate closure
        delattr(self, "_driver")

refresh_schema #

refresh_schema() -> None

刷新架构。

Source code in

llama-index-integrations/graph_stores/llama-index-graph-stores-neo4j/llama_index/graph_stores/neo4j/neo4j_property_graph.py

def refresh_schema(self) -> None:
    """Refresh the schema."""
    node_query_results = self.structured_query(
        node_properties_query,
        param_map={
            "EXCLUDED_LABELS": [
                *EXCLUDED_LABELS,
                BASE_ENTITY_LABEL,
                BASE_NODE_LABEL,
            ]
        },
    )
    node_properties = (
        [el["output"] for el in node_query_results] if node_query_results else []
    )

    rels_query_result = self.structured_query(
        rel_properties_query, param_map={"EXCLUDED_LABELS": EXCLUDED_RELS}
    )
    rel_properties = (
        [el["output"] for el in rels_query_result] if rels_query_result else []
    )

    rel_objs_query_result = self.structured_query(
        rel_query,
        param_map={
            "EXCLUDED_LABELS": [
                *EXCLUDED_LABELS,
                BASE_ENTITY_LABEL,
                BASE_NODE_LABEL,
            ]
        },
    )
    relationships = (
        [el["output"] for el in rel_objs_query_result]
        if rel_objs_query_result
        else []
    )

    # Get constraints & indexes
    try:
        constraint = self.structured_query("SHOW CONSTRAINTS")
        index = self.structured_query(
            "CALL apoc.schema.nodes() YIELD label, properties, type, size, "
            "valuesSelectivity WHERE type = 'RANGE' RETURN *, "
            "size * valuesSelectivity as distinctValues"
        )
    except (
        neo4j.exceptions.ClientError
    ):  # Read-only user might not have access to schema information
        constraint = []
        index = []

    self.structured_schema = {
        "node_props": {el["labels"]: el["properties"] for el in node_properties},
        "rel_props": {el["type"]: el["properties"] for el in rel_properties},
        "relationships": relationships,
        "metadata": {"constraint": constraint, "index": index},
    }
    schema_counts = self.structured_query(
        "CALL apoc.meta.subGraph({}) YIELD nodes, relationships "
        "RETURN nodes, [rel in relationships | {name:apoc.any.property"
        "(rel, 'type'), count: apoc.any.property(rel, 'count')}]"
        " AS relationships"
    )
    # Update node info
    for node in schema_counts[0].get("nodes", []):
        # Skip bloom labels
        if node["name"] in EXCLUDED_LABELS:
            continue
        node_props = self.structured_schema["node_props"].get(node["name"])
        if not node_props:  # The node has no properties
            continue
        enhanced_cypher = self._enhanced_schema_cypher(
            node["name"], node_props, node["count"] < EXHAUSTIVE_SEARCH_LIMIT
        )
        enhanced_info = self.structured_query(enhanced_cypher)[0]["output"]
        for prop in node_props:
            # Map to custom types
            # Text
            if prop["type"] == "STRING" and any(
                len(value) >= LONG_TEXT_THRESHOLD
                for value in enhanced_info[prop["property"]]["values"]
            ):
                enhanced_info[prop["property"]]["type"] = "TEXT"
            # Embedding
            if (
                prop["type"] == "LIST"
                and enhanced_info[prop["property"]]["max_size"] > LIST_LIMIT
            ):
                enhanced_info[prop["property"]]["type"] = "EMBEDDING"
            if prop["property"] in enhanced_info:
                prop.update(enhanced_info[prop["property"]])
    # Update rel info
    for rel in schema_counts[0].get("relationships", []):
        # Skip bloom labels
        if rel["name"] in EXCLUDED_RELS:
            continue
        rel_props = self.structured_schema["rel_props"].get(rel["name"])
        if not rel_props:  # The rel has no properties
            continue
        enhanced_cypher = self._enhanced_schema_cypher(
            rel["name"],
            rel_props,
            rel["count"] < EXHAUSTIVE_SEARCH_LIMIT,
            is_relationship=True,
        )
        try:
            enhanced_info = self.structured_query(enhanced_cypher)[0]["output"]
            for prop in rel_props:
                if prop["property"] in enhanced_info:
                    prop.update(enhanced_info[prop["property"]])
        except neo4j.exceptions.ClientError:
            # Sometimes the types are not consistent in the db
            pass

upsert_relations #

upsert_relations(relations: List[Relation]) -> None

添加关系。

Source code in

llama-index-integrations/graph_stores/llama-index-graph-stores-neo4j/llama_index/graph_stores/neo4j/neo4j_property_graph.py

def upsert_relations(self, relations: List[Relation]) -> None:
    """Add relations."""
    params = [r.dict() for r in relations]
    for index in range(0, len(params), CHUNK_SIZE):
        chunked_params = params[index : index + CHUNK_SIZE]

        self.structured_query(
            f"""
            UNWIND $data AS row
            MERGE (source: {BASE_NODE_LABEL} {{id: row.source_id}})
            ON CREATE SET source:Chunk
            MERGE (target: {BASE_NODE_LABEL} {{id: row.target_id}})
            ON CREATE SET target:Chunk
            WITH source, target, row
            CALL apoc.merge.relationship(source, row.label, {{}}, row.properties, target) YIELD rel
            RETURN count(*)
            """,
            param_map={"data": chunked_params},
        )

获取 #

get(properties: Optional[dict] = None, ids: Optional[List[str]] = None) -> List[LabelledNode]

获取节点。

Source code in

llama-index-integrations/graph_stores/llama-index-graph-stores-neo4j/llama_index/graph_stores/neo4j/neo4j_property_graph.py

def get(
    self,
    properties: Optional[dict] = None,
    ids: Optional[List[str]] = None,
) -> List[LabelledNode]:
    """Get nodes."""
    cypher_statement = f"MATCH (e: {BASE_NODE_LABEL}) "

    params = {}
    cypher_statement += "WHERE e.id IS NOT NULL "

    if ids:
        cypher_statement += "AND e.id in $ids "
        params["ids"] = ids

    if properties:
        prop_list = []
        for i, prop in enumerate(properties):
            prop_list.append(f"e.`{prop}` = $property_{i}")
            params[f"property_{i}"] = properties[prop]
        cypher_statement += " AND " + " AND ".join(prop_list)

    return_statement = """
    WITH e
    RETURN e.id AS name,
           [l in labels(e) WHERE l <> '__Entity__' | l][0] AS type,
           e{.* , embedding: Null, id: Null} AS properties
    """
    cypher_statement += return_statement

    response = self.structured_query(cypher_statement, param_map=params)
    response = response if response else []

    nodes = []
    for record in response:
        # text indicates a chunk node
        # none on the type indicates an implicit node, likely a chunk node
        if "text" in record["properties"] or record["type"] is None:
            text = record["properties"].pop("text", "")
            nodes.append(
                ChunkNode(
                    id_=record["name"],
                    text=text,
                    properties=remove_empty_values(record["properties"]),
                )
            )
        else:
            nodes.append(
                EntityNode(
                    name=record["name"],
                    label=record["type"],
                    properties=remove_empty_values(record["properties"]),
                )
            )

    return nodes

get_rel_map #

get_rel_map(graph_nodes: List[LabelledNode], depth: int = 2, limit: int = 30, ignore_rels: Optional[List[str]] = None) -> List[Triplet]

获取深度感知关系图。

Source code in

llama-index-integrations/graph_stores/llama-index-graph-stores-neo4j/llama_index/graph_stores/neo4j/neo4j_property_graph.py

def get_rel_map(
    self,
    graph_nodes: List[LabelledNode],
    depth: int = 2,
    limit: int = 30,
    ignore_rels: Optional[List[str]] = None,
) -> List[Triplet]:
    """Get depth-aware rel map."""
    triples = []

    ids = [node.id for node in graph_nodes]
    # Needs some optimization
    response = self.structured_query(
        f"""
        WITH $ids AS id_list
        UNWIND range(0, size(id_list) - 1) AS idx
        MATCH (e:`{BASE_ENTITY_LABEL}`)
        WHERE e.id = id_list[idx]
        MATCH p=(e)-[r*1..{depth}]-(other)
        WHERE ALL(rel in relationships(p) WHERE type(rel) <> 'MENTIONS')
        UNWIND relationships(p) AS rel
        WITH distinct rel, idx
        WITH startNode(rel) AS source,
            type(rel) AS type,
            rel{{.*}} AS rel_properties,
            endNode(rel) AS endNode,
            idx
        LIMIT toInteger($limit)
        RETURN source.id AS source_id, [l in labels(source)
               WHERE NOT l IN ['{BASE_ENTITY_LABEL}', '{BASE_NODE_LABEL}'] | l][0] AS source_type,
            source{{.* , embedding: Null, id: Null}} AS source_properties,
            type,
            rel_properties,
            endNode.id AS target_id, [l in labels(endNode)
               WHERE NOT l IN ['{BASE_ENTITY_LABEL}', '{BASE_NODE_LABEL}'] | l][0] AS target_type,
            endNode{{.* , embedding: Null, id: Null}} AS target_properties,
            idx
        ORDER BY idx
        LIMIT toInteger($limit)
        """,
        param_map={"ids": ids, "limit": limit},
    )
    response = response if response else []

    ignore_rels = ignore_rels or []
    for record in response:
        if record["type"] in ignore_rels:
            continue

        source = EntityNode(
            name=record["source_id"],
            label=record["source_type"],
            properties=remove_empty_values(record["source_properties"]),
        )
        target = EntityNode(
            name=record["target_id"],
            label=record["target_type"],
            properties=remove_empty_values(record["target_properties"]),
        )
        rel = Relation(
            source_id=record["source_id"],
            target_id=record["target_id"],
            label=record["type"],
            properties=remove_empty_values(record["rel_properties"]),
        )
        triples.append([source, rel, target])

    return triples

向量查询 #

vector_query(query: VectorStoreQuery, **kwargs: Any) -> Tuple[List[LabelledNode], List[float]]

使用向量存储查询来查询图存储。

Source code in

llama-index-integrations/graph_stores/llama-index-graph-stores-neo4j/llama_index/graph_stores/neo4j/neo4j_property_graph.py

def vector_query(
    self, query: VectorStoreQuery, **kwargs: Any
) -> Tuple[List[LabelledNode], List[float]]:
    """Query the graph store with a vector store query."""
    conditions = []
    filter_params = {}
    if query.filters:
        for index, filter in enumerate(query.filters.filters):
            conditions.append(
                f"{'NOT' if filter.operator.value in ['nin'] else ''} e.`{filter.key}` "
                f"{convert_operator(filter.operator.value)} $param_{index}"
            )
            filter_params[f"param_{index}"] = filter.value
    filters = (
        f" {query.filters.condition.value} ".join(conditions)
        if conditions
        else "1 = 1"
    )
    if not query.filters and self._supports_vector_index:
        data = self.structured_query(
            f"""CALL db.index.vector.queryNodes('{VECTOR_INDEX_NAME}', $limit, $embedding)
            YIELD node, score RETURN node.id AS name,
            [l in labels(node) WHERE NOT l IN ['{BASE_ENTITY_LABEL}', '{BASE_NODE_LABEL}'] | l][0] AS type,
            node{{.* , embedding: Null, name: Null, id: Null}} AS properties,
            score
            """,
            param_map={
                "embedding": query.query_embedding,
                "limit": query.similarity_top_k,
            },
        )
    else:
        data = self.structured_query(
            f"""MATCH (e:`{BASE_ENTITY_LABEL}`)
            WHERE e.embedding IS NOT NULL AND size(e.embedding) = $dimension AND ({filters})
            WITH e, vector.similarity.cosine(e.embedding, $embedding) AS score
            ORDER BY score DESC LIMIT toInteger($limit)
            RETURN e.id AS name,
            [l in labels(e) WHERE NOT l IN ['{BASE_ENTITY_LABEL}', '{BASE_NODE_LABEL}'] | l][0] AS type,
            e{{.* , embedding: Null, name: Null, id: Null}} AS properties,
            score""",
            param_map={
                "embedding": query.query_embedding,
                "dimension": len(query.query_embedding),
                "limit": query.similarity_top_k,
                **filter_params,
            },
        )
    data = data if data else []

    nodes = []
    scores = []
    for record in data:
        node = EntityNode(
            name=record["name"],
            label=record["type"],
            properties=remove_empty_values(record["properties"]),
        )
        nodes.append(node)
        scores.append(record["score"])

    return (nodes, scores)

删除 #

delete(entity_names: Optional[List[str]] = None, relation_names: Optional[List[str]] = None, properties: Optional[dict] = None, ids: Optional[List[str]] = None) -> None

删除匹配的数据。

Source code in

llama-index-integrations/graph_stores/llama-index-graph-stores-neo4j/llama_index/graph_stores/neo4j/neo4j_property_graph.py

def delete(
    self,
    entity_names: Optional[List[str]] = None,
    relation_names: Optional[List[str]] = None,
    properties: Optional[dict] = None,
    ids: Optional[List[str]] = None,
) -> None:
    """Delete matching data."""
    if entity_names:
        self.structured_query(
            "MATCH (n) WHERE n.name IN $entity_names DETACH DELETE n",
            param_map={"entity_names": entity_names},
        )

    if ids:
        self.structured_query(
            "MATCH (n) WHERE n.id IN $ids DETACH DELETE n",
            param_map={"ids": ids},
        )

    if relation_names:
        for rel in relation_names:
            self.structured_query(f"MATCH ()-[r:`{rel}`]->() DELETE r")

    if properties:
        cypher = "MATCH (e) WHERE "
        prop_list = []
        params = {}
        for i, prop in enumerate(properties):
            prop_list.append(f"e.`{prop}` = $property_{i}")
            params[f"property_{i}"] = properties[prop]
        cypher += " AND ".join(prop_list)
        self.structured_query(cypher + " DETACH DELETE e", param_map=params)

verify_version #

verify_version() -> None

检查连接的Neo4j数据库版本是否支持在不指定嵌入维度的情况下进行向量索引。

查询Neo4j数据库以获取其版本，并与已知支持向量索引的目标版本(5.23.0)进行比较。如果连接的Neo4j版本不受支持，则抛出ValueError。

Source code in

llama-index-integrations/graph_stores/llama-index-graph-stores-neo4j/llama_index/graph_stores/neo4j/neo4j_property_graph.py

def verify_version(self) -> None:
    """
    Check if the connected Neo4j database version supports vector indexing
    without specifying embedding dimension.

    Queries the Neo4j database to retrieve its version and compares it
    against a target version (5.23.0) that is known to support vector
    indexing. Raises a ValueError if the connected Neo4j version is
    not supported.
    """
    db_data = self.structured_query("CALL dbms.components()")
    version = db_data[0]["versions"][0]
    if "aura" in version:
        version_tuple = (*map(int, version.split("-")[0].split(".")), 0)
    else:
        version_tuple = tuple(map(int, version.split(".")))

    target_version = (5, 23, 0)

    if version_tuple >= target_version:
        self._supports_vector_index = True
    else:
        self._supports_vector_index = False