ObjectIndex
类¶
ObjectIndex
类允许对任意的 Python 对象进行索引。因此,它非常灵活,适用于各种用例。例如:
要构建一个 ObjectIndex
,我们需要一个索引以及另一个抽象,即 ObjectNodeMapping
。正如其名称所示,该映射提供了在节点和相关对象之间以及反之间进行转换的方法。另外,还存在一个 from_objects()
类方法,可以方便地从一组对象构建一个 ObjectIndex
。
在这个笔记本中,我们将快速介绍如何使用 SimpleObjectNodeMapping
构建一个 ObjectIndex
。
from llama_index.core import Settings
Settings.embed_model = "local"
from llama_index.core import VectorStoreIndexfrom llama_index.core.objects import ObjectIndex, SimpleObjectNodeMapping# 一些非常随意的对象obj1 = {"input": "嘿,最近怎么样"}obj2 = ["a", "b", "c", "d"]obj3 = "llamaindex 是一个很棒的库!"arbitrary_objects = [obj1, obj2, obj3]# (可选)对象-节点映射obj_node_mapping = SimpleObjectNodeMapping.from_objects(arbitrary_objects)nodes = obj_node_mapping.to_nodes(arbitrary_objects)# 对象索引object_index = ObjectIndex( index=VectorStoreIndex(nodes=nodes), object_node_mapping=obj_node_mapping,)# 从对象创建对象索引(默认 index_cls=VectorStoreIndex)object_index = ObjectIndex.from_objects( arbitrary_objects, index_cls=VectorStoreIndex)
作为检索器¶
有了object_index
,我们可以将其用作检索器,针对索引对象进行检索。
object_retriever = object_index.as_retriever(similarity_top_k=1)
object_retriever.retrieve("llamaindex")
['llamaindex is an awesome library!']
我们还可以将节点后处理器添加到对象索引检索器中,以便轻松方便地进行重新排列等操作。
%pip install llama-index-postprocessor-colbert-rerank
from llama_index.postprocessor.colbert_rerank import ColbertRerank
retriever = object_index.as_retriever(
similarity_top_k=2, node_postprocessors=[ColbertRerank(top_n=1)]
)
retriever.retrieve("a random list object")
['llamaindex is an awesome library!']
%pip install llama-index-vector-stores-chroma
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb
db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("quickstart2")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
object_index = ObjectIndex.from_objects(
arbitrary_objects,
index_cls=VectorStoreIndex,
storage_context=storage_context,
)
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) Cell In[31], line 5 2 from llama_index.vector_stores.chroma import ChromaVectorStore 3 import chromadb ----> 5 db = chromadb.PersistentClient(path="./chroma_db2") 6 chroma_collection = db.get_or_create_collection("quickstart2") 7 vector_store = ChromaVectorStore(chroma_collection=chroma_collection) File ~/giant_change/llama_index/venv/lib/python3.10/site-packages/chromadb/__init__.py:146, in PersistentClient(path, settings, tenant, database) 143 tenant = str(tenant) 144 database = str(database) --> 146 return ClientCreator(tenant=tenant, database=database, settings=settings) File ~/giant_change/llama_index/venv/lib/python3.10/site-packages/chromadb/api/client.py:139, in Client.__init__(self, tenant, database, settings) 133 def __init__( 134 self, 135 tenant: str = DEFAULT_TENANT, 136 database: str = DEFAULT_DATABASE, 137 settings: Settings = Settings(), 138 ) -> None: --> 139 super().__init__(settings=settings) 140 self.tenant = tenant 141 self.database = database File ~/giant_change/llama_index/venv/lib/python3.10/site-packages/chromadb/api/client.py:43, in SharedSystemClient.__init__(self, settings) 38 def __init__( 39 self, 40 settings: Settings = Settings(), 41 ) -> None: 42 self._identifier = SharedSystemClient._get_identifier_from_settings(settings) ---> 43 SharedSystemClient._create_system_if_not_exists(self._identifier, settings) File ~/giant_change/llama_index/venv/lib/python3.10/site-packages/chromadb/api/client.py:54, in SharedSystemClient._create_system_if_not_exists(cls, identifier, settings) 51 cls._identifer_to_system[identifier] = new_system 53 new_system.instance(ProductTelemetryClient) ---> 54 new_system.instance(ServerAPI) 56 new_system.start() 57 else: File ~/giant_change/llama_index/venv/lib/python3.10/site-packages/chromadb/config.py:382, in System.instance(self, type) 379 type = get_class(fqn, type) 381 if type not in self._instances: --> 382 impl = type(self) 383 self._instances[type] = impl 384 if self._running: File ~/giant_change/llama_index/venv/lib/python3.10/site-packages/chromadb/api/segment.py:102, in SegmentAPI.__init__(self, system) 100 super().__init__(system) 101 self._settings = system.settings --> 102 self._sysdb = self.require(SysDB) 103 self._manager = self.require(SegmentManager) 104 self._quota = self.require(QuotaEnforcer) File ~/giant_change/llama_index/venv/lib/python3.10/site-packages/chromadb/config.py:281, in Component.require(self, type) 278 def require(self, type: Type[T]) -> T: 279 """Get a Component instance of the given type, and register as a dependency of 280 that instance.""" --> 281 inst = self._system.instance(type) 282 self._dependencies.add(inst) 283 return inst File ~/giant_change/llama_index/venv/lib/python3.10/site-packages/chromadb/config.py:382, in System.instance(self, type) 379 type = get_class(fqn, type) 381 if type not in self._instances: --> 382 impl = type(self) 383 self._instances[type] = impl 384 if self._running: File ~/giant_change/llama_index/venv/lib/python3.10/site-packages/chromadb/db/impl/sqlite.py:88, in SqliteDB.__init__(self, system) 84 self._db_file = ( 85 self._settings.require("persist_directory") + "/chroma.sqlite3" 86 ) 87 if not os.path.exists(self._db_file): ---> 88 os.makedirs(os.path.dirname(self._db_file), exist_ok=True) 89 self._conn_pool = PerThreadPool(self._db_file) 90 self._tx_stack = local() File ~/miniforge3/lib/python3.10/os.py:225, in makedirs(name, mode, exist_ok) 223 return 224 try: --> 225 mkdir(name, mode) 226 except OSError: 227 # Cannot rely on checking for EEXIST, since the operating system 228 # could give priority to other errors like EACCES or EROFS 229 if not exist_ok or not path.isdir(name): FileNotFoundError: [Errno 2] No such file or directory: './chroma_db2'
object_retriever = object_index.as_retriever(similarity_top_k=1)
object_retriever.retrieve("llamaindex")
['llamaindex is an awesome library!']
现在,让我们“重新加载”索引。
db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("quickstart")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
index = VectorStoreIndex.from_vector_store(vector_store=vector_store)
object_index = ObjectIndex.from_objects_and_index(arbitrary_objects, index)
object_retriever = object_index.as_retriever(similarity_top_k=1)
object_retriever.retrieve("llamaindex")
['llamaindex is an awesome library!']
请注意,当我们重新加载索引时,仍然需要传递对象,因为这些对象并未保存在实际的索引/向量数据库中。
[高级] 自定义映射¶
对于特殊情况,您希望完全控制对象如何映射到节点,还可以提供 to_node_fn()
和 from_node_fn()
钩子函数。
当您要转换特殊对象或希望在运行时动态创建对象而不是将它们保存在内存中时,这将非常有用。
下面是一个小例子。
from llama_index.core.schema import TextNode
my_objects = {
str(hash(str(obj))): obj for i, obj in enumerate(arbitrary_objects)
}
def from_node_fn(node):
return my_objects[node.id]
def to_node_fn(obj):
return TextNode(id=str(hash(str(obj))), text=str(obj))
object_index = ObjectIndex.from_objects(
arbitrary_objects,
index_cls=VectorStoreIndex,
from_node_fn=from_node_fn,
to_node_fn=to_node_fn,
)
object_retriever = object_index.as_retriever(similarity_top_k=1)
object_retriever.retrieve("llamaindex")
['llamaindex is an awesome library!']
使用对象持久化ObjectIndex
到磁盘¶
在持久化ObjectIndex
时,我们需要处理索引以及对象节点映射。持久化索引很简单,可以通过通常的方式处理(例如,参见这个指南)。然而,当涉及持久化ObjectNodeMapping
时情况就有些不同了。由于我们正在使用ObjectIndex
对任意Python对象进行索引,可能会出现这样的情况(也许比我们想象的更常见),即这些任意对象是不可序列化的。在这种情况下,您可以持久化索引,但用户必须维护一种方法来重新构建ObjectNodeMapping
,以便能够重新构建ObjectIndex
。为了方便起见,ObjectIndex
上有persist
和from_persist_dir
方法,它们将尝试持久化和加载先前保存的ObjectIndex
。
快乐的例子¶
# 持久化到磁盘(如果未提供路径,则持久化到默认路径./storage)object_index.persist()
# 重新加载(如果未提供路径,则尝试从默认路径./storage加载)reloaded_object_index = ObjectIndex.from_persist_dir()
reloaded_object_index._object_node_mapping.obj_node_mapping
{7981070310142320670: {'input': "Hey, how's it going"}, -5984737625581842527: ['a', 'b', 'c', 'd'], -8305186196625446821: 'llamaindex is an awesome library!'}
object_index._object_node_mapping.obj_node_mapping
{7981070310142320670: {'input': "Hey, how's it going"}, -5984737625581842527: ['a', 'b', 'c', 'd'], -8305186196625446821: 'llamaindex is an awesome library!'}
无法正常工作的示例¶
from llama_index.core.tools import FunctionToolfrom llama_index.core import SummaryIndexfrom llama_index.core.objects import SimpleToolNodeMappingdef add(a: int, b: int) -> int: """将两个整数相加并返回结果整数""" return a + bdef multiply(a: int, b: int) -> int: """将两个整数相乘并返回结果整数""" return a * bmultiply_tool = FunctionTool.from_defaults(fn=multiply)add_tool = FunctionTool.from_defaults(fn=add)object_mapping = SimpleToolNodeMapping.from_objects([add_tool, multiply_tool])object_index = ObjectIndex.from_objects( [add_tool, multiply_tool], object_mapping)
# 尝试直接持久化对象映射将会引发错误object_mapping.persist()
--------------------------------------------------------------------------- NotImplementedError Traceback (most recent call last) Cell In[4], line 2 1 # trying to persist the object_mapping directly will raise an error ----> 2 object_mapping.persist() File ~/Projects/llama_index/llama_index/objects/tool_node_mapping.py:47, in BaseToolNodeMapping.persist(self, persist_dir, obj_node_mapping_fname) 43 def persist( 44 self, persist_dir: str = ..., obj_node_mapping_fname: str = ... 45 ) -> None: 46 """Persist objs.""" ---> 47 raise NotImplementedError("Subclasses should implement this!") NotImplementedError: Subclasses should implement this!
# 尝试在这里持久化对象索引,将向用户抛出警告。 object_index.persist()
/var/folders/0g/wd11bmkd791fz7hvgy1kqyp00000gn/T/ipykernel_77363/46708458.py:2: UserWarning: Unable to persist ObjectNodeMapping. You will need to reconstruct the same object node mapping to build this ObjectIndex object_index.persist()
在这种情况下,只有索引被持久化了。 为了按照上面提到的重新构建ObjectIndex
,我们需要手动重新构建ObjectNodeMapping
,并将其提供给ObjectIndex.from_persist_dir
方法。
reloaded_object_index = ObjectIndex.from_persist_dir( object_node_mapping=object_mapping # 如果没有这个,就会抛出错误)