自定义嵌入¶
LlamaIndex支持来自OpenAI、Azure和Langchain的嵌入。但如果这还不够,你也可以实现任何嵌入模型!
下面的示例使用了Instructor Embeddings(安装/设置详情在这里),并实现了一个自定义嵌入类。Instructor嵌入通过提供文本以及关于文本领域的“指导”来工作。当嵌入来自非常特定和专业化的主题时,这将非常有帮助。
如果您在colab上打开这个笔记本,您可能需要安装LlamaIndex 🦙。
In [ ]:
Copied!
!pip install llama-index
!pip install llama-index
In [ ]:
Copied!
# 安装依赖
# !pip install InstructorEmbedding torch transformers sentence-transformers
# 安装依赖
# !pip install InstructorEmbedding torch transformers sentence-transformers
In [ ]:
Copied!
import openai
import os
os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY"
openai.api_key = os.environ["OPENAI_API_KEY"]
import openai
import os
os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY"
openai.api_key = os.environ["OPENAI_API_KEY"]
自定义嵌入实现¶
In [ ]:
Copied!
from typing import Any, List
from InstructorEmbedding import INSTRUCTOR
from llama_index.core.bridge.pydantic import PrivateAttr
from llama_index.core.embeddings import BaseEmbedding
class InstructorEmbeddings(BaseEmbedding):
_model: INSTRUCTOR = PrivateAttr()
_instruction: str = PrivateAttr()
def __init__(
self,
instructor_model_name: str = "hkunlp/instructor-large",
instruction: str = "Represent a document for semantic search:",
**kwargs: Any,
) -> None:
self._model = INSTRUCTOR(instructor_model_name)
self._instruction = instruction
super().__init__(**kwargs)
@classmethod
def class_name(cls) -> str:
return "instructor"
async def _aget_query_embedding(self, query: str) -> List[float]:
return self._get_query_embedding(query)
async def _aget_text_embedding(self, text: str) -> List[float]:
return self._get_text_embedding(text)
def _get_query_embedding(self, query: str) -> List[float]:
embeddings = self._model.encode([[self._instruction, query]])
return embeddings[0]
def _get_text_embedding(self, text: str) -> List[float]:
embeddings = self._model.encode([[self._instruction, text]])
return embeddings[0]
def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
embeddings = self._model.encode(
[[self._instruction, text] for text in texts]
)
return embeddings
from typing import Any, List
from InstructorEmbedding import INSTRUCTOR
from llama_index.core.bridge.pydantic import PrivateAttr
from llama_index.core.embeddings import BaseEmbedding
class InstructorEmbeddings(BaseEmbedding):
_model: INSTRUCTOR = PrivateAttr()
_instruction: str = PrivateAttr()
def __init__(
self,
instructor_model_name: str = "hkunlp/instructor-large",
instruction: str = "Represent a document for semantic search:",
**kwargs: Any,
) -> None:
self._model = INSTRUCTOR(instructor_model_name)
self._instruction = instruction
super().__init__(**kwargs)
@classmethod
def class_name(cls) -> str:
return "instructor"
async def _aget_query_embedding(self, query: str) -> List[float]:
return self._get_query_embedding(query)
async def _aget_text_embedding(self, text: str) -> List[float]:
return self._get_text_embedding(text)
def _get_query_embedding(self, query: str) -> List[float]:
embeddings = self._model.encode([[self._instruction, query]])
return embeddings[0]
def _get_text_embedding(self, text: str) -> List[float]:
embeddings = self._model.encode([[self._instruction, text]])
return embeddings[0]
def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
embeddings = self._model.encode(
[[self._instruction, text] for text in texts]
)
return embeddings
使用样例¶
In [ ]:
Copied!
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core import Settings
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core import Settings
下载数据¶
In [ ]:
Copied!
!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'
!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'
# 加载文档
这里是一个简短的标题,说明要加载文档。
In [ ]:
Copied!
documents = SimpleDirectoryReader("./data/paul_graham/").load_data()
documents = SimpleDirectoryReader("./data/paul_graham/").load_data()
In [ ]:
Copied!
embed_model = InstructorEmbeddings(embed_batch_size=2)
Settings.embed_model = embed_model
Settings.chunk_size = 512
# 如果是第一次运行,将首先下载模型权重!
index = VectorStoreIndex.from_documents(documents)
embed_model = InstructorEmbeddings(embed_batch_size=2)
Settings.embed_model = embed_model
Settings.chunk_size = 512
# 如果是第一次运行,将首先下载模型权重!
index = VectorStoreIndex.from_documents(documents)
load INSTRUCTOR_Transformer max_seq_length 512
In [ ]:
Copied!
response = index.as_query_engine().query("What did the author do growing up?")
print(response)
response = index.as_query_engine().query("What did the author do growing up?")
print(response)
The author wrote short stories and also worked on programming, specifically on an IBM 1401 computer in 9th grade. They used an early version of Fortran and had to type programs on punch cards. Later on, they got a microcomputer, a TRS-80, and started programming more extensively, writing simple games and a word processor. They initially planned to study philosophy in college but eventually switched to AI.