要使用Pip安装依赖项,请运行以下命令:
pip install -r requirements.txt
其中,requirements.txt
是包含所有依赖项名称及其版本的文件。
In [ ]:
Copied!
# 使用pip安装llama-index llama-index-embeddings-huggingface llama-index-llms-openai llama-index-readers-file llama-index-vector-stores-kdbai# 使用pip安装kdbai_client pandas
# 使用pip安装llama-index llama-index-embeddings-huggingface llama-index-llms-openai llama-index-readers-file llama-index-vector-stores-kdbai# 使用pip安装kdbai_client pandas
导入依赖库¶
In [ ]:
Copied!
from getpass import getpassimport reimport osimport shutilimport timeimport urllibimport pandas as pdfrom llama_index.core import ( Settings, SimpleDirectoryReader, ServiceContext, StorageContext, VectorStoreIndex,)from llama_index.core import Settingsfrom llama_index.core.node_parser import SentenceSplitterfrom llama_index.core.retrievers import VectorIndexRetrieverfrom llama_index.embeddings.huggingface import HuggingFaceEmbeddingfrom llama_index.llms.openai import OpenAIfrom llama_index.vector_stores.kdbai import KDBAIVectorStoreimport pykx as kximport kdbai_client as kdbaiOUTDIR = "pdf"RESET = True# LLM = 'gpt-3.5-turbo'LLM = "gpt-4-turbo-preview" # 昂贵!!!EMBEDDING = "sentence-transformers/all-mpnet-base-v2"os.environ["OPENAI_API_KEY"] = getpass("OpenAI API key: ")
from getpass import getpassimport reimport osimport shutilimport timeimport urllibimport pandas as pdfrom llama_index.core import ( Settings, SimpleDirectoryReader, ServiceContext, StorageContext, VectorStoreIndex,)from llama_index.core import Settingsfrom llama_index.core.node_parser import SentenceSplitterfrom llama_index.core.retrievers import VectorIndexRetrieverfrom llama_index.embeddings.huggingface import HuggingFaceEmbeddingfrom llama_index.llms.openai import OpenAIfrom llama_index.vector_stores.kdbai import KDBAIVectorStoreimport pykx as kximport kdbai_client as kdbaiOUTDIR = "pdf"RESET = True# LLM = 'gpt-3.5-turbo'LLM = "gpt-4-turbo-preview" # 昂贵!!!EMBEDDING = "sentence-transformers/all-mpnet-base-v2"os.environ["OPENAI_API_KEY"] = getpass("OpenAI API key: ")
创建 KDB.AI 会话和表格¶
In [ ]:
Copied!
KDBAI_ENDPOINT = "http://localhost:8082"
KDBAI_API_KEY = None
KDBAI_TABLE_NAME = "reports"
session = kdbai.Session(endpoint=KDBAI_ENDPOINT, api_key=KDBAI_API_KEY)
if KDBAI_TABLE_NAME in session.list():
session.table(KDBAI_TABLE_NAME).drop()
schema = dict(
columns=[
dict(name="document_id", pytype="bytes"),
dict(name="text", pytype="bytes"),
dict(
name="embedding",
vectorIndex=dict(type="flat", metric="L2", dims=768),
),
dict(name="title", pytype="bytes"),
dict(name="publication_date", pytype="datetime64[ns]"),
]
)
table = session.create_table(KDBAI_TABLE_NAME, schema)
KDBAI_ENDPOINT = "http://localhost:8082"
KDBAI_API_KEY = None
KDBAI_TABLE_NAME = "reports"
session = kdbai.Session(endpoint=KDBAI_ENDPOINT, api_key=KDBAI_API_KEY)
if KDBAI_TABLE_NAME in session.list():
session.table(KDBAI_TABLE_NAME).drop()
schema = dict(
columns=[
dict(name="document_id", pytype="bytes"),
dict(name="text", pytype="bytes"),
dict(
name="embedding",
vectorIndex=dict(type="flat", metric="L2", dims=768),
),
dict(name="title", pytype="bytes"),
dict(name="publication_date", pytype="datetime64[ns]"),
]
)
table = session.create_table(KDBAI_TABLE_NAME, schema)
财务报告链接和元数据¶
In [ ]:
Copied!
INPUT_URLS = [
"https://www.govinfo.gov/content/pkg/PLAW-106publ102/pdf/PLAW-106publ102.pdf",
"https://www.govinfo.gov/content/pkg/PLAW-111publ203/pdf/PLAW-111publ203.pdf",
]
METADATA = {
"pdf/PLAW-106publ102.pdf": {
"title": "GRAMM–LEACH–BLILEY ACT, 1999",
"publication_date": pd.to_datetime("1999-11-12"),
},
"pdf/PLAW-111publ203.pdf": {
"title": "DODD-FRANK WALL STREET REFORM AND CONSUMER PROTECTION ACT, 2010",
"publication_date": pd.to_datetime("2010-07-21"),
},
}
INPUT_URLS = [
"https://www.govinfo.gov/content/pkg/PLAW-106publ102/pdf/PLAW-106publ102.pdf",
"https://www.govinfo.gov/content/pkg/PLAW-111publ203/pdf/PLAW-111publ203.pdf",
]
METADATA = {
"pdf/PLAW-106publ102.pdf": {
"title": "GRAMM–LEACH–BLILEY ACT, 1999",
"publication_date": pd.to_datetime("1999-11-12"),
},
"pdf/PLAW-111publ203.pdf": {
"title": "DODD-FRANK WALL STREET REFORM AND CONSUMER PROTECTION ACT, 2010",
"publication_date": pd.to_datetime("2010-07-21"),
},
}
将PDF文件下载到本地¶
In [ ]:
Copied!
%%time
CHUNK_SIZE = 512 * 1024
def download_file(url):
print("Downloading %s..." % url)
out = os.path.join(OUTDIR, os.path.basename(url))
try:
response = urllib.request.urlopen(url)
except urllib.error.URLError as e:
logging.exception("Failed to download %s !" % url)
else:
with open(out, "wb") as f:
while True:
chunk = response.read(CHUNK_SIZE)
if chunk:
f.write(chunk)
else:
break
return out
if RESET:
if os.path.exists(OUTDIR):
shutil.rmtree(OUTDIR)
os.mkdir(OUTDIR)
local_files = [download_file(x) for x in INPUT_URLS]
local_files[:10]
%%time
CHUNK_SIZE = 512 * 1024
def download_file(url):
print("Downloading %s..." % url)
out = os.path.join(OUTDIR, os.path.basename(url))
try:
response = urllib.request.urlopen(url)
except urllib.error.URLError as e:
logging.exception("Failed to download %s !" % url)
else:
with open(out, "wb") as f:
while True:
chunk = response.read(CHUNK_SIZE)
if chunk:
f.write(chunk)
else:
break
return out
if RESET:
if os.path.exists(OUTDIR):
shutil.rmtree(OUTDIR)
os.mkdir(OUTDIR)
local_files = [download_file(x) for x in INPUT_URLS]
local_files[:10]
Downloading https://www.govinfo.gov/content/pkg/PLAW-106publ102/pdf/PLAW-106publ102.pdf... Downloading https://www.govinfo.gov/content/pkg/PLAW-111publ203/pdf/PLAW-111publ203.pdf... CPU times: user 64.6 ms, sys: 4.44 ms, total: 69 ms Wall time: 4.98 s
使用LlamaIndex加载本地PDF文件¶
In [ ]:
Copied!
%%time
def get_metadata(filepath):
return METADATA[filepath]
documents = SimpleDirectoryReader(
input_files=local_files,
file_metadata=get_metadata,
)
docs = documents.load_data()
len(docs)
%%time
def get_metadata(filepath):
return METADATA[filepath]
documents = SimpleDirectoryReader(
input_files=local_files,
file_metadata=get_metadata,
)
docs = documents.load_data()
len(docs)
CPU times: user 11.1 s, sys: 56 ms, total: 11.1 s Wall time: 11.2 s
Out[ ]:
994
使用KDB.AI向量存储设置LlamaIndex RAG管道¶
在这个示例中,我们将演示如何设置LlamaIndex RAG(RAG表示RAGs Are Graphs)管道,以使用KDB.AI向量存储。LlamaIndex是一个用于处理大规模图数据的工具,而KDB.AI向量存储则提供了高效的向量存储和检索功能。通过将它们结合起来,我们可以构建一个强大的图数据处理管道。
In [ ]:
Copied!
%%time
embed_model = HuggingFaceEmbedding(model_name=EMBEDDING)
llm = OpenAI(temperature=0, model=LLM)
vector_store = KDBAIVectorStore(table)
Settings.embed_model = embed_model
Settings.llm = llm
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
docs,
storage_context=storage_context,
transformations=[SentenceSplitter(chunk_size=2048, chunk_overlap=0)],
)
%%time
embed_model = HuggingFaceEmbedding(model_name=EMBEDDING)
llm = OpenAI(temperature=0, model=LLM)
vector_store = KDBAIVectorStore(table)
Settings.embed_model = embed_model
Settings.llm = llm
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
docs,
storage_context=storage_context,
transformations=[SentenceSplitter(chunk_size=2048, chunk_overlap=0)],
)
CPU times: user 3min 32s, sys: 3.72 s, total: 3min 35s Wall time: 4min 41s
设置LlamaIndex查询引擎¶
In [ ]:
Copied!
%%时间# 使用gpt-3.5-turbo,16k标记的上下文大小只能容纳大约15页的文档。# 使用gpt-4-turbo-preview,128k标记的上下文大小可以达到100页。K = 100query_engine = index.as_query_engine( similarity_top_k=K, filter=[("<", "publication_date", "2008-09-15")], sort_by="publication_date",)
%%时间# 使用gpt-3.5-turbo,16k标记的上下文大小只能容纳大约15页的文档。# 使用gpt-4-turbo-preview,128k标记的上下文大小可以达到100页。K = 100query_engine = index.as_query_engine( similarity_top_k=K, filter=[("<", "publication_date", "2008-09-15")], sort_by="publication_date",)
CPU times: user 60.2 ms, sys: 766 µs, total: 61 ms Wall time: 79.1 ms
2008年危机之前¶
In [ ]:
Copied!
%%时间# 查询引擎查询result = query_engine.query( """在2008年金融危机之前,美国的主要金融监管是什么?""")print(result.response)
%%时间# 查询引擎查询result = query_engine.query( """在2008年金融危机之前,美国的主要金融监管是什么?""")print(result.response)
The main financial regulation in the US before the 2008 financial crisis was the Gramm-Leach-Bliley Act. CPU times: user 2.28 s, sys: 666 µs, total: 2.28 s Wall time: 56.9 s
In [ ]:
Copied!
%%时间result = query_engine.query( """1999年的格拉姆-利奇-布莱利法案足以防止2008年的危机吗?搜索文档并解释其调节美国股市的优势和劣势。""")print(result.response)
%%时间result = query_engine.query( """1999年的格拉姆-利奇-布莱利法案足以防止2008年的危机吗?搜索文档并解释其调节美国股市的优势和劣势。""")print(result.response)
The Gramm-Leach-Bliley Act of 1999, also known as the Financial Services Modernization Act, aimed to modernize financial services by removing barriers between banking, securities, and insurance companies, allowing them to offer each other's services. While the Act contributed to financial services integration and competition, its effectiveness in preventing crises like that of 2008 is debatable due to its strengths and weaknesses in regulating the US stock market. Strengths: 1. Enhanced Competition: By allowing financial institutions to merge and offer a broader range of services, the Act fostered competition, innovation, and efficiency in the financial sector. 2. Functional Regulation: The Act maintained that activities within financial institutions would be regulated by the appropriate functional regulator (e.g., securities activities by the SEC), aiming for expertise-based oversight. Weaknesses: 1. Increased Systemic Risk: The Act's facilitation of larger, more complex financial institutions may have contributed to systemic risk, as failures of these institutions could have more significant impacts on the financial system. 2. Regulatory Gaps and Oversight Challenges: The integration of different financial services under one roof made it challenging for regulators to oversee and manage the risks of these conglomerates effectively. The Act did not fully address the need for a systemic risk regulator or enhance oversight of the shadow banking system, which played a significant role in the 2008 crisis. 3. Weakened Consumer Privacy Protections: While the Act included provisions for protecting consumers' personal financial information, critics argue that it also allowed for increased sharing of this information among financial entities, potentially undermining consumer privacy. In summary, while the Gramm-Leach-Bliley Act of 1999 had the potential to foster innovation and efficiency in the financial sector by breaking down barriers between different types of financial services, its weaknesses in addressing systemic risk and regulatory oversight challenges may have limited its effectiveness in preventing financial crises like that of 2008. CPU times: user 177 ms, sys: 45.6 ms, total: 223 ms Wall time: 31.6 s
2008年危机之后¶
In [ ]:
Copied!
%%时间# 使用gpt-3.5-turbo,16k标记的上下文大小只能容纳大约15页的文档。# 使用gpt-4-turbo-preview,128k标记的上下文大小可以容纳100页。K = 100query_engine = index.as_query_engine( similarity_top_k=K, filter=[(">=", "publication_date", "2008-09-15")], sort_by="publication_date",)
%%时间# 使用gpt-3.5-turbo,16k标记的上下文大小只能容纳大约15页的文档。# 使用gpt-4-turbo-preview,128k标记的上下文大小可以容纳100页。K = 100query_engine = index.as_query_engine( similarity_top_k=K, filter=[(">=", "publication_date", "2008-09-15")], sort_by="publication_date",)
CPU times: user 217 µs, sys: 99 µs, total: 316 µs Wall time: 320 µs
In [ ]:
Copied!
%%时间result = query_engine.query( """2008年9月15日发生了什么?仅凭自己的知识回答。""")print(result.response)
%%时间result = query_engine.query( """2008年9月15日发生了什么?仅凭自己的知识回答。""")print(result.response)
I'm unable to provide an answer based on the given instructions. CPU times: user 151 ms, sys: 22 ms, total: 173 ms Wall time: 12.7 s
In [ ]:
Copied!
%%时间result = query_engine.query( """在2008年危机后颁布的新美国金融监管法规是什么,以增加市场监管并改善消费者情绪?""")print(result.response)
%%时间result = query_engine.query( """在2008年危机后颁布的新美国金融监管法规是什么,以增加市场监管并改善消费者情绪?""")print(result.response)
The Dodd-Frank Wall Street Reform and Consumer Protection Act, 2010. CPU times: user 184 ms, sys: 23.1 ms, total: 207 ms Wall time: 17.1 s
深入分析¶
In [ ]:
Copied!
%%时间# 使用gpt-3.5-turbo,16k标记的上下文大小只能容纳大约15页的文档。# 使用gpt-4-turbo-preview,128k标记的上下文大小可以容纳100页。K = 100query_engine = index.as_query_engine( similarity_top_k=K, sort_by="publication_date")
%%时间# 使用gpt-3.5-turbo,16k标记的上下文大小只能容纳大约15页的文档。# 使用gpt-4-turbo-preview,128k标记的上下文大小可以容纳100页。K = 100query_engine = index.as_query_engine( similarity_top_k=K, sort_by="publication_date")
CPU times: user 381 µs, sys: 2 µs, total: 383 µs Wall time: 399 µs
In [ ]:
Copied!
%%时间result = query_engine.query( """分析2008年危机前后的美国金融监管,并生成所有相关论点的报告,以解释发生了什么,并确保不会再次发生。使用提供的背景和您自己的知识,但请明确说明您使用了哪一个。""")print(result.response)
%%时间result = query_engine.query( """分析2008年危机前后的美国金融监管,并生成所有相关论点的报告,以解释发生了什么,并确保不会再次发生。使用提供的背景和您自己的知识,但请明确说明您使用了哪一个。""")print(result.response)
Before the 2008 financial crisis, the US financial system was characterized by deregulation and an increase in complex financial products such as mortgage-backed securities and derivatives. The Gramm-Leach-Bliley Act of 1999 repealed the Glass-Steagall Act, allowing banks to engage in investment activities, which led to increased risk-taking. The lack of transparency and understanding of these complex financial products, coupled with inadequate oversight, contributed to the financial crisis. After the 2008 crisis, the Dodd-Frank Wall Street Reform and Consumer Protection Act was enacted in 2010 to address the regulatory gaps and weaknesses revealed by the crisis. The Act aimed to increase transparency, protect consumers, and prevent the occurrence of a similar crisis. Key provisions included the creation of the Financial Stability Oversight Council to monitor systemic risk, the establishment of the Consumer Financial Protection Bureau to protect consumers from abusive financial practices, and the introduction of the Volcker Rule to limit speculative investments by banks. Additionally, the Act imposed stricter capital requirements and introduced mechanisms for the orderly liquidation of failing financial institutions to prevent bailouts. To ensure that a similar crisis does not happen again, it is crucial to maintain vigilant regulatory oversight, promote transparency in financial markets, and ensure that financial institutions have robust risk management practices in place. Continuous monitoring of systemic risks and the ability to adapt regulations in response to evolving financial products and practices are also essential. This analysis is based on the context provided and my own knowledge of the US financial regulations before and after the 2008 crisis. CPU times: user 1.11 s, sys: 1.99 s, total: 3.1 s Wall time: 29.8 s