Source code for langchain_experimental.graph_transformers.diffbot
from enum import Enum
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
import requests
from langchain.utils import get_from_env
from langchain_community.graphs.graph_document import GraphDocument, Node, Relationship
from langchain_core.documents import Document
[docs]def format_property_key(s: str) -> str:
"""将字符串格式化为属性键使用。"""
words = s.split()
if not words:
return s
first_word = words[0].lower()
capitalized_words = [word.capitalize() for word in words[1:]]
return "".join([first_word] + capitalized_words)
[docs]class NodesList:
"""具有关联属性的节点列表。
属性:
节点(Dict[Tuple, Any]):将节点存储为键,将其属性存储为值。
每个键都是一个元组,其中第一个元素是节点ID,第二个是节点类型。"""
[docs] def add_node_property(
self, node: Tuple[Union[str, int], str], properties: Dict[str, Any]
) -> None:
"""添加或更新节点属性。
如果节点在列表中不存在,则将其与其属性一起添加。
如果节点已经存在,则使用新值更新其属性。
参数:
node(元组):包含节点ID和节点类型的元组。
properties(字典):要为节点添加或更新的属性字典。
"""
if node not in self.nodes:
self.nodes[node] = properties
else:
self.nodes[node].update(properties)
[docs] def return_node_list(self) -> List[Node]:
"""返回节点作为节点对象列表。
每个节点对象将具有其ID、类型和属性填充。
返回:
List[Node]:一个节点对象列表。
"""
nodes = [
Node(id=key[0], type=key[1], properties=self.nodes[key])
for key in self.nodes
]
return nodes
# Properties that should be treated as node properties instead of relationships
FACT_TO_PROPERTY_TYPE = [
"Date",
"Number",
"Job title",
"Cause of death",
"Organization type",
"Academic title",
]
schema_mapping = [
("HEADQUARTERS", "ORGANIZATION_LOCATIONS"),
("RESIDENCE", "PERSON_LOCATION"),
("ALL_PERSON_LOCATIONS", "PERSON_LOCATION"),
("CHILD", "HAS_CHILD"),
("PARENT", "HAS_PARENT"),
("CUSTOMERS", "HAS_CUSTOMER"),
("SKILLED_AT", "INTERESTED_IN"),
]
[docs]class SimplifiedSchema:
"""简化模式映射。
属性:
schema (Dict): 包含到简化模式类型的映射的字典。"""
[docs] def __init__(self) -> None:
"""根据预定义的列表初始化模式字典。"""
self.schema = dict()
for row in schema_mapping:
self.schema[row[0]] = row[1]
[docs] def get_type(self, type: str) -> str:
"""获取给定原始类型的简化模式类型。
参数:
type(str):要查找简化类型的原始模式类型。
返回:
str:如果存在简化模式类型,则返回简化模式类型;
否则返回原始类型。
"""
try:
return self.schema[type]
except KeyError:
return type
[docs]class DiffbotGraphTransformer:
"""将文档使用Diffbot NLP API转换为图文档。
图文档转换系统接受一系列文档,并返回一系列图文档。
示例:
.. code-block:: python
from langchain_experimental.graph_transformers import DiffbotGraphTransformer
from langchain_core.documents import Document
diffbot_api_key = "DIFFBOT_API_KEY"
diffbot_nlp = DiffbotGraphTransformer(diffbot_api_key=diffbot_api_key)
document = Document(page_content="Mike Tunge is the CEO of Diffbot.")
graph_documents = diffbot_nlp.convert_to_graph_documents([document])"""
[docs] def __init__(
self,
diffbot_api_key: Optional[str] = None,
fact_confidence_threshold: float = 0.7,
include_qualifiers: bool = True,
include_evidence: bool = True,
simplified_schema: bool = True,
extract_types: List[TypeOption] = [TypeOption.FACTS],
*,
include_confidence: bool = False,
) -> None:
"""初始化图转换器,并设置各种选项。
参数:
diffbot_api_key (str):
Diffbot的NLP服务的API密钥。
fact_confidence_threshold (float):
用于包含事实的最小置信水平。
include_qualifiers (bool):
是否在关系中包含限定词。
include_evidence (bool):
是否在关系中包含证据。
simplified_schema (bool):
是否使用简化的关系模式。
extract_types (List[TypeOption]):
要提取的数据类型列表。支持事实、实体和情感。默认情况下,选项设置为事实。事实表示具有关系类型的源节点和目标节点的组合。
include_confidence (bool):
是否在节点和关系上包含置信度分数。
"""
self.diffbot_api_key = diffbot_api_key or get_from_env(
"diffbot_api_key", "DIFFBOT_API_KEY"
)
self.fact_threshold_confidence = fact_confidence_threshold
self.include_qualifiers = include_qualifiers
self.include_evidence = include_evidence
self.include_confidence = include_confidence
self.simplified_schema = None
if simplified_schema:
self.simplified_schema = SimplifiedSchema()
if not extract_types:
raise ValueError(
"`extract_types` cannot be an empty array. "
"Allowed values are 'facts', 'entities', or both."
)
self.extract_types = extract_types
[docs] def nlp_request(self, text: str) -> Dict[str, Any]:
"""向Diffbot NLP端点发出API请求。
参数:
text(str):要处理的文本。
返回:
Dict[str, Any]:API的JSON响应。
"""
# Relationship extraction only works for English
payload = {
"content": text,
"lang": "en",
}
FIELDS = ",".join(self.extract_types)
HOST = "nl.diffbot.com"
url = (
f"https://{HOST}/v1/?fields={FIELDS}&"
f"token={self.diffbot_api_key}&language=en"
)
result = requests.post(url, data=payload)
return result.json()
[docs] def process_response(
self, payload: Dict[str, Any], document: Document
) -> GraphDocument:
"""将Diffbot NLP响应转换为GraphDocument。
参数:
payload(Dict[str, Any]):来自Diffbot NLP API的JSON响应。
document(Document):原始文档。
返回:
GraphDocument:作为图形的转换文档。
"""
# Return empty result if there are no facts
if ("facts" not in payload or not payload["facts"]) and (
"entities" not in payload or not payload["entities"]
):
return GraphDocument(nodes=[], relationships=[], source=document)
# Nodes are a custom class because we need to deduplicate
nodes_list = NodesList()
if "entities" in payload and payload["entities"]:
for record in payload["entities"]:
# Ignore if it doesn't have a type
if not record["allTypes"]:
continue
# Define source node
source_id = (
record["allUris"][0] if record["allUris"] else record["name"]
)
source_label = record["allTypes"][0]["name"].capitalize()
source_name = record["name"]
nodes_list.add_node_property(
(source_id, source_label), {"name": source_name}
)
if record.get("sentiment") is not None:
nodes_list.add_node_property(
(source_id, source_label),
{"sentiment": record.get("sentiment")},
)
if self.include_confidence:
nodes_list.add_node_property(
(source_id, source_label),
{"confidence": record.get("confidence")},
)
relationships = list()
# Relationships are a list because we don't deduplicate nor anything else
if "facts" in payload and payload["facts"]:
for record in payload["facts"]:
# Skip if the fact is below the threshold confidence
if record["confidence"] < self.fact_threshold_confidence:
continue
# TODO: It should probably be treated as a node property
if not record["value"]["allTypes"]:
continue
# Define source node
source_id = (
record["entity"]["allUris"][0]
if record["entity"]["allUris"]
else record["entity"]["name"]
)
source_label = record["entity"]["allTypes"][0]["name"].capitalize()
source_name = record["entity"]["name"]
source_node = Node(id=source_id, type=source_label)
nodes_list.add_node_property(
(source_id, source_label), {"name": source_name}
)
# Define target node
target_id = (
record["value"]["allUris"][0]
if record["value"]["allUris"]
else record["value"]["name"]
)
target_label = record["value"]["allTypes"][0]["name"].capitalize()
target_name = record["value"]["name"]
# Some facts are better suited as node properties
if target_label in FACT_TO_PROPERTY_TYPE:
nodes_list.add_node_property(
(source_id, source_label),
{format_property_key(record["property"]["name"]): target_name},
)
else: # Define relationship
# Define target node object
target_node = Node(id=target_id, type=target_label)
nodes_list.add_node_property(
(target_id, target_label), {"name": target_name}
)
# Define relationship type
rel_type = record["property"]["name"].replace(" ", "_").upper()
if self.simplified_schema:
rel_type = self.simplified_schema.get_type(rel_type)
# Relationship qualifiers/properties
rel_properties = dict()
relationship_evidence = [
el["passage"] for el in record["evidence"]
][0]
if self.include_evidence:
rel_properties.update({"evidence": relationship_evidence})
if self.include_confidence:
rel_properties.update({"confidence": record["confidence"]})
if self.include_qualifiers and record.get("qualifiers"):
for property in record["qualifiers"]:
prop_key = format_property_key(property["property"]["name"])
rel_properties[prop_key] = property["value"]["name"]
relationship = Relationship(
source=source_node,
target=target_node,
type=rel_type,
properties=rel_properties,
)
relationships.append(relationship)
return GraphDocument(
nodes=nodes_list.return_node_list(),
relationships=relationships,
source=document,
)
[docs] def convert_to_graph_documents(
self, documents: Sequence[Document]
) -> List[GraphDocument]:
"""将一系列文档转换为图形文档。
参数:
documents(Sequence[Document]):原始文档。
**kwargs:额外的关键字参数。
返回:
Sequence[GraphDocument]:转换后的文档作为图形。
"""
results = []
for document in documents:
raw_results = self.nlp_request(document.page_content)
graph_document = self.process_response(raw_results, document)
results.append(graph_document)
return results