Source code for langchain_experimental.graph_transformers.diffbot

from enum import Enum
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union

import requests
from langchain.utils import get_from_env
from langchain_community.graphs.graph_document import GraphDocument, Node, Relationship
from langchain_core.documents import Document


[docs]class TypeOption(str, Enum): FACTS = "facts" ENTITIES = "entities" SENTIMENT = "sentiment"
[docs]def format_property_key(s: str) -> str: """将字符串格式化为属性键使用。""" words = s.split() if not words: return s first_word = words[0].lower() capitalized_words = [word.capitalize() for word in words[1:]] return "".join([first_word] + capitalized_words)
[docs]class NodesList: """具有关联属性的节点列表。 属性: 节点(Dict[Tuple, Any]):将节点存储为键,将其属性存储为值。 每个键都是一个元组,其中第一个元素是节点ID,第二个是节点类型。"""
[docs] def __init__(self) -> None: self.nodes: Dict[Tuple[Union[str, int], str], Any] = dict()
[docs] def add_node_property( self, node: Tuple[Union[str, int], str], properties: Dict[str, Any] ) -> None: """添加或更新节点属性。 如果节点在列表中不存在,则将其与其属性一起添加。 如果节点已经存在,则使用新值更新其属性。 参数: node(元组):包含节点ID和节点类型的元组。 properties(字典):要为节点添加或更新的属性字典。 """ if node not in self.nodes: self.nodes[node] = properties else: self.nodes[node].update(properties)
[docs] def return_node_list(self) -> List[Node]: """返回节点作为节点对象列表。 每个节点对象将具有其ID、类型和属性填充。 返回: List[Node]:一个节点对象列表。 """ nodes = [ Node(id=key[0], type=key[1], properties=self.nodes[key]) for key in self.nodes ] return nodes
# Properties that should be treated as node properties instead of relationships FACT_TO_PROPERTY_TYPE = [ "Date", "Number", "Job title", "Cause of death", "Organization type", "Academic title", ] schema_mapping = [ ("HEADQUARTERS", "ORGANIZATION_LOCATIONS"), ("RESIDENCE", "PERSON_LOCATION"), ("ALL_PERSON_LOCATIONS", "PERSON_LOCATION"), ("CHILD", "HAS_CHILD"), ("PARENT", "HAS_PARENT"), ("CUSTOMERS", "HAS_CUSTOMER"), ("SKILLED_AT", "INTERESTED_IN"), ]
[docs]class SimplifiedSchema: """简化模式映射。 属性: schema (Dict): 包含到简化模式类型的映射的字典。"""
[docs] def __init__(self) -> None: """根据预定义的列表初始化模式字典。""" self.schema = dict() for row in schema_mapping: self.schema[row[0]] = row[1]
[docs] def get_type(self, type: str) -> str: """获取给定原始类型的简化模式类型。 参数: type(str):要查找简化类型的原始模式类型。 返回: str:如果存在简化模式类型,则返回简化模式类型; 否则返回原始类型。 """ try: return self.schema[type] except KeyError: return type
[docs]class DiffbotGraphTransformer: """将文档使用Diffbot NLP API转换为图文档。 图文档转换系统接受一系列文档,并返回一系列图文档。 示例: .. code-block:: python from langchain_experimental.graph_transformers import DiffbotGraphTransformer from langchain_core.documents import Document diffbot_api_key = "DIFFBOT_API_KEY" diffbot_nlp = DiffbotGraphTransformer(diffbot_api_key=diffbot_api_key) document = Document(page_content="Mike Tunge is the CEO of Diffbot.") graph_documents = diffbot_nlp.convert_to_graph_documents([document])"""
[docs] def __init__( self, diffbot_api_key: Optional[str] = None, fact_confidence_threshold: float = 0.7, include_qualifiers: bool = True, include_evidence: bool = True, simplified_schema: bool = True, extract_types: List[TypeOption] = [TypeOption.FACTS], *, include_confidence: bool = False, ) -> None: """初始化图转换器,并设置各种选项。 参数: diffbot_api_key (str): Diffbot的NLP服务的API密钥。 fact_confidence_threshold (float): 用于包含事实的最小置信水平。 include_qualifiers (bool): 是否在关系中包含限定词。 include_evidence (bool): 是否在关系中包含证据。 simplified_schema (bool): 是否使用简化的关系模式。 extract_types (List[TypeOption]): 要提取的数据类型列表。支持事实、实体和情感。默认情况下,选项设置为事实。事实表示具有关系类型的源节点和目标节点的组合。 include_confidence (bool): 是否在节点和关系上包含置信度分数。 """ self.diffbot_api_key = diffbot_api_key or get_from_env( "diffbot_api_key", "DIFFBOT_API_KEY" ) self.fact_threshold_confidence = fact_confidence_threshold self.include_qualifiers = include_qualifiers self.include_evidence = include_evidence self.include_confidence = include_confidence self.simplified_schema = None if simplified_schema: self.simplified_schema = SimplifiedSchema() if not extract_types: raise ValueError( "`extract_types` cannot be an empty array. " "Allowed values are 'facts', 'entities', or both." ) self.extract_types = extract_types
[docs] def nlp_request(self, text: str) -> Dict[str, Any]: """向Diffbot NLP端点发出API请求。 参数: text(str):要处理的文本。 返回: Dict[str, Any]:API的JSON响应。 """ # Relationship extraction only works for English payload = { "content": text, "lang": "en", } FIELDS = ",".join(self.extract_types) HOST = "nl.diffbot.com" url = ( f"https://{HOST}/v1/?fields={FIELDS}&" f"token={self.diffbot_api_key}&language=en" ) result = requests.post(url, data=payload) return result.json()
[docs] def process_response( self, payload: Dict[str, Any], document: Document ) -> GraphDocument: """将Diffbot NLP响应转换为GraphDocument。 参数: payload(Dict[str, Any]):来自Diffbot NLP API的JSON响应。 document(Document):原始文档。 返回: GraphDocument:作为图形的转换文档。 """ # Return empty result if there are no facts if ("facts" not in payload or not payload["facts"]) and ( "entities" not in payload or not payload["entities"] ): return GraphDocument(nodes=[], relationships=[], source=document) # Nodes are a custom class because we need to deduplicate nodes_list = NodesList() if "entities" in payload and payload["entities"]: for record in payload["entities"]: # Ignore if it doesn't have a type if not record["allTypes"]: continue # Define source node source_id = ( record["allUris"][0] if record["allUris"] else record["name"] ) source_label = record["allTypes"][0]["name"].capitalize() source_name = record["name"] nodes_list.add_node_property( (source_id, source_label), {"name": source_name} ) if record.get("sentiment") is not None: nodes_list.add_node_property( (source_id, source_label), {"sentiment": record.get("sentiment")}, ) if self.include_confidence: nodes_list.add_node_property( (source_id, source_label), {"confidence": record.get("confidence")}, ) relationships = list() # Relationships are a list because we don't deduplicate nor anything else if "facts" in payload and payload["facts"]: for record in payload["facts"]: # Skip if the fact is below the threshold confidence if record["confidence"] < self.fact_threshold_confidence: continue # TODO: It should probably be treated as a node property if not record["value"]["allTypes"]: continue # Define source node source_id = ( record["entity"]["allUris"][0] if record["entity"]["allUris"] else record["entity"]["name"] ) source_label = record["entity"]["allTypes"][0]["name"].capitalize() source_name = record["entity"]["name"] source_node = Node(id=source_id, type=source_label) nodes_list.add_node_property( (source_id, source_label), {"name": source_name} ) # Define target node target_id = ( record["value"]["allUris"][0] if record["value"]["allUris"] else record["value"]["name"] ) target_label = record["value"]["allTypes"][0]["name"].capitalize() target_name = record["value"]["name"] # Some facts are better suited as node properties if target_label in FACT_TO_PROPERTY_TYPE: nodes_list.add_node_property( (source_id, source_label), {format_property_key(record["property"]["name"]): target_name}, ) else: # Define relationship # Define target node object target_node = Node(id=target_id, type=target_label) nodes_list.add_node_property( (target_id, target_label), {"name": target_name} ) # Define relationship type rel_type = record["property"]["name"].replace(" ", "_").upper() if self.simplified_schema: rel_type = self.simplified_schema.get_type(rel_type) # Relationship qualifiers/properties rel_properties = dict() relationship_evidence = [ el["passage"] for el in record["evidence"] ][0] if self.include_evidence: rel_properties.update({"evidence": relationship_evidence}) if self.include_confidence: rel_properties.update({"confidence": record["confidence"]}) if self.include_qualifiers and record.get("qualifiers"): for property in record["qualifiers"]: prop_key = format_property_key(property["property"]["name"]) rel_properties[prop_key] = property["value"]["name"] relationship = Relationship( source=source_node, target=target_node, type=rel_type, properties=rel_properties, ) relationships.append(relationship) return GraphDocument( nodes=nodes_list.return_node_list(), relationships=relationships, source=document, )
[docs] def convert_to_graph_documents( self, documents: Sequence[Document] ) -> List[GraphDocument]: """将一系列文档转换为图形文档。 参数: documents(Sequence[Document]):原始文档。 **kwargs:额外的关键字参数。 返回: Sequence[GraphDocument]:转换后的文档作为图形。 """ results = [] for document in documents: raw_results = self.nlp_request(document.page_content) graph_document = self.process_response(raw_results, document) results.append(graph_document) return results