Bases: BaseNodePostprocessor
NER PII节点处理器。
使用HF transformers模型。
Source code in llama_index/core/postprocessor/pii.py
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140 | class NERPIINodePostprocessor(BaseNodePostprocessor):
"""NER PII节点处理器。
使用HF transformers模型。"""
pii_node_info_key: str = "__pii_node_info__"
@classmethod
def class_name(cls) -> str:
return "NERPIINodePostprocessor"
def mask_pii(self, ner: Callable, text: str) -> Tuple[str, Dict]:
"""在文本中对个人身份信息进行脱敏。"""
new_text = text
response = ner(text)
mapping = {}
for entry in response:
entity_group_tag = f"[{entry['entity_group']}_{entry['start']}]"
new_text = new_text.replace(entry["word"], entity_group_tag).strip()
mapping[entity_group_tag] = entry["word"]
return new_text, mapping
def _postprocess_nodes(
self,
nodes: List[NodeWithScore],
query_bundle: Optional[QueryBundle] = None,
) -> List[NodeWithScore]:
"""后处理节点。"""
from transformers import pipeline # pants: no-infer-dep
ner = pipeline("ner", grouped_entities=True)
# swap out text from nodes, with the original node mappings
new_nodes = []
for node_with_score in nodes:
node = node_with_score.node
new_text, mapping_info = self.mask_pii(
ner, node.get_content(metadata_mode=MetadataMode.LLM)
)
new_node = deepcopy(node)
new_node.excluded_embed_metadata_keys.append(self.pii_node_info_key)
new_node.excluded_llm_metadata_keys.append(self.pii_node_info_key)
new_node.metadata[self.pii_node_info_key] = mapping_info
new_node.set_content(new_text)
new_nodes.append(NodeWithScore(node=new_node, score=node_with_score.score))
return new_nodes
|
mask_pii
mask_pii(ner: Callable, text: str) -> Tuple[str, Dict]
在文本中对个人身份信息进行脱敏。
Source code in llama_index/core/postprocessor/pii.py
105
106
107
108
109
110
111
112
113
114 | def mask_pii(self, ner: Callable, text: str) -> Tuple[str, Dict]:
"""在文本中对个人身份信息进行脱敏。"""
new_text = text
response = ner(text)
mapping = {}
for entry in response:
entity_group_tag = f"[{entry['entity_group']}_{entry['start']}]"
new_text = new_text.replace(entry["word"], entity_group_tag).strip()
mapping[entity_group_tag] = entry["word"]
return new_text, mapping
|