Source code for langchain_experimental.data_anonymizer.deanonymizer_mapping
import re
from collections import defaultdict
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Dict, List
if TYPE_CHECKING:
from presidio_analyzer import RecognizerResult
from presidio_anonymizer.entities import EngineResult
MappingDataType = Dict[str, Dict[str, str]]
[docs]@dataclass
class DeanonymizerMapping:
"""Deanonymizer映射。"""
mapping: MappingDataType = field(
default_factory=lambda: defaultdict(lambda: defaultdict(str))
)
@property
def data(self) -> MappingDataType:
"""返回去匿名化映射。"""
return {k: dict(v) for k, v in self.mapping.items()}
[docs] def update(self, new_mapping: MappingDataType) -> None:
"""更新去匿名化映射表的新数值。
重复数值不会被添加
如果存在多个相同类型的实体,则映射表将包括计数以区分它们。例如,如果输入文本中有两个名称,映射表将包括NAME_1和NAME_2。
"""
seen_values = set()
for entity_type, values in new_mapping.items():
count = len(self.mapping[entity_type]) + 1
for key, value in values.items():
if (
value not in seen_values
and value not in self.mapping[entity_type].values()
):
new_key = (
format_duplicated_operator(key, count)
if key in self.mapping[entity_type]
else key
)
self.mapping[entity_type][new_key] = value
seen_values.add(value)
count += 1
[docs]def create_anonymizer_mapping(
original_text: str,
analyzer_results: List["RecognizerResult"],
anonymizer_results: "EngineResult",
is_reversed: bool = False,
) -> MappingDataType:
"""创建或更新用于匿名化和/或去匿名化文本的映射。
该方法利用分析和匿名化过程返回的结果。
如果 is_reversed 为 True,则构建从每个原始实体到其匿名化值的映射。
如果 is_reversed 为 False,则构建从每个匿名化实体到其原始文本值的映射。
如果存在多个相同类型的实体,则映射将包括计数以区分它们。例如,如果输入文本中有两个名称,则映射将包括 NAME_1 和 NAME_2。
映射示例:
{
"PERSON": {
"<original>": "<anonymized>",
"John Doe": "Slim Shady"
},
"PHONE_NUMBER": {
"111-111-1111": "555-555-5555"
}
...
}
"""
# We are able to zip and loop through both lists because we expect
# them to return corresponding entities for each identified piece
# of analyzable data from our input.
# We sort them by their 'start' attribute because it allows us to
# match corresponding entities by their position in the input text.
analyzer_results.sort(key=lambda d: d.start)
anonymizer_results.items.sort(key=lambda d: d.start)
mapping: MappingDataType = defaultdict(dict)
count: dict = defaultdict(int)
for analyzed, anonymized in zip(analyzer_results, anonymizer_results.items):
original_value = original_text[analyzed.start : analyzed.end]
entity_type = anonymized.entity_type
if is_reversed:
cond = original_value in mapping[entity_type].values()
else:
cond = original_value in mapping[entity_type]
if cond:
continue
if (
anonymized.text in mapping[entity_type].values()
or anonymized.text in mapping[entity_type]
):
anonymized_value = format_duplicated_operator(
anonymized.text, count[entity_type] + 2
)
count[entity_type] += 1
else:
anonymized_value = anonymized.text
mapping_key, mapping_value = (
(anonymized_value, original_value)
if is_reversed
else (original_value, anonymized_value)
)
mapping[entity_type][mapping_key] = mapping_value
return mapping