Source code for langchain_experimental.data_anonymizer.deanonymizer_mapping

import re
from collections import defaultdict
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Dict, List

if TYPE_CHECKING:
    from presidio_analyzer import RecognizerResult
    from presidio_anonymizer.entities import EngineResult

MappingDataType = Dict[str, Dict[str, str]]


[docs]def format_duplicated_operator(operator_name: str, count: int) -> str: """使用计数格式化运算符名称。""" clean_operator_name = re.sub(r"[<>]", "", operator_name) clean_operator_name = re.sub(r"_\d+$", "", clean_operator_name) if operator_name.startswith("<") and operator_name.endswith(">"): return f"<{clean_operator_name}_{count}>" else: return f"{clean_operator_name}_{count}"
[docs]@dataclass class DeanonymizerMapping: """Deanonymizer映射。""" mapping: MappingDataType = field( default_factory=lambda: defaultdict(lambda: defaultdict(str)) ) @property def data(self) -> MappingDataType: """返回去匿名化映射。""" return {k: dict(v) for k, v in self.mapping.items()}
[docs] def update(self, new_mapping: MappingDataType) -> None: """更新去匿名化映射表的新数值。 重复数值不会被添加 如果存在多个相同类型的实体,则映射表将包括计数以区分它们。例如,如果输入文本中有两个名称,映射表将包括NAME_1和NAME_2。 """ seen_values = set() for entity_type, values in new_mapping.items(): count = len(self.mapping[entity_type]) + 1 for key, value in values.items(): if ( value not in seen_values and value not in self.mapping[entity_type].values() ): new_key = ( format_duplicated_operator(key, count) if key in self.mapping[entity_type] else key ) self.mapping[entity_type][new_key] = value seen_values.add(value) count += 1
[docs]def create_anonymizer_mapping( original_text: str, analyzer_results: List["RecognizerResult"], anonymizer_results: "EngineResult", is_reversed: bool = False, ) -> MappingDataType: """创建或更新用于匿名化和/或去匿名化文本的映射。 该方法利用分析和匿名化过程返回的结果。 如果 is_reversed 为 True,则构建从每个原始实体到其匿名化值的映射。 如果 is_reversed 为 False,则构建从每个匿名化实体到其原始文本值的映射。 如果存在多个相同类型的实体,则映射将包括计数以区分它们。例如,如果输入文本中有两个名称,则映射将包括 NAME_1 和 NAME_2。 映射示例: { "PERSON": { "<original>": "<anonymized>", "John Doe": "Slim Shady" }, "PHONE_NUMBER": { "111-111-1111": "555-555-5555" } ... } """ # We are able to zip and loop through both lists because we expect # them to return corresponding entities for each identified piece # of analyzable data from our input. # We sort them by their 'start' attribute because it allows us to # match corresponding entities by their position in the input text. analyzer_results.sort(key=lambda d: d.start) anonymizer_results.items.sort(key=lambda d: d.start) mapping: MappingDataType = defaultdict(dict) count: dict = defaultdict(int) for analyzed, anonymized in zip(analyzer_results, anonymizer_results.items): original_value = original_text[analyzed.start : analyzed.end] entity_type = anonymized.entity_type if is_reversed: cond = original_value in mapping[entity_type].values() else: cond = original_value in mapping[entity_type] if cond: continue if ( anonymized.text in mapping[entity_type].values() or anonymized.text in mapping[entity_type] ): anonymized_value = format_duplicated_operator( anonymized.text, count[entity_type] + 2 ) count[entity_type] += 1 else: anonymized_value = anonymized.text mapping_key, mapping_value = ( (anonymized_value, original_value) if is_reversed else (original_value, anonymized_value) ) mapping[entity_type][mapping_key] = mapping_value return mapping