Source code for langchain_experimental.data_anonymizer.deanonymizer_mapping

import re
from collections import defaultdict
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Dict, List

if TYPE_CHECKING:
    from presidio_analyzer import RecognizerResult
    from presidio_anonymizer.entities import EngineResult

MappingDataType = Dict[str, Dict[str, str]]


[docs]def format_duplicated_operator(operator_name: str, count: int) -> str:
    """使用计数格式化运算符名称。"""

    clean_operator_name = re.sub(r"[<>]", "", operator_name)
    clean_operator_name = re.sub(r"_\d+$", "", clean_operator_name)

    if operator_name.startswith("<") and operator_name.endswith(">"):
        return f"<{clean_operator_name}_{count}>"
    else:
        return f"{clean_operator_name}_{count}"


[docs]@dataclass
class DeanonymizerMapping:
    """Deanonymizer映射。"""

    mapping: MappingDataType = field(
        default_factory=lambda: defaultdict(lambda: defaultdict(str))
    )

    @property
    def data(self) -> MappingDataType:
        """返回去匿名化映射。"""
        return {k: dict(v) for k, v in self.mapping.items()}

[docs]    def update(self, new_mapping: MappingDataType) -> None:
        """更新去匿名化映射表的新数值。

重复数值不会被添加
如果存在多个相同类型的实体，则映射表将包括计数以区分它们。例如，如果输入文本中有两个名称，映射表将包括NAME_1和NAME_2。
"""
        seen_values = set()

        for entity_type, values in new_mapping.items():
            count = len(self.mapping[entity_type]) + 1

            for key, value in values.items():
                if (
                    value not in seen_values
                    and value not in self.mapping[entity_type].values()
                ):
                    new_key = (
                        format_duplicated_operator(key, count)
                        if key in self.mapping[entity_type]
                        else key
                    )

                    self.mapping[entity_type][new_key] = value
                    seen_values.add(value)
                    count += 1


[docs]def create_anonymizer_mapping(
    original_text: str,
    analyzer_results: List["RecognizerResult"],
    anonymizer_results: "EngineResult",
    is_reversed: bool = False,
) -> MappingDataType:
    """创建或更新用于匿名化和/或去匿名化文本的映射。

该方法利用分析和匿名化过程返回的结果。

如果 is_reversed 为 True，则构建从每个原始实体到其匿名化值的映射。

如果 is_reversed 为 False，则构建从每个匿名化实体到其原始文本值的映射。

如果存在多个相同类型的实体，则映射将包括计数以区分它们。例如，如果输入文本中有两个名称，则映射将包括 NAME_1 和 NAME_2。

映射示例：
{
    "PERSON": {
        "<original>": "<anonymized>",
        "John Doe": "Slim Shady"
    },
    "PHONE_NUMBER": {
        "111-111-1111": "555-555-5555"
    }
    ...
}
"""
    # We are able to zip and loop through both lists because we expect
    # them to return corresponding entities for each identified piece
    # of analyzable data from our input.

    # We sort them by their 'start' attribute because it allows us to
    # match corresponding entities by their position in the input text.
    analyzer_results.sort(key=lambda d: d.start)
    anonymizer_results.items.sort(key=lambda d: d.start)

    mapping: MappingDataType = defaultdict(dict)
    count: dict = defaultdict(int)

    for analyzed, anonymized in zip(analyzer_results, anonymizer_results.items):
        original_value = original_text[analyzed.start : analyzed.end]
        entity_type = anonymized.entity_type

        if is_reversed:
            cond = original_value in mapping[entity_type].values()
        else:
            cond = original_value in mapping[entity_type]

        if cond:
            continue

        if (
            anonymized.text in mapping[entity_type].values()
            or anonymized.text in mapping[entity_type]
        ):
            anonymized_value = format_duplicated_operator(
                anonymized.text, count[entity_type] + 2
            )
            count[entity_type] += 1
        else:
            anonymized_value = anonymized.text

        mapping_key, mapping_value = (
            (anonymized_value, original_value)
            if is_reversed
            else (original_value, anonymized_value)
        )

        mapping[entity_type][mapping_key] = mapping_value

    return mapping