Source code for langchain_experimental.data_anonymizer.deanonymizer_matching_strategies

import re
from typing import List

from langchain_experimental.data_anonymizer.deanonymizer_mapping import MappingDataType


[docs]def exact_matching_strategy(text: str, deanonymizer_mapping: MappingDataType) -> str: """用于去匿名化的精确匹配策略。 它将所有匿名实体替换为原始实体。 参数: text: 要去匿名化的文本 deanonymizer_mapping: 匿名实体和原始实体之间的映射关系 """ # Iterate over all the entities (PERSON, EMAIL_ADDRESS, etc.) for entity_type in deanonymizer_mapping: for anonymized, original in deanonymizer_mapping[entity_type].items(): text = text.replace(anonymized, original) return text
[docs]def case_insensitive_matching_strategy( text: str, deanonymizer_mapping: MappingDataType ) -> str: """不区分大小写的去匿名化匹配策略。 它将所有匿名实体替换为原始实体,无论它们的大小写如何。 参数: text: 需要去匿名化的文本 deanonymizer_mapping: 匿名实体和原始实体之间的映射 匹配示例: keanu reeves -> Keanu Reeves JOHN F. KENNEDY -> John F. Kennedy """ # Iterate over all the entities (PERSON, EMAIL_ADDRESS, etc.) for entity_type in deanonymizer_mapping: for anonymized, original in deanonymizer_mapping[entity_type].items(): # Use regular expressions for case-insensitive matching and replacing text = re.sub(anonymized, original, text, flags=re.IGNORECASE) return text
[docs]def fuzzy_matching_strategy( text: str, deanonymizer_mapping: MappingDataType, max_l_dist: int = 3 ) -> str: """模糊匹配策略用于去匿名化。 它使用模糊匹配来找到文本中匿名实体的位置。 它用原始实体替换所有匿名实体。 参数: text:要去匿名化的文本 deanonymizer_mapping:匿名实体和原始实体之间的映射 max_l_dist:匿名实体和文本段之间的最大Levenshtein距离,以便将其视为匹配项 匹配示例: Kaenu Reves -> Keanu Reeves John F. Kennedy -> John Kennedy """ try: from fuzzysearch import find_near_matches except ImportError as e: raise ImportError( "Could not import fuzzysearch, please install with " "`pip install fuzzysearch`." ) from e for entity_type in deanonymizer_mapping: for anonymized, original in deanonymizer_mapping[entity_type].items(): matches = find_near_matches(anonymized, text, max_l_dist=max_l_dist) new_text = "" last_end = 0 for m in matches: # add the text that isn't part of a match new_text += text[last_end : m.start] # add the replacement text new_text += original last_end = m.end # add the remaining text that wasn't part of a match new_text += text[last_end:] text = new_text return text
[docs]def combined_exact_fuzzy_matching_strategy( text: str, deanonymizer_mapping: MappingDataType, max_l_dist: int = 3 ) -> str: """组合精确匹配和模糊匹配策略用于去匿名化。 这是一种推荐的策略。 参数: text:需要去匿名化的文本 deanonymizer_mapping:匿名实体和原始实体之间的映射 max_l_dist:匿名实体和文本段之间的最大Levenshtein距离,以便将其视为匹配项 匹配的示例: Kaenu Reves -> Keanu Reeves John F. Kennedy -> John Kennedy """ text = exact_matching_strategy(text, deanonymizer_mapping) text = fuzzy_matching_strategy(text, deanonymizer_mapping, max_l_dist) return text
[docs]def ngram_fuzzy_matching_strategy( text: str, deanonymizer_mapping: MappingDataType, fuzzy_threshold: int = 85, use_variable_length: bool = True, ) -> str: """N-gram模糊匹配策略用于去匿名化。 它将所有匿名实体替换为原始实体。 它使用模糊匹配来找到文本中匿名实体的位置。 它从文本中生成与匿名实体相同长度的n-gram,并使用模糊匹配来找到文本中匿名实体的位置。 参数: text:需要去匿名化的文本 deanonymizer_mapping:匿名实体和原始实体之间的映射 fuzzy_threshold:模糊匹配阈值 use_variable_length:是否使用(n-1, n, n+1)-gram或只使用n-gram """ def generate_ngrams(words_list: List[str], n: int) -> list: """从单词列表中生成n-gram。""" return [ " ".join(words_list[i : i + n]) for i in range(len(words_list) - (n - 1)) ] try: from fuzzywuzzy import fuzz except ImportError as e: raise ImportError( "Could not import fuzzywuzzy, please install with " "`pip install fuzzywuzzy`." ) from e text_words = text.split() replacements = [] matched_indices: List[int] = [] for entity_type in deanonymizer_mapping: for anonymized, original in deanonymizer_mapping[entity_type].items(): anonymized_words = anonymized.split() if use_variable_length: gram_lengths = [ len(anonymized_words) - 1, len(anonymized_words), len(anonymized_words) + 1, ] else: gram_lengths = [len(anonymized_words)] for n in gram_lengths: if n > 0: # Take only positive values segments = generate_ngrams(text_words, n) for i, segment in enumerate(segments): if ( fuzz.ratio(anonymized.lower(), segment.lower()) > fuzzy_threshold and i not in matched_indices ): replacements.append((i, n, original)) # Add the matched segment indices to the list matched_indices.extend(range(i, i + n)) # Sort replacements by index in reverse order replacements.sort(key=lambda x: x[0], reverse=True) # Apply replacements in reverse order to not affect subsequent indices for start, length, replacement in replacements: text_words[start : start + length] = replacement.split() return " ".join(text_words)