Source code for langchain_experimental.data_anonymizer.deanonymizer_matching_strategies
import re
from typing import List
from langchain_experimental.data_anonymizer.deanonymizer_mapping import MappingDataType
[docs]def exact_matching_strategy(text: str, deanonymizer_mapping: MappingDataType) -> str:
"""用于去匿名化的精确匹配策略。
它将所有匿名实体替换为原始实体。
参数:
text: 要去匿名化的文本
deanonymizer_mapping: 匿名实体和原始实体之间的映射关系
"""
# Iterate over all the entities (PERSON, EMAIL_ADDRESS, etc.)
for entity_type in deanonymizer_mapping:
for anonymized, original in deanonymizer_mapping[entity_type].items():
text = text.replace(anonymized, original)
return text
[docs]def case_insensitive_matching_strategy(
text: str, deanonymizer_mapping: MappingDataType
) -> str:
"""不区分大小写的去匿名化匹配策略。
它将所有匿名实体替换为原始实体,无论它们的大小写如何。
参数:
text: 需要去匿名化的文本
deanonymizer_mapping: 匿名实体和原始实体之间的映射
匹配示例:
keanu reeves -> Keanu Reeves
JOHN F. KENNEDY -> John F. Kennedy
"""
# Iterate over all the entities (PERSON, EMAIL_ADDRESS, etc.)
for entity_type in deanonymizer_mapping:
for anonymized, original in deanonymizer_mapping[entity_type].items():
# Use regular expressions for case-insensitive matching and replacing
text = re.sub(anonymized, original, text, flags=re.IGNORECASE)
return text
[docs]def fuzzy_matching_strategy(
text: str, deanonymizer_mapping: MappingDataType, max_l_dist: int = 3
) -> str:
"""模糊匹配策略用于去匿名化。
它使用模糊匹配来找到文本中匿名实体的位置。
它用原始实体替换所有匿名实体。
参数:
text:要去匿名化的文本
deanonymizer_mapping:匿名实体和原始实体之间的映射
max_l_dist:匿名实体和文本段之间的最大Levenshtein距离,以便将其视为匹配项
匹配示例:
Kaenu Reves -> Keanu Reeves
John F. Kennedy -> John Kennedy
"""
try:
from fuzzysearch import find_near_matches
except ImportError as e:
raise ImportError(
"Could not import fuzzysearch, please install with "
"`pip install fuzzysearch`."
) from e
for entity_type in deanonymizer_mapping:
for anonymized, original in deanonymizer_mapping[entity_type].items():
matches = find_near_matches(anonymized, text, max_l_dist=max_l_dist)
new_text = ""
last_end = 0
for m in matches:
# add the text that isn't part of a match
new_text += text[last_end : m.start]
# add the replacement text
new_text += original
last_end = m.end
# add the remaining text that wasn't part of a match
new_text += text[last_end:]
text = new_text
return text
[docs]def combined_exact_fuzzy_matching_strategy(
text: str, deanonymizer_mapping: MappingDataType, max_l_dist: int = 3
) -> str:
"""组合精确匹配和模糊匹配策略用于去匿名化。
这是一种推荐的策略。
参数:
text:需要去匿名化的文本
deanonymizer_mapping:匿名实体和原始实体之间的映射
max_l_dist:匿名实体和文本段之间的最大Levenshtein距离,以便将其视为匹配项
匹配的示例:
Kaenu Reves -> Keanu Reeves
John F. Kennedy -> John Kennedy
"""
text = exact_matching_strategy(text, deanonymizer_mapping)
text = fuzzy_matching_strategy(text, deanonymizer_mapping, max_l_dist)
return text
[docs]def ngram_fuzzy_matching_strategy(
text: str,
deanonymizer_mapping: MappingDataType,
fuzzy_threshold: int = 85,
use_variable_length: bool = True,
) -> str:
"""N-gram模糊匹配策略用于去匿名化。
它将所有匿名实体替换为原始实体。
它使用模糊匹配来找到文本中匿名实体的位置。
它从文本中生成与匿名实体相同长度的n-gram,并使用模糊匹配来找到文本中匿名实体的位置。
参数:
text:需要去匿名化的文本
deanonymizer_mapping:匿名实体和原始实体之间的映射
fuzzy_threshold:模糊匹配阈值
use_variable_length:是否使用(n-1, n, n+1)-gram或只使用n-gram
"""
def generate_ngrams(words_list: List[str], n: int) -> list:
"""从单词列表中生成n-gram。"""
return [
" ".join(words_list[i : i + n]) for i in range(len(words_list) - (n - 1))
]
try:
from fuzzywuzzy import fuzz
except ImportError as e:
raise ImportError(
"Could not import fuzzywuzzy, please install with "
"`pip install fuzzywuzzy`."
) from e
text_words = text.split()
replacements = []
matched_indices: List[int] = []
for entity_type in deanonymizer_mapping:
for anonymized, original in deanonymizer_mapping[entity_type].items():
anonymized_words = anonymized.split()
if use_variable_length:
gram_lengths = [
len(anonymized_words) - 1,
len(anonymized_words),
len(anonymized_words) + 1,
]
else:
gram_lengths = [len(anonymized_words)]
for n in gram_lengths:
if n > 0: # Take only positive values
segments = generate_ngrams(text_words, n)
for i, segment in enumerate(segments):
if (
fuzz.ratio(anonymized.lower(), segment.lower())
> fuzzy_threshold
and i not in matched_indices
):
replacements.append((i, n, original))
# Add the matched segment indices to the list
matched_indices.extend(range(i, i + n))
# Sort replacements by index in reverse order
replacements.sort(key=lambda x: x[0], reverse=True)
# Apply replacements in reverse order to not affect subsequent indices
for start, length, replacement in replacements:
text_words[start : start + length] = replacement.split()
return " ".join(text_words)