高级自定义原语指南#
[1]:
import re
import numpy as np
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import Datetime, NaturalLanguage
import featuretools as ft
from featuretools.primitives import TransformPrimitive
from featuretools.tests.testing_utils import make_ecommerce_entityset
2024-10-11 14:49:27,094 featuretools - WARNING While loading primitives via "premium_primitives" entry point, ignored primitive "DiversityScore" from "premium_primitives.diversity_score" because a primitive with that name already exists in "nlp_primitives.diversity_score"
2024-10-11 14:49:27,094 featuretools - WARNING While loading primitives via "premium_primitives" entry point, ignored primitive "LSA" from "premium_primitives.lsa" because a primitive with that name already exists in "nlp_primitives.lsa"
2024-10-11 14:49:27,094 featuretools - WARNING While loading primitives via "premium_primitives" entry point, ignored primitive "MeanCharactersPerSentence" from "premium_primitives.mean_characters_per_sentence" because a primitive with that name already exists in "nlp_primitives.mean_characters_per_sentence"
2024-10-11 14:49:27,094 featuretools - WARNING While loading primitives via "premium_primitives" entry point, ignored primitive "NumberOfSentences" from "premium_primitives.number_of_sentences" because a primitive with that name already exists in "nlp_primitives.number_of_sentences"
2024-10-11 14:49:27,095 featuretools - WARNING While loading primitives via "premium_primitives" entry point, ignored primitive "PartOfSpeechCount" from "premium_primitives.part_of_speech_count" because a primitive with that name already exists in "nlp_primitives.part_of_speech_count"
2024-10-11 14:49:27,095 featuretools - WARNING While loading primitives via "premium_primitives" entry point, ignored primitive "PolarityScore" from "premium_primitives.polarity_score" because a primitive with that name already exists in "nlp_primitives.polarity_score"
2024-10-11 14:49:27,095 featuretools - WARNING While loading primitives via "premium_primitives" entry point, ignored primitive "StopwordCount" from "premium_primitives.stopword_count" because a primitive with that name already exists in "nlp_primitives.stopword_count"
2024-10-11 14:49:27,112 featuretools - WARNING Featuretools failed to load plugin tsfresh from library featuretools_tsfresh_primitives.__init__. For a full stack trace, set logging to debug.
带有额外参数的基本元素#
有些功能需要比其他功能更复杂的计算。高级功能通常需要额外的参数来帮助输出所需的值。通过自定义基本元素,您可以使用基本元素参数来帮助您创建高级功能。
字符串计数示例#
在这个示例中,您将学习如何创建接受额外参数的自定义基本元素。您将创建一个基本元素来计算特定字符串值在文本中出现的次数。
首先,使用TransformPrimitive
作为基础派生一个新的转换基本元素类。该基本元素将接受文本列作为输入,并返回一个数值列作为输出,因此将输入类型设置为Woodwork的ColumnSchema
,逻辑类型为NaturalLanguage
,返回类型设置为Woodwork的ColumnSchema
,语义标签为'numeric'
。特定的字符串值是额外的参数,因此在__init__
中将其定义为一个关键字参数。然后,覆盖get_function
以返回一个将计算特征的基本函数。
Featuretools的基本元素使用Woodwork的ColumnSchema
来控制基本元素的列的输入和返回类型。有关在Featuretools中使用Woodwork类型系统的更多信息,请参阅Featuretools中的Woodwork类型指南。
[2]:
class StringCount(TransformPrimitive):
"""计算字符串值出现的次数。"""
name = "string_count"
input_types = [ColumnSchema(logical_type=NaturalLanguage)]
return_type = ColumnSchema(semantic_tags={"numeric"})
def __init__(self, string=None):
self.string = string
def get_function(self):
def string_count(column):
assert self.string is not None, "string to count needs to be defined"
# 这是一个用于清晰展示的简单实现。
counts = [text.lower().count(self.string) for text in column]
return counts
return string_count
现在您有一个可重复使用于不同字符串值的原语。例如,您可以基于单词“the”在文本中出现的次数创建特征。创建一个原语的实例,其中字符串值为“the”,并将该原语传递给DFS以生成特征。特征名称将自动反映原语的字符串值。
[3]:
es = make_ecommerce_entityset()
feature_matrix, features = ft.dfs(
entityset=es,
target_dataframe_name="sessions",
agg_primitives=["sum", "mean", "std"],
trans_primitives=[StringCount(string="the")],
)
feature_matrix[
[
"STD(log.STRING_COUNT(comments, string=the))",
"SUM(log.STRING_COUNT(comments, string=the))",
"MEAN(log.STRING_COUNT(comments, string=the))",
]
]
/Users/code/fin_tool/github/featuretools/featuretools/computational_backends/feature_set_calculator.py:756: FutureWarning: The provided callable <function sum at 0x104ba07c0> is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.
).agg(to_agg)
/Users/code/fin_tool/github/featuretools/featuretools/computational_backends/feature_set_calculator.py:756: FutureWarning: The provided callable <function std at 0x104ba19e0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
).agg(to_agg)
/Users/code/fin_tool/github/featuretools/featuretools/computational_backends/feature_set_calculator.py:756: FutureWarning: The provided callable <function mean at 0x104ba18a0> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
).agg(to_agg)
/Users/code/fin_tool/github/featuretools/featuretools/computational_backends/feature_set_calculator.py:756: FutureWarning: The provided callable <function sum at 0x104ba07c0> is currently using SeriesGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.
).agg(to_agg)
/Users/code/fin_tool/github/featuretools/featuretools/computational_backends/feature_set_calculator.py:756: FutureWarning: The provided callable <function std at 0x104ba19e0> is currently using SeriesGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.
).agg(to_agg)
/Users/code/fin_tool/github/featuretools/featuretools/computational_backends/feature_set_calculator.py:756: FutureWarning: The provided callable <function mean at 0x104ba18a0> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.
).agg(to_agg)
[3]:
STD(log.STRING_COUNT(comments, string=the)) | SUM(log.STRING_COUNT(comments, string=the)) | MEAN(log.STRING_COUNT(comments, string=the)) | |
---|---|---|---|
id | |||
0 | 47.124304 | 209.0 | 41.80 |
1 | 36.509131 | 109.0 | 27.25 |
2 | NaN | 29.0 | 29.00 |
3 | 49.497475 | 70.0 | 35.00 |
4 | 0.000000 | 0.0 | 0.00 |
5 | 1.414214 | 4.0 | 2.00 |
具有多个输出的特征#
有些计算会输出多个值。通过自定义基元,您可以利用这些计算创建每个输出值的特征。
案例计数示例#
在这个示例中,您将学习如何创建输出多个特征的自定义基元。您将创建一个基元,输出文本中大写字母和小写字母的计数。
首先,使用TransformPrimitive
作为基础派生一个新的转换基元类。该基元将以文本列作为输入,并返回两个数值列作为输出,因此将输入类型设置为Woodwork中逻辑类型为NaturalLanguage
的ColumnSchema
,将返回类型设置为Woodwork中语义标签为'numeric'
的ColumnSchema
。由于此基元返回两列,还需将number_output_features
设置为两。然后,重写get_function
以返回一个将计算特征并返回列列表的基元函数。
[4]:
class CaseCount(TransformPrimitive):
"""返回文本中大写字母和小写字母的数量。"""
name = "case_count"
input_types = [ColumnSchema(logical_type=NaturalLanguage)]
return_type = ColumnSchema(semantic_tags={"numeric"})
number_output_features = 2
def get_function(self):
def case_count(array):
# 这是一个用于清晰展示的简单实现。
upper = np.array([len(re.findall("[A-Z]", i)) for i in array])
lower = np.array([len(re.findall("[a-z]", i)) for i in array])
return upper, lower
return case_count
现在你有一个输出两列的原语。一列包含大写字母的计数,另一列包含小写字母的计数。将这个原语传递给DFS以生成特征。默认情况下,特征名称将反映输出的索引。
[5]:
feature_matrix, features = ft.dfs(
entityset=es,
target_dataframe_name="sessions",
agg_primitives=[],
trans_primitives=[CaseCount],
)
feature_matrix[
[
"customers.CASE_COUNT(favorite_quote)[0]",
"customers.CASE_COUNT(favorite_quote)[1]",
]
]
[5]:
customers.CASE_COUNT(favorite_quote)[0] | customers.CASE_COUNT(favorite_quote)[1] | |
---|---|---|
id | ||
0 | 1.0 | 44.0 |
1 | 1.0 | 44.0 |
2 | 1.0 | 44.0 |
3 | 1.0 | 41.0 |
4 | 1.0 | 41.0 |
5 | 1.0 | 57.0 |
为多个输出定义自定义命名#
当您创建一个输出多个特征的基元时,您还可以为这些特征中的每一个定义自定义命名。
小时正弦和余弦示例#
在这个示例中,您将学习如何为多个输出应用自定义命名。您将创建一个基元,输出小时的正弦和余弦。
首先,使用TransformPrimitive
作为基类派生一个新的转换基元类。该基元将以时间索引作为输入,并返回两个数值列作为输出。将输入类型设置为Woodwork ColumnSchema
,逻辑类型为Datetime
,语义标签为'time_index'
。接下来,将返回类型设置为Woodwork ColumnSchema
,语义标签为'numeric'
,并将number_output_features
设置为两个。然后,重写get_function
以返回一个将计算特征并返回列列表的基元函数。还要重写generate_names
以返回您定义的特征名称列表。
[6]:
class HourlySineAndCosine(TransformPrimitive):
"""返回小时的正弦和余弦值。"""
name = "hourly_sine_and_cosine"
input_types = [ColumnSchema(logical_type=Datetime, semantic_tags={"time_index"})]
return_type = ColumnSchema(semantic_tags={"numeric"})
number_output_features = 2
def get_function(self):
def hourly_sine_and_cosine(column):
sine = np.sin(column.dt.hour)
cosine = np.cos(column.dt.hour)
return sine, cosine
return hourly_sine_and_cosine
def generate_names(self, base_feature_names):
name = self.generate_name(base_feature_names)
return f"{name}[sine]", f"{name}[cosine]"
现在你有一个输出两列的原始数据。一列包含小时的正弦值,另一列包含小时的余弦值。将这个原始数据传递给DFS以生成特征。特征的名称将反映你定义的自定义命名。
[7]:
feature_matrix, features = ft.dfs(
entityset=es,
target_dataframe_name="log",
agg_primitives=[],
trans_primitives=[HourlySineAndCosine],
)
feature_matrix.head()[
[
"HOURLY_SINE_AND_COSINE(datetime)[sine]",
"HOURLY_SINE_AND_COSINE(datetime)[cosine]",
]
]
[7]:
HOURLY_SINE_AND_COSINE(datetime)[sine] | HOURLY_SINE_AND_COSINE(datetime)[cosine] | |
---|---|---|
id | ||
0 | -0.544021 | -0.839072 |
1 | -0.544021 | -0.839072 |
2 | -0.544021 | -0.839072 |
3 | -0.544021 | -0.839072 |
4 | -0.544021 | -0.839072 |