时间序列¶

设置¶

In [ ]:

Copied!

# pip install ydf temporian -U
# pip install ydf temporian -U

In [8]:

Copied!





import ydf
import temporian as tp

import pandas as pd
import datetime
import os
import ydf
import temporian as tp

import pandas as pd
import datetime
import os

什么是时间序列？¶

时间序列或事件序列是带有时间戳的事件，具有非均匀采样的特征数据。这种类型的数据有很多例子：银行交易日志、销售日志、网络日志、患者测量、治疗日志等。决策森林非常适合处理事件序列，因为事件序列预处理的输出是表格形式，可以是稀疏和嘈杂的。

当处理时间数据时，YDF模型从特征预处理中获益良多。基本操作可以使用像Pandas这样的通用数据处理工具来完成。对于更复杂的用例，时间特征预处理工具非常有用。本教程演示了如何将YDF与Temporian数据预处理库集成。

在本笔记本中，我们的目标是确定来自可重现的信用卡欺诈检测机器学习数据集的交易是否存在欺诈行为。每笔交易都是一个事件，包含金额、客户ID和终端ID。我们的模型将考虑交易特征以及相关交易（例如，来自同一客户的交易和来自同一终端的交易）。为此，我们将预处理事件数据，以便YDF模型能够检测欺诈。

注意： 本教程是使用Temporian和TensorFlow Decision Forests检测支付卡欺诈教程的简化改编版。

注意2： Yggdrasil决策森林和Temporian共享部分开发团队。

下载数据集¶

交易数据以CSV格式提供，从2018年4月1日到2018年9月30日的每一天都有一个文件。我们使用2018年4月1日到2018年8月31日（包括8月31日）的交易数据进行训练，以及2018年9月1日至2018年9月30日的交易数据进行评估。

In [46]:

Copied!





start_date = datetime.date(2018, 4, 1)
end_date = datetime.date(2018, 9, 30)
train_test_split = datetime.datetime(2018, 9, 1)

# 列出输入的CSV文件
filenames = []
while start_date <= end_date:
    filenames.append(f"{start_date}.pkl")
    start_date += datetime.timedelta(days=1)
print(f"{len(filenames)} dates")

# 下载并加载文件
print("Loading files (this step can take a minute)")
def load_date(idx_and_filename):
    if (idx_and_filename[0]%10) == 0:
        print(f"[{idx_and_filename[0]}/{len(filenames)}]", end="", flush=True)
    return pd.read_pickle(f"https://github.com/Fraud-Detection-Handbook/simulated-data-raw/raw/main/data/{idx_and_filename[1]}")
transactions_pd = pd.concat(map(load_date, enumerate(filenames)))
print("Done!")

print(f"Found {len(transactions_pd)} transactions")
start_date = datetime.date(2018, 4, 1)
end_date = datetime.date(2018, 9, 30)
train_test_split = datetime.datetime(2018, 9, 1)

# 列出输入的CSV文件
filenames = []
while start_date <= end_date:
    filenames.append(f"{start_date}.pkl")
    start_date += datetime.timedelta(days=1)
print(f"{len(filenames)} dates")

# 下载并加载文件
print("Loading files (this step can take a minute)")
def load_date(idx_and_filename):
    if (idx_and_filename[0]%10) == 0:
        print(f"[{idx_and_filename[0]}/{len(filenames)}]", end="", flush=True)
    return pd.read_pickle(f"https://github.com/Fraud-Detection-Handbook/simulated-data-raw/raw/main/data/{idx_and_filename[1]}")
transactions_pd = pd.concat(map(load_date, enumerate(filenames)))
print("Done!")

print(f"Found {len(transactions_pd)} transactions")

183 dates
Loading files (this step can take a minute)
[0/183][10/183][20/183][30/183][40/183][50/183][60/183][70/183][80/183][90/183][100/183][110/183][120/183][130/183][140/183][150/183][160/183][170/183][180/183]Done!
Found 1754155 transactions

In [57]:

Copied!

transactions_pd
transactions_pd

Out[57]:

	TRANSACTION_ID	TX_DATETIME	CUSTOMER_ID	TERMINAL_ID	TX_AMOUNT	TX_TIME_SECONDS	TX_TIME_DAYS	TX_FRAUD	TX_FRAUD_SCENARIO
0	0	2018-04-01 00:00:31	596	3156	57.16	31	0	0	0
1	1	2018-04-01 00:02:10	4961	3412	81.51	130	0	0	0
2	2	2018-04-01 00:07:56	2	1365	146.00	476	0	0	0
3	3	2018-04-01 00:09:29	4128	8737	64.49	569	0	0	0
4	4	2018-04-01 00:10:34	927	9906	50.99	634	0	0	0
...	...	...	...	...	...	...	...	...	...
1754150	1754150	2018-09-30 23:56:36	161	655	54.24	15810996	182	0	0
1754151	1754151	2018-09-30 23:57:38	4342	6181	1.23	15811058	182	0	0
1754152	1754152	2018-09-30 23:58:21	618	1502	6.62	15811101	182	0	0
1754153	1754153	2018-09-30 23:59:52	4056	3067	55.40	15811192	182	0	0
1754154	1754154	2018-09-30 23:59:57	3542	9849	23.59	15811197	182	0	0

1754155 rows × 9 columns

TX_FRAUD 表示一个交易是否为欺诈。这是标签。

准备数据¶

我们目前拥有的数据存储在一个 Pandas DataFrame 中。我们希望将其转换为 Temporian EventSet，以便进行预处理。

In [58]:

Copied!





transactions_es = tp.from_pandas(
    transactions_pd[["TX_DATETIME", "CUSTOMER_ID", "TERMINAL_ID",
                     "TX_AMOUNT", "TX_FRAUD"]],
    timestamps="TX_DATETIME")

transactions_es
transactions_es = tp.from_pandas(
    transactions_pd[["TX_DATETIME", "CUSTOMER_ID", "TERMINAL_ID",
                     "TX_AMOUNT", "TX_FRAUD"]],
    timestamps="TX_DATETIME")

transactions_es

WARNING:root:Feature "CUSTOMER_ID" is an array of numpy.object_ and will be casted to numpy.string_ (Note: numpy.string_ is equivalent to numpy.bytes_).
WARNING:root:Feature "TERMINAL_ID" is an array of numpy.object_ and will be casted to numpy.string_ (Note: numpy.string_ is equivalent to numpy.bytes_).

Out[58]:

features [4]: CUSTOMER_ID (str_) , TERMINAL_ID (str_) , TX_AMOUNT (float64) , TX_FRAUD (int64)

indexes [0]: none

events: 1754155

index values: 1

memory usage: 56.1 MB

index ( ) with 1754155 events

timestamp	CUSTOMER_ID	TERMINAL_ID	TX_AMOUNT	TX_FRAUD
2018-04-01 00:00:31+00:00	596	3156	57.16	0
2018-04-01 00:02:10+00:00	4961	3412	81.51	0
2018-04-01 00:07:56+00:00	2	1365	146	0
2018-04-01 00:09:29+00:00	4128	8737	64.49	0
2018-04-01 00:10:34+00:00	927	9906	50.99	0
…	…	…	…	…

让我们绘制CUSTOMER_ID=3774的交易记录。我们可以看到一些欺诈行为。

In [59]:

Copied!

transactions_es.filter(tp.equal(transactions_es["CUSTOMER_ID"],"3774")).plot()
transactions_es.filter(tp.equal(transactions_es["CUSTOMER_ID"],"3774")).plot()

No description has been provided for this image

在这个数据集中，确定一笔交易是否为欺诈交易需要一周的时间。对于每笔交易，我们将计算在过去1到4周内同一终端的欺诈交易次数。我们还将计算该终端在过去一天、星期和四周内的交易次数及交易金额总和。最后，我们将提取每笔交易的小时和星期几。

In [61]:

Copied!





# @tp.compile  取消注释 tp.compile 以加快运行速度
def extract_features(transactions: tp.types.EventSetOrNode) -> dict[str, tp.types.EventSetOrNode]:

    per_terminal_features = []

    # 按客户索引交易
    per_terminal = transactions.add_index("TERMINAL_ID")

    # 1. 先前欺诈案件数量
    # 将欺诈行为延迟一周
    lagged_fraud_per_terminal = per_terminal["TX_FRAUD"].lag(tp.duration.weeks(1))
    # 过去4周的交易总额
    per_terminal_features.append(
        lagged_fraud_per_terminal
        .moving_sum(tp.duration.weeks(4), sampling=per_terminal)
        .rename("per_terminal.moving_sum_frauds"))


    # 2. 过往交易次数及金额
    for day in [1, 7, 4*7]:
        per_terminal_features.append(
            per_terminal
            .moving_count(tp.duration.days(day))
            .rename(f"per_terminal.moving_count_transaction_{day}d"))
        
        per_terminal_features.append(
            per_terminal["TX_AMOUNT"]
            .moving_sum(tp.duration.days(day))
            .rename(f"per_terminal.moving_sum_transaction_{day}d"))

    # 3. 日历功能
    # 每笔交易的具体时间（小时和星期几）。
    per_terminal_features.append(per_terminal.calendar_hour())
    per_terminal_features.append(per_terminal.calendar_day_of_week())

    # 汇总原始数据和特征
    return tp.glue(
        per_terminal,
        *per_terminal_features).drop_index("TERMINAL_ID")

feature_transactions_es = extract_features(transactions_es)
feature_transactions_es
# @tp.compile  取消注释 tp.compile 以加快运行速度
def extract_features(transactions: tp.types.EventSetOrNode) -> dict[str, tp.types.EventSetOrNode]:

    per_terminal_features = []

    # 按客户索引交易
    per_terminal = transactions.add_index("TERMINAL_ID")

    # 1. 先前欺诈案件数量
    # 将欺诈行为延迟一周
    lagged_fraud_per_terminal = per_terminal["TX_FRAUD"].lag(tp.duration.weeks(1))
    # 过去4周的交易总额
    per_terminal_features.append(
        lagged_fraud_per_terminal
        .moving_sum(tp.duration.weeks(4), sampling=per_terminal)
        .rename("per_terminal.moving_sum_frauds"))


    # 2. 过往交易次数及金额
    for day in [1, 7, 4*7]:
        per_terminal_features.append(
            per_terminal
            .moving_count(tp.duration.days(day))
            .rename(f"per_terminal.moving_count_transaction_{day}d"))
        
        per_terminal_features.append(
            per_terminal["TX_AMOUNT"]
            .moving_sum(tp.duration.days(day))
            .rename(f"per_terminal.moving_sum_transaction_{day}d"))

    # 3. 日历功能
    # 每笔交易的具体时间（小时和星期几）。
    per_terminal_features.append(per_terminal.calendar_hour())
    per_terminal_features.append(per_terminal.calendar_day_of_week())

    # 汇总原始数据和特征
    return tp.glue(
        per_terminal,
        *per_terminal_features).drop_index("TERMINAL_ID")

feature_transactions_es = extract_features(transactions_es)
feature_transactions_es

Out[61]:

features [13]: CUSTOMER_ID (str_) , TX_AMOUNT (float64) , TX_FRAUD (int64) , per_terminal.moving_sum_frauds (int64) , per_terminal.moving_count_transaction_1d (int32) , per_terminal.moving_sum_transaction_1d (float64) , per_terminal.moving_count_transaction_7d (int32) , per_terminal.moving_sum_transaction_7d (float64) , per_terminal.moving_count_transaction_28d (int32) , per_terminal.moving_sum_transaction_28d (float64) , calendar_hour (int32) , calendar_day_of_week (int32) , TERMINAL_ID (str_)

indexes [0]: none

events: 1754155

index values: 1

memory usage: 147.4 MB

index ( ) with 1754155 events

timestamp	CUSTOMER_ID	TX_AMOUNT	TX_FRAUD	per_terminal.moving_sum_frauds	per_terminal.moving_count_transaction_1d	per_terminal.moving_sum_transaction_1d	per_terminal.moving_count_transaction_7d	per_terminal.moving_sum_transaction_7d	per_terminal.moving_count_transaction_28d	per_terminal.moving_sum_transaction_28d	calendar_hour	calendar_day_of_week	TERMINAL_ID
2018-04-01 00:00:31+00:00	596	57.16	0	0	1	57.16	1	57.16	1	57.16	0	6	3156
2018-04-01 00:02:10+00:00	4961	81.51	0	0	1	81.51	1	81.51	1	81.51	0	6	3412
2018-04-01 00:07:56+00:00	2	146	0	0	1	146	1	146	1	146	0	6	1365
2018-04-01 00:09:29+00:00	4128	64.49	0	0	1	64.49	1	64.49	1	64.49	0	6	8737
2018-04-01 00:10:34+00:00	927	50.99	0	0	1	50.99	1	50.99	1	50.99	0	6	9906
…	…	…	…	…	…	…	…	…	…	…	…	…	…

我们为我们最喜欢的客户绘制特征：

In [62]:

Copied!

feature_transactions_es.filter(tp.equal(feature_transactions_es["CUSTOMER_ID"],"3774")).plot()
feature_transactions_es.filter(tp.equal(feature_transactions_es["CUSTOMER_ID"],"3774")).plot()

训练和评估模型¶

我们将数据集分为训练集和测试集，将 Temporian EventSet 转换回 Pandas DataFrame，并训练一个 YDF 模型。

In [63]:

Copied!





is_train = feature_transactions_es.timestamps() < train_test_split.timestamp()
is_test = ~is_train

train_feature_transactions_es = feature_transactions_es.filter(is_train)
test_feature_transactions_es = feature_transactions_es.filter(is_test)

print(f"{train_feature_transactions_es.num_events()} training transactions")
print(f"{test_feature_transactions_es.num_events()} testing transactions")
is_train = feature_transactions_es.timestamps() < train_test_split.timestamp()
is_test = ~is_train

train_feature_transactions_es = feature_transactions_es.filter(is_train)
test_feature_transactions_es = feature_transactions_es.filter(is_test)

print(f"{train_feature_transactions_es.num_events()} training transactions")
print(f"{test_feature_transactions_es.num_events()} testing transactions")

1466091 training transactions
288064 testing transactions

In [64]:

Copied!

train_ds_pd = tp.to_pandas(train_feature_transactions_es)
test_ds_pd = tp.to_pandas(test_feature_transactions_es)

print(f"{len(train_ds_pd)} training examples")
print(f"{len(test_ds_pd)} testing examples")
train_ds_pd = tp.to_pandas(train_feature_transactions_es)
test_ds_pd = tp.to_pandas(test_feature_transactions_es)

print(f"{len(train_ds_pd)} training examples")
print(f"{len(test_ds_pd)} testing examples")

1466091 training examples
288064 testing examples

In [65]:

Copied!

train_ds_pd
train_ds_pd

Out[65]:

	CUSTOMER_ID	TX_AMOUNT	TX_FRAUD	per_terminal.moving_sum_frauds	per_terminal.moving_count_transaction_1d	per_terminal.moving_sum_transaction_1d	per_terminal.moving_count_transaction_7d	per_terminal.moving_sum_transaction_7d	per_terminal.moving_count_transaction_28d	per_terminal.moving_sum_transaction_28d	calendar_hour	calendar_day_of_week	TERMINAL_ID	timestamp
0	596	57.16	0	0	1	57.16	1	57.16	1	57.16	0	6	3156	1.522541e+09
1	4961	81.51	0	0	1	81.51	1	81.51	1	81.51	0	6	3412	1.522541e+09
2	2	146.00	0	0	1	146.00	1	146.00	1	146.00	0	6	1365	1.522541e+09
3	4128	64.49	0	0	1	64.49	1	64.49	1	64.49	0	6	8737	1.522541e+09
4	927	50.99	0	0	1	50.99	1	50.99	1	50.99	0	6	9906	1.522541e+09
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1466086	1321	79.06	0	0	2	123.09	8	342.55	26	941.59	21	4	229	1.535753e+09
1466087	4202	13.53	0	0	3	216.03	5	315.35	24	1233.40	21	4	5559	1.535753e+09
1466088	1879	19.64	0	0	1	19.64	7	274.47	26	1046.89	21	4	601	1.535753e+09
1466089	1864	73.29	0	0	4	192.22	13	598.69	25	1106.18	21	4	4571	1.535753e+09
1466090	409	56.21	0	0	2	145.79	8	568.48	33	1959.01	21	4	8457	1.535753e+09

1466091 rows × 14 columns

In [66]:

Copied!





learner = ydf.GradientBoostedTreesLearner(label="TX_FRAUD",
                                          features=["per_terminal.moving_sum_frauds",
                                                    "per_terminal.moving_count_transaction_1d",
                                                    "per_terminal.moving_count_transaction_7d",
                                                    "per_terminal.moving_count_transaction_28d",
                                                    "calendar_hour",
                                                    "calendar_day_of_week",
                                                    ],
                                          num_trees=100,  # 加快训练速度
                                          )
model = learner.train(train_ds_pd)
learner = ydf.GradientBoostedTreesLearner(label="TX_FRAUD",
                                          features=["per_terminal.moving_sum_frauds",
                                                    "per_terminal.moving_count_transaction_1d",
                                                    "per_terminal.moving_count_transaction_7d",
                                                    "per_terminal.moving_count_transaction_28d",
                                                    "calendar_hour",
                                                    "calendar_day_of_week",
                                                    ],
                                          num_trees=100,  # 加快训练速度
                                          )
model = learner.train(train_ds_pd)

Train model on 1466091 examples
Model trained in 0:00:40.887605

我们可以在测试数据集上评估模型。

In [67]:

Copied!

model.evaluate(test_ds_pd)
model.evaluate(test_ds_pd)

Out[67]:

accuracy:

0.991033

AUC: '1' vs others:

0.74193

PR-AUC: '1' vs others:

0.216602

loss:

0.0379502

num examples:

288064

num examples (weighted):

288064

Confusion matrix

Label \ Pred	0	1
0	284702	813
1	1770	779

理解模型¶

尽管模型并不是直接对原始交易数据进行操作，而是对预处理后的特征进行操作，但它是可以被解释的。例如，通过检查变量的重要性，我们可以看到特征 per_terminal.moving_sum_frauds 对模型来说是最重要的，而 calendar_day_of_week 并不是很重要。我们可以将其解释为，如果一个客户有过去的欺诈交易记录，那么这对于未来的欺诈交易是一个很好的指示。

In [68]:

Copied!

model.describe()
model.describe()

Out[68]:

Name : GRADIENT_BOOSTED_TREES
Task : CLASSIFICATION
Label : TX_FRAUD
Features (6) : per_terminal.moving_sum_frauds per_terminal.moving_count_transaction_1d per_terminal.moving_count_transaction_7d per_terminal.moving_count_transaction_28d calendar_hour calendar_day_of_week
Weights : None
Trained with tuner : No
Model size : 1406 kB

Number of records: 1466091
Number of columns: 7

Number of columns by type:
	NUMERICAL: 6 (85.7143%)
	CATEGORICAL: 1 (14.2857%)

Columns:

NUMERICAL: 6 (85.7143%)
	0: "per_terminal.moving_sum_frauds" NUMERICAL mean:0.197974 min:0 max:53 sd:1.59577
	1: "per_terminal.moving_count_transaction_1d" NUMERICAL mean:1.99715 min:1 max:12 sd:1.02114
	2: "per_terminal.moving_count_transaction_7d" NUMERICAL mean:7.84226 min:1 max:28 sd:3.1002
	3: "per_terminal.moving_count_transaction_28d" NUMERICAL mean:26.4539 min:1 max:79 sd:9.81157
	4: "calendar_hour" NUMERICAL mean:11.4991 min:0 max:23 sd:5.05594
	5: "calendar_day_of_week" NUMERICAL mean:2.9861 min:0 max:6 sd:2.00096

CATEGORICAL: 1 (14.2857%)
	6: "TX_FRAUD" CATEGORICAL has-dict vocab-size:3 zero-ood-items most-frequent:"0" 1453959 (99.1725%)

Terminology:
	nas: Number of non-available (i.e. missing) values.
	ood: Out of dictionary.
	manually-defined: Attribute whose type is manually defined by the user, i.e., the type was not automatically inferred.
	tokenized: The attribute value is obtained through tokenization.
	has-dict: The attribute is attached to a string dictionary e.g. a categorical attribute stored as a string.
	vocab-size: Number of unique values.

The following evaluation is computed on the validation or out-of-bag dataset.

Task: CLASSIFICATION
Label: TX_FRAUD
Loss (BINOMIAL_LOG_LIKELIHOOD): 0.0692174

Accuracy: 0.99269  CI95[W][0 1]
ErrorRate: : 0.00730968


Confusion Table:
truth\prediction
            0    1
    0  145251  236
    1     836  332
Total: 146655

Variable importances measure the importance of an input feature for a model.

    1.            "per_terminal.moving_sum_frauds"  0.900000 ################
    2. "per_terminal.moving_count_transaction_28d"  0.290564 ##
    3.  "per_terminal.moving_count_transaction_7d"  0.215782 
    4.                             "calendar_hour"  0.190089 
    5.                      "calendar_day_of_week"  0.188707 
    6.  "per_terminal.moving_count_transaction_1d"  0.180135

    1.            "per_terminal.moving_sum_frauds" 88.000000 ################
    2. "per_terminal.moving_count_transaction_28d"  8.000000 #
    3.  "per_terminal.moving_count_transaction_7d"  2.000000 
    4.                      "calendar_day_of_week"  1.000000

    1. "per_terminal.moving_count_transaction_28d" 859.000000 ################
    2.            "per_terminal.moving_sum_frauds" 824.000000 ###############
    3.  "per_terminal.moving_count_transaction_7d" 525.000000 ########
    4.                             "calendar_hour" 306.000000 ###
    5.                      "calendar_day_of_week" 214.000000 #
    6.  "per_terminal.moving_count_transaction_1d" 132.000000

    1.            "per_terminal.moving_sum_frauds" 2574.658942 ################
    2. "per_terminal.moving_count_transaction_28d" 692.886033 ####
    3.  "per_terminal.moving_count_transaction_7d" 305.556677 #
    4.                             "calendar_hour" 83.875978 
    5.                      "calendar_day_of_week" 79.349436 
    6.  "per_terminal.moving_count_transaction_1d" 34.315195

Those variable importances are computed during training. More, and possibly more informative, variable importances are available when analyzing a model on a test dataset.

Num trees : 99

Only printing the first tree.

Tree #0:
    "per_terminal.moving_sum_frauds">=2.5 [s:0.00122566 n:1319436 np:10761 miss:0] ; pred:3.22407e-10
        ├─(pos)─ "per_terminal.moving_sum_frauds">=18.5 [s:0.0176001 n:10761 np:3536 miss:0] ; pred:4.68509
        |        ├─(pos)─ "per_terminal.moving_count_transaction_28d">=29.5 [s:0.00771428 n:3536 np:2178 miss:0] ; pred:2.38384
        |        |        ├─(pos)─ "per_terminal.moving_sum_frauds">=24.5 [s:0.0209201 n:2178 np:1388 miss:0] ; pred:3.22545
        |        |        |        ├─(pos)─ "per_terminal.moving_count_transaction_28d">=37.5 [s:0.00556346 n:1388 np:633 miss:0] ; pred:1.90128
        |        |        |        |        ├─(pos)─ pred:2.8898
        |        |        |        |        └─(neg)─ pred:1.07249
        |        |        |        └─(neg)─ "per_terminal.moving_count_transaction_28d">=36.5 [s:0.0096319 n:790 np:206 miss:0] ; pred:5
        |        |        |                 ├─(pos)─ pred:5
        |        |        |                 └─(neg)─ pred:4.84463
        |        |        └─(neg)─ "per_terminal.moving_sum_frauds">=22.5 [s:0.00322549 n:1358 np:543 miss:0] ; pred:1.03403
        |        |                 ├─(pos)─ "per_terminal.moving_sum_frauds">=23.5 [s:0.000536586 n:543 np:399 miss:0] ; pred:0.189689
        |        |                 |        ├─(pos)─ pred:0.0208172
        |        |                 |        └─(neg)─ pred:0.657606
        |        |                 └─(neg)─ "per_terminal.moving_count_transaction_28d">=25.5 [s:0.00624813 n:815 np:378 miss:1] ; pred:1.59659
        |        |                          ├─(pos)─ pred:2.62795
        |        |                          └─(neg)─ pred:0.704466
        |        └─(neg)─ "per_terminal.moving_sum_frauds">=3.5 [s:0.00906797 n:7225 np:6188 miss:0] ; pred:5
        |                 ├─(pos)─ "per_terminal.moving_sum_frauds">=9.5 [s:0.0036483 n:6188 np:3617 miss:0] ; pred:5
        |                 |        ├─(pos)─ "per_terminal.moving_count_transaction_28d">=22.5 [s:0.0138705 n:3617 np:2748 miss:1] ; pred:5
        |                 |        |        ├─(pos)─ pred:5
        |                 |        |        └─(neg)─ pred:3.12495
        |                 |        └─(neg)─ "per_terminal.moving_count_transaction_28d">=27.5 [s:0.00442462 n:2571 np:1083 miss:0] ; pred:5
        |                 |                 ├─(pos)─ pred:5
        |                 |                 └─(neg)─ pred:5
        |                 └─(neg)─ "per_terminal.moving_count_transaction_28d">=28.5 [s:0.0251879 n:1037 np:616 miss:0] ; pred:2.98852
        |                          ├─(pos)─ "per_terminal.moving_count_transaction_28d">=47.5 [s:0.00377678 n:616 np:147 miss:0] ; pred:1.39635
        |                          |        ├─(pos)─ pred:0.0642654
        |                          |        └─(neg)─ pred:1.81387
        |                          └─(neg)─ "per_terminal.moving_count_transaction_28d">=22.5 [s:0.0124898 n:421 np:251 miss:1] ; pred:5
        |                                   ├─(pos)─ pred:4.20204
        |                                   └─(neg)─ pred:5
        └─(neg)─ "per_terminal.moving_sum_frauds">=1.5 [s:6.26323e-06 n:1308675 np:7935 miss:0] ; pred:-0.0385246
                 ├─(pos)─ "per_terminal.moving_count_transaction_28d">=11.5 [s:0.000800429 n:7935 np:7915 miss:1] ; pred:0.350309
                 |        ├─(pos)─ "per_terminal.moving_count_transaction_28d">=15.5 [s:0.000222715 n:7915 np:7832 miss:1] ; pred:0.333051
                 |        |        ├─(pos)─ "per_terminal.moving_count_transaction_28d">=33.5 [s:0.000176741 n:7832 np:2934 miss:0] ; pred:0.314408
                 |        |        |        ├─(pos)─ pred:0.105963
                 |        |        |        └─(neg)─ pred:0.43927
                 |        |        └─(neg)─ "per_terminal.moving_count_transaction_7d">=9.5 [s:0.0112688 n:83 np:5 miss:0] ; pred:2.09225
                 |        |                 ├─(pos)─ pred:5
                 |        |                 └─(neg)─ pred:1.7661
                 |        └─(neg)─ "per_terminal.moving_count_transaction_28d">=9.5 [s:0.0533333 n:20 np:15 miss:1] ; pred:5
                 |                 ├─(pos)─ "calendar_hour">=13.5 [s:0.0355556 n:15 np:5 miss:0] ; pred:5
                 |                 |        ├─(pos)─ pred:2.32618
                 |                 |        └─(neg)─ pred:5
                 |                 └─(neg)─ pred:5
                 └─(neg)─ "per_terminal.moving_sum_frauds">=0.5 [s:6.94014e-07 n:1300740 np:84741 miss:0] ; pred:-0.0408967
                          ├─(pos)─ "per_terminal.moving_count_transaction_28d">=23.5 [s:5.77336e-06 n:84741 np:68762 miss:1] ; pred:-0.00260123
                          |        ├─(pos)─ "per_terminal.moving_count_transaction_28d">=31.5 [s:7.97811e-07 n:68762 np:35992 miss:0] ; pred:-0.0166571
                          |        |        ├─(pos)─ pred:-0.0269997
                          |        |        └─(neg)─ pred:-0.00529765
                          |        └─(neg)─ "per_terminal.moving_count_transaction_7d">=13.5 [s:2.81166e-05 n:15979 np:8 miss:0] ; pred:0.0578851
                          |                 ├─(pos)─ pred:2.93294
                          |                 └─(neg)─ pred:0.0564449
                          └─(neg)─ "per_terminal.moving_count_transaction_28d">=10.5 [s:3.48456e-07 n:1215999 np:1124099 miss:1] ; pred:-0.0435654
                                   ├─(pos)─ "per_terminal.moving_count_transaction_28d">=20.5 [s:3.55088e-08 n:1124099 np:910721 miss:1] ; pred:-0.0415172
                                   |        ├─(pos)─ pred:-0.0404104
                                   |        └─(neg)─ pred:-0.0462414
                                   └─(neg)─ "per_terminal.moving_count_transaction_28d">=3.5 [s:6.14969e-07 n:91900 np:64839 miss:1] ; pred:-0.0686185
                                            ├─(pos)─ pred:-0.0624707
                                            └─(neg)─ pred:-0.083349

例如，如果客户的卡被盗，我们可以查看PDP模型，以了解过去4周欺诈次数与交易欺诈概率之间的关系。

In [69]:

Copied!

model.analyze(test_ds_pd, sampling=0.1)
model.analyze(test_ds_pd, sampling=0.1)

Out[69]:

Variable importances measure the importance of an input feature for a model.

    1.            "per_terminal.moving_sum_frauds"  0.005308 ################
    2. "per_terminal.moving_count_transaction_28d"  0.001118 ###
    3.  "per_terminal.moving_count_transaction_7d"  0.000358 #
    4.                             "calendar_hour"  0.000000 
    5.  "per_terminal.moving_count_transaction_1d" -0.000007 
    6.                      "calendar_day_of_week" -0.000007

    1.            "per_terminal.moving_sum_frauds"  0.207669 ################
    2. "per_terminal.moving_count_transaction_28d"  0.058980 ####
    3.  "per_terminal.moving_count_transaction_7d"  0.023943 #
    4.                             "calendar_hour"  0.000144 
    5.                      "calendar_day_of_week" -0.000243 
    6.  "per_terminal.moving_count_transaction_1d" -0.001815

    1.            "per_terminal.moving_sum_frauds"  0.231211 ################
    2. "per_terminal.moving_count_transaction_28d"  0.015302 #
    3.                             "calendar_hour"  0.001647 
    4.  "per_terminal.moving_count_transaction_7d"  0.001499 
    5.  "per_terminal.moving_count_transaction_1d"  0.000363 
    6.                      "calendar_day_of_week" -0.000701

    1.            "per_terminal.moving_sum_frauds"  0.207691 ################
    2. "per_terminal.moving_count_transaction_28d"  0.059061 ####
    3.  "per_terminal.moving_count_transaction_7d"  0.023948 #
    4.                             "calendar_hour"  0.000115 
    5.                      "calendar_day_of_week" -0.000204 
    6.  "per_terminal.moving_count_transaction_1d" -0.001902

    1.            "per_terminal.moving_sum_frauds"  0.900000 ################
    2. "per_terminal.moving_count_transaction_28d"  0.290564 ##
    3.  "per_terminal.moving_count_transaction_7d"  0.215782 
    4.                             "calendar_hour"  0.190089 
    5.                      "calendar_day_of_week"  0.188707 
    6.  "per_terminal.moving_count_transaction_1d"  0.180135

    1.            "per_terminal.moving_sum_frauds" 88.000000 ################
    2. "per_terminal.moving_count_transaction_28d"  8.000000 #
    3.  "per_terminal.moving_count_transaction_7d"  2.000000 
    4.                      "calendar_day_of_week"  1.000000

    1. "per_terminal.moving_count_transaction_28d" 859.000000 ################
    2.            "per_terminal.moving_sum_frauds" 824.000000 ###############
    3.  "per_terminal.moving_count_transaction_7d" 525.000000 ########
    4.                             "calendar_hour" 306.000000 ###
    5.                      "calendar_day_of_week" 214.000000 #
    6.  "per_terminal.moving_count_transaction_1d" 132.000000

    1.            "per_terminal.moving_sum_frauds" 2574.658942 ################
    2. "per_terminal.moving_count_transaction_28d" 692.886033 ####
    3.  "per_terminal.moving_count_transaction_7d" 305.556677 #
    4.                             "calendar_hour" 83.875978 
    5.                      "calendar_day_of_week" 79.349436 
    6.  "per_terminal.moving_count_transaction_1d" 34.315195