# pip install ydf temporian -U
import ydf
import temporian as tp
import pandas as pd
import datetime
import os
注意: 本教程是使用Temporian和TensorFlow Decision Forests检测支付卡欺诈教程的简化改编版。
注意2: Yggdrasil决策森林和Temporian共享部分开发团队。
start_date = datetime.date(2018, 4, 1)
end_date = datetime.date(2018, 9, 30)
train_test_split = datetime.datetime(2018, 9, 1)
# 列出输入的CSV文件
filenames = []
while start_date <= end_date:
start_date += datetime.timedelta(days=1)
print(f"{len(filenames)} dates")
# 下载并加载文件
print("Loading files (this step can take a minute)")
def load_date(idx_and_filename):
if (idx_and_filename[0]%10) == 0:
print(f"[{idx_and_filename[0]}/{len(filenames)}]", end="", flush=True)
return pd.read_pickle(f"https://github.com/Fraud-Detection-Handbook/simulated-data-raw/raw/main/data/{idx_and_filename[1]}")
transactions_pd = pd.concat(map(load_date, enumerate(filenames)))
print(f"Found {len(transactions_pd)} transactions")
183 dates Loading files (this step can take a minute) [0/183][10/183][20/183][30/183][40/183][50/183][60/183][70/183][80/183][90/183][100/183][110/183][120/183][130/183][140/183][150/183][160/183][170/183][180/183]Done! Found 1754155 transactions
0 | 0 | 2018-04-01 00:00:31 | 596 | 3156 | 57.16 | 31 | 0 | 0 | 0 |
1 | 1 | 2018-04-01 00:02:10 | 4961 | 3412 | 81.51 | 130 | 0 | 0 | 0 |
2 | 2 | 2018-04-01 00:07:56 | 2 | 1365 | 146.00 | 476 | 0 | 0 | 0 |
3 | 3 | 2018-04-01 00:09:29 | 4128 | 8737 | 64.49 | 569 | 0 | 0 | 0 |
4 | 4 | 2018-04-01 00:10:34 | 927 | 9906 | 50.99 | 634 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1754150 | 1754150 | 2018-09-30 23:56:36 | 161 | 655 | 54.24 | 15810996 | 182 | 0 | 0 |
1754151 | 1754151 | 2018-09-30 23:57:38 | 4342 | 6181 | 1.23 | 15811058 | 182 | 0 | 0 |
1754152 | 1754152 | 2018-09-30 23:58:21 | 618 | 1502 | 6.62 | 15811101 | 182 | 0 | 0 |
1754153 | 1754153 | 2018-09-30 23:59:52 | 4056 | 3067 | 55.40 | 15811192 | 182 | 0 | 0 |
1754154 | 1754154 | 2018-09-30 23:59:57 | 3542 | 9849 | 23.59 | 15811197 | 182 | 0 | 0 |
1754155 rows × 9 columns
我们目前拥有的数据存储在一个 Pandas DataFrame 中。我们希望将其转换为 Temporian EventSet,以便进行预处理。
transactions_es = tp.from_pandas(
transactions_pd[["TX_DATETIME", "CUSTOMER_ID", "TERMINAL_ID",
WARNING:root:Feature "CUSTOMER_ID" is an array of numpy.object_ and will be casted to numpy.string_ (Note: numpy.string_ is equivalent to numpy.bytes_). WARNING:root:Feature "TERMINAL_ID" is an array of numpy.object_ and will be casted to numpy.string_ (Note: numpy.string_ is equivalent to numpy.bytes_).
2018-04-01 00:00:31+00:00 | 596 | 3156 | 57.16 | 0 |
2018-04-01 00:02:10+00:00 | 4961 | 3412 | 81.51 | 0 |
2018-04-01 00:07:56+00:00 | 2 | 1365 | 146 | 0 |
2018-04-01 00:09:29+00:00 | 4128 | 8737 | 64.49 | 0 |
2018-04-01 00:10:34+00:00 | 927 | 9906 | 50.99 | 0 |
… | … | … | … | … |
在这个数据集中,确定一笔交易是否为欺诈交易需要一周的时间。对于每笔交易,我们将计算在过去1到4周内同一终端的欺诈交易次数。 我们还将计算该终端在过去一天、星期和四周内的交易次数及交易金额总和。 最后,我们将提取每笔交易的小时和星期几。
# @tp.compile 取消注释 tp.compile 以加快运行速度
def extract_features(transactions: tp.types.EventSetOrNode) -> dict[str, tp.types.EventSetOrNode]:
per_terminal_features = []
# 按客户索引交易
per_terminal = transactions.add_index("TERMINAL_ID")
# 1. 先前欺诈案件数量
# 将欺诈行为延迟一周
lagged_fraud_per_terminal = per_terminal["TX_FRAUD"].lag(tp.duration.weeks(1))
# 过去4周的交易总额
.moving_sum(tp.duration.weeks(4), sampling=per_terminal)
# 2. 过往交易次数及金额
for day in [1, 7, 4*7]:
# 3. 日历功能
# 每笔交易的具体时间(小时和星期几)。
# 汇总原始数据和特征
return tp.glue(
feature_transactions_es = extract_features(transactions_es)
timestamp | CUSTOMER_ID | TX_AMOUNT | TX_FRAUD | per_terminal.moving_sum_frauds | per_terminal.moving_count_transaction_1d | per_terminal.moving_sum_transaction_1d | per_terminal.moving_count_transaction_7d | per_terminal.moving_sum_transaction_7d | per_terminal.moving_count_transaction_28d | per_terminal.moving_sum_transaction_28d | calendar_hour | calendar_day_of_week | TERMINAL_ID |
2018-04-01 00:00:31+00:00 | 596 | 57.16 | 0 | 0 | 1 | 57.16 | 1 | 57.16 | 1 | 57.16 | 0 | 6 | 3156 |
2018-04-01 00:02:10+00:00 | 4961 | 81.51 | 0 | 0 | 1 | 81.51 | 1 | 81.51 | 1 | 81.51 | 0 | 6 | 3412 |
2018-04-01 00:07:56+00:00 | 2 | 146 | 0 | 0 | 1 | 146 | 1 | 146 | 1 | 146 | 0 | 6 | 1365 |
2018-04-01 00:09:29+00:00 | 4128 | 64.49 | 0 | 0 | 1 | 64.49 | 1 | 64.49 | 1 | 64.49 | 0 | 6 | 8737 |
2018-04-01 00:10:34+00:00 | 927 | 50.99 | 0 | 0 | 1 | 50.99 | 1 | 50.99 | 1 | 50.99 | 0 | 6 | 9906 |
… | … | … | … | … | … | … | … | … | … | … | … | … | … |
我们将数据集分为训练集和测试集,将 Temporian EventSet 转换回 Pandas DataFrame,并训练一个 YDF 模型。
is_train = feature_transactions_es.timestamps() < train_test_split.timestamp()
is_test = ~is_train
train_feature_transactions_es = feature_transactions_es.filter(is_train)
test_feature_transactions_es = feature_transactions_es.filter(is_test)
print(f"{train_feature_transactions_es.num_events()} training transactions")
print(f"{test_feature_transactions_es.num_events()} testing transactions")
1466091 training transactions 288064 testing transactions
train_ds_pd = tp.to_pandas(train_feature_transactions_es)
test_ds_pd = tp.to_pandas(test_feature_transactions_es)
print(f"{len(train_ds_pd)} training examples")
print(f"{len(test_ds_pd)} testing examples")
1466091 training examples 288064 testing examples
CUSTOMER_ID | TX_AMOUNT | TX_FRAUD | per_terminal.moving_sum_frauds | per_terminal.moving_count_transaction_1d | per_terminal.moving_sum_transaction_1d | per_terminal.moving_count_transaction_7d | per_terminal.moving_sum_transaction_7d | per_terminal.moving_count_transaction_28d | per_terminal.moving_sum_transaction_28d | calendar_hour | calendar_day_of_week | TERMINAL_ID | timestamp | |
0 | 596 | 57.16 | 0 | 0 | 1 | 57.16 | 1 | 57.16 | 1 | 57.16 | 0 | 6 | 3156 | 1.522541e+09 |
1 | 4961 | 81.51 | 0 | 0 | 1 | 81.51 | 1 | 81.51 | 1 | 81.51 | 0 | 6 | 3412 | 1.522541e+09 |
2 | 2 | 146.00 | 0 | 0 | 1 | 146.00 | 1 | 146.00 | 1 | 146.00 | 0 | 6 | 1365 | 1.522541e+09 |
3 | 4128 | 64.49 | 0 | 0 | 1 | 64.49 | 1 | 64.49 | 1 | 64.49 | 0 | 6 | 8737 | 1.522541e+09 |
4 | 927 | 50.99 | 0 | 0 | 1 | 50.99 | 1 | 50.99 | 1 | 50.99 | 0 | 6 | 9906 | 1.522541e+09 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1466086 | 1321 | 79.06 | 0 | 0 | 2 | 123.09 | 8 | 342.55 | 26 | 941.59 | 21 | 4 | 229 | 1.535753e+09 |
1466087 | 4202 | 13.53 | 0 | 0 | 3 | 216.03 | 5 | 315.35 | 24 | 1233.40 | 21 | 4 | 5559 | 1.535753e+09 |
1466088 | 1879 | 19.64 | 0 | 0 | 1 | 19.64 | 7 | 274.47 | 26 | 1046.89 | 21 | 4 | 601 | 1.535753e+09 |
1466089 | 1864 | 73.29 | 0 | 0 | 4 | 192.22 | 13 | 598.69 | 25 | 1106.18 | 21 | 4 | 4571 | 1.535753e+09 |
1466090 | 409 | 56.21 | 0 | 0 | 2 | 145.79 | 8 | 568.48 | 33 | 1959.01 | 21 | 4 | 8457 | 1.535753e+09 |
1466091 rows × 14 columns
learner = ydf.GradientBoostedTreesLearner(label="TX_FRAUD",
num_trees=100, # 加快训练速度
model = learner.train(train_ds_pd)
Train model on 1466091 examples Model trained in 0:00:40.887605
Label \ Pred | 0 | 1 |
0 | 284702 | 813 |
1 | 1770 | 779 |
尽管模型并不是直接对原始交易数据进行操作,而是对预处理后的特征进行操作,但它是可以被解释的。例如,通过检查变量的重要性,我们可以看到特征 per_terminal.moving_sum_frauds
对模型来说是最重要的,而 calendar_day_of_week
Label : TX_FRAUD
Features (6) : per_terminal.moving_sum_frauds per_terminal.moving_count_transaction_1d per_terminal.moving_count_transaction_7d per_terminal.moving_count_transaction_28d calendar_hour calendar_day_of_week
Weights : None
Trained with tuner : No
Model size : 1406 kB
Number of records: 1466091 Number of columns: 7 Number of columns by type: NUMERICAL: 6 (85.7143%) CATEGORICAL: 1 (14.2857%) Columns: NUMERICAL: 6 (85.7143%) 0: "per_terminal.moving_sum_frauds" NUMERICAL mean:0.197974 min:0 max:53 sd:1.59577 1: "per_terminal.moving_count_transaction_1d" NUMERICAL mean:1.99715 min:1 max:12 sd:1.02114 2: "per_terminal.moving_count_transaction_7d" NUMERICAL mean:7.84226 min:1 max:28 sd:3.1002 3: "per_terminal.moving_count_transaction_28d" NUMERICAL mean:26.4539 min:1 max:79 sd:9.81157 4: "calendar_hour" NUMERICAL mean:11.4991 min:0 max:23 sd:5.05594 5: "calendar_day_of_week" NUMERICAL mean:2.9861 min:0 max:6 sd:2.00096 CATEGORICAL: 1 (14.2857%) 6: "TX_FRAUD" CATEGORICAL has-dict vocab-size:3 zero-ood-items most-frequent:"0" 1453959 (99.1725%) Terminology: nas: Number of non-available (i.e. missing) values. ood: Out of dictionary. manually-defined: Attribute whose type is manually defined by the user, i.e., the type was not automatically inferred. tokenized: The attribute value is obtained through tokenization. has-dict: The attribute is attached to a string dictionary e.g. a categorical attribute stored as a string. vocab-size: Number of unique values.
The following evaluation is computed on the validation or out-of-bag dataset.
Task: CLASSIFICATION Label: TX_FRAUD Loss (BINOMIAL_LOG_LIKELIHOOD): 0.0692174 Accuracy: 0.99269 CI95[W][0 1] ErrorRate: : 0.00730968 Confusion Table: truth\prediction 0 1 0 145251 236 1 836 332 Total: 146655
Variable importances measure the importance of an input feature for a model.
1. "per_terminal.moving_sum_frauds" 0.900000 ################ 2. "per_terminal.moving_count_transaction_28d" 0.290564 ## 3. "per_terminal.moving_count_transaction_7d" 0.215782 4. "calendar_hour" 0.190089 5. "calendar_day_of_week" 0.188707 6. "per_terminal.moving_count_transaction_1d" 0.180135
1. "per_terminal.moving_sum_frauds" 88.000000 ################ 2. "per_terminal.moving_count_transaction_28d" 8.000000 # 3. "per_terminal.moving_count_transaction_7d" 2.000000 4. "calendar_day_of_week" 1.000000
1. "per_terminal.moving_count_transaction_28d" 859.000000 ################ 2. "per_terminal.moving_sum_frauds" 824.000000 ############### 3. "per_terminal.moving_count_transaction_7d" 525.000000 ######## 4. "calendar_hour" 306.000000 ### 5. "calendar_day_of_week" 214.000000 # 6. "per_terminal.moving_count_transaction_1d" 132.000000
1. "per_terminal.moving_sum_frauds" 2574.658942 ################ 2. "per_terminal.moving_count_transaction_28d" 692.886033 #### 3. "per_terminal.moving_count_transaction_7d" 305.556677 # 4. "calendar_hour" 83.875978 5. "calendar_day_of_week" 79.349436 6. "per_terminal.moving_count_transaction_1d" 34.315195
Those variable importances are computed during training. More, and possibly more informative, variable importances are available when analyzing a model on a test dataset.
Only printing the first tree.
Tree #0: "per_terminal.moving_sum_frauds">=2.5 [s:0.00122566 n:1319436 np:10761 miss:0] ; pred:3.22407e-10 ├─(pos)─ "per_terminal.moving_sum_frauds">=18.5 [s:0.0176001 n:10761 np:3536 miss:0] ; pred:4.68509 | ├─(pos)─ "per_terminal.moving_count_transaction_28d">=29.5 [s:0.00771428 n:3536 np:2178 miss:0] ; pred:2.38384 | | ├─(pos)─ "per_terminal.moving_sum_frauds">=24.5 [s:0.0209201 n:2178 np:1388 miss:0] ; pred:3.22545 | | | ├─(pos)─ "per_terminal.moving_count_transaction_28d">=37.5 [s:0.00556346 n:1388 np:633 miss:0] ; pred:1.90128 | | | | ├─(pos)─ pred:2.8898 | | | | └─(neg)─ pred:1.07249 | | | └─(neg)─ "per_terminal.moving_count_transaction_28d">=36.5 [s:0.0096319 n:790 np:206 miss:0] ; pred:5 | | | ├─(pos)─ pred:5 | | | └─(neg)─ pred:4.84463 | | └─(neg)─ "per_terminal.moving_sum_frauds">=22.5 [s:0.00322549 n:1358 np:543 miss:0] ; pred:1.03403 | | ├─(pos)─ "per_terminal.moving_sum_frauds">=23.5 [s:0.000536586 n:543 np:399 miss:0] ; pred:0.189689 | | | ├─(pos)─ pred:0.0208172 | | | └─(neg)─ pred:0.657606 | | └─(neg)─ "per_terminal.moving_count_transaction_28d">=25.5 [s:0.00624813 n:815 np:378 miss:1] ; pred:1.59659 | | ├─(pos)─ pred:2.62795 | | └─(neg)─ pred:0.704466 | └─(neg)─ "per_terminal.moving_sum_frauds">=3.5 [s:0.00906797 n:7225 np:6188 miss:0] ; pred:5 | ├─(pos)─ "per_terminal.moving_sum_frauds">=9.5 [s:0.0036483 n:6188 np:3617 miss:0] ; pred:5 | | ├─(pos)─ "per_terminal.moving_count_transaction_28d">=22.5 [s:0.0138705 n:3617 np:2748 miss:1] ; pred:5 | | | ├─(pos)─ pred:5 | | | └─(neg)─ pred:3.12495 | | └─(neg)─ "per_terminal.moving_count_transaction_28d">=27.5 [s:0.00442462 n:2571 np:1083 miss:0] ; pred:5 | | ├─(pos)─ pred:5 | | └─(neg)─ pred:5 | └─(neg)─ "per_terminal.moving_count_transaction_28d">=28.5 [s:0.0251879 n:1037 np:616 miss:0] ; pred:2.98852 | ├─(pos)─ "per_terminal.moving_count_transaction_28d">=47.5 [s:0.00377678 n:616 np:147 miss:0] ; pred:1.39635 | | ├─(pos)─ pred:0.0642654 | | └─(neg)─ pred:1.81387 | └─(neg)─ "per_terminal.moving_count_transaction_28d">=22.5 [s:0.0124898 n:421 np:251 miss:1] ; pred:5 | ├─(pos)─ pred:4.20204 | └─(neg)─ pred:5 └─(neg)─ "per_terminal.moving_sum_frauds">=1.5 [s:6.26323e-06 n:1308675 np:7935 miss:0] ; pred:-0.0385246 ├─(pos)─ "per_terminal.moving_count_transaction_28d">=11.5 [s:0.000800429 n:7935 np:7915 miss:1] ; pred:0.350309 | ├─(pos)─ "per_terminal.moving_count_transaction_28d">=15.5 [s:0.000222715 n:7915 np:7832 miss:1] ; pred:0.333051 | | ├─(pos)─ "per_terminal.moving_count_transaction_28d">=33.5 [s:0.000176741 n:7832 np:2934 miss:0] ; pred:0.314408 | | | ├─(pos)─ pred:0.105963 | | | └─(neg)─ pred:0.43927 | | └─(neg)─ "per_terminal.moving_count_transaction_7d">=9.5 [s:0.0112688 n:83 np:5 miss:0] ; pred:2.09225 | | ├─(pos)─ pred:5 | | └─(neg)─ pred:1.7661 | └─(neg)─ "per_terminal.moving_count_transaction_28d">=9.5 [s:0.0533333 n:20 np:15 miss:1] ; pred:5 | ├─(pos)─ "calendar_hour">=13.5 [s:0.0355556 n:15 np:5 miss:0] ; pred:5 | | ├─(pos)─ pred:2.32618 | | └─(neg)─ pred:5 | └─(neg)─ pred:5 └─(neg)─ "per_terminal.moving_sum_frauds">=0.5 [s:6.94014e-07 n:1300740 np:84741 miss:0] ; pred:-0.0408967 ├─(pos)─ "per_terminal.moving_count_transaction_28d">=23.5 [s:5.77336e-06 n:84741 np:68762 miss:1] ; pred:-0.00260123 | ├─(pos)─ "per_terminal.moving_count_transaction_28d">=31.5 [s:7.97811e-07 n:68762 np:35992 miss:0] ; pred:-0.0166571 | | ├─(pos)─ pred:-0.0269997 | | └─(neg)─ pred:-0.00529765 | └─(neg)─ "per_terminal.moving_count_transaction_7d">=13.5 [s:2.81166e-05 n:15979 np:8 miss:0] ; pred:0.0578851 | ├─(pos)─ pred:2.93294 | └─(neg)─ pred:0.0564449 └─(neg)─ "per_terminal.moving_count_transaction_28d">=10.5 [s:3.48456e-07 n:1215999 np:1124099 miss:1] ; pred:-0.0435654 ├─(pos)─ "per_terminal.moving_count_transaction_28d">=20.5 [s:3.55088e-08 n:1124099 np:910721 miss:1] ; pred:-0.0415172 | ├─(pos)─ pred:-0.0404104 | └─(neg)─ pred:-0.0462414 └─(neg)─ "per_terminal.moving_count_transaction_28d">=3.5 [s:6.14969e-07 n:91900 np:64839 miss:1] ; pred:-0.0686185 ├─(pos)─ pred:-0.0624707 └─(neg)─ pred:-0.083349
model.analyze(test_ds_pd, sampling=0.1)
Variable importances measure the importance of an input feature for a model.
1. "per_terminal.moving_sum_frauds" 0.005308 ################ 2. "per_terminal.moving_count_transaction_28d" 0.001118 ### 3. "per_terminal.moving_count_transaction_7d" 0.000358 # 4. "calendar_hour" 0.000000 5. "per_terminal.moving_count_transaction_1d" -0.000007 6. "calendar_day_of_week" -0.000007
1. "per_terminal.moving_sum_frauds" 0.207669 ################ 2. "per_terminal.moving_count_transaction_28d" 0.058980 #### 3. "per_terminal.moving_count_transaction_7d" 0.023943 # 4. "calendar_hour" 0.000144 5. "calendar_day_of_week" -0.000243 6. "per_terminal.moving_count_transaction_1d" -0.001815
1. "per_terminal.moving_sum_frauds" 0.231211 ################ 2. "per_terminal.moving_count_transaction_28d" 0.015302 # 3. "calendar_hour" 0.001647 4. "per_terminal.moving_count_transaction_7d" 0.001499 5. "per_terminal.moving_count_transaction_1d" 0.000363 6. "calendar_day_of_week" -0.000701
1. "per_terminal.moving_sum_frauds" 0.207691 ################ 2. "per_terminal.moving_count_transaction_28d" 0.059061 #### 3. "per_terminal.moving_count_transaction_7d" 0.023948 # 4. "calendar_hour" 0.000115 5. "calendar_day_of_week" -0.000204 6. "per_terminal.moving_count_transaction_1d" -0.001902
1. "per_terminal.moving_sum_frauds" 0.900000 ################ 2. "per_terminal.moving_count_transaction_28d" 0.290564 ## 3. "per_terminal.moving_count_transaction_7d" 0.215782 4. "calendar_hour" 0.190089 5. "calendar_day_of_week" 0.188707 6. "per_terminal.moving_count_transaction_1d" 0.180135
1. "per_terminal.moving_sum_frauds" 88.000000 ################ 2. "per_terminal.moving_count_transaction_28d" 8.000000 # 3. "per_terminal.moving_count_transaction_7d" 2.000000 4. "calendar_day_of_week" 1.000000
1. "per_terminal.moving_count_transaction_28d" 859.000000 ################ 2. "per_terminal.moving_sum_frauds" 824.000000 ############### 3. "per_terminal.moving_count_transaction_7d" 525.000000 ######## 4. "calendar_hour" 306.000000 ### 5. "calendar_day_of_week" 214.000000 # 6. "per_terminal.moving_count_transaction_1d" 132.000000
1. "per_terminal.moving_sum_frauds" 2574.658942 ################ 2. "per_terminal.moving_count_transaction_28d" 692.886033 #### 3. "per_terminal.moving_count_transaction_7d" 305.556677 # 4. "calendar_hour" 83.875978 5. "calendar_day_of_week" 79.349436 6. "per_terminal.moving_count_transaction_1d" 34.315195