In [1]:
Copied!
import ydf
import pandas as pd
# 下载一个分类数据集,并将其加载为Pandas DataFrame。
ds_path = "https://raw.githubusercontent.com/google/yggdrasil-decision-forests/main/yggdrasil_decision_forests/test_data/dataset"
train_ds = pd.read_csv(f"{ds_path}/adult_train.csv")
test_ds = pd.read_csv(f"{ds_path}/adult_test.csv")
# 打印前5个训练样本
train_ds.head(5)
import ydf
import pandas as pd
# 下载一个分类数据集,并将其加载为Pandas DataFrame。
ds_path = "https://raw.githubusercontent.com/google/yggdrasil-decision-forests/main/yggdrasil_decision_forests/test_data/dataset"
train_ds = pd.read_csv(f"{ds_path}/adult_train.csv")
test_ds = pd.read_csv(f"{ds_path}/adult_test.csv")
# 打印前5个训练样本
train_ds.head(5)
Out[1]:
age | workclass | fnlwgt | education | education_num | marital_status | occupation | relationship | race | sex | capital_gain | capital_loss | hours_per_week | native_country | income | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 44 | Private | 228057 | 7th-8th | 4 | Married-civ-spouse | Machine-op-inspct | Wife | White | Female | 0 | 0 | 40 | Dominican-Republic | <=50K |
1 | 20 | Private | 299047 | Some-college | 10 | Never-married | Other-service | Not-in-family | White | Female | 0 | 0 | 20 | United-States | <=50K |
2 | 40 | Private | 342164 | HS-grad | 9 | Separated | Adm-clerical | Unmarried | White | Female | 0 | 0 | 37 | United-States | <=50K |
3 | 30 | Private | 361742 | Some-college | 10 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 50 | United-States | <=50K |
4 | 67 | Self-emp-inc | 171564 | HS-grad | 9 | Married-civ-spouse | Prof-specialty | Wife | White | Female | 20051 | 0 | 30 | England | >50K |
In [3]:
Copied!
model = ydf.RandomForestLearner(label="income").train(train_ds)
evaluation = model.evaluate(test_ds)
evaluation
model = ydf.RandomForestLearner(label="income").train(train_ds)
evaluation = model.evaluate(test_ds)
evaluation
Train model on 22792 examples Model trained in 0:00:01.169327
Out[3]:
accuracy:
0.866005
AUC: '>50K' vs others:
0.908676
PR-AUC: '>50K' vs others:
0.790029
loss:
0.394958
9769
9769
Label \ Pred | <=50K | >50K |
---|---|---|
<=50K | 6976 | 436 |
>50K | 873 | 1484 |
测试指标是比较模型质量的有用方法。我们已经训练了一个随机森林模型。现在,让我们训练一个梯度增强树模型,看看哪个模型表现更好。
注意: 在实际应用中,我们可能希望调整模型的超参数。然而,为了简单起见,我们将使用学习者的默认超参数。
In [12]:
Copied!
model_2 = ydf.GradientBoostedTreesLearner(label="income").train(train_ds)
evaluation_2 = model_2.evaluate(test_ds)
model_2 = ydf.GradientBoostedTreesLearner(label="income").train(train_ds)
evaluation_2 = model_2.evaluate(test_ds)
Train model on 22792 examples Model trained in 0:00:03.561738
让我们来看一下测试指标:
In [14]:
Copied!
print(f"""\
Test accuracy:
Random Forest: {evaluation.accuracy:.4f}
Gradient Boosted Trees: {evaluation_2.accuracy:.4f}
Test AUC:
Random Forest: {evaluation.characteristics[0].auc:.4f}
Gradient Boosted Trees: {evaluation_2.characteristics[0].auc:.4f}
""")
print(f"""\
Test accuracy:
Random Forest: {evaluation.accuracy:.4f}
Gradient Boosted Trees: {evaluation_2.accuracy:.4f}
Test AUC:
Random Forest: {evaluation.characteristics[0].auc:.4f}
Gradient Boosted Trees: {evaluation_2.characteristics[0].auc:.4f}
""")
Test accuracy: Random Forest: 0.8660 Gradient Boosted Trees: 0.8739 Test AUC: Random Forest: 0.9087 Gradient Boosted Trees: 0.9296
看起来对于这个数据集,使用默认超参数的梯度提升树模型优于随机森林。