# 隐藏
! [ -e /content ] && pip install -Uqq fastai # 在Colab上升级fastai
表格模型
from __future__ import annotations
from fastai.torch_basics import *
from fastai.tabular.core import *
from nbdev.showdoc import *
可以在表格数据上使用的基本模型
嵌入表示
def emb_sz_rule(
int # 范畴的基数
n_cat:-> int:
) "Rule of thumb to pick embedding size corresponding to `n_cat`"
return min(600, round(1.6 * n_cat**0.56))
def _one_emb_sz(classes, n, sz_dict=None):
"Pick an embedding size for `n` depending on `classes` if not given in `sz_dict`."
= ifnone(sz_dict, {})
sz_dict = len(classes[n])
n_cat = sz_dict.get(n, int(emb_sz_rule(n_cat))) # 经验法则
sz return n_cat,sz
通过反复试验,这条通用规则取两个值中的较小值:
- 维度空间为600
- 维度空间等于变量基数的1.6倍,取0.56
这为您的变量提供了一个良好的嵌入空间起点。对于希望深入这种做法的高级用户,您可以根据自己的判断调整这些值。对这个通用公式进行轻微调整以获得更好的效果是很常见的。
def get_emb_sz(
|TabularPandas,
to:Tabulardict=None # Dictionary of {'class_name' : size, ...} to override default `emb_sz_rule`
sz_dict:-> list: # 各分类的嵌入尺寸列表
) "Get embedding size for each cat_name in `Tabular` or `TabularPandas`, or populate embedding size manually using sz_dict"
return [_one_emb_sz(to.classes, n, sz_dict) for n in to.cat_names]
class TabularModel(Module):
"Basic model for tabular data."
def __init__(self,
list, # 每个分类变量的序列(num_embeddings, embedding_dim)
emb_szs:int, # 连续变量的数量
n_cont:int, # 最终`LinBnDrop`层的输出数量
out_sz:list, # 用于指定每个 `LinBnDrop` 层输入和输出大小的整数序列
layers:float|MutableSequence=None, # `LinBnDrop`的丢弃概率序列
ps:float=0., # `Embedding`层的丢弃概率
embed_p:=None, # `SigmoidRange`激活的低值和高值
y_rangebool=True, # 在 `LinBnDrop` 层中使用 `BatchNorm1d`
use_bn:bool=False, # 在最终层上使用 `BatchNorm1d`
bn_final:bool=True, # 对连续变量使用 `BatchNorm1d`
bn_cont:=nn.ReLU(inplace=True), # `LinBnDrop`层的激活类型
act_clsbool=True # 线性层位于`LinBnDrop`层的开头或末尾
lin_first:
):= ifnone(ps, [0]*len(layers))
ps if not is_listy(ps): ps = [ps]*len(layers)
self.embeds = nn.ModuleList([Embedding(ni, nf) for ni,nf in emb_szs])
self.emb_drop = nn.Dropout(embed_p)
self.bn_cont = nn.BatchNorm1d(n_cont) if bn_cont else None
= sum(e.embedding_dim for e in self.embeds)
n_emb self.n_emb,self.n_cont = n_emb,n_cont
= [n_emb + n_cont] + layers + [out_sz]
sizes = [act_cls for _ in range(len(sizes)-2)] + [None]
actns = [LinBnDrop(sizes[i], sizes[i+1], bn=use_bn and (i!=len(actns)-1 or bn_final), p=p, act=a, lin_first=lin_first)
_layers for i,(p,a) in enumerate(zip(ps+[0.],actns))]
if y_range is not None: _layers.append(SigmoidRange(*y_range))
self.layers = nn.Sequential(*_layers)
def forward(self, x_cat, x_cont=None):
if self.n_emb != 0:
= [e(x_cat[:,i]) for i,e in enumerate(self.embeds)]
x = torch.cat(x, 1)
x = self.emb_drop(x)
x if self.n_cont != 0:
if self.bn_cont is not None: x_cont = self.bn_cont(x_cont)
= torch.cat([x, x_cont], 1) if self.n_emb != 0 else x_cont
x return self.layers(x)
该模型期望您的cat
和cont
变量分开。cat
变量通过Embedding
层和潜在的Dropout
,而cont
变量通过潜在的BatchNorm1d
。随后,两者被连接并经过一系列的LinBnDrop
,最后通过一个对应预期输出的Linear
层。
= [(4,2), (17,8)]
emb_szs = TabularModel(emb_szs, n_cont=2, out_sz=2, layers=[200,100]).eval()
m = torch.tensor([[2,12]]).long()
x_cat = torch.tensor([[0.7633, -0.1887]]).float()
x_cont = m(x_cat, x_cont) out
@delegates(TabularModel.__init__)
def tabular_config(**kwargs):
"Convenience function to easily create a config for `TabularModel`"
return kwargs
任何对 TabularModel
内部的直接设置都应该通过这里:
= tabular_config(embed_p=0.6, use_bn=False); config config
导出 -
from nbdev import nbdev_export
nbdev_export()