表格模型

# 隐藏
! [ -e /content ] && pip install -Uqq fastai  # 在Colab上升级fastai
from __future__ import annotations
from fastai.torch_basics import *
from fastai.tabular.core import *
from nbdev.showdoc import *

可以在表格数据上使用的基本模型

嵌入表示

def emb_sz_rule(
    n_cat:int # 范畴的基数
) -> int:
    "Rule of thumb to pick embedding size corresponding to `n_cat`"
    return min(600, round(1.6 * n_cat**0.56))
def _one_emb_sz(classes, n, sz_dict=None):
    "Pick an embedding size for `n` depending on `classes` if not given in `sz_dict`."
    sz_dict = ifnone(sz_dict, {})
    n_cat = len(classes[n])
    sz = sz_dict.get(n, int(emb_sz_rule(n_cat)))  # 经验法则
    return n_cat,sz

通过反复试验,这条通用规则取两个值中的较小值:

  • 维度空间为600
  • 维度空间等于变量基数的1.6倍,取0.56

这为您的变量提供了一个良好的嵌入空间起点。对于希望深入这种做法的高级用户,您可以根据自己的判断调整这些值。对这个通用公式进行轻微调整以获得更好的效果是很常见的。

def get_emb_sz(
    to:Tabular|TabularPandas, 
    sz_dict:dict=None # Dictionary of {'class_name' : size, ...} to override default `emb_sz_rule` 
) -> list: # 各分类的嵌入尺寸列表
    "Get embedding size for each cat_name in `Tabular` or `TabularPandas`, or populate embedding size manually using sz_dict"
    return [_one_emb_sz(to.classes, n, sz_dict) for n in to.cat_names]
class TabularModel(Module):
    "Basic model for tabular data."
    def __init__(self, 
        emb_szs:list, # 每个分类变量的序列(num_embeddings, embedding_dim)
        n_cont:int, # 连续变量的数量
        out_sz:int, # 最终`LinBnDrop`层的输出数量
        layers:list, # 用于指定每个 `LinBnDrop` 层输入和输出大小的整数序列
        ps:float|MutableSequence=None, # `LinBnDrop`的丢弃概率序列
        embed_p:float=0., # `Embedding`层的丢弃概率
        y_range=None, # `SigmoidRange`激活的低值和高值 
        use_bn:bool=True, # 在 `LinBnDrop` 层中使用 `BatchNorm1d`
        bn_final:bool=False, # 在最终层上使用 `BatchNorm1d`
        bn_cont:bool=True, # 对连续变量使用 `BatchNorm1d`
        act_cls=nn.ReLU(inplace=True), # `LinBnDrop`层的激活类型
        lin_first:bool=True # 线性层位于`LinBnDrop`层的开头或末尾
    ):
        ps = ifnone(ps, [0]*len(layers))
        if not is_listy(ps): ps = [ps]*len(layers)
        self.embeds = nn.ModuleList([Embedding(ni, nf) for ni,nf in emb_szs])
        self.emb_drop = nn.Dropout(embed_p)
        self.bn_cont = nn.BatchNorm1d(n_cont) if bn_cont else None
        n_emb = sum(e.embedding_dim for e in self.embeds)
        self.n_emb,self.n_cont = n_emb,n_cont
        sizes = [n_emb + n_cont] + layers + [out_sz]
        actns = [act_cls for _ in range(len(sizes)-2)] + [None]
        _layers = [LinBnDrop(sizes[i], sizes[i+1], bn=use_bn and (i!=len(actns)-1 or bn_final), p=p, act=a, lin_first=lin_first)
                       for i,(p,a) in enumerate(zip(ps+[0.],actns))]
        if y_range is not None: _layers.append(SigmoidRange(*y_range))
        self.layers = nn.Sequential(*_layers)

    def forward(self, x_cat, x_cont=None):
        if self.n_emb != 0:
            x = [e(x_cat[:,i]) for i,e in enumerate(self.embeds)]
            x = torch.cat(x, 1)
            x = self.emb_drop(x)
        if self.n_cont != 0:
            if self.bn_cont is not None: x_cont = self.bn_cont(x_cont)
            x = torch.cat([x, x_cont], 1) if self.n_emb != 0 else x_cont
        return self.layers(x)

该模型期望您的catcont变量分开。cat变量通过Embedding层和潜在的Dropout,而cont变量通过潜在的BatchNorm1d。随后,两者被连接并经过一系列的LinBnDrop,最后通过一个对应预期输出的Linear层。

emb_szs = [(4,2), (17,8)]
m = TabularModel(emb_szs, n_cont=2, out_sz=2, layers=[200,100]).eval()
x_cat = torch.tensor([[2,12]]).long()
x_cont = torch.tensor([[0.7633, -0.1887]]).float()
out = m(x_cat, x_cont)
@delegates(TabularModel.__init__)
def tabular_config(**kwargs):
    "Convenience function to easily create a config for `TabularModel`"
    return kwargs

任何对 TabularModel 内部的直接设置都应该通过这里:

config = tabular_config(embed_p=0.6, use_bn=False); config

导出 -

from nbdev import nbdev_export
nbdev_export()