! [ -e /content ] && pip install -Uqq fastai # 在Colab上升级fastai
核心文本模块
from __future__ import annotations
from fastai.data.all import *
from fastai.text.core import *
from fastai.text.models.awdlstm import *
from nbdev.showdoc import *
::: {#cell-4 .cell 0=‘默’ 1=‘认’ 2=‘导’ 3=‘出’ 4=’ ’ 5=‘t’ 6=‘e’ 7=‘x’ 8=‘t’ 9=‘.’ 10=‘m’ 11=‘o’ 12=‘d’ 13=‘e’ 14=‘l’ 15=‘s’ 16=‘.’ 17=‘c’ 18=‘o’ 19=‘r’ 20=‘e’}
### 默认类级别 3
:::
包含在不同架构之间共有的模块和获取模型的通用函数
::: {#cell-6 .cell 0=‘e’ 1=‘x’ 2=‘p’ 3=‘o’ 4=‘r’ 5=‘t’}
= {AWD_LSTM: {'hid_name':'emb_sz', 'url':URLs.WT103_FWD, 'url_bwd':URLs.WT103_BWD,
_model_meta 'config_lm':awd_lstm_lm_config, 'split_lm': awd_lstm_lm_split,
'config_clas':awd_lstm_clas_config, 'split_clas': awd_lstm_clas_split},}
# Transformer: {'hid_name':'d_model', 'url':URLs.OPENAI_TRANSFORMER,
# 'config_lm':tfmer_lm_config, 'split_lm': tfmer_lm_split,
# 'config_clas':tfmer_clas_config, 'split_clas': tfmer_clas_split},
# TransformerXL: {'hid_name':'d_model',
# 'config_lm':tfmerXL_lm_config, 'split_lm': tfmerXL_lm_split,
# 'config_clas':tfmerXL_clas_config, 'split_clas': tfmerXL_clas_split}}
:::
语言模型
class LinearDecoder(Module):
"To go on top of a RNNCore module and create a Language Model."
=0.1
initrange
def __init__(self,
int, # 输出通道数量
n_out:int, # 编码器最后一层输出中的特征数量
n_hid:float=0.1, # 输入丢弃概率
output_p:=None, # 如果提供了 `module`,则会将解码器的权重绑定到 `tie_encoder.weight`。
tie_encoder:nn.Modulebool=True # 如果为 `False`,该层将不会学习加性偏置。
bias:
):self.decoder = nn.Linear(n_hid, n_out, bias=bias)
self.decoder.weight.data.uniform_(-self.initrange, self.initrange)
self.output_dp = RNNDropout(output_p)
if bias: self.decoder.bias.data.zero_()
if tie_encoder: self.decoder.weight = tie_encoder.weight
def forward(self, input):
= self.output_dp(input)
dp_inp return self.decoder(dp_inp), input, dp_inp
from fastai.text.models.awdlstm import *
= AWD_LSTM(100, 20, 10, 2)
enc = torch.randint(0, 100, (10,5))
x = enc(x)
r
= LinearDecoder(100, 20, 0.1)
tst = tst(r)
y 1], r)
test_eq(y[2].shape, r.shape)
test_eq(y[0].shape, [10, 5, 100])
test_eq(y[
= LinearDecoder(100, 20, 0.1, tie_encoder=enc.encoder)
tst test_eq(tst.decoder.weight, enc.encoder.weight)
class SequentialRNN(nn.Sequential):
"A sequential module that passes the reset call to its children."
def reset(self):
for c in self.children(): getcallable(c, 'reset')()
class _TstMod(Module):
def reset(self): print('reset')
= SequentialRNN(_TstMod(), _TstMod())
tst 'reset\nreset') test_stdout(tst.reset,
def get_language_model(
# 能够生成语言模型架构的函数或类
arch, int, # 词汇量大小
vocab_sz:dict=None, # 模型配置字典
config:float=1. # 用于缩放`config`中所有丢弃概率的乘法因子
drop_mult:-> SequentialRNN: # 具有`arch`编码器和线性解码器的语言模型
) "Create a language model from `arch` and its `config`."
= _model_meta[arch]
meta = ifnone(config, meta['config_lm']).copy()
config for k in config.keys():
if k.endswith('_p'): config[k] *= drop_mult
= map(config.pop, ['tie_weights', 'output_p', 'out_bias'])
tie_weights,output_p,out_bias = config.pop('init') if 'init' in config else None
init = arch(vocab_sz, **config)
encoder = encoder.encoder if tie_weights else None
enc = LinearDecoder(vocab_sz, config[meta['hid_name']], output_p, tie_encoder=enc, bias=out_bias)
decoder = SequentialRNN(encoder, decoder)
model return model if init is None else model.apply(init)
默认的 config
可以在 _model_meta[arch]['config_lm']
中找到。drop_mult
应用于该配置中所有的 dropout 概率。
= awd_lstm_lm_config.copy()
config 'n_hid':10, 'emb_sz':20})
config.update({
= get_language_model(AWD_LSTM, 100, config=config)
tst = torch.randint(0, 100, (10,5))
x = tst(x)
y 0].shape, [10, 5, 100])
test_eq(y[1].shape, [10, 5, 20])
test_eq(y[2].shape, [10, 5, 20])
test_eq(y[1].decoder.weight, tst[0].encoder.weight) test_eq(tst[
#测试丢弃倍数
= get_language_model(AWD_LSTM, 100, config=config, drop_mult=0.5)
tst 1].output_dp.p, config['output_p']*0.5)
test_eq(tst[for rnn in tst[0].rnns: test_eq(rnn.weight_p, config['weight_p']*0.5)
for dp in tst[0].hidden_dps: test_eq(dp.p, config['hidden_p']*0.5)
0].encoder_dp.embed_p, config['embed_p']*0.5)
test_eq(tst[0].input_dp.p, config['input_p']*0.5) test_eq(tst[
分类模型
def _pad_tensor(t:Tensor, bs:int) -> Tensor:
if t.size(0) < bs: return torch.cat([t, t.new_zeros(bs-t.size(0), *t.shape[1:])])
return t
class SentenceEncoder(Module):
"Create an encoder over `module` that can process a full sentence."
def __init__(self,
int, # 时间反向传播
bptt:# 一个能够处理最多 [`bs`, `bptt`] 个标记的模块
module:nn.Module, int=1, # 填充标记ID
pad_idx:int=None # 最大输出长度
max_len:
): 'bptt,module,pad_idx,max_len')
store_attr(
def reset(self): getcallable(self.module, 'reset')()
def forward(self, input):
= input.size()
bs,sl self.reset()
= input == self.pad_idx
mask = [],[]
outs,masks for i in range(0, sl, self.bptt):
#注意:这要求序列确实从bptt的整数倍轮次开始
= (input[:,i] != self.pad_idx).long().sum()
real_bs = self.module(input[:real_bs,i: min(i+self.bptt, sl)])
o if self.max_len is None or sl-i <= self.max_len:
outs.append(o)min(i+self.bptt, sl)])
masks.append(mask[:,i: = torch.cat([_pad_tensor(o, bs) for o in outs], dim=1)
outs = torch.cat(masks, dim=1)
mask return outs,mask
Warning
此模块期望输入数据首部填充大多数填充部分,序列从 bptt
的整数倍开始(其余的填充在末尾)。请使用 pad_input_chunk
将数据转换为适当的格式。
= nn.Embedding(5, 10)
mod = SentenceEncoder(5, mod, pad_idx=0)
tst = torch.randint(1, 5, (3, 15))
x 2,:5]=0
x[= tst(x)
out,mask
1], mod(x)[:1])
test_eq(out[:2,5:], mod(x)[2,5:])
test_eq(out[==0) test_eq(mask, x
def masked_concat_pool(
# 句子编码器的输出
output:Tensor, # 由句子编码器返回的布尔掩码
mask:Tensor, int # 时间反向传播
bptt:-> Tensor: # [last_hidden, max_pool, avg_pool] 的串联
) "Pool `MultiBatchEncoder` outputs into one vector [last_hidden, max_pool, avg_pool]"
= output.shape[1] - mask.long().sum(dim=1)
lens = mask[:,-bptt:].long().sum(dim=1)
last_lens = output.masked_fill(mask[:, :, None], 0).sum(dim=1)
avg_pool type(avg_pool.dtype)[:,None])
avg_pool.div_(lens.= output.masked_fill(mask[:,:,None], -float('inf')).max(dim=1)[0]
max_pool = torch.cat([output[torch.arange(0, output.size(0)),-last_lens-1], max_pool, avg_pool], 1) #连接池化。
x return x
= torch.randn(2,4,5)
out = tensor([[True,True,False,False], [False,False,False,True]])
mask = masked_concat_pool(out, mask, 2)
x
0,:5], out[0,-1])
test_close(x[1,:5], out[1,-2])
test_close(x[0,5:10], out[0,2:].max(dim=0)[0])
test_close(x[1,5:10], out[1,:3].max(dim=0)[0])
test_close(x[0,10:], out[0,2:].mean(dim=0))
test_close(x[1,10:], out[1,:3].mean(dim=0)) test_close(x[
#通过将填充部分替换为随机内容来测试结果是否与填充无关
= torch.randn(2,4,5)
out1 0,2:] = out[0,2:].clone()
out1[1,:3] = out[1,:3].clone()
out1[= masked_concat_pool(out1, mask, 2)
x1 test_eq(x, x1)
class PoolingLinearClassifier(Module):
"Create a linear classifier with pooling"
def __init__(self,
list, # MLP的隐藏层大小列表,以`int`类型表示
dims:list, # 以`float`类型表示的辍学概率列表
ps:int, # 时间反向传播
bptt:tuple=None # (低, 高)输出值范围的元组
y_range:
):if len(ps) != len(dims)-1: raise ValueError("Number of layers and dropout values do not match.")
= [nn.ReLU(inplace=True)] * (len(dims) - 2) + [None]
acts = [LinBnDrop(i, o, p=p, act=a) for i,o,p,a in zip(dims[:-1], dims[1:], ps, acts)]
layers if y_range is not None: layers.append(SigmoidRange(*y_range))
self.layers = nn.Sequential(*layers)
self.bptt = bptt
def forward(self, input):
= input
out,mask = masked_concat_pool(out, mask, self.bptt)
x = self.layers(x)
x return x, out, out
= nn.Embedding(5, 10)
mod = SentenceEncoder(5, mod, pad_idx=0)
tst = torch.randint(1, 5, (3, 15))
x 2,:5]=0
x[= tst(x)
out,mask
1], mod(x)[:1])
test_eq(out[:2,5:], mod(x)[2,5:])
test_eq(out[==0) test_eq(mask, x
= nn.Embedding(5, 10)
mod = nn.Sequential(SentenceEncoder(5, mod, pad_idx=0), PoolingLinearClassifier([10*3,4], [0.], 5))
tst
= torch.randint(1, 5, (3, 14))
x 2,:5] = 0
x[= tst(x)
res,raw,out
1], mod(x)[:1])
test_eq(raw[:2,5:], mod(x)[2,5:])
test_eq(raw[1], mod(x)[:1])
test_eq(out[:2,5:], mod(x)[2,5:])
test_eq(out[3,4])
test_eq(res.shape, [
= torch.cat([x, tensor([0,0,0])[:,None]], dim=1)
x1 = tst(x1)
res1,raw1,out1 test_eq(res, res1)
def get_text_classifier(
callable, # 能够生成语言模型架构的函数或类
arch:int, # 词汇量大小
vocab_sz:int, # 班级数量
n_class:int=72, # 时间反向传播
seq_len:dict=None, # 编码器配置字典
config:float=1., # 乘法因子,用于按比例调整 `config` 中所有 dropout 概率
drop_mult:list=None, # 分类器头隐藏层大小的列表,以 `int` 类型表示
lin_ftrs:list=None, # 分类器头部的退出概率列表,以 `float` 类型表示
ps:int=1, # 填充标记ID
pad_idx:int=72*20, # `SentenceEncoder`的最大输出长度
max_len:tuple=None # 元组形式的(低,高)输出值范围
y_range:
):"Create a text classifier from `arch` and its `config`, maybe `pretrained`"
= _model_meta[arch]
meta = meta['config_clas'].copy()
cfg
cfg.update(ifnone(config, {}))= cfg
config for k in config.keys():
if k.endswith('_p'): config[k] *= drop_mult
if lin_ftrs is None: lin_ftrs = [50]
if ps is None: ps = [0.1]*len(lin_ftrs)
= [config[meta['hid_name']] * 3] + lin_ftrs + [n_class]
layers = [config.pop('output_p')] + ps
ps = config.pop('init') if 'init' in config else None
init = SentenceEncoder(seq_len, arch(vocab_sz, **config), pad_idx=pad_idx, max_len=max_len)
encoder = SequentialRNN(encoder, PoolingLinearClassifier(layers, ps, bptt=seq_len, y_range=y_range))
model return model if init is None else model.apply(init)
= awd_lstm_clas_config.copy()
config 'n_hid':10, 'emb_sz':20})
config.update({
= get_text_classifier(AWD_LSTM, 100, 3, config=config)
tst = torch.randint(2, 100, (10,5))
x = tst(x)
y 0].shape, [10, 3])
test_eq(y[1].shape, [10, 5, 20])
test_eq(y[2].shape, [10, 5, 20]) test_eq(y[
#测试填充得到相同结果
eval()
tst.= tst(x)
y = torch.cat([x, tensor([2,1,1,1,1,1,1,1,1,1])[:,None]], dim=1)
x1 = tst(x1)
y1 0][1:],y1[0][1:]) test_close(y[
#测试丢弃倍数
= get_text_classifier(AWD_LSTM, 100, 3, config=config, drop_mult=0.5)
tst 1].layers[1][1].p, 0.1)
test_eq(tst[1].layers[0][1].p, config['output_p']*0.5)
test_eq(tst[for rnn in tst[0].module.rnns: test_eq(rnn.weight_p, config['weight_p']*0.5)
for dp in tst[0].module.hidden_dps: test_eq(dp.p, config['hidden_p']*0.5)
0].module.encoder_dp.embed_p, config['embed_p']*0.5)
test_eq(tst[0].module.input_dp.p, config['input_p']*0.5) test_eq(tst[
导出 -
from nbdev import nbdev_export
nbdev_export()