模型钩子

! [ -e /content ] && pip install -Uqq fastai  # 在Colab上升级fastai
from __future__ import annotations
from fastai.basics import *
from nbdev.showdoc import *
import math

回调和辅助函数,用于在模型中添加钩子

from fastai.test_utils import *

什么是钩子?

钩子是您可以附加到模型中特定层的函数,这些函数将在前向传递(前向钩子)或反向传递(反向钩子)中执行。这里我们首先介绍钩子的概念,但如果您想快速实现一个钩子,可以跳到HookCallback(并阅读以下示例ActivationStats)。

前向钩子是接受三个参数的函数:应用于的层、该层的输入和该层的输出。

tst_model = nn.Linear(5,3)
def example_forward_hook(m,i,o): print(m,i,o)
    
x = torch.randn(4,5)
hook = tst_model.register_forward_hook(example_forward_hook)
y = tst_model(x)
hook.remove()
Linear(in_features=5, out_features=3, bias=True) (tensor([[ 0.0117, -0.1157, -1.0055,  0.6962,  0.2329],
        [-0.6184,  0.3594,  2.2896, -0.8758,  0.2538],
        [-0.5746, -1.3166, -0.3460,  2.0481,  0.9366],
        [ 0.6335,  0.7545, -2.2502,  0.2476,  0.0433]]),) tensor([[-0.4212,  0.4666,  0.3218],
        [-0.2607,  0.3498, -0.4724],
        [-0.0859,  0.9967,  0.6624],
        [-0.6484,  0.2241,  0.2266]], grad_fn=<AddmmBackward0>)

反向钩子是接受三个参数的函数:应用于的层、损失相对于输入的梯度以及相对于输出的梯度。

def example_backward_hook(m,gi,go): print(m,gi,go)
hook = tst_model.register_backward_hook(example_backward_hook)

x = torch.randn(4,5)
y = tst_model(x)
loss = y.pow(2).mean()
loss.backward()
hook.remove()
Linear(in_features=5, out_features=3, bias=True) (tensor([-0.3353,  0.0533,  0.1919]), None, tensor([[ 0.1835,  0.2476, -0.1970],
        [ 0.2397,  0.0214, -0.1315],
        [-0.0041, -0.2241,  0.0439],
        [ 0.2395,  0.1597, -0.1938],
        [ 0.3009,  0.1428, -0.1170]])) (tensor([[-0.0592,  0.0075,  0.0259],
        [-0.1145, -0.0367,  0.0386],
        [-0.1182, -0.0369,  0.1108],
        [-0.0435,  0.1193,  0.0165]]),)
/home/jhoward/miniconda3/lib/python3.9/site-packages/torch/nn/modules/module.py:1033: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior.
  warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes "

钩子可以改变层的输入/输出,或者梯度,打印值或形状。如果你想存储与这些输入/输出相关的内容,最好将你的钩子与一个类关联,这样它可以将其放入该类实例的状态中。

钩子 -

@docs
class Hook():
    "Create a hook on `m` with `hook_func`."
    def __init__(self, m, hook_func, is_forward=True, detach=True, cpu=False, gather=False):
        store_attr('hook_func,detach,cpu,gather')
        f = m.register_forward_hook if is_forward else m.register_backward_hook
        self.hook = f(self.hook_fn)
        self.stored,self.removed = None,False

    def hook_fn(self, module, input, output):
        "Applies `hook_func` to `module`, `input`, `output`."
        if self.detach:
            input,output = to_detach(input, cpu=self.cpu, gather=self.gather),to_detach(output, cpu=self.cpu, gather=self.gather)
        self.stored = self.hook_func(module, input, output)

    def remove(self):
        "Remove the hook from the model."
        if not self.removed:
            self.hook.remove()
            self.removed=True

    def __enter__(self, *args): return self
    def __exit__(self, *args): self.remove()

    _docs = dict(__enter__="Register the hook",
                 __exit__="Remove the hook")

在正向传递时,如果 is_forward=True 将调用此函数,否则在反向传递时调用,并且会可选地 detachgather 并将模型的输入/输出(梯度)放到 cpu 上,然后再将它们传递给 hook_funchook_func 的结果将存储在 Hookstored 属性中。

tst_model = nn.Linear(5,3)
hook = Hook(tst_model, lambda m,i,o: o)
y = tst_model(x)
test_eq(hook.stored, y)
show_doc(Hook.hook_fn)

Hook.hook_fn[source]

Hook.hook_fn(module, input, output)

Applies hook_func to module, input, output.

show_doc(Hook.remove)

Hook.remove[source]

Hook.remove()

Remove the hook from the model.

Note

在完成模型后,妥善删除模型的钩子非常重要,以避免在下次将模型应用于某些输入时再次调用它们,并释放与其状态相关的内存。

tst_model = nn.Linear(5,10)
x = torch.randn(4,5)
y = tst_model(x)
hook = Hook(tst_model, example_forward_hook)
test_stdout(lambda: tst_model(x), f"{tst_model} ({x},) {y.detach()}")
hook.remove()
test_stdout(lambda: tst_model(x), "")

上下文管理器

因为即使您的代码由于某个错误而中断,删除 Hook 仍然非常重要,因此 Hook 可以用作上下文管理器。

show_doc(Hook.__enter__)

Hook.__enter__[source]

Hook.__enter__(*args)

Register the hook

show_doc(Hook.__exit__)

Hook.__exit__[source]

Hook.__exit__(*args)

Remove the hook

tst_model = nn.Linear(5,10)
x = torch.randn(4,5)
y = tst_model(x)
with Hook(tst_model, example_forward_hook) as h:
    test_stdout(lambda: tst_model(x), f"{tst_model} ({x},) {y.detach()}")
test_stdout(lambda: tst_model(x), "")
def _hook_inner(m,i,o): return o if isinstance(o,Tensor) or is_listy(o) else list(o)

def hook_output(module, detach=True, cpu=False, grad=False):
    "Return a `Hook` that stores activations of `module` in `self.stored`"
    return Hook(module, _hook_inner, detach=detach, cpu=cpu, is_forward=not grad)

存储的激活值是梯度,如果grad=True,否则是module的输出。如果detach=True,它们将与其历史分离,而如果cpu=True,则将它们放在CPU上。

tst_model = nn.Linear(5,10)
x = torch.randn(4,5)
with hook_output(tst_model) as h:
    y = tst_model(x)
    test_eq(y, h.stored)
    assert not h.stored.requires_grad
    
with hook_output(tst_model, grad=True) as h:
    y = tst_model(x)
    loss = y.pow(2).mean()
    loss.backward()
    test_close(2*y / y.numel(), h.stored[0])

::: {#cell-29 .cell 0=‘c’ 1=‘u’ 2=‘d’ 3=‘a’}

with hook_output(tst_model, cpu=True) as h:
    y = tst_model.cuda()(x.cuda())
    test_eq(h.stored.device, torch.device('cpu'))

:::

Hooks -

@docs
class Hooks():
    "Create several hooks on the modules in `ms` with `hook_func`."
    def __init__(self, ms, hook_func, is_forward=True, detach=True, cpu=False):
        self.hooks = [Hook(m, hook_func, is_forward, detach, cpu) for m in ms]

    def __getitem__(self,i): return self.hooks[i]
    def __len__(self):       return len(self.hooks)
    def __iter__(self):      return iter(self.hooks)
    @property
    def stored(self):        return L(o.stored for o in self)

    def remove(self):
        "Remove the hooks from the model."
        for h in self.hooks: h.remove()

    def __enter__(self, *args): return self
    def __exit__ (self, *args): self.remove()

    _docs = dict(stored = "The states saved in each hook.",
                 __enter__="Register the hooks",
                 __exit__="Remove the hooks")
layers = [nn.Linear(5,10), nn.ReLU(), nn.Linear(10,3)]
tst_model = nn.Sequential(*layers)
hooks = Hooks(tst_model, lambda m,i,o: o)
y = tst_model(x)
test_eq(hooks.stored[0], layers[0](x))
test_eq(hooks.stored[1], F.relu(layers[0](x)))
test_eq(hooks.stored[2], y)
hooks.remove()
show_doc(Hooks.stored, name='Hooks.stored')

Hooks.stored[source]

The states saved in each hook.

show_doc(Hooks.remove)

Hooks.remove[source]

Hooks.remove()

Remove the hooks from the model.

上下文管理器

Hook 一样,您可以将 Hooks 用作上下文管理器。

show_doc(Hooks.__enter__)

Hooks.__enter__[source]

Hooks.__enter__(*args)

Register the hooks

show_doc(Hooks.__exit__)

Hooks.__exit__[source]

Hooks.__exit__(*args)

Remove the hooks

layers = [nn.Linear(5,10), nn.ReLU(), nn.Linear(10,3)]
tst_model = nn.Sequential(*layers)
with Hooks(layers, lambda m,i,o: o) as h:
    y = tst_model(x)
    test_eq(h.stored[0], layers[0](x))
    test_eq(h.stored[1], F.relu(layers[0](x)))
    test_eq(h.stored[2], y)
def hook_outputs(modules, detach=True, cpu=False, grad=False):
    "Return `Hooks` that store activations of all `modules` in `self.stored`"
    return Hooks(modules, _hook_inner, detach=detach, cpu=cpu, is_forward=not grad)

存储的激活值是梯度(如果grad=True),否则是modules的输出。如果detach=True,它们将从它们的历史中分离出来,如果cpu=True,则会被放在CPU上。

layers = [nn.Linear(5,10), nn.ReLU(), nn.Linear(10,3)]
tst_model = nn.Sequential(*layers)
x = torch.randn(4,5)
with hook_outputs(layers) as h:
    y = tst_model(x)
    test_eq(h.stored[0], layers[0](x))
    test_eq(h.stored[1], F.relu(layers[0](x)))
    test_eq(h.stored[2], y)
    for s in h.stored: assert not s.requires_grad
    
with hook_outputs(layers, grad=True) as h:
    y = tst_model(x)
    loss = y.pow(2).mean()
    loss.backward()
    g = 2*y / y.numel()
    test_close(g, h.stored[2][0])
    g = g @ layers[2].weight.data
    test_close(g, h.stored[1][0])
    g = g * (layers[0](x) > 0).float()
    test_close(g, h.stored[0][0])

::: {#cell-43 .cell 0=‘c’ 1=‘u’ 2=‘d’ 3=‘a’}

with hook_outputs(tst_model, cpu=True) as h:
    y = tst_model.cuda()(x.cuda())
    for s in h.stored: test_eq(s.device, torch.device('cpu'))

:::

def dummy_eval(m, size=(64,64)):
    "Evaluate `m` on a dummy input of a certain `size`"
    ch_in = in_channels(m)
    x = one_param(m).new(1, ch_in, *size).requires_grad_(False).uniform_(-1.,1.)
    with torch.no_grad(): return m.eval()(x)
def model_sizes(m, size=(64,64)):
    "Pass a dummy input through the model `m` to get the various sizes of activations."
    with hook_outputs(m) as hooks:
        _ = dummy_eval(m, size=size)
        return [o.stored.shape for o in hooks]
m = nn.Sequential(ConvLayer(3, 16), ConvLayer(16, 32, stride=2), ConvLayer(32, 32))
test_eq(model_sizes(m), [[1, 16, 64, 64], [1, 32, 32, 32], [1, 32, 32, 32]])
def num_features_model(m):
    "Return the number of output features for `m`."
    sz,ch_in = 32,in_channels(m)
    while True:
        #尝试几种尺寸,以防模型需要较大的输入尺寸。
        try:
            return model_sizes(m, (sz,sz))[-1][1]
        except Exception as e:
            sz *= 2
            if sz > 2048: raise e
m = nn.Sequential(nn.Conv2d(5,4,3), nn.Conv2d(4,3,3))
test_eq(num_features_model(m), 3)
m = nn.Sequential(ConvLayer(3, 16), ConvLayer(16, 32, stride=2), ConvLayer(32, 32))
test_eq(num_features_model(m), 32)

Hook回调 -

为了简化钩子的使用,我们将其封装在一个回调中,您只需实现一个 hook 函数(加上您可能需要的任何元素)。

def has_params(m):
    "Check if `m` has at least one parameter"
    return len(list(m.parameters())) > 0
assert has_params(nn.Linear(3,4))
assert has_params(nn.LSTM(4,5,2))
assert not has_params(nn.ReLU())
@funcs_kwargs
class HookCallback(Callback):
    "`Callback` that can be used to register hooks on `modules`"
    _methods = ["hook"]
    hook = noops
    def __init__(self, modules=None, every=None, remove_end=True, is_forward=True, detach=True, cpu=True, include_paramless=False , **kwargs):
        store_attr('modules,every,remove_end,is_forward,detach,cpu, include_paramless')
        assert not kwargs

    def before_fit(self):
        "Register the `Hooks` on `self.modules`."
        if self.modules is None: self.modules = [m for m in flatten_model(self.model) if self.include_paramless or has_params(m)]
        if self.every is None: self._register()

    def before_batch(self):
        if self.every is None: return
        if self.training and self.train_iter%self.every==0: self._register()

    def after_batch(self):
        if self.every is None: return
        if self.training and self.train_iter%self.every==0: self._remove()

    def after_fit(self):
        "Remove the `Hooks`."
        if self.remove_end: self._remove()

    def _register(self): self.hooks = Hooks(self.modules, self.hook, self.is_forward, self.detach, self.cpu)
    def _remove(self):
        if getattr(self, 'hooks', None): self.hooks.remove()

    def __del__(self): self._remove()

您可以选择子类化并实现一个 hook 函数(以及您想要的任何事件),或者在初始化时传递一个 hook 函数。这样的函数需要接受三个参数:一个层、输入和输出(对于反向 hook,输入是相对于输入的梯度,输出是相对于输出的梯度),并且可以修改它们或根据它们更新状态。

如果未提供,modules 将默认为具有 weight 属性的 self.model 的层。(要包含那些 没有 weight 属性的 self.model 的层,例如 ReLUFlatten 等,请将 include_paramless=True 设置为 True)根据 do_remove,hooks 将在训练结束时(或在出现错误时)被适当地移除。is_forwarddetachcpu 会传递给 Hooks

在每次前向(或反向)传递时调用的函数是 self.hook,并且在子类化此回调时必须实现该函数。

class TstCallback(HookCallback):
    def hook(self, m, i, o): return o
    def after_batch(self): test_eq(self.hooks.stored[0], self.pred)
        
learn = synth_learner(n_trn=5, cbs = TstCallback())
learn.fit(1)
[0, 7.570430278778076, 7.6170854568481445, '00:00']
/home/jhoward/git/fastai/fastai/callback/core.py:67: UserWarning: You are shadowing an attribute (modules) that exists in the learner. Use `self.learn.modules` to avoid this
  warn(f"You are shadowing an attribute ({name}) that exists in the learner. Use `self.learn.{name}` to avoid this")
class TstCallback(HookCallback):
    def __init__(self, modules=None, remove_end=True, detach=True, cpu=False):
        super().__init__(modules, None, remove_end, False, detach, cpu)
    def hook(self, m, i, o): return o
    def after_batch(self):
        if self.training:
            test_eq(self.hooks.stored[0][0], 2*(self.pred-self.y)/self.pred.shape[0])
        
learn = synth_learner(n_trn=5, cbs = TstCallback())
learn.fit(1)
[0, 15.194129943847656, 15.124653816223145, '00:00']
show_doc(HookCallback.before_fit)

HookCallback.before_fit[source]

HookCallback.before_fit()

Register the Hooks on self.modules.

show_doc(HookCallback.after_fit)

HookCallback.after_fit[source]

HookCallback.after_fit()

Remove the Hooks.

模型概述

def total_params(m):
    "Give the number of parameters of a module and if it's trainable or not"
    params = sum([p.numel() for p in m.parameters()])
    trains = [p.requires_grad for p in m.parameters()]
    return params, (False if len(trains)==0 else trains[0])
test_eq(total_params(nn.Linear(10,32)), (32*10+32,True))
test_eq(total_params(nn.Linear(10,32, bias=False)), (32*10,True))
test_eq(total_params(nn.BatchNorm2d(20)), (20*2, True))
test_eq(total_params(nn.BatchNorm2d(20, affine=False)), (0,False))
test_eq(total_params(nn.Conv2d(16, 32, 3)), (16*32*3*3 + 32, True))
test_eq(total_params(nn.Conv2d(16, 32, 3, bias=False)), (16*32*3*3, True))
#首先,ih层20--10,其余均为10--10。*4表示四个门
test_eq(total_params(nn.LSTM(20, 10, 2)), (4 * (20*10 + 10) + 3 * 4 * (10*10 + 10), True))
def layer_info(learn, *xb):
    "Return layer infos of `model` on `xb` (only support batch first inputs)"
    def _track(m, i, o): 
        params, trainable, shape = '', '', ''
        same = any((isinstance(x[0], torch.Tensor) and x[0].shape[1:] == x[1].shape for x in zip(i, o)))
        shape = apply(lambda x: x.shape, o)
        if hasattr(m, 'weight'): # 非激活层
            params, trainable = total_params(m)
        return (type(m).__name__, params, trainable, shape, same)
            
    with Hooks(flatten_model(learn.model), _track) as h:
        batch = apply(lambda o:o[:1], xb)
        train_only_cbs = [cb for cb in learn.cbs if hasattr(cb, '_only_train_loop')]
        with learn.removed_cbs(train_only_cbs), learn.no_logging(), learn as l:
            r = l.get_preds(dl=[batch], inner=True, reorder=False)
        return h.stored

_track的输出预计为一个包含以下内容的元组:模块名称、参数数量、层的形状、是否可训练、属于哪个层组,以及大小是否发生变化。可能展示的三种组别如下:

  • 非激活层(线性层、卷积层等)
  • 激活层
  • 池化层

根据不同的情况,只有部分输出会被返回,否则返回为''。对于非激活层,所有信息都会被返回。激活层仅返回名称、形状以及sameFalse。池化层将返回名称、新的形状以及sameFalse

def _m(): return nn.Sequential(nn.Linear(1,50), nn.ReLU(), nn.BatchNorm1d(50), nn.Linear(50, 1))
sample_input = torch.randn((16, 1))
test_eq(layer_info(synth_learner(model=_m()), sample_input), [
    ('Linear', 100, True, [1, 50], False),
    ('ReLU', '', '', [1,50], True),
    ('BatchNorm1d', 100, True, [1, 50], True),
    ('Linear', 51, True, [1, 1], False)
])
# 扁平化测试
def _tst_m(): return nn.Sequential(
    nn.Conv2d(1, 2, kernel_size=3, padding=1, stride=2),
    nn.ReLU(),
    nn.Flatten(),
    nn.Linear(8,50), 
    nn.ReLU(), 
    nn.BatchNorm1d(50), 
    nn.Linear(50, 1)
)
                                                              
sample_input = torch.randn((1,1,4,4))
test_eq(layer_info(synth_learner(model=_tst_m()), sample_input), [
    ('Conv2d', 20, True, [1, 2, 2, 2], False),
    ('ReLU', '', '', [1, 2, 2, 2], True),
    ('Flatten', '', '', [1, 8], False),
    ('Linear', 450, True, [1, 50], False),
    ('ReLU', '', '', [1,50], True),
    ('BatchNorm1d', 100, True, [1, 50], True),
    ('Linear', 51, True, [1, 1], False)
])
# 多输入模型测试
class _2InpModel(Module):
    def __init__(self):
        super().__init__()
        self.seq = nn.Sequential(nn.Linear(2,50), nn.ReLU(), nn.BatchNorm1d(50), nn.Linear(50, 1))
    def forward(self, *inps):
        outputs = torch.cat(inps, dim=-1)
        return self.seq(outputs)

sample_inputs = (torch.randn(16, 1), torch.randn(16, 1))
learn = synth_learner(model=_2InpModel())
learn.dls.n_inp = 2
test_eq(layer_info(learn, *sample_inputs), [
    ('Linear', 150, True, [1, 50], False),
    ('ReLU', '', '', [1,50], True),
    ('BatchNorm1d', 100, True, [1, 50], True),
    ('Linear', 51, True, [1, 1], False)
])
def _get_shapes(o, bs): 
    inp = o[first(o)] if (isinstance(o, dict)) else o
    return ' x '.join([str(bs)] + [str(t) for t in inp[1:]])

def _print_shapes(o, bs):
    if isinstance(o, torch.Size): return _get_shapes(o, bs)
    elif isinstance(o, tuple): return _get_shapes(o[0], bs)
    else: return str([_print_shapes(x, bs) for x in o])
def module_summary(learn, *xb):
    "Print a summary of `model` using `xb`"
    #Individual parameters wrapped in ParameterModule aren't called through the hooks in `layer_info`,
    #  thus are not counted inside the summary
    #TODO: find a way to have them counted in param number somehow
    infos = layer_info(learn, *xb)
    n,bs = 76,find_bs(xb)
    inp_sz = _print_shapes(apply(lambda x:x.shape, xb), bs)
    res = f"{type(learn.model).__name__} (Input shape: {inp_sz})\n"
    res += "=" * n + "\n"
    res += f"{'Layer (type)':<20} {'Output Shape':<20} {'Param #':<10} {'Trainable':<10}\n"
    res += "=" * n
    ps,trn_ps,j = 0,0,0
    infos = [o for o in infos if o is not None] #see comment in previous cell
    prev_sz = None
    for typ,np,trn,sz,chnged in infos:
        if sz is None: continue
        if j == 0:
            res += f'\n{"":<20} {_print_shapes(sz, bs)[:19]:<20}' # to avoid a double line at the top
        if not chnged and not prev_sz == sz and j > 0: res += "\n" + "_" * n + "\n" + f'{"":<20} {_print_shapes(sz, bs)[:19]:<20}'
        j = 1
        res += f"\n{typ:<20} {'':<20} {np:<10} {str(trn):<10}"
        if np != '':
            ps += np
            if trn: trn_ps += np
        prev_sz = sz
    res += "\n" + "_" * n + "\n"
    res += f"\nTotal params: {ps:,}\n"
    res += f"Total trainable params: {trn_ps:,}\n"
    res += f"Total non-trainable params: {ps - trn_ps:,}\n\n"
    return PrettyString(res)
@patch
def summary(self:Learner):
    "Print a summary of the model, optimizer and loss function."
    xb = self.dls.train.one_batch()[:getattr(self.dls.train, "n_inp", 1)]
    res = module_summary(self, *xb)
    res += f"Optimizer used: {self.opt_func}\nLoss function: {self.loss_func}\n\n"
    if self.opt is not None:
        res += f"Model " + ("unfrozen\n\n" if self.opt.frozen_idx==0 else f"frozen up to parameter group #{self.opt.frozen_idx}\n\n")
    res += "Callbacks:\n" + '\n'.join(f"  - {cb}" for cb in self.cbs.sorted('order'))
    return PrettyString(res)
learn = synth_learner(model=_m())
learn.summary()
Sequential (Input shape: 16 x 1)
============================================================================
Layer (type)         Output Shape         Param #    Trainable 
============================================================================
                     16 x 50             
Linear                                    100        True      
ReLU                                                           
BatchNorm1d                               100        True      
____________________________________________________________________________
                     16 x 1              
Linear                                    51         True      
____________________________________________________________________________

Total params: 251
Total trainable params: 251
Total non-trainable params: 0

Optimizer used: functools.partial(<function SGD>, mom=0.9)
Loss function: FlattenedLoss of MSELoss()

Callbacks:
  - TrainEvalCallback
  - Recorder
#|cuda
learn = synth_learner(model=_m(), cuda=True)
learn.summary()
Sequential (Input shape: 16 x 1)
============================================================================
Layer (type)         Output Shape         Param #    Trainable 
============================================================================
                     16 x 50             
Linear                                    100        True      
ReLU                                                           
BatchNorm1d                               100        True      
____________________________________________________________________________
                     16 x 1              
Linear                                    51         True      
____________________________________________________________________________

Total params: 251
Total trainable params: 251
Total non-trainable params: 0

Optimizer used: functools.partial(<function SGD>, mom=0.9)
Loss function: FlattenedLoss of MSELoss()

Callbacks:
  - TrainEvalCallback
  - Recorder
# 多输出测试
class _NOutModel(Module):
    def __init__(self): self.lin = nn.Linear(5, 6)
    def forward(self, x1):
        x = torch.randn((10, 5))
        return x,self.lin(x)

learn = synth_learner(model = _NOutModel())
learn.summary() # 输出形状应为 (50, 16, 256), (1, 16, 256)
_NOutModel (Input shape: 16 x 1)
============================================================================
Layer (type)         Output Shape         Param #    Trainable 
============================================================================
                     16 x 6              
Linear                                    36         True      
____________________________________________________________________________

Total params: 36
Total trainable params: 36
Total non-trainable params: 0

Optimizer used: functools.partial(<function SGD>, mom=0.9)
Loss function: FlattenedLoss of MSELoss()

Callbacks:
  - TrainEvalCallback
  - Recorder
# 测试情况(如书中所述),当 learn.dls.train_ds 是一个列表而非 fastai.data.core.Datasets 时。
train_x = torch.rand((100, 4))
train_y = torch.rand((100, 1))

valid_x = torch.rand((100, 4))
valid_y = torch.rand((100,1))

dset = list(zip(train_x,train_y))
valid_dset = list(zip(valid_x,valid_y))

dl = DataLoader(dset, batch_size=16)
val_dl = DataLoader(valid_dset, batch_size=16)

dls = DataLoaders(dl, val_dl)

simple_net = nn.Sequential(
    nn.Linear(4, 2),
    nn.ReLU(),
    nn.Linear(2,1)
)
learn = Learner(dls, simple_net, loss_func=F.l1_loss)
learn.summary()
Sequential (Input shape: 16 x 4)
============================================================================
Layer (type)         Output Shape         Param #    Trainable 
============================================================================
                     16 x 2              
Linear                                    10         True      
ReLU                                                           
____________________________________________________________________________
                     16 x 1              
Linear                                    3          True      
____________________________________________________________________________

Total params: 13
Total trainable params: 13
Total non-trainable params: 0

Optimizer used: <function Adam>
Loss function: <function l1_loss>

Callbacks:
  - TrainEvalCallback
  - Recorder

激活图

@delegates()
class ActivationStats(HookCallback):
    "Callback that record the mean and std of activations."
    order=-20
    def __init__(self, with_hist=False, **kwargs):
        super().__init__(**kwargs)
        self.with_hist = with_hist

    def before_fit(self):
        "Initialize stats."
        super().before_fit()
        self.stats = L()

    def hook(self, m, i, o):
        if isinstance(o, tuple): return self.hook_multi_ouput(o)
        o = o.float()
        res = {'mean': o.mean().item(), 'std': o.std().item(),
               'near_zero': (o<=0.05).long().sum().item()/o.numel()}
        if self.with_hist: res['hist'] = o.histc(40,0,10)
        return res
    
    def hook_multi_ouput(self,o_tuple):
        "For outputs of RNN which are [nested] tuples of tensors"
        res = []
        for o in self._flatten_tuple(o_tuple):
            if not(isinstance(o, Tensor)): continue
            res.append(self.hook(None, None, o))
        return res

    def _flatten_tuple(self, o_tuple):
        "Recursively flatten a [nested] tuple"
        res = []
        for it in o_tuple:
            if isinstance(it, tuple): res += self._flatten_tuple(it)
            else: res += [it]
        return tuple(res)

    def after_batch(self):
        "Take the stored results and puts it in `self.stats`"
        if self.training and (self.every is None or self.train_iter%self.every == 0): self.stats.append(self.hooks.stored)
        super().after_batch()

    def layer_stats(self, idx):
        lstats = self.stats.itemgot(idx)
        return L(lstats.itemgot(o) for o in ('mean','std','near_zero'))

    def hist(self, idx):
        res = self.stats.itemgot(idx).itemgot('hist')
        return torch.stack(tuple(res)).t().float().log1p()

    def color_dim(self, idx, figsize=(10,5), ax=None):
        "The 'colorful dimension' plot"
        res = self.hist(idx)
        if ax is None: ax = subplots(figsize=figsize)[1][0]
        ax.imshow(res, origin='lower')
        ax.axis('off')

    def plot_layer_stats(self, idx):
        _,axs = subplots(1, 3, figsize=(12,3))
        for o,ax,title in zip(self.layer_stats(idx),axs,('mean','std','% near zero')):
            ax.plot(o)
            ax.set_title(title)
learn = synth_learner(n_trn=5, cbs = ActivationStats(every=4))
learn.fit(1)
[0, 9.915902137756348, 10.236139297485352, '00:00']
learn.activation_stats.stats
(#2) [[{'mean': 1.0413528680801392, 'std': 0.4082348346710205, 'near_zero': 0.0}],[{'mean': 0.7963836193084717, 'std': 0.3677118122577667, 'near_zero': 0.0}]]

第一行包含训练集中每个批次模型输出的均值,第二行包含它们的标准差。

def test_activation_stats_include_paramless(include_paramless=False):
    "create a learner, fit, then check number of layers"
    modl = nn.Sequential(nn.Linear(1,50), nn.ReLU(), nn.BatchNorm1d(50), nn.Linear(50, 1), nn.Flatten())

    learn = synth_learner(model=modl, cbs=ActivationStats(every=4, include_paramless=include_paramless))
    learn.fit(1)
    
    expected_stats_len = 3  
    if include_paramless: expected_stats_len = 5 # 包括ReLU和展平层
    test_eq(expected_stats_len, len(learn.activation_stats.modules))

test_activation_stats_include_paramless(include_paramless=True)
test_activation_stats_include_paramless(include_paramless=False)
[0, 11.84472370147705, 7.684460639953613, '00:00']
[0, 10.660934448242188, 6.482079029083252, '00:00']
def test_every(n_tr, every):
    "create a learner, fit, then check number of stats collected"
    learn = synth_learner(n_trn=n_tr, cbs=ActivationStats(every=every))
    learn.fit(1)
    expected_stats_len = math.ceil(n_tr / every)
    test_eq(expected_stats_len, len(learn.activation_stats.stats))
    
for n_tr in [11, 12, 13]:
    test_every(n_tr, 4)
    test_every(n_tr, 1)
[0, 6.150048732757568, 4.771674156188965, '00:00']
[0, 17.470989227294922, 17.58202362060547, '00:00']
[0, 10.142230987548828, 9.362530708312988, '00:00']
[0, 3.4879150390625, 3.3121471405029297, '00:00']
[0, 14.660429000854492, 17.298110961914062, '00:00']
[0, 22.280864715576172, 18.45922088623047, '00:00']
class TstCallback(HookCallback):
    def hook(self, m, i, o): return o
    def before_fit(self):
        super().before_fit()
        self.means,self.stds = [],[]
    
    def after_batch(self):
        if self.training:
            self.means.append(self.hooks.stored[0].mean().item())
            self.stds.append (self.hooks.stored[0].std() .item())

learn = synth_learner(n_trn=5, cbs = [TstCallback(), ActivationStats()])
learn.fit(1)
test_eq(learn.activation_stats.stats.itemgot(0).itemgot("mean"), learn.tst.means)
test_eq(learn.activation_stats.stats.itemgot(0).itemgot("std"),  learn.tst.stds)
[0, 9.024697303771973, 6.801002025604248, '00:00']

导出 -

from nbdev import nbdev_export
nbdev_export()
Converted 00_torch_core.ipynb.
Converted 01_layers.ipynb.
Converted 01a_losses.ipynb.
Converted 02_data.load.ipynb.
Converted 03_data.core.ipynb.
Converted 04_data.external.ipynb.
Converted 05_data.transforms.ipynb.
Converted 06_data.block.ipynb.
Converted 07_vision.core.ipynb.
Converted 08_vision.data.ipynb.
Converted 09_vision.augment.ipynb.
Converted 09b_vision.utils.ipynb.
Converted 09c_vision.widgets.ipynb.
Converted 10_tutorial.pets.ipynb.
Converted 10b_tutorial.albumentations.ipynb.
Converted 11_vision.models.xresnet.ipynb.
Converted 12_optimizer.ipynb.
Converted 13_callback.core.ipynb.
Converted 13a_learner.ipynb.
Converted 13b_metrics.ipynb.
Converted 14_callback.schedule.ipynb.
Converted 14a_callback.data.ipynb.
Converted 15_callback.hook.ipynb.
Converted 15a_vision.models.unet.ipynb.
Converted 16_callback.progress.ipynb.
Converted 17_callback.tracker.ipynb.
Converted 18_callback.fp16.ipynb.
Converted 18a_callback.training.ipynb.
Converted 18b_callback.preds.ipynb.
Converted 19_callback.mixup.ipynb.
Converted 20_interpret.ipynb.
Converted 20a_distributed.ipynb.
Converted 21_vision.learner.ipynb.
Converted 22_tutorial.imagenette.ipynb.
Converted 23_tutorial.vision.ipynb.
Converted 24_tutorial.image_sequence.ipynb.
Converted 24_tutorial.siamese.ipynb.
Converted 24_vision.gan.ipynb.
Converted 30_text.core.ipynb.
Converted 31_text.data.ipynb.
Converted 32_text.models.awdlstm.ipynb.
Converted 33_text.models.core.ipynb.
Converted 34_callback.rnn.ipynb.
Converted 35_tutorial.wikitext.ipynb.
Converted 37_text.learner.ipynb.
Converted 38_tutorial.text.ipynb.
Converted 39_tutorial.transformers.ipynb.
Converted 40_tabular.core.ipynb.
Converted 41_tabular.data.ipynb.
Converted 42_tabular.model.ipynb.
Converted 43_tabular.learner.ipynb.
Converted 44_tutorial.tabular.ipynb.
Converted 45_collab.ipynb.
Converted 46_tutorial.collab.ipynb.
Converted 50_tutorial.datablock.ipynb.
Converted 60_medical.imaging.ipynb.
Converted 61_tutorial.medical_imaging.ipynb.
Converted 65_medical.text.ipynb.
Converted 70_callback.wandb.ipynb.
Converted 71_callback.tensorboard.ipynb.
Converted 72_callback.neptune.ipynb.
Converted 73_callback.captum.ipynb.
Converted 74_callback.azureml.ipynb.
Converted 97_test_utils.ipynb.
Converted 99_pytorch_doc.ipynb.
Converted dev-setup.ipynb.
Converted app_examples.ipynb.
Converted camvid.ipynb.
Converted migrating_catalyst.ipynb.
Converted migrating_ignite.ipynb.
Converted migrating_lightning.ipynb.
Converted migrating_pytorch.ipynb.
Converted migrating_pytorch_verbose.ipynb.
Converted ulmfit.ipynb.
Converted index.ipynb.
Converted quick_start.ipynb.
Converted tutorial.ipynb.