! [ -e /content ] && pip install -Uqq fastai # 在Colab上升级fastai
超参数调度
from __future__ import annotations
from fastai.basics import *
from fastai.callback.tracker import SaveModelCallback
= ['SuggestionMethod'] _all_
from nbdev.showdoc import *
回调和辅助函数,用于调度任何超参数
from fastai.test_utils import *
退火
class _Annealer:
def __init__(self, f, start, end): store_attr('f,start,end')
def __call__(self, pos): return self.f(self.start, self.end, pos)
def annealer(f):
"Decorator to make `f` return itself partially applied."
@functools.wraps(f)
def _inner(start, end): return _Annealer(f, start, end)
return _inner
这是我们将用于所有调度函数的装饰器,因为它将一个接受 (start, end, pos)
的函数转换为一个接受 (start, end)
的函数,并返回一个依赖于 pos
的函数。
#待办事项 杰里米,把这个腌黄瓜做好
#@退火器
#def 线性调度(起点, 终点, 位置): 返回 起点 + 位置*(终点-起点)
#@退火器
#def SchedCos(start, end, pos): return start + (1 + math.cos(math.pi*(1-pos))) * (end-start) / 2
#@退火器
#def 调度无 (开始, 结束, 位置): 返回 开始
#@退火器
#def SchedExp(start, end, pos): return start * (end/start) ** pos
#
#SchedLin.__doc__ = "Linear schedule function from `start` to `end`"
#SchedCos.__doc__ = "Cosine schedule function from `start` to `end`"
#SchedNo .__doc__ = "Constant schedule function with `start` value"
#SchedExp.__doc__ = "Exponential schedule function from `start` to `end`"
def sched_lin(start, end, pos): return start + pos*(end-start)
def sched_cos(start, end, pos): return start + (1 + math.cos(math.pi*(1-pos))) * (end-start) / 2
def sched_no (start, end, pos): return start
def sched_exp(start, end, pos): return start * (end/start) ** pos
def SchedLin(start, end): return _Annealer(sched_lin, start, end)
def SchedCos(start, end): return _Annealer(sched_cos, start, end)
def SchedNo (start, end): return _Annealer(sched_no, start, end)
def SchedExp(start, end): return _Annealer(sched_exp, start, end)
= "Linear schedule function from `start` to `end`"
SchedLin.__doc__ = "Cosine schedule function from `start` to `end`"
SchedCos.__doc__ = "Constant schedule function with `start` value"
SchedNo .__doc__ = "Exponential schedule function from `start` to `end`" SchedExp.__doc__
= pickle.dumps(SchedCos(0, 5)) tst
= "NO LINEAR COS EXP".split()
annealings = torch.linspace(0.,1,100)
p = [SchedNo, SchedLin, SchedCos, SchedExp] fns
def SchedPoly(start, end, power):
"Polynomial schedule (of `power`) function from `start` to `end`"
def _inner(pos): return start + (end - start) * pos ** power
return _inner
for fn, t in zip(fns, annealings):
2, 1e-2)(o) for o in p], label=t)
plt.plot(p, [fn(= SchedPoly(2,1e-2,0.5)
f for o in p], label="POLY(0.5)")
plt.plot(p, [f(o) ; plt.legend()
show_doc(SchedLin)
= SchedLin(0, 2)
sched map(sched, [0., 0.25, 0.5, 0.75, 1.])), [0., 0.5, 1., 1.5, 2.]) test_eq(L(
show_doc(SchedCos)
= SchedCos(0, 2)
sched map(sched, [0., 0.25, 0.5, 0.75, 1.])), [0., 0.29289, 1., 1.70711, 2.]) test_close(L(
show_doc(SchedNo)
= SchedNo(0, 2)
sched map(sched, [0., 0.25, 0.5, 0.75, 1.])), [0., 0., 0., 0., 0.]) test_close(L(
show_doc(SchedExp)
= SchedExp(1, 2)
sched map(sched, [0., 0.25, 0.5, 0.75, 1.])), [1., 1.18921, 1.41421, 1.68179, 2.]) test_close(L(
show_doc(SchedPoly)
SchedPoly
[source]
SchedPoly
(start
,end
,power
)
Polynomial schedule (of power
) function from start
to end
= SchedPoly(0, 2, 2)
sched map(sched, [0., 0.25, 0.5, 0.75, 1.])), [0., 0.125, 0.5, 1.125, 2.]) test_close(L(
= torch.linspace(0.,1,100)
p
= [0.5,1.,2.]
pows for e in pows:
= SchedPoly(2, 0, e)
f for o in p], label=f'power {e}')
plt.plot(p, [f(o) ; plt.legend()
def combine_scheds(pcts, scheds):
"Combine `scheds` according to `pcts` in one function"
assert sum(pcts) == 1.
= tensor([0] + L(pcts))
pcts assert torch.all(pcts >= 0)
= torch.cumsum(pcts, 0)
pcts = len(pcts) - 2
pct_lim def _inner(pos):
= min((pos >= pcts).nonzero().max(), pct_lim)
idx = (pos-pcts[idx]) / (pcts[idx+1]-pcts[idx])
actual_pos return scheds[idx](actual_pos.item())
return _inner
pcts
必须是一个正数列表,且所有元素之和为1,且长度与 scheds
相同。生成的函数将使用 scheds[0]
从 0 到 pcts[0]
,然后使用 scheds[1]
从 pcts[0]
到 pcts[0]+pcts[1]
,依此类推。
= torch.linspace(0.,1,100)
p = combine_scheds([0.3,0.7], [SchedCos(0.3,0.6), SchedCos(0.6,0.2)])
f for o in p]); plt.plot(p, [f(o)
= torch.linspace(0.,1,100)
p = combine_scheds([0.3,0.2,0.5], [SchedLin(0.,1.), SchedNo(1.,1.), SchedCos(1., 0.)])
f for o in p]); plt.plot(p, [f(o)
0.), f(0.15), f(0.3), f(0.4), f(0.5), f(0.7), f(1.)],
test_close([f(0., 0.5, 1., 1., 1., 0.65451, 0.]) [
def combined_cos(pct, start, middle, end):
"Return a scheduler with cosine annealing from `start`→`middle` & `middle`→`end`"
return combine_scheds([pct,1-pct], [SchedCos(start, middle), SchedCos(middle, end)])
这是一个用于 1cycle 策略 的有用辅助函数。pct
用于 开始
到 中间
部分,1-pct
用于 中间
到 结束
。处理浮点数或浮点数集合。例如:
= combined_cos(0.25,0.5,1.,0.)
f for o in p]); plt.plot(p, [f(o)
0.), f(0.1), f(0.25), f(0.5), f(1.)], [0.5, 0.67275, 1., 0.75, 0.])
test_close([f(= combined_cos(0.25, np.array([0.25,0.5]), np.array([0.5,1.]), np.array([0.,0.]))
f for a,b in zip([f(0.), f(0.1), f(0.25), f(0.5), f(1.)],
0.25,0.5], [0.33638,0.67275], [0.5,1.], [0.375,0.75], [0.,0.]]):
[[ test_close(a,b)
参数调度器 -
@docs
class ParamScheduler(Callback):
"Schedule hyper-parameters according to `scheds`"
= 60,False
order,run_valid
def __init__(self, scheds): self.scheds = scheds
def before_fit(self): self.hps = {p:[] for p in self.scheds.keys()}
def before_batch(self): self._update_val(self.pct_train)
def _update_val(self, pct):
for n,f in self.scheds.items(): self.opt.set_hyper(n, f(pct))
def after_batch(self):
for p in self.scheds.keys(): self.hps[p].append(self.opt.hypers[-1][p])
def after_fit(self):
if hasattr(self.learn, 'recorder') and hasattr(self, 'hps'): self.recorder.hps = self.hps
= {"before_fit": "Initialize container for hyper-parameters",
_docs "before_batch": "Set the proper hyper-parameters in the optimizer",
"after_batch": "Record hyper-parameters of this batch",
"after_fit": "Save the hyper-parameters in the recorder if there is one"}
scheds
是一个字典,其中每个超参数都有一个键,值可以是调度器或调度器列表(在第二种情况下,列表的长度必须与优化器参数组的数量相同)。
= synth_learner()
learn = {'lr': SchedLin(1e-3, 1e-2)}
sched 1, cbs=ParamScheduler(sched))
learn.fit(= len(learn.dls.train)
n 'lr'], [1e-3 + (1e-2-1e-3) * i/n for i in range(n)]) test_close(learn.recorder.hps[
epoch | train_loss | valid_loss | time |
---|---|---|---|
0 | 11.929138 | 4.039281 | 00:00 |
#测试判别学习率
def _splitter(m): return [[m.a], [m.b]]
= synth_learner(splitter=_splitter)
learn = {'lr': combined_cos(0.5, np.array([1e-4,1e-3]), np.array([1e-3,1e-2]), np.array([1e-5,1e-4]))}
sched 1, cbs=ParamScheduler(sched)) learn.fit(
epoch | train_loss | valid_loss | time |
---|---|---|---|
0 | 21.979218 | 21.261951 | 00:00 |
show_doc(ParamScheduler.before_fit)
ParamScheduler.before_fit
[source]
ParamScheduler.before_fit
()
Initialize container for hyper-parameters
show_doc(ParamScheduler.before_batch)
ParamScheduler.before_batch
[source]
ParamScheduler.before_batch
()
Set the proper hyper-parameters in the optimizer
show_doc(ParamScheduler.after_batch)
ParamScheduler.after_batch
[source]
ParamScheduler.after_batch
()
Record hyper-parameters of this batch
show_doc(ParamScheduler.after_fit)
ParamScheduler.after_fit
[source]
ParamScheduler.after_fit
()
Save the hyper-parameters in the recorder if there is one
@patch
def fit_one_cycle(self:Learner, n_epoch, lr_max=None, div=25., div_final=1e5, pct_start=0.25, wd=None,
=None, cbs=None, reset_opt=False, start_epoch=0):
moms"Fit `self.model` for `n_epoch` using the 1cycle policy."
if self.opt is None: self.create_opt()
self.opt.set_hyper('lr', self.lr if lr_max is None else lr_max)
= np.array([h['lr'] for h in self.opt.hypers])
lr_max = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
scheds 'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd, start_epoch=start_epoch)
1cycle策略是由Leslie N. Smith等人提出的,详细内容见Super-Convergence: Very Fast Training of Neural Networks Using Large Learning Rates。该策略通过余弦退火调度学习率,从lr_max/div
减小到lr_max
,然后再减小到lr_max/div_final
(如果想使用不同的学习率,可以向lr_max
传递一个数组),同时根据moms
中的值使用余弦退火调度动量。第一阶段占用训练的pct_start
。您可以选择性地传递额外的cbs
和reset_opt
。
#集成测试:训练几个周期应使模型性能提升
= synth_learner(lr=1e-2)
learn = learn.dls.one_batch()
xb,yb = learn.loss_func(learn.model(xb), yb)
init_loss 2)
learn.fit_one_cycle(= learn.dls.one_batch()
xb,yb = learn.loss_func(learn.model(xb), yb)
final_loss assert final_loss < init_loss
epoch | train_loss | valid_loss | time |
---|---|---|---|
0 | 19.444899 | 6.755066 | 00:00 |
1 | 9.919473 | 1.044571 | 00:00 |
#调度器测试
= learn.recorder.hps['lr'],learn.recorder.hps['mom']
lrs,moms 0.25,1e-2/25,1e-2,1e-7)(i/20) for i in range(20)])
test_close(lrs, [combined_cos(0.25,0.95,0.85,0.95)(i/20) for i in range(20)]) test_close(moms, [combined_cos(
@patch
def plot_sched(self:Recorder, keys=None, figsize=None):
= self.hps.keys() if keys is None else L(keys)
keys = (len(keys)+1)//2, min(2, len(keys))
rows,cols = figsize or (6*cols,4*rows)
figsize = plt.subplots(rows, cols, figsize=figsize)
_, axs = axs.flatten() if len(keys) > 1 else L(axs)
axs for p,ax in zip(keys, axs):
self.hps[p])
ax.plot( ax.set_ylabel(p)
#测试判别学习率
def _splitter(m): return [[m.a], [m.b]]
= synth_learner(splitter=_splitter)
learn 1, lr_max=slice(1e-3,1e-2))
learn.fit_one_cycle(#n = len(learn.dls.train)
#est_close(learn.recorder.hps['lr'], [1e-3 + (1e-2-1e-3) * i/n for i in range(n)])
epoch | train_loss | valid_loss | time |
---|---|---|---|
0 | 17.738113 | 21.716423 | 00:00 |
= synth_learner()
learn 2) learn.fit_one_cycle(
epoch | train_loss | valid_loss | time |
---|---|---|---|
0 | 5.406837 | 5.305011 | 00:00 |
1 | 5.058437 | 4.899223 | 00:00 |
learn.recorder.plot_sched()
@patch
def fit_flat_cos(self:Learner, n_epoch, lr=None, div_final=1e5, pct_start=0.75, wd=None,
=None, reset_opt=False, start_epoch=0):
cbs"Fit `self.model` for `n_epoch` at flat `lr` before a cosine annealing."
if self.opt is None: self.create_opt()
self.opt.set_hyper('lr', self.lr if lr is None else lr)
= np.array([h['lr'] for h in self.opt.hypers])
lr = {'lr': combined_cos(pct_start, lr, lr, lr/div_final)}
scheds self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd, start_epoch=0)
= synth_learner()
learn 2) learn.fit_flat_cos(
epoch | train_loss | valid_loss | time |
---|---|---|---|
0 | 10.588930 | 7.106113 | 00:00 |
1 | 8.943380 | 5.016665 | 00:00 |
learn.recorder.plot_sched()
@patch
def fit_sgdr(self:Learner, n_cycles, cycle_len, lr_max=None, cycle_mult=2, cbs=None, reset_opt=False, wd=None,
=0):
start_epoch"Fit `self.model` for `n_cycles` of `cycle_len` using SGDR."
if self.opt is None: self.create_opt()
self.opt.set_hyper('lr', self.lr if lr_max is None else lr_max)
= np.array([h['lr'] for h in self.opt.hypers])
lr_max = cycle_len * (cycle_mult**n_cycles-1)//(cycle_mult-1)
n_epoch = [cycle_len * cycle_mult**i / n_epoch for i in range(n_cycles)]
pcts = [SchedCos(lr_max, 0) for _ in range(n_cycles)]
scheds = {'lr': combine_scheds(pcts, scheds)}
scheds self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd, start_epoch=start_epoch)
这个调度由 Ilya Loshchilov 等人在 SGDR: 带热重启的随机梯度下降 中提出。它由 n_cycles
组成,这些周期是从 lr_max
(默认为 Learner
的学习率)到 0 的余弦退火,第 i
个周期的长度为 cycle_len * cycle_mult**i
(第一个周期长度为 cycle_len
,然后我们在每个 epoch 中将长度乘以 cycle_mult
)。您可以选择性地传递其他 cbs
和 reset_opt
。
::: {#cell-60 .cell 0=‘缓’ 1=‘慢’}
= synth_learner()
learn with learn.no_logging(): learn.fit_sgdr(3, 1)
7)
test_eq(learn.n_epoch, = [k * len(learn.dls.train) for k in [0,1,3,7]]
iters for i in range(3):
= iters[i+1]-iters[i]
n #周期的开始可能与前一周期的0因舍入误差而混淆,因此我们在+1处进行测试
+1:iters[i+1]], [SchedCos(learn.lr, 0)(k/n) for k in range(1,n)])
test_close(learn.recorder.lrs[iters[i]
learn.recorder.plot_sched()
:::
@patch
@delegates(Learner.fit_one_cycle)
def fine_tune(self:Learner, epochs, base_lr=2e-3, freeze_epochs=1, lr_mult=100,
=0.3, div=5.0, **kwargs):
pct_start"Fine tune with `Learner.freeze` for `freeze_epochs`, then with `Learner.unfreeze` for `epochs`, using discriminative LR."
self.freeze()
self.fit_one_cycle(freeze_epochs, slice(base_lr), pct_start=0.99, **kwargs)
/= 2
base_lr self.unfreeze()
self.fit_one_cycle(epochs, slice(base_lr/lr_mult, base_lr), pct_start=pct_start, div=div, **kwargs)
1) learn.fine_tune(
epoch | train_loss | valid_loss | time |
---|---|---|---|
0 | 2.428970 | 1.740237 | 00:00 |
epoch | train_loss | valid_loss | time |
---|---|---|---|
0 | 2.019952 | 1.616970 | 00:00 |
从检查点恢复训练
class InterruptCallback(Callback):
def __init__(self, epoch):
self._interupt_before = epoch
def before_epoch(self):
if self.epoch == self._interupt_before:
raise CancelFitException
要启用从检查点恢复,请确保保存模型和优化器的状态。这可以通过设置 SaveModelCallback
的 (with_opt=True)
来完成。如果训练中断,请使用与之前相同的参数定义 learn
,从检查点加载模型,并将 start_epoch
传递给 fit
调用。训练将从 start_epoch
继续,并正确安排 lr
。
::: {#cell-66 .cell 0=‘缓’ 1=‘慢’}
with tempfile.TemporaryDirectory() as d:
= synth_learner(path=d, cbs=SaveModelCallback(with_opt=True, fname="ckpt"))
learn1 5, cbs=InterruptCallback(2))
learn1.fit_one_cycle(
= synth_learner(path=d)
learn2 = learn2.load("ckpt")
learn2 5, start_epoch=2)
learn2.fit_one_cycle(
= plt.subplots(1,2, sharey=True)
fig, axs 0].plot(learn1.recorder.lrs)
axs[1].plot(learn2.recorder.lrs) axs[
epoch | train_loss | valid_loss | time |
---|---|---|---|
0 | 18.930223 | 14.100439 | 00:00 |
1 | 17.092665 | 10.603369 | 00:00 |
Better model found at epoch 0 with valid_loss value: 14.100439071655273.
Better model found at epoch 1 with valid_loss value: 10.603368759155273.
epoch | train_loss | valid_loss | time |
---|---|---|---|
0 | 00:00 | ||
1 | 00:00 | ||
2 | 11.456764 | 10.057186 | 00:00 |
3 | 10.287196 | 8.694046 | 00:00 |
4 | 9.585465 | 8.422710 | 00:00 |
:::
LRFind -
@docs
class LRFinder(ParamScheduler):
"Training with exponentially growing learning rate"
def __init__(self, start_lr=1e-7, end_lr=10, num_it=100, stop_div=True):
if num_it < 6: num_it = 6
self.scheds = {'lr': [SchedExp(s, e) for (s,e) in zip(start_lr,end_lr)
if is_listy(start_lr) else SchedExp(start_lr, end_lr)}
] self.num_it,self.stop_div = num_it,stop_div
def before_fit(self):
super().before_fit()
= self.path/self.model_dir
path =True, exist_ok=True)
path.mkdir(parentsself.tmp_d = tempfile.TemporaryDirectory(dir=path)
self.tmp_p = Path(self.tmp_d.name).stem
self.learn.save(f'{self.tmp_p}/_tmp')
self.best_loss = float('inf')
def before_batch(self): self._update_val(self.train_iter/self.num_it)
def after_batch(self):
super().after_batch()
if self.smooth_loss < self.best_loss: self.best_loss = self.smooth_loss
if self.smooth_loss > 4*self.best_loss and self.stop_div: raise CancelFitException()
if self.train_iter >= self.num_it: raise CancelFitException()
def before_validate(self): raise CancelValidException()
def after_fit(self):
self.learn.opt.zero_grad() # 在分离优化器以供未来拟合之前需要完成的事项
= self.path/self.model_dir/self.tmp_p/'_tmp.pth'
tmp_f if tmp_f.exists():
self.learn.load(f'{self.tmp_p}/_tmp', with_opt=True)
self.tmp_d.cleanup()
= {"before_fit": "Initialize container for hyper-parameters and save the model",
_docs "before_batch": "Set the proper hyper-parameters in the optimizer",
"after_batch": "Record hyper-parameters of this batch and potentially stop training",
"after_fit": "Save the hyper-parameters in the recorder if there is one and load the original model",
"before_validate": "Skip the validation part of training"}
::: {#cell-69 .cell 0=‘c’ 1=‘u’ 2=‘d’ 3=‘a’}
from fastai.vision.all import *
:::
::: {#cell-70 .cell 0=‘c’ 1=‘u’ 2=‘d’ 3=‘a’}
99, True)
set_seed(= untar_data(URLs.PETS)/'images'
path
= get_image_files(path)
image_files if sys.platform == "win32" and IN_NOTEBOOK:
= random.choices(image_files, k=int(len(image_files)/8))
image_files print("Randomly select 1/8 files in NOTEBOOK on Windows to save time")
# pickle can't serializer lamda function.
def _label_func(x):
return x[0].isupper()
= ImageDataLoaders.from_name_func(
dls =0.2,
path, image_files, valid_pct=_label_func, item_tfms=Resize(224))
label_func
= vision_learner(dls, resnet18)
learn 1)
learn.fit('state'][1]['grad_avg'] learn.opt.state_dict()[
epoch | train_loss | valid_loss | time |
---|---|---|---|
0 | 0.086690 | 0.016682 | 00:33 |
tensor([-5.8191e-04, -2.2443e-03, 0.0000e+00, -1.2517e-03, 0.0000e+00,
-1.4744e-03, -3.6433e-04, 0.0000e+00, 9.3745e-03, 0.0000e+00,
5.1993e-03, -1.5093e-02, -4.0410e-03, 0.0000e+00, 7.1963e-03,
-6.6033e-03, -3.3354e-03, -2.9191e-03, -1.5054e-03, -1.3179e-03,
8.7333e-03, -1.1155e-02, -9.6656e-04, 1.6653e-02, 9.5839e-04,
8.4995e-03, -2.8187e-02, 3.1579e-03, -9.3051e-04, -2.3887e-03,
-7.3557e-04, -1.4501e-02, -6.2110e-03, 1.9949e-03, -7.0233e-03,
1.2792e-02, 0.0000e+00, 1.0687e-03, 0.0000e+00, -4.2413e-04,
2.9628e-03, 7.2686e-03, -9.7241e-03, -4.9941e-04, 1.7408e-02,
-9.2441e-03, -9.7731e-03, -9.9393e-03, 0.0000e+00, -2.1448e-03,
2.7660e-03, -3.1110e-03, 5.9454e-05, -1.4412e-03, -6.1454e-04,
-1.6537e-03, 1.7001e-02, 1.4041e-02, -6.2878e-03, 2.0800e-02,
-1.2900e-02, -1.2626e-02, -2.6591e-03, 3.9685e-03], device='cuda:0')
:::
::: {#cell-71 .cell 0=‘缓’ 1=‘慢’}
with tempfile.TemporaryDirectory() as d:
= synth_learner(path=Path(d))
learn = learn.model.a,learn.model.b
init_a,init_b with learn.no_logging(): learn.fit(20, cbs=LRFinder(num_it=100))
assert len(learn.recorder.lrs) <= 100
len(learn.recorder.lrs), len(learn.recorder.losses))
test_eq(#检查是否发散
if len(learn.recorder.lrs) < 100: assert learn.recorder.losses[-1] > 4 * min(learn.recorder.losses)
#测试计划
1e-7, 10)(i/100) for i in range_of(learn.recorder.lrs)])
test_eq(learn.recorder.lrs, [SchedExp(#无验证数据
len(v) for v in learn.recorder.values], [1 for _ in range_of(learn.recorder.values)])
test_eq([#模型已正确加载
test_eq(learn.model.a, init_a)
test_eq(learn.model.b, init_b)'state'], [{}, {}]) test_eq(learn.opt.state_dict()[
:::
show_doc(LRFinder.before_fit)
LRFinder.before_fit
[source]
LRFinder.before_fit
()
Initialize container for hyper-parameters and save the model
show_doc(LRFinder.before_batch)
LRFinder.before_batch
[source]
LRFinder.before_batch
()
Set the proper hyper-parameters in the optimizer
show_doc(LRFinder.after_batch)
LRFinder.after_batch
[source]
LRFinder.after_batch
()
Record hyper-parameters of this batch and potentially stop training
show_doc(LRFinder.before_validate)
建议方法
有几种方法可以自动建议学习率,正如我们将看到的,这些方法可以进一步传递给 lr_find
。当前支持四种方法,然而,如果要编写自己的方法,它应该看起来像一个可以接受 LRFinder
返回的 lrs
、losses
以及 num_it
的函数。 你的函数应该返回一个可以绘制的 x,y
坐标,如下所示:
def myfunc(lrs:list, losses:list, num_it:int) -> tuple(float, tuple(float,int)):
...return suggestion, (suggestion,loss_idx)
如果需要传递更多参数,你应该将你的 func
作为部分函数传入,并自行指定,例如:
def myfunc(lrs:list, losses:list, num_it:int, pct_reduction:float) -> tuple(float, tuple(float,int)):
...return suggestion, (suggestion,loss_idx)
= partial(myfunc, pct_reduction=.2) f
= synth_learner()
learn with learn.no_logging(): learn.fit(20, cbs=LRFinder(num_it=100))
= tensor(learn.recorder.lrs[100//10:-5]),tensor(learn.recorder.losses[100//10:-5]) lrs,losses
def valley(lrs:list, losses:list, num_it:int):
"Suggests a learning rate from the longest valley and returns its index"
= len(losses)
n = 0,0
max_start, max_end
# 寻找最长的山谷
= [1]*n
lds for i in range(1,n):
for j in range(0,i):
if (losses[i] < losses[j]) and (lds[i] < lds[j] + 1):
= lds[j] + 1
lds[i] if lds[max_end] < lds[i]:
= i
max_end = max_end - lds[max_end]
max_start
= (max_end - max_start) / 3
sections = max_start + int(sections) + int(sections/2)
idx
return float(lrs[idx]), (float(lrs[idx]), losses[idx])
valley
算法是由 ESRI 开发的,主要是在学习率图中取最长谷底大约 2/3 处分的最陡坡度,且也是 Learner.lr_find
的默认选项。
100) valley(lrs, losses,
(0.007585775572806597, (0.007585775572806597, tensor(10.4701)))
def slide(lrs:list, losses:list, num_it:int, lr_diff:int=15, thresh:float=.005, adjust_value:float=1.):
"Suggests a learning rate following an interval slide rule and returns its index"
= to_np(losses)
losses = np.gradient(losses)
loss_grad
= -1
r_idx = r_idx - lr_diff
l_idx = lrs[l_idx]
local_min_lr while (l_idx >= -len(losses)) and (abs(loss_grad[r_idx] - loss_grad[l_idx]) > thresh):
= lrs[l_idx]
local_min_lr -= 1
r_idx -= 1
l_idx
= float(local_min_lr) * adjust_value
suggestion = np.interp(np.log10(suggestion), np.log10(lrs), losses)
idx return suggestion, (suggestion, idx)
slide
规则是由Novetta的Andrew Chang开发的一种算法,详细信息请参见此处。
100) slide(lrs, losses,
(6.309573450380412e-07, (6.309573450380412e-07, 12.832133293151855))
def minimum(lrs:list, losses:list, num_it:int):
"Suggests a learning rate one-tenth the minumum before divergance and returns its index"
= lrs[losses.argmin()].item()
lr_min = losses[min(range(len(lrs)), key=lambda i: abs(lrs[i]-lr_min))]
loss_idx return lr_min/10, (lr_min, loss_idx)
100) minimum(lrs, losses,
(0.10964782238006592, (1.0964782238006592, tensor(7.5869)))
def steep(lrs:list, losses:list, num_it:int) -> (float, tuple):
"Suggests a learning rate when the slope is the steepest and returns its index"
= (losses[1:]-losses[:-1]) / (lrs[1:].log()-lrs[:-1].log())
grads = lrs[grads.argmin()].item()
lr_steep = losses[min(range(len(lrs)), key=lambda i: abs(lrs[i]-lr_steep))]
loss_idx return lr_steep, (lr_steep, loss_idx)
100) steep(lrs, losses,
(1.9054607491852948e-06, (1.9054607491852948e-06, tensor(12.8940)))
@patch
def plot_lr_find(self:Recorder, skip_end=5, return_fig=True, suggestions=None, nms=None, **kwargs):
"Plot the result of an LR Finder test (won't work if you didn't do `learn.lr_find()` before)"
= self.lrs if skip_end==0 else self.lrs [:-skip_end]
lrs = self.losses if skip_end==0 else self.losses[:-skip_end]
losses = plt.subplots(1,1)
fig, ax
ax.plot(lrs, losses)"Loss")
ax.set_ylabel("Learning Rate")
ax.set_xlabel('log')
ax.set_xscale(if suggestions:
= plt.rcParams['axes.prop_cycle'].by_key()['color'][1:]
colors for (val, idx), nm, color in zip(suggestions, nms, colors):
'o', label=nm, c=color)
ax.plot(val, idx, ='best') ax.legend(loc
"SuggestionMethod", **{o.__name__.capitalize():o for o in [valley,slide,minimum,steep]},
mk_class(="All possible suggestion methods as convience attributes to get tab-completion and typo-proofing") doc
@patch
def lr_find(self:Learner, start_lr=1e-7, end_lr=10, num_it=100, stop_div=True, show_plot=True, suggest_funcs=(SuggestionMethod.Valley)):
"Launch a mock training to find a good learning rate and return suggestions based on `suggest_funcs` as a named tuple"
= num_it//len(self.dls.train) + 1
n_epoch =LRFinder(start_lr=start_lr, end_lr=end_lr, num_it=num_it, stop_div=stop_div)
cbwith self.no_logging(): self.fit(n_epoch, cbs=cb)
if suggest_funcs is not None:
= tensor(self.recorder.lrs[num_it//10:-5]), tensor(self.recorder.losses[num_it//10:-5])
lrs, losses = torch.nonzero(torch.isnan(losses.view(-1)))
nan_idxs if len(nan_idxs) > 0:
= min(nan_idxs)
drop_idx = lrs[:drop_idx]
lrs = losses[:drop_idx]
losses = [], []
_suggestions, nms for func in tuplify(suggest_funcs):
__name__ if not isinstance(func, partial) else func.func.__name__) # 处理部分内容
nms.append(func.
_suggestions.append(func(lrs, losses, num_it))
= collections.namedtuple('SuggestedLRs', nms)
SuggestedLRs = [], []
lrs, pnts for lr, pnt in _suggestions:
lrs.append(lr)
pnts.append(pnt)if show_plot: self.recorder.plot_lr_find(suggestions=pnts, nms=nms)
return SuggestedLRs(*lrs)
elif show_plot: self.recorder.plot_lr_find()
最初由Leslie N. Smith在Cyclical Learning Rates for Training Neural Networks中提出,学习率查找器(LR Finder)通过在start_lr
和end_lr
之间以指数方式增长的学习率训练模型,训练持续num_it
次,若出现发散情况则停止(除非stop_div=False
),然后以对数刻度绘制损失与学习率的关系图。
各种学习率建议算法可以传入该函数,默认使用valley
范式。
::: {#cell-93 .cell 0=‘缓’ 1=‘慢’}
with tempfile.TemporaryDirectory() as d:
= synth_learner(path=Path(d))
learn = L(learn.model.parameters())
weights_pre_lr_find = learn.lr_find(suggest_funcs=(minimum, steep, valley, slide))
lr_min, lr_steep, lr_valley, lr_slide = L(learn.model.parameters())
weights_post_lr_find
test_eq(weights_pre_lr_find, weights_post_lr_find)print(f"Minimum/10:\t{lr_min:.2e}\nSteepest point:\t{lr_steep:.2e}\nLongest valley:\t{lr_valley:.2e}\nSlide interval:\t{lr_slide:.2e}")
Minimum/10: 1.58e-01
Steepest point: 9.12e-03
Longest valley: 1.58e-02
Slide interval: 8.32e-02
:::
导出 -
from nbdev import nbdev_export
nbdev_export()
Converted 00_torch_core.ipynb.
Converted 01_layers.ipynb.
Converted 01a_losses.ipynb.
Converted 02_data.load.ipynb.
Converted 03_data.core.ipynb.
Converted 04_data.external.ipynb.
Converted 05_data.transforms.ipynb.
Converted 06_data.block.ipynb.
Converted 07_vision.core.ipynb.
Converted 08_vision.data.ipynb.
Converted 09_vision.augment.ipynb.
Converted 09b_vision.utils.ipynb.
Converted 09c_vision.widgets.ipynb.
Converted 10_tutorial.pets.ipynb.
Converted 10b_tutorial.albumentations.ipynb.
Converted 11_vision.models.xresnet.ipynb.
Converted 12_optimizer.ipynb.
Converted 13_callback.core.ipynb.
Converted 13a_learner.ipynb.
Converted 13b_metrics.ipynb.
Converted 14_callback.schedule.ipynb.
Converted 14a_callback.data.ipynb.
Converted 15_callback.hook.ipynb.
Converted 15a_vision.models.unet.ipynb.
Converted 16_callback.progress.ipynb.
Converted 17_callback.tracker.ipynb.
Converted 18_callback.fp16.ipynb.
Converted 18a_callback.training.ipynb.
Converted 18b_callback.preds.ipynb.
Converted 19_callback.mixup.ipynb.
Converted 20_interpret.ipynb.
Converted 20a_distributed.ipynb.
Converted 20b_tutorial.distributed.ipynb.
Converted 21_vision.learner.ipynb.
Converted 22_tutorial.imagenette.ipynb.
Converted 23_tutorial.vision.ipynb.
Converted 24_tutorial.image_sequence.ipynb.
Converted 24_tutorial.siamese.ipynb.
Converted 24_vision.gan.ipynb.
Converted 30_text.core.ipynb.
Converted 31_text.data.ipynb.
Converted 32_text.models.awdlstm.ipynb.
Converted 33_text.models.core.ipynb.
Converted 34_callback.rnn.ipynb.
Converted 35_tutorial.wikitext.ipynb.
Converted 37_text.learner.ipynb.
Converted 38_tutorial.text.ipynb.
Converted 39_tutorial.transformers.ipynb.
Converted 40_tabular.core.ipynb.
Converted 41_tabular.data.ipynb.
Converted 42_tabular.model.ipynb.
Converted 43_tabular.learner.ipynb.
Converted 44_tutorial.tabular.ipynb.
Converted 45_collab.ipynb.
Converted 46_tutorial.collab.ipynb.
Converted 50_tutorial.datablock.ipynb.
Converted 60_medical.imaging.ipynb.
Converted 61_tutorial.medical_imaging.ipynb.
Converted 65_medical.text.ipynb.
Converted 70_callback.wandb.ipynb.
Converted 70a_callback.tensorboard.ipynb.
Converted 70b_callback.neptune.ipynb.
Converted 70c_callback.captum.ipynb.
Converted 70d_callback.comet.ipynb.
Converted 74_huggingface.ipynb.
Converted 97_test_utils.ipynb.
Converted 99_pytorch_doc.ipynb.
Converted dev-setup.ipynb.
Converted app_examples.ipynb.
Converted camvid.ipynb.
Converted distributed_app_examples.ipynb.
Converted migrating_catalyst.ipynb.
Converted migrating_ignite.ipynb.
Converted migrating_lightning.ipynb.
Converted migrating_pytorch.ipynb.
Converted migrating_pytorch_verbose.ipynb.
Converted ulmfit.ipynb.
Converted index.ipynb.
Converted quick_start.ipynb.
Converted tutorial.ipynb.