! [ -e /content ] && pip install -Uqq fastai # 在Colab上升级fastai
数据加载器
DataLoader
类
from __future__ import annotations
from fastai.torch_basics import *
from torch.utils.data.dataloader import _MultiProcessingDataLoaderIter,_SingleProcessDataLoaderIter,_DatasetKind
= (_MultiProcessingDataLoaderIter,_SingleProcessDataLoaderIter) _loaders
from nbdev.showdoc import *
= 4
bs = list(string.ascii_lowercase) letters
DataLoader 辅助函数
fastai包含一个替代Pytorch的DataLoader,它在很大程度上与API兼容,并增加了许多有用的功能和灵活性。在我们查看这个类之前,有几个辅助函数需要定义。
def _wif(worker_id):
1)
set_num_threads(= get_worker_info()
info = info.dataset.d
ds = info.num_workers,info.id
ds.num_workers,ds.offs
set_seed(info.seed)
ds.wif()
class _FakeLoader:
def _fn_noops(self, x=None, *args, **kwargs): return x
= None,False,_fn_noops,False
_IterableDataset_len_called,_auto_collation,collate_fn,drop_last = Inf.count,None,2,noop
_index_sampler,generator,prefetch_factor,_get_shared_seed = _dataset_kind = _DatasetKind.Iterable
dataset_kind
def __init__(self, d, pin_memory, num_workers, timeout, persistent_workers,pin_memory_device):
self.dataset,self.default,self.worker_init_fn,self.pin_memory_device = self,d,_wif,pin_memory_device
'd,pin_memory,num_workers,timeout,persistent_workers,pin_memory_device')
store_attr(
def __iter__(self): return iter(self.d.create_batches(self.d.sample()))
@property
def multiprocessing_context(self): return (None,multiprocessing)[self.num_workers>0]
@contextmanager
def no_multiproc(self):
= self.num_workers
old_num_workers try:
self.num_workers = 0
yield self.d
finally: self.num_workers = old_num_workers
= (ndarray, Tensor, typing.Mapping, str) _collate_types
def fa_collate(t):
"A replacement for PyTorch `default_collate` which maintains types and handles `Sequence`s"
= t[0]
b return (default_collate(t) if isinstance(b, _collate_types)
else type(t[0])([fa_collate(s) for s in zip(*t)]) if isinstance(b, Sequence)
else default_collate(t))
#例如,x 是整数,y 是元组
= [(1,(2,3)),(1,(2,3))]
t
test_eq(fa_collate(t), default_collate(t))map(type), [Tensor,tuple])
test_eq(L(fa_collate(t)).
= [(1,(2,(3,4))),(1,(2,(3,4)))]
t
test_eq(fa_collate(t), default_collate(t))map(type), [Tensor,tuple])
test_eq(L(fa_collate(t)).1]).map(type), [Tensor,tuple]) test_eq(L(fa_collate(t)[
def fa_convert(t):
"A replacement for PyTorch `default_convert` which maintains types and handles `Sequence`s"
return (default_convert(t) if isinstance(t, _collate_types)
else type(t)([fa_convert(s) for s in t]) if isinstance(t, Sequence)
else default_convert(t))
= array([1,2])
t0 = [t0,(t0,t0)]
t
test_eq(fa_convert(t), default_convert(t))map(type), [Tensor,tuple]) test_eq(L(fa_convert(t)).
class SkipItemException(Exception):
"Raised to notify `DataLoader` to skip an item"
pass
=3) show_doc(SkipItemException, title_level
def collate_error(e:Exception, batch):
"Raises error when the batch could not collate, stating what items in the batch are different sizes and their types"
= f'Error when trying to collate the data into batches with fa_collate, at least two tensors in the batch are not the same size.\n\n'
err # 我们需要遍历整个批次,找出不匹配的地方。
= len(batch[0])
length for idx in range(length): # 对于批次中的每种类型
for i, item in enumerate(batch):
if i == 0: shape_a, type_a = item[idx].shape, item[idx].__class__.__name__
elif item[idx].shape != shape_a:
= item[idx].shape
shape_b if shape_a != shape_b:
+= f'Mismatch found on axis {idx} of the batch and is of type `{type_a}`:\n\tItem at index 0 has shape: {shape_a}\n\tItem at index {i} has shape: {shape_b}\n\nPlease include a transform in `after_item` that ensures all data of type {type_a} is the same size'
err = [err]
e.args raise
= [torch.rand(3, 375, 500), torch.rand(3, 375, 500), torch.rand(3, 500, 333)]
batch with ExceptionExpected(RuntimeError, "Mismatch found on axis 0 of the batch and is of type `Tensor`"):
try:
fa_collate(batch)except Exception as e:
collate_error(e, batch)
数据加载器 -
@funcs_kwargs
class DataLoader(GetAttr):
= 'wif before_iter after_item before_batch after_batch after_iter'.split()
_noop_methods for o in _noop_methods: exec(f"def {o}(self, x=None, *args, **kwargs): return x")
= _noop_methods + 'create_batches create_item create_batch retain \
_methods get_idxs sample shuffle_fn do_batch create_batch'.split()
= 'dataset'
_default def __init__(self, dataset=None, bs=None, num_workers=0, pin_memory=False, timeout=0, batch_size=None,
=False, drop_last=False, indexed=None, n=None, device=None, persistent_workers=False,
shuffle='', **kwargs):
pin_memory_deviceif batch_size is not None: bs = batch_size # PyTorch compatibility
assert not (bs is None and drop_last)
if indexed is None: indexed = (hasattr(dataset,'__getitem__')
and not isinstance(dataset, IterableDataset))
if not indexed and shuffle: raise ValueError("Can only shuffle an indexed dataset (not an iterable one).")
if n is None:
try: n = len(dataset)
except TypeError: pass
'dataset,bs,shuffle,drop_last,indexed,n,pin_memory,timeout,device')
store_attr(self.rng,self.num_workers,self.offs = random.Random(random.randint(0,2**32-1)),1,0
if sys.platform == "win32" and IN_NOTEBOOK and num_workers > 0: num_workers = 0
if sys.platform == "darwin" and num_workers > 0: num_workers = 0
self.fake_l = _FakeLoader(self, pin_memory, num_workers, timeout, persistent_workers=persistent_workers,
=pin_memory_device)
pin_memory_device
def __len__(self):
if self.n is None: raise TypeError
if self.bs is None: return self.n
return self.n//self.bs + (0 if self.drop_last or self.n%self.bs==0 else 1)
def get_idxs(self):
= Inf.count if self.indexed else Inf.nones
idxs if self.n is not None: idxs = list(itertools.islice(idxs, self.n))
if self.shuffle: idxs = self.shuffle_fn(idxs)
return idxs
def sample(self):
return (b for i,b in enumerate(self.__idxs) if i//(self.bs or 1)%self.num_workers==self.offs)
def __iter__(self):
self.randomize()
self.before_iter()
self.__idxs=self.get_idxs() # called in context of main process (not workers/subprocesses)
for b in _loaders[self.fake_l.num_workers==0](self.fake_l):
# pin_memory causes tuples to be converted to lists, so convert them back to tuples
if self.pin_memory and type(b) == list: b = tuple(b)
if self.device is not None: b = to_device(b, self.device)
yield self.after_batch(b)
self.after_iter()
if hasattr(self, 'it'): del(self.it)
def create_batches(self, samps):
if self.dataset is not None: self.it = iter(self.dataset)
= filter(lambda o:o is not None, map(self.do_item, samps))
res yield from map(self.do_batch, self.chunkify(res))
def new(self, dataset=None, cls=None, **kwargs):
if dataset is None: dataset = self.dataset
if cls is None: cls = type(self)
= dict(dataset=dataset, num_workers=self.fake_l.num_workers, pin_memory=self.pin_memory, timeout=self.timeout,
cur_kwargs =self.bs, shuffle=self.shuffle, drop_last=self.drop_last, indexed=self.indexed, device=self.device)
bsfor n in self._methods:
= getattr(self, n)
o if not isinstance(o, MethodType): cur_kwargs[n] = o
return cls(**merge(cur_kwargs, kwargs))
@property
def device(self) -> torch.device|None:
return self._device
@device.setter
def device(self, device:int|str|torch.device|None):
self._device, *_ = torch._C._nn._parse_to(device=device)
if hasattr(self, 'after_batch') and hasattr(self.after_batch, 'fs'):
for tfm in self.after_batch.fs:
# Check that tfm.to is callable as TabularPandas & transforms set tfm.to as an object
if hasattr(tfm, 'to') and callable(tfm.to): tfm.to(device)
else:
for a in L(getattr(tfm, 'parameters', None)):
if hasattr(getattr(tfm, a), 'to'): setattr(tfm, a, getattr(tfm, a).to(device))
@property
def prebatched(self): return self.bs is None
def do_item(self, s):
try: return self.after_item(self.create_item(s))
except SkipItemException: return None
def chunkify(self, b): return b if self.prebatched else chunked(b, self.bs, self.drop_last)
def shuffle_fn(self, idxs): return self.rng.sample(idxs, len(idxs))
def randomize(self): self.rng = random.Random(self.rng.randint(0,2**32-1))
def retain(self, res, b): return retain_types(res, b[0] if is_listy(b) else b)
def create_item(self, s):
if self.indexed: return self.dataset[s or 0]
elif s is None: return next(self.it)
else: raise IndexError("Cannot index an iterable dataset numerically - must use `None`.")
def create_batch(self, b):
try: return (fa_collate,fa_convert)[self.prebatched](b)
except Exception as e:
if not self.prebatched: collate_error(e,b)
raise
def do_batch(self, b): return self.retain(self.create_batch(self.before_batch(b)), b)
def to(self, device): self.device = device
def one_batch(self):
if self.n is not None and len(self)==0: raise ValueError(f'This DataLoader does not contain any batches')
with self.fake_l.no_multiproc(): res = first(self)
if hasattr(self, 'it'): delattr(self, 'it')
return res
"API compatible with PyTorch DataLoader, with a lot more callbacks and flexibility",
add_docs(DataLoader, = "Return a list of indices to reference the dataset. Calls `shuffle_fn` internally if `shuffle=True`.",
get_idxs = "Same as `get_idxs` but returns a generator of indices to reference the dataset.",
sample = "Takes output of `sample` as input, and returns batches of data. Does not apply `after_batch`.",
create_batches = "Create a new `DataLoader` with given arguments keeping remaining arguments same as original `DataLoader`.",
new = "Check if `bs` is None.",
prebatched = "Combines `after_item` and `create_item` to get an item from dataset by providing index as input.",
do_item = "Used by `create_batches` to turn generator of items (`b`) into batches.",
chunkify = "Returns a random permutation of `idxs`.",
shuffle_fn = "Set's `DataLoader` random number generator state.",
randomize = "Cast each item of `res` to type of matching item in `b` if its a superclass.",
retain = "Subset of the dataset containing the index values of sample if exists, else next iterator.",
create_item = "Collate a list of items into a batch.",
create_batch = "Combines `create_batch` and `before_batch` to get a batch of items. Input is a list of items to collate.",
do_batch = "Sets `self.device=device`.",
to = "Return one batch from `DataLoader`.",
one_batch = "See pytorch `worker_init_fn` for details.",
wif = "Called before `DataLoader` starts to read/iterate over the dataset.",
before_iter = "Takes output of `create_item` as input and applies this function on it.",
after_item = "It is called before collating a list of items into a batch. Input is a list of items.",
before_batch = "After collating mini-batch of items, the mini-batch is passed through this function.",
after_batch = "Called after `DataLoader` has fully read/iterated over the dataset.") after_iter
DataLoader
的参数:
dataset
:用于加载数据的数据集。可以是映射式数据集或迭代式数据集。bs
(int):每个批次加载多少样本(如果提供了batch_size
,则batch_size
会覆盖bs
)。如果bs=None
,则假定dataset.__getitem__
返回一个批次。num_workers
(int):用于数据加载的子进程数量。0
表示数据将在主进程中加载。pin_memory
(bool):如果为True
,数据加载器将在返回张量之前将其复制到 CUDA 针对内存中。timeout
(float>0):从工作进程收集一个批次的超时值(以秒为单位)。batch_size
(int):仅为 PyTorch 兼容性提供。使用bs
。shuffle
(bool):如果为True
,则每次完全读取/迭代数据加载器时数据都会被洗牌。drop_last
(bool):如果为True
,则最后一个不完整的批次将被丢弃。indexed
(bool):DataLoader
将猜测数据集是否可以索引(或是否可迭代),但您可以用此参数覆盖它。默认为True
。n
(int):默认为len(dataset)
。如果使用迭代式数据集,可以用n
指定大小。device
(torch.device):默认为default_device()
,默认是 CUDA。您可以将设备指定为torch.device('cpu')
。
重写 create_item
并使用默认的无限采样器以获取一个未知长度的流(当你想停止流时使用 stop()
)。
class RandDL(DataLoader):
def create_item(self, s):
= random.random()
r return r if r<0.95 else stop()
L(RandDL())
(#9) [0.09071201211613367,0.03249811556595483,0.6517029228593939,0.8584412116263038,0.759838440232556,0.3725873327679504,0.1445316323722865,0.18876233969606782,0.25518635091544917]
=4, drop_last=True)).map(len) L(RandDL(bs
(#1) [4]
= RandDL(bs=4, num_workers=4, drop_last=True)
dl map(len) L(dl).
(#1) [4]
= 0 if sys.platform in ("win32","darwin") else 4
test_num_workers
test_eq(dl.fake_l.num_workers, test_num_workers)with dl.fake_l.no_multiproc():
0)
test_eq(dl.fake_l.num_workers, map(len)
L(dl). test_eq(dl.fake_l.num_workers, test_num_workers)
def _rand_item(s):
= random.random()
r return r if r<0.95 else stop()
=_rand_item)) L(DataLoader(create_item
(#2) [0.624781366539204,0.39823513973618685]
如果您不设置bs
,则假设dataset
提供一个迭代器或一个返回批次的__getitem__
。
= DataLoader(letters)
ds1
test_eq(L(ds1), letters)len(ds1), 26)
test_eq(
=True)), letters)
test_shuffled(L(DataLoader(letters, shuffle
= DataLoader(letters, indexed=False)
ds1
test_eq(L(ds1), letters)len(ds1), 26)
test_eq(
= L(tensor([0,1,2]),tensor([3,4,5]))
t2 = DataLoader(t2)
ds2
test_eq_type(L(ds2), t2)
= L(array([0,1,2], dtype=np.int64),array([3,4,5], dtype=np.int64))
t3 = DataLoader(t3)
ds3 map(tensor))
test_eq_type(L(ds3), t3.
= DataLoader(t3, create_batch=noop, after_iter=lambda: setattr(t3, 'f', 1))
ds4
test_eq_type(L(ds4), t3)1) test_eq(t3.f,
如果您设置了 bs
,那么假设 dataset
提供一个迭代器或 __getitem__
方法,该方法返回一个批次的单个项。
def twoepochs(d): return ' '.join(''.join(list(o)) for _ in range(2) for o in d)
= DataLoader(letters, bs=4, drop_last=True, num_workers=0)
ds1 'abcd efgh ijkl mnop qrst uvwx abcd efgh ijkl mnop qrst uvwx')
test_eq(twoepochs(ds1),
= DataLoader(letters,4,num_workers=2)
ds1 'abcd efgh ijkl mnop qrst uvwx yz abcd efgh ijkl mnop qrst uvwx yz')
test_eq(twoepochs(ds1),
= DataLoader(range(12), bs=4, num_workers=3)
ds1 0,1,2,3]),tensor([4,5,6,7]),tensor([8,9,10,11])))
test_eq_type(L(ds1), L(tensor([
= DataLoader([str(i) for i in range(11)], bs=4, after_iter=lambda: setattr(t3, 'f', 2))
ds1 '0','1','2','3'],['4','5','6','7'],['8','9','10']))
test_eq_type(L(ds1), L([2)
test_eq(t3.f,
= iter(DataLoader(map(noop,range(20)), bs=4, num_workers=1))
it next(it) for _ in range(3)], [tensor([0,1,2,3]),tensor([4,5,6,7]),tensor([8,9,10,11])]) test_eq_type([
可迭代的数据加载器需要特定的测试。
class DummyIterableDataset(IterableDataset):
def __iter__(self):
yield from range(11)
= DataLoader(DummyIterableDataset(), bs=4)
ds1 # 检查其效果良好,并确认我们可以进行多次处理。
for i in range(3):
0,1,2,3]),tensor([4,5,6,7]),tensor([8,9,10])))
test_eq_type(L(ds1), L(tensor([
# 检查 `drop_last` 功能正常(需进行多次遍历,因为这会提前终止迭代器)
= DataLoader(DummyIterableDataset(), bs=4, drop_last=True)
ds1 for i in range(3):
0,1,2,3]),tensor([4,5,6,7]))) test_eq_type(L(ds1), L(tensor([
class SleepyDL(list):
def __getitem__(self,i):
/50)
time.sleep(random.random()return super().__getitem__(i)
= SleepyDL(letters)
t
%time test_eq(DataLoader(t, num_workers=0), letters)
%time test_eq(DataLoader(t, num_workers=2), letters)
%time test_eq(DataLoader(t, num_workers=4), letters)
= DataLoader(t, shuffle=True, num_workers=1)
dl
test_shuffled(L(dl), letters)
test_shuffled(L(dl), L(dl)) L(dl)
CPU times: user 3.35 ms, sys: 890 µs, total: 4.24 ms
Wall time: 307 ms
CPU times: user 6.93 ms, sys: 860 µs, total: 7.79 ms
Wall time: 333 ms
CPU times: user 7.78 ms, sys: 722 µs, total: 8.51 ms
Wall time: 331 ms
(#26) ['l','h','f','r','z','s','u','x','m','p'...]
class SleepyQueue():
"Simulate a queue with varying latency"
def __init__(self, q): self.q=q
def __iter__(self):
while True:
/100)
time.sleep(random.random()try: yield self.q.get_nowait()
except queues.Empty: return
= Queue()
q for o in range(30): q.put(o)
= SleepyQueue(q)
it
if not (sys.platform == "win32" and IN_NOTEBOOK):
%time test_shuffled(L(DataLoader(it, num_workers=4)), L(range(30)))
--------------------------------------------------------------------------- AssertionError Traceback (most recent call last) File <timed eval>:1 File ~/git/fastcore/fastcore/test.py:73, in test_shuffled(a, b) 71 def test_shuffled(a,b): 72 "`test` that `a` and `b` are shuffled versions of the same sequence of items" ---> 73 test_ne(a, b) 74 test_eq(Counter(a), Counter(b)) File ~/git/fastcore/fastcore/test.py:49, in test_ne(a, b) 47 def test_ne(a,b): 48 "`test` that `a!=b`" ---> 49 test(a,b,nequals,'!=') File ~/git/fastcore/fastcore/test.py:27, in test(a, b, cmp, cname) 25 "`assert` that `cmp(a,b)`; display inputs and `cname or cmp.__name__` if it fails" 26 if cname is None: cname=cmp.__name__ ---> 27 assert cmp(a,b),f"{cname}:\n{a}\n{b}" AssertionError: !=: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
class A(TensorBase): pass
for nw in (0,2):
= A(tensor([1,2]))
t = DataLoader([t,t,t,t,t,t,t,t], bs=4, num_workers=nw)
dl = first(dl)
b type(b), A)
test_eq(
= (A(tensor([1,2])),)
t = DataLoader([t,t,t,t,t,t,t,t], bs=4, num_workers=nw)
dl = first(dl)
b type(b[0]), A) test_eq(
list(DataLoader(list(range(50)),bs=32,shuffle=True,num_workers=3))
[tensor([42, 12, 44, 21, 8, 6, 3, 37, 33, 9, 27, 34, 18, 26, 1, 23, 11, 41,
15, 0, 49, 4, 38, 46, 48, 14, 40, 36, 17, 45, 30, 29]),
tensor([19, 10, 22, 13, 25, 32, 35, 5, 2, 20, 47, 39, 16, 28, 43, 7, 31, 24])]
class A(TensorBase): pass
= A(tensor(1,2))
t
= DataLoader([t,t,t,t,t,t,t,t], bs=4, num_workers=2, after_batch=to_device)
tdl = first(tdl)
b type(b), A)
test_eq(
# 未知属性被委托给 `dataset`
1,2)) test_eq(tdl.pop(), tensor(
覆盖 get_idxs
以返回相同的索引,直到数据加载器消耗完毕。这是为了测试在 num_workers
> 1 时的一致采样行为。
class AdamantDL(DataLoader):
def get_idxs(self):
=random.randint(0,self.n-1)
rreturn [r] * self.n
tuple(AdamantDL((list(range(50))),bs=16,num_workers=4))).unique().numel(),1) test_eq(torch.cat(
导出 -
from nbdev import nbdev_export
nbdev_export()
# 从子进程模块中导入Popen和PIPE
# 在脚本中测试 num_workers > 0 时,当 Python 进程启动方式为 spawn 时,功能正常。
# process = Popen(["python", "dltest.py"], stdout=PIPE)
# _, err = process.communicate(timeout=15)
# 退出码 = process.wait()
# test_eq(退出代码, 0)