数据块



! [ -e /content ] && pip install -Uqq fastai  # 在Colab上升级fastai


from __future__ import annotations
from fastai.torch_basics import *
from fastai.data.core import *
from fastai.data.load import *
from fastai.data.external import *
from fastai.data.transforms import *


from nbdev.showdoc import *

高级 API 可以快速将您的数据加载到 DataLoaders 中。

TransformBlock -


class TransformBlock():
    "A basic wrapper that links defaults transforms for the data block API"
    def __init__(self, 
        type_tfms:list=None, # 一个或多个 `Transform`
        item_tfms:list=None, # 应用于物品的`ItemTransform`
        batch_tfms:list=None, # 通过批处理应用的`Transform`或`RandTransform`
        dl_type:TfmdDL=None, # 特定任务的 `TfmdDL`，默认为 `TfmdDL`
        dls_kwargs:dict=None, # 要传递给 `DataLoaders` 的额外参数
    ):
        self.type_tfms  =            L(type_tfms)
        self.item_tfms  = ToTensor + L(item_tfms)
        self.batch_tfms =            L(batch_tfms)
        self.dl_type,self.dls_kwargs = dl_type,({} if dls_kwargs is None else dls_kwargs)


def CategoryBlock(
    vocab:MutableSequence|pd.Series=None, # 唯一类名列表
    sort:bool=True, # 按字母顺序排列课程
    add_na:bool=False, # 将 `#na#` 添加到 `vocab`
):
    "`TransformBlock` for single-label categorical targets"
    return TransformBlock(type_tfms=Categorize(vocab=vocab, sort=sort, add_na=add_na))


def MultiCategoryBlock(
    encoded:bool=False, # 无论数据是否采用独热编码形式
    vocab:MutableSequence|pd.Series=None, # 唯一类名列表 
    add_na:bool=False, # 将 `#na#` 添加到 `vocab`
):
    "`TransformBlock` for multi-label categorical targets"
    tfm = EncodedMultiCategorize(vocab=vocab) if encoded else [MultiCategorize(vocab=vocab, add_na=add_na), OneHotEncode]
    return TransformBlock(type_tfms=tfm)


def RegressionBlock(
    n_out:int=None, # 输出值的数量
):
    "`TransformBlock` for float targets"
    return TransformBlock(type_tfms=RegressionSetup(c=n_out))

通用API


from inspect import isfunction,ismethod


def _merge_grouper(o):
    if isinstance(o, LambdaType): return id(o)
    elif isinstance(o, type): return o
    elif (isfunction(o) or ismethod(o)): return o.__qualname__
    return o.__class__


def _merge_tfms(*tfms):
    "Group the `tfms` in a single list, removing duplicates (from the same class) and instantiating"
    g = groupby(concat(*tfms), _merge_grouper)
    return L(v[-1] for k,v in g.items()).map(instantiate)

def _zip(x): return L(x).zip()


from fastai.vision.core import *
from fastai.vision.data import *


tfms = _merge_tfms([Categorize, MultiCategorize, Categorize(['dog', 'cat'])], Categorize(['a', 'b']))
#如果有多个实例化版本，则保留最后一个。
test_eq(len(tfms), 2)
test_eq(tfms[1].__class__, MultiCategorize)
test_eq(tfms[0].__class__, Categorize)
test_eq(tfms[0].vocab, ['a', 'b'])

tfms = _merge_tfms([PILImage.create, PILImage.show])
#检查方法是否正确分离
test_eq(len(tfms), 2)
tfms = _merge_tfms([show_image, set_trace])
#检查功能是否正确分离
test_eq(len(tfms), 2)

_f = lambda x: 0
test_eq(len(_merge_tfms([_f,lambda x: 1])), 2)
test_eq(len(_merge_tfms([_f,_f])), 1)


@docs
@funcs_kwargs
class DataBlock():
    "Generic container to quickly build `Datasets` and `DataLoaders`."
    get_x=get_items=splitter=get_y = None
    blocks,dl_type = (TransformBlock,TransformBlock),TfmdDL
    _methods = 'get_items splitter get_y get_x'.split()
    _msg = "If you wanted to compose several transforms in your getter don't forget to wrap them in a `Pipeline`."
    def __init__(self, 
        blocks:list=None, # One or more `TransformBlock`s
        dl_type:TfmdDL=None, # Task specific `TfmdDL`, defaults to `block`'s dl_type or`TfmdDL`
        getters:list=None, # Getter functions applied to results of `get_items`
        n_inp:int=None, # Number of inputs
        item_tfms:list=None, # `ItemTransform`s, applied on an item 
        batch_tfms:list=None, # `Transform`s or `RandTransform`s, applied by batch
        **kwargs, 
    ):
        blocks = L(self.blocks if blocks is None else blocks)
        blocks = L(b() if callable(b) else b for b in blocks)
        self.type_tfms = blocks.attrgot('type_tfms', L())
        self.default_item_tfms  = _merge_tfms(*blocks.attrgot('item_tfms',  L()))
        self.default_batch_tfms = _merge_tfms(*blocks.attrgot('batch_tfms', L()))
        for b in blocks:
            if getattr(b, 'dl_type', None) is not None: self.dl_type = b.dl_type
        if dl_type is not None: self.dl_type = dl_type
        self.dataloaders = delegates(self.dl_type.__init__)(self.dataloaders)
        self.dls_kwargs = merge(*blocks.attrgot('dls_kwargs', {}))

        self.n_inp = ifnone(n_inp, max(1, len(blocks)-1))
        self.getters = ifnone(getters, [noop]*len(self.type_tfms))
        if self.get_x:
            if len(L(self.get_x)) != self.n_inp:
                raise ValueError(f'get_x contains {len(L(self.get_x))} functions, but must contain {self.n_inp} (one for each input)\n{self._msg}')
            self.getters[:self.n_inp] = L(self.get_x)
        if self.get_y:
            n_targs = len(self.getters) - self.n_inp
            if len(L(self.get_y)) != n_targs:
                raise ValueError(f'get_y contains {len(L(self.get_y))} functions, but must contain {n_targs} (one for each target)\n{self._msg}')
            self.getters[self.n_inp:] = L(self.get_y)

        if kwargs: raise TypeError(f'invalid keyword arguments: {", ".join(kwargs.keys())}')
        self.new(item_tfms, batch_tfms)

    def _combine_type_tfms(self): return L([self.getters, self.type_tfms]).map_zip(
        lambda g,tt: (g.fs if isinstance(g, Pipeline) else L(g)) + tt)

    def new(self, 
        item_tfms:list=None, # `ItemTransform`s, applied on an item
        batch_tfms:list=None, # `Transform`s or `RandTransform`s, applied by batch 
    ):
        self.item_tfms  = _merge_tfms(self.default_item_tfms,  item_tfms)
        self.batch_tfms = _merge_tfms(self.default_batch_tfms, batch_tfms)
        return self

    @classmethod
    def from_columns(cls, 
        blocks:list =None, # One or more `TransformBlock`s
        getters:list =None, # Getter functions applied to results of `get_items`
        get_items:callable=None, # A function to get items
        **kwargs,
    ):
        if getters is None: getters = L(ItemGetter(i) for i in range(2 if blocks is None else len(L(blocks))))
        get_items = _zip if get_items is None else compose(get_items, _zip)
        return cls(blocks=blocks, getters=getters, get_items=get_items, **kwargs)

    def datasets(self, 
        source, # The data source
        verbose:bool=False, # Show verbose messages
    ) -> Datasets:
        self.source = source                     ; pv(f"Collecting items from {source}", verbose)
        items = (self.get_items or noop)(source) ; pv(f"Found {len(items)} items", verbose)
        splits = (self.splitter or RandomSplitter())(items)
        pv(f"{len(splits)} datasets of sizes {','.join([str(len(s)) for s in splits])}", verbose)
        return Datasets(items, tfms=self._combine_type_tfms(), splits=splits, dl_type=self.dl_type, n_inp=self.n_inp, verbose=verbose)

    def dataloaders(self, 
        source, # The data source
        path:str='.', # Data source and default `Learner` path 
        verbose:bool=False, # Show verbose messages
        **kwargs
    ) -> DataLoaders:
        dsets = self.datasets(source, verbose=verbose)
        kwargs = {**self.dls_kwargs, **kwargs, 'verbose': verbose}
        return dsets.dataloaders(path=path, after_item=self.item_tfms, after_batch=self.batch_tfms, **kwargs)

    _docs = dict(new="Create a new `DataBlock` with other `item_tfms` and `batch_tfms`",
                 datasets="Create a `Datasets` object from `source`",
                 dataloaders="Create a `DataLoaders` object from `source`")

要构建一个 DataBlock，您需要提供库的四个内容：输入/标签的类型，以及至少两个函数：get_items 和 splitter。您还可能需要包含 get_x 和 get_y，或者一种更通用的 getters 列表，这些函数应用于 get_items 的结果。

splitter 是一个可调用对象，当传入 items 时，它会返回一个元组，包含表示训练数据和验证数据索引的可迭代对象。

一旦提供了这些，您将自动获得 Datasets 或 DataLoaders：

show_doc(DataBlock.datasets)

`DataBlock.datasets`[source]

DataBlock.datasets(source, verbose:bool=False)

Create a Datasets object from source

	Type	Default	Details
`source`			The data source
`verbose`	`bool`	`False`	Show verbose messages


dblock = DataBlock()
show_doc(dblock.dataloaders, name="DataBlock.dataloaders")

`DataBlock.dataloaders`[source]

DataBlock.dataloaders(source, path:str='.', verbose:bool=False, bs=64, shuffle=False, num_workers=None, do_setup=True, pin_memory=False, timeout=0, batch_size=None, drop_last=False, indexed=None, n=None, device=None, persistent_workers=False, wif=None, before_iter=None, after_item=None, before_batch=None, after_batch=None, after_iter=None, create_batches=None, create_item=None, create_batch=None, retain=None, get_idxs=None, sample=None, shuffle_fn=None, do_batch=None)

Create a DataLoaders object from source

	Type	Default	Details
`source`			The data source
`path`	`str`	\|Data source and default [`Learner`](/learner.html#Learner) path\| \|`verbose`\|`bool`\|`False`\|Show verbose messages\| \|\|\|Valid Keyword Arguments\|\| \|`bs`\|`int`\|`64`\|Argument passed to [`TfmdDL.__init__`](/data.core.html#TfmdDL.__init__)\| \|`shuffle`\|`bool`\|`False`\|Argument passed to [`TfmdDL.__init__`](/data.core.html#TfmdDL.__init__)\| \|`num_workers`\|`NoneType`\|	Argument passed to `TfmdDL.__init__`
`do_setup`	`bool`	`True`	Argument passed to `TfmdDL.__init__`
`pin_memory`	`bool`	`False`	Argument passed to `TfmdDL.__init__`
`timeout`	`int`	`0`	Argument passed to `TfmdDL.__init__`
`batch_size`	`NoneType`	\|Argument passed to [`TfmdDL.__init__`](/data.core.html#TfmdDL.__init__)\| \|`drop_last`\|`bool`\|`False`\|Argument passed to [`TfmdDL.__init__`](/data.core.html#TfmdDL.__init__)\| \|`indexed`\|`NoneType`\|	Argument passed to `TfmdDL.__init__`
`n`	`NoneType`	\|Argument passed to [`TfmdDL.__init__`](/data.core.html#TfmdDL.__init__)\| \|`device`\|`NoneType`\|	Argument passed to `TfmdDL.__init__`
`persistent_workers`	`bool`	`False`	Argument passed to `TfmdDL.__init__`
`wif`	`NoneType`	\|Argument passed to [`TfmdDL.__init__`](/data.core.html#TfmdDL.__init__)\| \|`before_iter`\|`NoneType`\|	Argument passed to `TfmdDL.__init__`
`after_item`	`NoneType`	\|Argument passed to [`TfmdDL.__init__`](/data.core.html#TfmdDL.__init__)\| \|`before_batch`\|`NoneType`\|	Argument passed to `TfmdDL.__init__`
`after_batch`	`NoneType`	\|Argument passed to [`TfmdDL.__init__`](/data.core.html#TfmdDL.__init__)\| \|`after_iter`\|`NoneType`\|	Argument passed to `TfmdDL.__init__`
`create_batches`	`NoneType`	\|Argument passed to [`TfmdDL.__init__`](/data.core.html#TfmdDL.__init__)\| \|`create_item`\|`NoneType`\|	Argument passed to `TfmdDL.__init__`
`create_batch`	`NoneType`	\|Argument passed to [`TfmdDL.__init__`](/data.core.html#TfmdDL.__init__)\| \|`retain`\|`NoneType`\|	Argument passed to `TfmdDL.__init__`
`get_idxs`	`NoneType`	\|Argument passed to [`TfmdDL.__init__`](/data.core.html#TfmdDL.__init__)\| \|`sample`\|`NoneType`\|	Argument passed to `TfmdDL.__init__`
`shuffle_fn`	`NoneType`	\|Argument passed to [`TfmdDL.__init__`](/data.core.html#TfmdDL.__init__)\| \|`do_batch`\|`NoneType`\|	Argument passed to `TfmdDL.__init__`
Returns	`DataLoaders`

您可以通过传递函数来创建 DataBlock：

mnist = DataBlock(blocks = (ImageBlock(cls=PILImageBW),CategoryBlock),
                  get_items = get_image_files,
                  splitter = GrandparentSplitter(),
                  get_y = parent_label)

每种类型都有默认的变换，将会被应用：

在基础层面，以从基本元素（如文件名）创建元组中的项目（通常是输入，目标）
在数据集的项目层面
在批处理层面

它们分别称为类型变换、项目变换、批处理变换。在MNIST的情况下，类型变换是创建PILImageBW（用于输入）和Categorize变换（用于目标）的方法，项目变换是ToTensor，批处理变换是Cuda和IntToFloatTensor。您可以通过在DataBlock.datasets或DataBlock.dataloaders中传递它们来添加任何其他变换。

test_eq(mnist.type_tfms[0], [PILImageBW.create])
test_eq(mnist.type_tfms[1].map(type), [Categorize])
test_eq(mnist.default_item_tfms.map(type), [ToTensor])
test_eq(mnist.default_batch_tfms.map(type), [IntToFloatTensor])

dsets = mnist.datasets(untar_data(URLs.MNIST_TINY))
test_eq(dsets.vocab, ['3', '7'])
x,y = dsets.train[0]
test_eq(x.size,(28,28))
show_at(dsets.train, 0, cmap='Greys', figsize=(2,2));

test_fail(lambda: DataBlock(wrong_kwarg=42, wrong_kwarg2='foo'))

我们可以将任意数量的块传递给 DataBlock，然后通过更改 n_inp 来定义输入块和目标块。例如，定义 n_inp=2 将把传递的前两个块视为输入，其余块视为目标。

mnist = DataBlock((ImageBlock, ImageBlock, CategoryBlock), get_items=get_image_files, splitter=GrandparentSplitter(),
                   get_y=parent_label)
dsets = mnist.datasets(untar_data(URLs.MNIST_TINY))
test_eq(mnist.n_inp, 2)
test_eq(len(dsets.train[0]), 3)

test_fail(lambda: DataBlock((ImageBlock, ImageBlock, CategoryBlock), get_items=get_image_files, splitter=GrandparentSplitter(),
                  get_y=[parent_label, noop],
                  n_inp=2), msg='get_y contains 2 functions, but must contain 1 (one for each output)')

mnist = DataBlock((ImageBlock, ImageBlock, CategoryBlock), get_items=get_image_files, splitter=GrandparentSplitter(),
                  n_inp=1,
                  get_y=[noop, Pipeline([noop, parent_label])])
dsets = mnist.datasets(untar_data(URLs.MNIST_TINY))
test_eq(len(dsets.train[0]), 3)

调试


def _short_repr(x):
    if isinstance(x, tuple): return f'({", ".join([_short_repr(y) for y in x])})'
    if isinstance(x, list): return f'[{", ".join([_short_repr(y) for y in x])}]'
    if not isinstance(x, Tensor): return str(x)
    if x.numel() <= 20 and x.ndim <=1: return str(x)
    return f'{x.__class__.__name__} of size {"x".join([str(d) for d in x.shape])}'


test_eq(_short_repr(TensorImage(torch.randn(40,56))), 'TensorImage of size 40x56')
test_eq(_short_repr(TensorCategory([1,2,3])), 'TensorCategory([1, 2, 3])')
test_eq(_short_repr((TensorImage(torch.randn(40,56)), TensorImage(torch.randn(32,20)))),
        '(TensorImage of size 40x56, TensorImage of size 32x20)')


def _apply_pipeline(p, x):
    print(f"  {p}\n    starting from\n      {_short_repr(x)}")
    for f in p.fs:
        name = f.name
        try:
            x = f(x)
            if name != "noop": print(f"    applying {name} gives\n      {_short_repr(x)}")
        except Exception as e:
            print(f"    applying {name} failed.")
            raise e
    return x


from fastai.data.load import _collate_types

def _find_fail_collate(s):
    s = L(*s)
    for x in s[0]:
        if not isinstance(x, _collate_types): return f"{type(x).__name__} is not collatable"
    for i in range_of(s[0]):
        try: _ = default_collate(s.itemgot(i))
        except:
            shapes = [getattr(o[i], 'shape', None) for o in s]
            return f"Could not collate the {i}-th members of your tuples because got the following shapes\n{','.join([str(s) for s in shapes])}"


@patch
def summary(self:DataBlock,
    source, # 数据源  
    bs:int=4, # 批量大小
    show_batch:bool=False, # 在总结之后调用 `show_batch`
    **kwargs, # `show_batch` 的额外关键字参数
):
    "Steps through the transform pipeline for one batch, and optionally calls `show_batch(**kwargs)` on the transient `Dataloaders`."
    print(f"Setting-up type transforms pipelines")
    dsets = self.datasets(source, verbose=True)
    print("\nBuilding one sample")
    for tl in dsets.train.tls:
        _apply_pipeline(tl.tfms, get_first(dsets.train.items))
    print(f"\nFinal sample: {dsets.train[0]}\n\n")

    dls = self.dataloaders(source, bs=bs, verbose=True)
    print("\nBuilding one batch")
    if len([f for f in dls.train.after_item.fs if f.name != 'noop'])!=0:
        print("Applying item_tfms to the first sample:")
        s = [_apply_pipeline(dls.train.after_item, dsets.train[0])]
        print(f"\nAdding the next {bs-1} samples")
        s += [dls.train.after_item(dsets.train[i]) for i in range(1, bs)]
    else:
        print("No item_tfms to apply")
        s = [dls.train.after_item(dsets.train[i]) for i in range(bs)]

    if len([f for f in dls.train.before_batch.fs if f.name != 'noop'])!=0:
        print("\nApplying before_batch to the list of samples")
        s = _apply_pipeline(dls.train.before_batch, s)
    else: print("\nNo before_batch transform to apply")

    print("\nCollating items in a batch")
    try:
        b = dls.train.create_batch(s)
        b = retain_types(b, s[0] if is_listy(s) else s)
    except Exception as e:
        print("Error! It's not possible to collate your items in a batch")
        why = _find_fail_collate(s)
        print("Make sure all parts of your samples are tensors of the same size" if why is None else why)
        raise e

    if len([f for f in dls.train.after_batch.fs if f.name != 'noop'])!=0:
        print("\nApplying batch_tfms to the batch built")
        b = to_device(b, dls.device)
        b = _apply_pipeline(dls.train.after_batch, b)
    else: print("\nNo batch_tfms to apply")

    if show_batch: dls.show_batch(**kwargs)

show_doc(DataBlock.summary)

`DataBlock.summary`[source]

DataBlock.summary(source, bs:int=4, show_batch:bool=False, **kwargs)

Steps through the transform pipeline for one batch, and optionally calls show_batch(**kwargs) on the transient Dataloaders.

	Type	Default	Details
`source`			The data source
`bs`	`int`	`4`	The batch size
`show_batch`	`bool`	`False`	Call `show_batch` after the summary
`kwargs`			No Content

除了逐步遍历转换过程，summary() 提供了一个快捷方式 dls.show_batch(...)，用于查看数据。例如：

pets.summary(path/"images", bs=8, show_batch=True, unique=True,...)

是以下代码的快捷方式：

pets.summary(path/"images", bs=8)
dls = pets.dataloaders(path/"images", bs=8)
dls.show_batch(unique=True,...)  # 查看不同的转换效果在同一图像上的影响。

导出 -


from nbdev import nbdev_export
nbdev_export()

Converted 00_torch_core.ipynb.
Converted 01_layers.ipynb.
Converted 01a_losses.ipynb.
Converted 02_data.load.ipynb.
Converted 03_data.core.ipynb.
Converted 04_data.external.ipynb.
Converted 05_data.transforms.ipynb.
Converted 06_data.block.ipynb.
Converted 07_vision.core.ipynb.
Converted 08_vision.data.ipynb.
Converted 09_vision.augment.ipynb.
Converted 09b_vision.utils.ipynb.
Converted 09c_vision.widgets.ipynb.
Converted 10_tutorial.pets.ipynb.
Converted 10b_tutorial.albumentations.ipynb.
Converted 11_vision.models.xresnet.ipynb.
Converted 12_optimizer.ipynb.
Converted 13_callback.core.ipynb.
Converted 13a_learner.ipynb.
Converted 13b_metrics.ipynb.
Converted 14_callback.schedule.ipynb.
Converted 14a_callback.data.ipynb.
Converted 15_callback.hook.ipynb.
Converted 15a_vision.models.unet.ipynb.
Converted 16_callback.progress.ipynb.
Converted 17_callback.tracker.ipynb.
Converted 18_callback.fp16.ipynb.
Converted 18a_callback.training.ipynb.
Converted 18b_callback.preds.ipynb.
Converted 19_callback.mixup.ipynb.
Converted 20_interpret.ipynb.
Converted 20a_distributed.ipynb.
Converted 21_vision.learner.ipynb.
Converted 22_tutorial.imagenette.ipynb.
Converted 23_tutorial.vision.ipynb.
Converted 24_tutorial.image_sequence.ipynb.
Converted 24_tutorial.siamese.ipynb.
Converted 24_vision.gan.ipynb.
Converted 30_text.core.ipynb.
Converted 31_text.data.ipynb.
Converted 32_text.models.awdlstm.ipynb.
Converted 33_text.models.core.ipynb.
Converted 34_callback.rnn.ipynb.
Converted 35_tutorial.wikitext.ipynb.
Converted 37_text.learner.ipynb.
Converted 38_tutorial.text.ipynb.
Converted 39_tutorial.transformers.ipynb.
Converted 40_tabular.core.ipynb.
Converted 41_tabular.data.ipynb.
Converted 42_tabular.model.ipynb.
Converted 43_tabular.learner.ipynb.
Converted 44_tutorial.tabular.ipynb.
Converted 45_collab.ipynb.
Converted 46_tutorial.collab.ipynb.
Converted 50_tutorial.datablock.ipynb.
Converted 60_medical.imaging.ipynb.
Converted 61_tutorial.medical_imaging.ipynb.
Converted 65_medical.text.ipynb.
Converted 70_callback.wandb.ipynb.
Converted 71_callback.tensorboard.ipynb.
Converted 72_callback.neptune.ipynb.
Converted 73_callback.captum.ipynb.
Converted 74_callback.azureml.ipynb.
Converted 97_test_utils.ipynb.
Converted 99_pytorch_doc.ipynb.
Converted dev-setup.ipynb.
Converted app_examples.ipynb.
Converted camvid.ipynb.
Converted migrating_catalyst.ipynb.
Converted migrating_ignite.ipynb.
Converted migrating_lightning.ipynb.
Converted migrating_pytorch.ipynb.
Converted migrating_pytorch_verbose.ipynb.
Converted ulmfit.ipynb.
Converted index.ipynb.
Converted index_original.ipynb.
Converted quick_start.ipynb.
Converted tutorial.ipynb.

TransformBlock -

通用API

DataBlock.datasets[source]

DataBlock.dataloaders[source]

调试

DataBlock.summary[source]

导出 -

`DataBlock.datasets`[source]

`DataBlock.dataloaders`[source]

`DataBlock.summary`[source]