视觉数据

! [ -e /content ] && pip install -Uqq fastai  # 在Colab上升级fastai
from __future__ import annotations
from fastai.torch_basics import *
from fastai.data.all import *
from fastai.vision.core import *
import types
from nbdev.showdoc import *
# 来自 fastai 视觉增强库的导入:*

用于在视觉应用程序中获取 DataLoaders 的辅助函数以及更高级的 ImageDataLoaders

本模块中定义的主要类是 ImageDataLoadersSegmentationDataLoaders,所以你可能想直接查看它们的定义。它们提供的工厂方法是快速准备训练数据的好方法,更多示例请参见 视觉教程

辅助函数

@delegates(subplots)
def get_grid(
    n:int, # 返回网格中的轴数
    nrows:int=None, # 返回网格的行数,默认为 `int(math.sqrt(n))`
    ncols:int=None, # 返回的网格中的列数,默认为 `ceil(n/rows)` 
    figsize:tuple=None, # 返回图形的长、宽(英寸)
    double:bool=False, # 是否将列数和 `n` 加倍
    title:str=None, # 如果通过,标题将设置为该数字。
    return_fig:bool=False, # 是否返回由 `subplots` 创建的图形
    flatten:bool=True, # 是否将matplot轴展平,以便可以用单个循环对其进行迭代
    **kwargs,
) -> (plt.Figure, plt.Axes): # 默认情况下仅返回 `axs`,如果 `return_fig` 设置为 True,则返回 (`fig`, `axs`)。
    "Return a grid of `n` axes, `rows` by `cols`"
    if nrows:
        ncols = ncols or int(np.ceil(n/nrows))
    elif ncols:
        nrows = nrows or int(np.ceil(n/ncols))
    else:
        nrows = int(math.sqrt(n))
        ncols = int(np.ceil(n/nrows))
    if double: ncols*=2 ; n*=2
    fig,axs = subplots(nrows, ncols, figsize=figsize, **kwargs)
    if flatten: axs = [ax if i<n else ax.set_axis_off() for i, ax in enumerate(axs.flatten())][:n]
    if title is not None: fig.suptitle(title, weight='bold', size=14)
    return (fig,axs) if return_fig else axs

这是由show_batchshow_results的类型分派版本用于视觉应用的。默认的figsize(cols*imsize, rows*imsize+0.6)imsize会传递给subplotssuptitlesharexshareysqueezesubplot_kwgridspec_kw都被传递给plt.subplots。如果return_figTrue,则返回fig, axs,否则仅返回axs

def clip_remove_empty(
    bbox:TensorBBox, # 边界框的坐标 
    label:TensorMultiCategory # 边界框的标签
):
    "Clip bounding boxes with image border and remove empty boxes along with corresponding labels"
    bbox = torch.clamp(bbox, -1, 1)
    empty = ((bbox[...,2] - bbox[...,0])*(bbox[...,3] - bbox[...,1]) <= 0.)
    return (bbox[~empty], label[TensorBase(~empty)])

这是用于 bb_pad 的。

bb = TensorBBox([[-2,-0.5,0.5,1.5], [-0.5,-0.5,0.5,0.5], [1,0.5,0.5,0.75], [-0.5,-0.5,0.5,0.5], [-2, -0.5, -1.5, 0.5]])
bb,lbl = clip_remove_empty(bb, TensorMultiCategory([1,2,3,2,5]))
test_eq(bb, TensorBBox([[-1,-0.5,0.5,1.], [-0.5,-0.5,0.5,0.5], [-0.5,-0.5,0.5,0.5]]))
test_eq(lbl, TensorMultiCategory([1,2,2]))
def bb_pad(
    samples:list, # 包含三元组(图像、边界框、标签)的列表
    pad_idx=0 # 将用于填充每个标签列表的标签
):
    "Function that collects `samples` of labelled bboxes and adds padding with `pad_idx`."
    samples = [(s[0], *clip_remove_empty(*s[1:])) for s in samples]
    max_len = max([len(s[2]) for s in samples])
    def _f(img,bbox,lbl):
        bbox = torch.cat([bbox,bbox.new_zeros(max_len-bbox.shape[0], 4)])
        lbl  = torch.cat([lbl, lbl .new_zeros(max_len-lbl .shape[0])+pad_idx])
        return img,bbox,lbl
    return [_f(*s) for s in samples]

这在 BBoxBlock 中使用。

img1,img2 = TensorImage(torch.randn(16,16,3)),TensorImage(torch.randn(16,16,3))
bb1 = tensor([[-2,-0.5,0.5,1.5], [-0.5,-0.5,0.5,0.5], [1,0.5,0.5,0.75], [-0.5,-0.5,0.5,0.5]])
lbl1 = tensor([1, 2, 3, 2])
bb2 = tensor([[-0.5,-0.5,0.5,0.5], [-0.5,-0.5,0.5,0.5]])
lbl2 = tensor([2, 2])
samples = [(img1, bb1, lbl1), (img2, bb2, lbl2)]
res = bb_pad(samples)
non_empty = tensor([True,True,False,True])
test_eq(res[0][0], img1)
test_eq(res[0][1], tensor([[-1,-0.5,0.5,1.], [-0.5,-0.5,0.5,0.5], [-0.5,-0.5,0.5,0.5]]))
test_eq(res[0][2], tensor([1,2,2]))
test_eq(res[1][0], img2)
test_eq(res[1][1], tensor([[-0.5,-0.5,0.5,0.5], [-0.5,-0.5,0.5,0.5], [0,0,0,0]]))
test_eq(res[1][2], tensor([2,2,0]))      

显示方法 -

@typedispatch
def show_batch(x:TensorImage, y, samples, ctxs=None, max_n=10, nrows=None, ncols=None, figsize=None, **kwargs):
    if ctxs is None: ctxs = get_grid(min(len(samples), max_n), nrows=nrows, ncols=ncols, figsize=figsize)
    ctxs = show_batch[object](x, y, samples, ctxs=ctxs, max_n=max_n, **kwargs)
    return ctxs
@typedispatch
def show_batch(x:TensorImage, y:TensorImage, samples, ctxs=None, max_n=10, nrows=None, ncols=None, figsize=None, **kwargs):
    if ctxs is None: ctxs = get_grid(min(len(samples), max_n), nrows=nrows, ncols=ncols, figsize=figsize, double=True)
    for i in range(2):
        ctxs[i::2] = [b.show(ctx=c, **kwargs) for b,c,_ in zip(samples.itemgot(i),ctxs[i::2],range(max_n))]
    return ctxs

用于视觉的 TransformBlock

这些是视觉应用为数据块API提供的模块。

def ImageBlock(cls:PILBase=PILImage):
    "A `TransformBlock` for images of `cls`"
    return TransformBlock(type_tfms=cls.create, batch_tfms=IntToFloatTensor)
def MaskBlock(
    codes:list=None # 用于分割掩码的词汇标签
):
    "A `TransformBlock` for segmentation masks, potentially with `codes`"
    return TransformBlock(type_tfms=PILMask.create, item_tfms=AddMaskCodes(codes=codes), batch_tfms=IntToFloatTensor)
PointBlock = TransformBlock(type_tfms=TensorPoint.create, item_tfms=PointScaler)
BBoxBlock = TransformBlock(type_tfms=TensorBBox.create, item_tfms=PointScaler, dls_kwargs = {'before_batch': bb_pad})

PointBlock.__doc__ = "A `TransformBlock` for points in an image"
BBoxBlock.__doc__  = "A `TransformBlock` for bounding boxes in an image"
show_doc(PointBlock, name='PointBlock')

PointBlock

A TransformBlock for points in an image

show_doc(BBoxBlock, name='BBoxBlock')

BBoxBlock

A TransformBlock for bounding boxes in an image

def BBoxLblBlock(
    vocab:list=None, # 边界框的词汇标签
    add_na:bool=True # 将NaN作为背景类添加
):
    "A `TransformBlock` for labeled bounding boxes, potentially with `vocab`"
    return TransformBlock(type_tfms=MultiCategorize(vocab=vocab, add_na=add_na), item_tfms=BBoxLabeler)

如果 add_naTrue,将为 NaN 添加一个新类别(这将代表背景类)。

图像数据加载器 -

class ImageDataLoaders(DataLoaders):
    "Basic wrapper around several `DataLoader`s with factory methods for computer vision problems"
    @classmethod
    @delegates(DataLoaders.from_dblock)
    def from_folder(cls, path, train='train', valid='valid', valid_pct=None, seed=None, vocab=None, item_tfms=None,
                    batch_tfms=None, img_cls=PILImage, **kwargs):
        "Create from imagenet style dataset in `path` with `train` and `valid` subfolders (or provide `valid_pct`)"
        splitter = GrandparentSplitter(train_name=train, valid_name=valid) if valid_pct is None else RandomSplitter(valid_pct, seed=seed)
        get_items = get_image_files if valid_pct else partial(get_image_files, folders=[train, valid])
        dblock = DataBlock(blocks=(ImageBlock(img_cls), CategoryBlock(vocab=vocab)),
                           get_items=get_items,
                           splitter=splitter,
                           get_y=parent_label,
                           item_tfms=item_tfms,
                           batch_tfms=batch_tfms)
        return cls.from_dblock(dblock, path, path=path, **kwargs)

    @classmethod
    @delegates(DataLoaders.from_dblock)
    def from_path_func(cls, path, fnames, label_func, valid_pct=0.2, seed=None, item_tfms=None, batch_tfms=None, 
                       img_cls=PILImage, **kwargs):
        "Create from list of `fnames` in `path`s with `label_func`"
        dblock = DataBlock(blocks=(ImageBlock(img_cls), CategoryBlock),
                           splitter=RandomSplitter(valid_pct, seed=seed),
                           get_y=label_func,
                           item_tfms=item_tfms,
                           batch_tfms=batch_tfms)
        return cls.from_dblock(dblock, fnames, path=path, **kwargs)

    @classmethod
    def from_name_func(cls,
        path:str|Path, # 将默认路径设置为一个目录,`Learner` 可以用来保存模型等文件。
        fnames:list, # A list of `os.Pathlike`'s to individual image files
        label_func:callable, # A function that receives a string (the file name) and outputs a label
        **kwargs
    ) -> DataLoaders:
        "Create from the name attrs of `fnames` in `path`s with `label_func`"
        if sys.platform == 'win32' and isinstance(label_func, types.LambdaType) and label_func.__name__ == '<lambda>':
            # https://medium.com/@jwnx/multiprocessing-serialization-in-python-with-pickle-9844f6fa1812
            raise ValueError("label_func couldn't be lambda function on Windows")
        f = using_attr(label_func, 'name')
        return cls.from_path_func(path, fnames, f, **kwargs)

    @classmethod
    def from_path_re(cls, path, fnames, pat, **kwargs):
        "Create from list of `fnames` in `path`s with re expression `pat`"
        return cls.from_path_func(path, fnames, RegexLabeller(pat), **kwargs)

    @classmethod
    @delegates(DataLoaders.from_dblock)
    def from_name_re(cls, path, fnames, pat, **kwargs):
        "Create from the name attrs of `fnames` in `path`s with re expression `pat`"
        return cls.from_name_func(path, fnames, RegexLabeller(pat), **kwargs)

    @classmethod
    @delegates(DataLoaders.from_dblock)
    def from_df(cls, df, path='.', valid_pct=0.2, seed=None, fn_col=0, folder=None, suff='', label_col=1, label_delim=None,
                y_block=None, valid_col=None, item_tfms=None, batch_tfms=None, img_cls=PILImage, **kwargs):
        "Create from `df` using `fn_col` and `label_col`"
        pref = f'{Path(path) if folder is None else Path(path)/folder}{os.path.sep}'
        if y_block is None:
            is_multi = (is_listy(label_col) and len(label_col) > 1) or label_delim is not None
            y_block = MultiCategoryBlock if is_multi else CategoryBlock
        splitter = RandomSplitter(valid_pct, seed=seed) if valid_col is None else ColSplitter(valid_col)
        dblock = DataBlock(blocks=(ImageBlock(img_cls), y_block),
                           get_x=ColReader(fn_col, pref=pref, suff=suff),
                           get_y=ColReader(label_col, label_delim=label_delim),
                           splitter=splitter,
                           item_tfms=item_tfms,
                           batch_tfms=batch_tfms)
        return cls.from_dblock(dblock, df, path=path, **kwargs)

    @classmethod
    def from_csv(cls, path, csv_fname='labels.csv', header='infer', delimiter=None, quoting=csv.QUOTE_MINIMAL, **kwargs):
        "Create from `path/csv_fname` using `fn_col` and `label_col`"
        df = pd.read_csv(Path(path)/csv_fname, header=header, delimiter=delimiter, quoting=quoting)
        return cls.from_df(df, path=path, **kwargs)

    @classmethod
    @delegates(DataLoaders.from_dblock)
    def from_lists(cls, path, fnames, labels, valid_pct=0.2, seed:int=None, y_block=None, item_tfms=None, batch_tfms=None,
                   img_cls=PILImage, **kwargs):
        "Create from list of `fnames` and `labels` in `path`"
        if y_block is None:
            y_block = MultiCategoryBlock if is_listy(labels[0]) and len(labels[0]) > 1 else (
                RegressionBlock if isinstance(labels[0], float) else CategoryBlock)
        dblock = DataBlock.from_columns(blocks=(ImageBlock(img_cls), y_block),
                           splitter=RandomSplitter(valid_pct, seed=seed),
                           item_tfms=item_tfms,
                           batch_tfms=batch_tfms)
        return cls.from_dblock(dblock, (fnames, labels), path=path, **kwargs)

ImageDataLoaders.from_csv = delegates(to=ImageDataLoaders.from_df)(ImageDataLoaders.from_csv)
ImageDataLoaders.from_name_func = delegates(to=ImageDataLoaders.from_path_func)(ImageDataLoaders.from_name_func)
ImageDataLoaders.from_path_re = delegates(to=ImageDataLoaders.from_path_func)(ImageDataLoaders.from_path_re)
ImageDataLoaders.from_name_re = delegates(to=ImageDataLoaders.from_name_func)(ImageDataLoaders.from_name_re)

这个类不应该直接使用,而应该优先考虑其中一个工厂方法。所有这些工厂方法接受以下作为参数:

  • item_tfms:在将项目分批之前应用于项目的一个或多个变换
  • batch_tfms:在批次形成后应用于批次的一个或多个变换
  • bs:批次大小
  • val_bs:用于验证 DataLoader 的批次大小(默认为 bs
  • shuffle_train:是否对训练 DataLoader 进行洗牌
  • device:要使用的 PyTorch 设备(默认为 default_device()
show_doc(ImageDataLoaders.from_folder)

source

ImageDataLoaders.from_folder

 ImageDataLoaders.from_folder (path, train='train', valid='valid',
                               valid_pct=None, seed=None, vocab=None,
                               item_tfms=None, batch_tfms=None,
                               img_cls=<class
                               'fastai.vision.core.PILImage'>, bs:int=64,
                               val_bs:int=None, shuffle:bool=True,
                               device=None)

Create from imagenet style dataset in path with train and valid subfolders (or provide valid_pct)

Type Default Details
path str | Path . Path to put in DataLoaders
train str train
valid str valid
valid_pct NoneType None
seed NoneType None
vocab NoneType None
item_tfms NoneType None
batch_tfms NoneType None
img_cls BypassNewMeta PILImage
bs int 64 Size of batch
val_bs int None Size of batch for validation DataLoader
shuffle bool True Whether to shuffle data
device NoneType None Device to put DataLoaders

如果提供了 valid_pct,则会通过设置一定比例的数据用于验证集进行随机拆分(可以选择设置一个 seed)。如果传递了 vocab,则仅保留名称在 vocab 中的文件夹。

以下是加载 MNIST 子样本的示例:

path = untar_data(URLs.MNIST_TINY)
dls = ImageDataLoaders.from_folder(path, img_cls=PILImageBW)
x,y = dls.one_batch()
test_eq(x.shape, [64, 1, 28, 28])

传递 valid_pct 将忽略有效/训练文件夹并进行新的随机划分:

dls = ImageDataLoaders.from_folder(path, valid_pct=0.2)
dls.valid_ds.items[:3]
[Path('/home/jhoward/.fastai/data/mnist_tiny/train/7/9307.png'),
 Path('/home/jhoward/.fastai/data/mnist_tiny/train/3/8241.png'),
 Path('/home/jhoward/.fastai/data/mnist_tiny/valid/3/8924.png')]
show_doc(ImageDataLoaders.from_path_func)

source

ImageDataLoaders.from_path_func

 ImageDataLoaders.from_path_func (path, fnames, label_func, valid_pct=0.2,
                                  seed=None, item_tfms=None,
                                  batch_tfms=None, img_cls=<class
                                  'fastai.vision.core.PILImage'>,
                                  bs:int=64, val_bs:int=None,
                                  shuffle:bool=True, device=None)

Create from list of fnames in paths with label_func

Type Default Details
path str | Path . Path to put in DataLoaders
fnames
label_func
valid_pct float 0.2
seed NoneType None
item_tfms NoneType None
batch_tfms NoneType None
img_cls BypassNewMeta PILImage
bs int 64 Size of batch
val_bs int None Size of batch for validation DataLoader
shuffle bool True Whether to shuffle data
device NoneType None Device to put DataLoaders

验证集是一个随机的 子集,其比例由 valid_pct 决定,可以选择使用 seed 来确保结果的可重复性。

以下是如何在 MNIST 数据集上创建与之前示例相同的 DataLoaders,并使用 label_func

fnames = get_image_files(path)
def label_func(x): return x.parent.name
dls = ImageDataLoaders.from_path_func(path, fnames, label_func)

这是另一个关于宠物数据集的例子。这里的文件名都在一个“images”文件夹中,名称的格式为class_name_123.jpg。正确标记它们的一种方法是将最后一个_之后的所有内容丢弃:

show_doc(ImageDataLoaders.from_path_re)

source

ImageDataLoaders.from_path_re

 ImageDataLoaders.from_path_re (path, fnames, pat, valid_pct=0.2,
                                seed=None, item_tfms=None,
                                batch_tfms=None, img_cls=<class
                                'fastai.vision.core.PILImage'>, bs:int=64,
                                val_bs:int=None, shuffle:bool=True,
                                device=None)

Create from list of fnames in paths with re expression pat

Type Default Details
path str | Path . Path to put in DataLoaders
fnames
pat
valid_pct float 0.2
seed NoneType None
item_tfms NoneType None
batch_tfms NoneType None
img_cls BypassNewMeta PILImage
bs int 64 Size of batch
val_bs int None Size of batch for validation DataLoader
shuffle bool True Whether to shuffle data
device NoneType None Device to put DataLoaders

验证集是 valid_pct 的一个随机子集,可选地使用 seed 来实现可重复性。

以下是如何在 MNIST 数据集上创建与之前示例相同的 DataLoaders(在 Windows 上,您需要将前两个 / 更改为 ):

pat = r'/([^/]*)/\d+.png$'
dls = ImageDataLoaders.from_path_re(path, fnames, pat)
show_doc(ImageDataLoaders.from_name_func)

source

ImageDataLoaders.from_name_func

 ImageDataLoaders.from_name_func (path:str|Path, fnames:list,
                                  label_func:callable, valid_pct=0.2,
                                  seed=None, item_tfms=None,
                                  batch_tfms=None, img_cls=<class
                                  'fastai.vision.core.PILImage'>,
                                  bs:int=64, val_bs:int=None,
                                  shuffle:bool=True, device=None)

Create from the name attrs of fnames in paths with label_func

Type Default Details
path str | Path Set the default path to a directory that a Learner can use to save files like models
fnames list A list of os.Pathlike’s to individual image files
label_func callable A function that receives a string (the file name) and outputs a label
valid_pct float 0.2
seed NoneType None
item_tfms NoneType None
batch_tfms NoneType None
img_cls BypassNewMeta PILImage
bs int 64 Size of batch
val_bs int None Size of batch for validation DataLoader
shuffle bool True Whether to shuffle data
device NoneType None Device to put DataLoaders
Returns DataLoaders

验证集是一个随机子集,大小为 valid_pct,可以选择使用 seed 来确保可重复性。此方法的功能与 ImageDataLoaders.from_path_func 相同,只是 label_func 应用于每个文件名的名称,而不是完整路径。

show_doc(ImageDataLoaders.from_name_re)

source

ImageDataLoaders.from_name_re

 ImageDataLoaders.from_name_re (path, fnames, pat, bs:int=64,
                                val_bs:int=None, shuffle:bool=True,
                                device=None)

Create from the name attrs of fnames in paths with re expression pat

Type Default Details
path str | Path . Path to put in DataLoaders
fnames
pat
bs int 64 Size of batch
val_bs int None Size of batch for validation DataLoader
shuffle bool True Whether to shuffle data
device NoneType None Device to put DataLoaders

验证集是 valid_pct 的随机子集,可以选择使用 seed 来保证可重复性。该方法的功能与 ImageDataLoaders.from_path_re 相同,不同之处在于 pat 应用于每个文件名的名称,而不是完整路径。

show_doc(ImageDataLoaders.from_df)

source

ImageDataLoaders.from_df

 ImageDataLoaders.from_df (df, path='.', valid_pct=0.2, seed=None,
                           fn_col=0, folder=None, suff='', label_col=1,
                           label_delim=None, y_block=None, valid_col=None,
                           item_tfms=None, batch_tfms=None, img_cls=<class
                           'fastai.vision.core.PILImage'>, bs:int=64,
                           val_bs:int=None, shuffle:bool=True,
                           device=None)

Create from df using fn_col and label_col

Type Default Details
df
path str | Path . Path to put in DataLoaders
valid_pct float 0.2
seed NoneType None
fn_col int 0
folder NoneType None
suff str
label_col int 1
label_delim NoneType None
y_block NoneType None
valid_col NoneType None
item_tfms NoneType None
batch_tfms NoneType None
img_cls BypassNewMeta PILImage
bs int 64 Size of batch
val_bs int None Size of batch for validation DataLoader
shuffle bool True Whether to shuffle data
device NoneType None Device to put DataLoaders

验证集是valid_pct的随机子集,可选择性地使用seed创建以实现可重复性。或者,如果您的df中包含valid_col,请将其名称或索引提供给该参数(该列应对进入验证集的元素标记为True)。

如果df中的文件名不应直接连接到path,您可以向文件名添加额外的folder。如果它们不包含正确的扩展名,您可以添加suff。如果您的标签列在每一行中包含多个标签,您可以使用label_delim来提醒库您有一个多标签问题。

当库自动选择的任务错误时,应传入y_block,然后您应提供CategoryBlockMultiCategoryBlockRegressionBlock。对于更高级的用法,您应使用数据块API。

之前的微型mnist示例也包含了一个数据框版本:

path = untar_data(URLs.MNIST_TINY)
df = pd.read_csv(path/'labels.csv')
df.head()
name label
0 train/3/7463.png 3
1 train/3/9829.png 3
2 train/3/7881.png 3
3 train/3/8065.png 3
4 train/3/7046.png 3

以下是如何使用 ImageDataLoaders.from_df 加载它:

dls = ImageDataLoaders.from_df(df, path)
/home/jhoward/git/fastai/fastai/data/transforms.py:212: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  o = r[c] if isinstance(c, int) or not c in getattr(r, '_fields', []) else getattr(r, c)

这是另一个多标签问题的示例:

path = untar_data(URLs.PASCAL_2007)
df = pd.read_csv(path/'train.csv')
df.head()
100.00% [1637801984/1637796771 03:22<00:00]
fname labels is_valid
0 000005.jpg chair True
1 000007.jpg car True
2 000009.jpg horse person True
3 000012.jpg car False
4 000016.jpg bicycle True
dls = ImageDataLoaders.from_df(df, path, folder='train', valid_col='is_valid')
/home/jhoward/git/fastai/fastai/data/transforms.py:212: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  o = r[c] if isinstance(c, int) or not c in getattr(r, '_fields', []) else getattr(r, c)

注意,您也可以将 2 传递给 valid_col(索引,起始为 0)。

show_doc(ImageDataLoaders.from_csv)

source

ImageDataLoaders.from_csv

 ImageDataLoaders.from_csv (path, csv_fname='labels.csv', header='infer',
                            delimiter=None, valid_pct=0.2, seed=None,
                            fn_col=0, folder=None, suff='', label_col=1,
                            label_delim=None, y_block=None,
                            valid_col=None, item_tfms=None,
                            batch_tfms=None, img_cls=<class
                            'fastai.vision.core.PILImage'>, bs:int=64,
                            val_bs:int=None, shuffle:bool=True,
                            device=None)

Create from path/csv_fname using fn_col and label_col

Type Default Details
path str | Path . Path to put in DataLoaders
csv_fname str labels.csv
header str infer
delimiter NoneType None
valid_pct float 0.2
seed NoneType None
fn_col int 0
folder NoneType None
suff str
label_col int 1
label_delim NoneType None
y_block NoneType None
valid_col NoneType None
item_tfms NoneType None
batch_tfms NoneType None
img_cls BypassNewMeta PILImage
bs int 64 Size of batch
val_bs int None Size of batch for validation DataLoader
shuffle bool True Whether to shuffle data
device NoneType None Device to put DataLoaders

ImageDataLoaders.from_df 相同,在使用 headerdelimiter 加载文件后。

以下是使用此方法加载与之前相同的数据集的方式:

dls = ImageDataLoaders.from_csv(path, 'train.csv', folder='train', valid_col='is_valid')
/home/jhoward/git/fastai/fastai/data/transforms.py:212: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  o = r[c] if isinstance(c, int) or not c in getattr(r, '_fields', []) else getattr(r, c)
show_doc(ImageDataLoaders.from_lists)

source

ImageDataLoaders.from_lists

 ImageDataLoaders.from_lists (path, fnames, labels, valid_pct=0.2,
                              seed:int=None, y_block=None, item_tfms=None,
                              batch_tfms=None, img_cls=<class
                              'fastai.vision.core.PILImage'>, bs:int=64,
                              val_bs:int=None, shuffle:bool=True,
                              device=None)

Create from list of fnames and labels in path

Type Default Details
path str | Path . Path to put in DataLoaders
fnames
labels
valid_pct float 0.2
seed int None
y_block NoneType None
item_tfms NoneType None
batch_tfms NoneType None
img_cls BypassNewMeta PILImage
bs int 64 Size of batch
val_bs int None Size of batch for validation DataLoader
shuffle bool True Whether to shuffle data
device NoneType None Device to put DataLoaders

验证集是一个随机子集,大小为valid_pct,可以选择使用seed进行可重复性创建。可以传入y_block来指定目标的类型。

path = untar_data(URLs.PETS)
fnames = get_image_files(path/"images")
labels = ['_'.join(x.name.split('_')[:-1]) for x in fnames]
dls = ImageDataLoaders.from_lists(path, fnames, labels)
Downloading a new version of this dataset...
89.67% [727859200/811706944 02:12<00:15]
class SegmentationDataLoaders(DataLoaders):
    "Basic wrapper around several `DataLoader`s with factory methods for segmentation problems"
    @classmethod
    @delegates(DataLoaders.from_dblock)
    def from_label_func(cls, path, fnames, label_func, valid_pct=0.2, seed=None, codes=None, item_tfms=None, batch_tfms=None, 
                        img_cls=PILImage, **kwargs):
        "Create from list of `fnames` in `path`s with `label_func`."
        dblock = DataBlock(blocks=(ImageBlock(img_cls), MaskBlock(codes=codes)),
                           splitter=RandomSplitter(valid_pct, seed=seed),
                           get_y=label_func,
                           item_tfms=item_tfms,
                           batch_tfms=batch_tfms)
        res = cls.from_dblock(dblock, fnames, path=path, **kwargs)
        return res
show_doc(SegmentationDataLoaders.from_label_func)

验证集是 valid_pct 的一个随机子集,可以选择使用 seed 来保证结果可复现。codes 包含标签的映射索引。

path = untar_data(URLs.CAMVID_TINY)
fnames = get_image_files(path/'images')
def label_func(x): return path/'labels'/f'{x.stem}_P{x.suffix}'
codes = np.loadtxt(path/'codes.txt', dtype=str)
    
dls = SegmentationDataLoaders.from_label_func(path, fnames, label_func, codes=codes)

导出 -

from nbdev import nbdev_export
nbdev_export()