! [ -e /content ] && pip install -Uqq fastai # 在Colab上升级fastai
数据转换
from __future__ import annotations
from fastai.torch_basics import *
from fastai.data.core import *
from fastai.data.load import *
from fastai.data.external import *
from sklearn.model_selection import train_test_split
import posixpath
from nbdev.showdoc import *
获取、拆分和标记数据的函数,以及通用转换。
获取、拆分和标记
对于大多数数据源的创建,我们需要一些函数来获取项目列表,将其分割为训练集/验证集,并对其进行标记。 fastai 提供了函数,使每个步骤变得简单(尤其是结合使用 fastai.data.blocks
时)。
获取
首先,我们将查看一些获取项目列表的函数(通常是文件名)。
我们将使用 tiny MNIST(MNIST 的一个子集,仅包含两个类别,7
和 3
)作为本页中示例/测试的基础。
= untar_data(URLs.MNIST_TINY)
path /'train').ls() (path
(#2) [Path('/Users/jhoward/.fastai/data/mnist_tiny/train/7'),Path('/Users/jhoward/.fastai/data/mnist_tiny/train/3')]
def _get_files(p, fs, extensions=None):
= Path(p)
p = [p/f for f in fs if not f.startswith('.')
res and ((not extensions) or f'.{f.split(".")[-1].lower()}' in extensions)]
return res
def get_files(path, extensions=None, recurse=True, folders=None, followlinks=True):
"Get all the files in `path` with optional `extensions`, optionally with `recurse`, only in `folders`, if specified."
= Path(path)
path =L(folders)
folders= setify(extensions)
extensions = {e.lower() for e in extensions}
extensions if recurse:
= []
res for i,(p,d,f) in enumerate(os.walk(path, followlinks=followlinks)): # 返回值(目录路径,目录名称,文件名称)
if len(folders) !=0 and i==0: d[:] = [o for o in d if o in folders]
else: d[:] = [o for o in d if not o.startswith('.')]
if len(folders) !=0 and i==0 and '.' not in folders: continue
+= _get_files(p, f, extensions)
res else:
= [o.name for o in os.scandir(path) if o.is_file()]
f = _get_files(path, f, extensions)
res return L(res)
这是从磁盘中获取一系列文件名的最通用方法。如果传递 extensions
(包括 .
),返回的文件名将根据该列表进行过滤。仅直接位于 path
中的文件会被包括,除非你传递 recurse
,此时所有子文件夹也会被递归搜索。folders
是一个可选的目录列表,用于限制搜索范围。
= get_files(path/'train'/'3', extensions='.png', recurse=False)
t3 = get_files(path/'train'/'7', extensions='.png', recurse=False)
t7 = get_files(path/'train', extensions='.png', recurse=True)
t len(t), len(t3)+len(t7))
test_eq(len(get_files(path/'train'/'3', extensions='.jpg', recurse=False)),0)
test_eq(len(t), len(get_files(path, extensions='.png', recurse=True, folders='train')))
test_eq( t
(#709) [Path('/Users/jhoward/.fastai/data/mnist_tiny/train/7/9243.png'),Path('/Users/jhoward/.fastai/data/mnist_tiny/train/7/9519.png'),Path('/Users/jhoward/.fastai/data/mnist_tiny/train/7/7534.png'),Path('/Users/jhoward/.fastai/data/mnist_tiny/train/7/9082.png'),Path('/Users/jhoward/.fastai/data/mnist_tiny/train/7/8377.png'),Path('/Users/jhoward/.fastai/data/mnist_tiny/train/7/994.png'),Path('/Users/jhoward/.fastai/data/mnist_tiny/train/7/8559.png'),Path('/Users/jhoward/.fastai/data/mnist_tiny/train/7/8217.png'),Path('/Users/jhoward/.fastai/data/mnist_tiny/train/7/8571.png'),Path('/Users/jhoward/.fastai/data/mnist_tiny/train/7/8954.png')...]
len(get_files(path/'train'/'3', recurse=False)),346)
test_eq(len(get_files(path, extensions='.png', recurse=True, folders=['train', 'test'])),729)
test_eq(len(get_files(path, extensions='.png', recurse=True, folders='train')),709)
test_eq(len(get_files(path, extensions='.png', recurse=True, folders='training')),0) test_eq(
能够创建具有自定义行为的函数通常很有用。 fastai.data
通常使用以 er
结尾的 CamelCase 动词命名的函数来创建这些函数。 FileGetter
就是这样的一个简单函数创建器的例子。
def FileGetter(suf='', extensions=None, recurse=True, folders=None):
"Create `get_files` partial function that searches path suffix `suf`, only in `folders`, if specified, and passes along args"
def _inner(o, extensions=extensions, recurse=recurse, folders=folders):
return get_files(o/suf, extensions, recurse, folders)
return _inner
= FileGetter(extensions='.png', recurse=False)
fpng len(t7), len(fpng(path/'train'/'7')))
test_eq(len(t), len(fpng(path/'train', recurse=True)))
test_eq(= FileGetter(extensions='.png', recurse=True)
fpng_r len(t), len(fpng_r(path/'train'))) test_eq(
= set(k for k,v in mimetypes.types_map.items() if v.startswith('image/')) image_extensions
def get_image_files(path, recurse=True, folders=None):
"Get image files in `path` recursively, only in `folders`, if specified."
return get_files(path, extensions=image_extensions, recurse=recurse, folders=folders)
这只是用标准图像扩展名列表调用的 get_files
。
len(t), len(get_image_files(path, recurse=True, folders='train'))) test_eq(
def ImageGetter(suf='', recurse=True, folders=None):
"Create `get_image_files` partial that searches suffix `suf` and passes along `kwargs`, only in `folders`, if specified"
def _inner(o, recurse=recurse, folders=folders): return get_image_files(o/suf, recurse, folders)
return _inner
与FileGetter
相同,但适用于图像扩展名。
len(get_files(path/'train', extensions='.png', recurse=True, folders='3')),
test_eq(len(ImageGetter( 'train', recurse=True, folders='3')(path)))
def get_text_files(path, recurse=True, folders=None):
"Get text files in `path` recursively, only in `folders`, if specified."
return get_files(path, extensions=['.txt'], recurse=recurse, folders=folders)
class ItemGetter(ItemTransform):
"Creates a proper transform that applies `itemgetter(i)` (even on a tuple)"
= False
_retain def __init__(self, i): self.i = i
def encodes(self, x): return x[self.i]
1)((1,2,3)), 2)
test_eq(ItemGetter(1)(L(1,2,3)), 2)
test_eq(ItemGetter(1)([1,2,3]), 2)
test_eq(ItemGetter(1)(np.array([1,2,3])), 2) test_eq(ItemGetter(
class AttrGetter(ItemTransform):
"Creates a proper transform that applies `attrgetter(nm)` (even on a tuple)"
= False
_retain def __init__(self, nm, default=None): store_attr()
def encodes(self, x): return getattr(x, self.nm, self.default)
'shape')(torch.randn([4,5])), [4,5])
test_eq(AttrGetter('shape', [0])([4,5]), [0]) test_eq(AttrGetter(
拆分
下一组函数用于拆分数据为训练集和验证集。这些函数返回两个列表 - 一个是训练集的索引或掩码列表,另一个是验证集的索引或掩码列表。
def RandomSplitter(valid_pct=0.2, seed=None):
"Create function that splits `items` between train/val with `valid_pct` randomly."
def _inner(o):
if seed is not None: torch.manual_seed(seed)
= L(list(torch.randperm(len(o)).numpy()))
rand_idx = int(valid_pct * len(o))
cut return rand_idx[cut:],rand_idx[:cut]
return _inner
def _test_splitter(f, items=None):
"A basic set of condition a splitter must pass"
= ifnone(items, range_of(30))
items = f(items)
trn,val assert 0<len(trn)<len(items)
assert all(o not in val for o in trn)
len(trn), len(items)-len(val))
test_eq(# 测试随机种子一致性
0], trn)
test_eq(f(items)[return trn, val
=42)) _test_splitter(RandomSplitter(seed
((#24) [10,18,16,23,28,26,20,7,21,22...], (#6) [12,0,6,25,8,15])
使用scikit-learn的train_test_split。这允许以分层的方式(根据“标签”分布均匀地)对项目进行拆分。
def TrainTestSplitter(test_size=0.2, random_state=None, stratify=None, train_size=None, shuffle=True):
"Split `items` into random train and test subsets using sklearn train_test_split utility."
def _inner(o, **kwargs):
= train_test_split(range_of(o), test_size=test_size, random_state=random_state,
train,valid =stratify, train_size=train_size, shuffle=shuffle)
stratifyreturn L(train), L(valid)
return _inner
= list(range(30))
src = [0] * 20 + [1] * 10
labels = 0.2
test_size
= TrainTestSplitter(test_size=test_size, random_state=42, stratify=labels)
f = _test_splitter(f, items=src)
trn,val
# 测试标签分布一致性
# 验证集中应分别有 test_size% 的零和一。
len([t for t in val if t < 20]) / 20, test_size)
test_eq(len([t for t in val if t > 20]) / 10, test_size) test_eq(
def IndexSplitter(valid_idx):
"Split `items` so that `val_idx` are in the validation set and the others in the training set"
def _inner(o):
= np.setdiff1d(np.array(range_of(o)), np.array(valid_idx))
train_idx return L(train_idx, use_list=True), L(valid_idx, use_list=True)
return _inner
= 'a,b,c,d,e,f,g,h,i,j'.split(',') #以明确区分索引与元素。
items = IndexSplitter([3,7,9])
splitter
_test_splitter(splitter, items)0,1,2,4,5,6,8],[3,7,9]]) test_eq(splitter(items),[[
def EndSplitter(valid_pct=0.2, valid_last=True):
"Create function that splits `items` between train/val with `valid_pct` at the end if `valid_last` else at the start. Useful for ordered data."
assert 0<valid_pct<1, "valid_pct must be in (0,1)"
def _inner(o):
= range_of(o)
idxs = int(valid_pct * len(o))
cut return (idxs[:-cut], idxs[-cut:]) if valid_last else (idxs[cut:],idxs[:cut])
return _inner
= range_of(10)
items
= EndSplitter(valid_last=True)
splitter_last
_test_splitter(splitter_last)0,1,2,3,4,5,6,7], [8,9]))
test_eq(splitter_last(items), ([
= EndSplitter(valid_last=False)
splitter_start
_test_splitter(splitter_start)2,3,4,5,6,7,8,9], [0,1])) test_eq(splitter_start(items), ([
def _grandparent_idxs(items, name):
def _inner(items, name): return mask2idxs(Path(o).parent.parent.name == name for o in items)
return [i for n in L(name) for i in _inner(items,n)]
def GrandparentSplitter(train_name='train', valid_name='valid'):
"Split `items` from the grand parent folder names (`train_name` and `valid_name`)."
def _inner(o):
return _grandparent_idxs(o, train_name),_grandparent_idxs(o, valid_name)
return _inner
= [path/'train/3/9932.png', path/'valid/7/7189.png',
fnames /'valid/7/7320.png', path/'train/7/9833.png',
path/'train/3/7666.png', path/'valid/3/925.png',
path/'train/7/724.png', path/'valid/3/93055.png']
path= GrandparentSplitter() splitter
=fnames)
_test_splitter(splitter, items0,3,4,6],[1,2,5,7]]) test_eq(splitter(fnames),[[
= fnames + [path/'test/3/4256.png', path/'test/7/2345.png', path/'valid/7/6467.png']
fnames2 = GrandparentSplitter(train_name=('train', 'valid'), valid_name='test')
splitter =fnames2)
_test_splitter(splitter, items0,3,4,6,1,2,5,7,10],[8,9]]) test_eq(splitter(fnames2),[[
def FuncSplitter(func):
"Split `items` by result of `func` (`True` for validation, `False` for training set)."
def _inner(o):
= mask2idxs(func(o_) for o_ in o)
val_idx return IndexSplitter(val_idx)(o)
return _inner
= FuncSplitter(lambda o: Path(o).parent.parent.name == 'valid')
splitter
_test_splitter(splitter, fnames)0,3,4,6],[1,2,5,7]]) test_eq(splitter(fnames),[[
def MaskSplitter(mask):
"Split `items` depending on the value of `mask`."
def _inner(o): return IndexSplitter(mask2idxs(mask))(o)
return _inner
= list(range(6))
items = MaskSplitter([True,False,False,True,False,True])
splitter
_test_splitter(splitter, items)1,2,4],[0,3,5]]) test_eq(splitter(items),[[
def FileSplitter(fname):
"Split `items` by providing file `fname` (contains names of valid items separated by newline)."
= Path(fname).read_text().split('\n')
valid def _func(x): return x.name in valid
def _inner(o): return FuncSplitter(_func)(o)
return _inner
with tempfile.TemporaryDirectory() as d:
= Path(d)/'valid.txt'
fname '\n'.join([Path(fnames[i]).name for i in [1,3,4]]))
fname.write_text(= FileSplitter(fname)
splitter
_test_splitter(splitter, fnames)0,2,5,6,7],[1,3,4]]) test_eq(splitter(fnames),[[
def ColSplitter(col='is_valid', on=None):
"Split `items` (supposed to be a dataframe) by value in `col`"
def _inner(o):
assert isinstance(o, pd.DataFrame), "ColSplitter only works when your items are a pandas DataFrame"
= o.iloc[:,col] if isinstance(col, int) else o[col]
c if on is None: valid_idx = c.values.astype('bool')
elif is_listy(on): valid_idx = c.isin(on)
else: valid_idx = c == on
return IndexSplitter(mask2idxs(valid_idx))(o)
return _inner
= pd.DataFrame({'a': [0,1,2,3,4], 'b': [True,False,True,True,False]})
df = ColSplitter('b')(df)
splits 1,4], [0,2,3]])
test_eq(splits, [[# 适用于字符串或索引
= ColSplitter(1)(df)
splits 1,4], [0,2,3]])
test_eq(splits, [[# does not get confused if the type of 'is_valid' is integer, but it meant to be a yes/no
= pd.DataFrame({'a': [0,1,2,3,4], 'is_valid': [1,0,1,1,0]})
df = ColSplitter('is_valid')(df)
splits_by_int 1,4], [0,2,3]])
test_eq(splits_by_int, [[# 可选择传递一个特定值作为分割依据
= pd.DataFrame({'a': [0,1,2,3,4,5], 'b': [1,2,3,1,2,3]})
df = ColSplitter('b', 3)(df)
splits_on_val 0,1,3,4], [2,5]])
test_eq(splits_on_val, [[# 或多个值
= ColSplitter('b', [2,3])(df)
splits_on_val 0,3], [1,2,4,5]]) test_eq(splits_on_val, [[
def RandomSubsetSplitter(train_sz, valid_sz, seed=None):
"Take randoms subsets of `splits` with `train_sz` and `valid_sz`"
assert 0 < train_sz < 1
assert 0 < valid_sz < 1
assert train_sz + valid_sz <= 1.
def _inner(o):
if seed is not None: torch.manual_seed(seed)
= int(len(o)*train_sz),int(len(o)*valid_sz)
train_len,valid_len = L(list(torch.randperm(len(o)).numpy()))
idxs return idxs[:train_len],idxs[train_len:train_len+valid_len]
return _inner
= list(range(100))
items = list(np.arange(70,100))
valid_idx = RandomSubsetSplitter(0.3, 0.1)
splitter = RandomSubsetSplitter(0.3, 0.1)(items)
splits len(splits[0]), 30)
test_eq(len(splits[1]), 10) test_eq(
标签
最终一组函数用于标记单个数据项。
def parent_label(o):
"Label `item` with the parent folder name."
return Path(o).parent.name
请注意,parent_label
并没有任何自定义内容,因此它不会返回一个函数 - 您可以直接使用它。
0]), '3')
test_eq(parent_label(fnames["fastai_dev/dev/data/mnist_tiny/train/3/9932.png"), '3')
test_eq(parent_label(for o in fnames] [parent_label(o)
['3', '7', '7', '7', '3', '3', '7', '3']
#test for MS Windows when os.path.sep is '\\' instead of '/'
"fastai_dev","dev","data","mnist_tiny","train", "3", "9932.png") ), '3') test_eq(parent_label(os.path.join(
class RegexLabeller():
"Label `item` with regex `pat`."
def __init__(self, pat, match=False):
self.pat = re.compile(pat)
self.matcher = self.pat.match if match else self.pat.search
def __call__(self, o):
= str(o).replace(os.sep, posixpath.sep)
o = self.matcher(o)
res assert res,f'Failed to find "{self.pat}" in "{o}"'
return res.group(1)
RegexLabeller
是一个非常灵活的函数,因为它处理字符串化项的任何正则表达式搜索。传递 match=True
使用 re.match
(即仅检查字符串开头),否则使用 re.search
(默认)。
例如,下面的示例复制了之前的 parent_label
结果。
= RegexLabeller(fr'{posixpath.sep}(\d){posixpath.sep}')
f 0]), '3')
test_eq(f(fnames[for o in fnames] [f(o)
['3', '7', '7', '7', '3', '3', '7', '3']
= RegexLabeller(fr'{posixpath.sep}(\d){posixpath.sep}')
f = Path(fnames[0]).as_posix()
a1 '3')
test_eq(f(a1), for o in fnames] [f(o)
['3', '7', '7', '7', '3', '3', '7', '3']
= RegexLabeller(r'(\d*)', match=True)
f 0].name), '9932') test_eq(f(fnames[
class ColReader(DisplayedTransform):
"Read `cols` in `row` with potential `pref` and `suff`"
def __init__(self, cols, pref='', suff='', label_delim=None):
store_attr()self.pref = str(pref) + os.path.sep if isinstance(pref, Path) else pref
self.cols = L(cols)
def _do_one(self, r, c):
= r[c] if isinstance(c, int) or not c in getattr(r, '_fields', []) else getattr(r, c)
o if len(self.pref)==0 and len(self.suff)==0 and self.label_delim is None: return o
if self.label_delim is None: return f'{self.pref}{o}{self.suff}'
else: return o.split(self.label_delim) if len(o)>0 else []
def __call__(self, o, **kwargs):
if len(self.cols) == 1: return self._do_one(o, self.cols[0])
return L(self._do_one(o, c) for c in self.cols)
cols
可以是列名的列表或索引的列表(或者两者的混合)。如果传递了 label_delim
,结果将通过它进行拆分。
= pd.DataFrame({'a': 'a b c d'.split(), 'b': ['1 2', '0', '', '1 2 3']})
df = ColReader('a', pref='0', suff='1')
f for o in df.itertuples()], '0a1 0b1 0c1 0d1'.split())
test_eq([f(o)
= ColReader('b', label_delim=' ')
f for o in df.itertuples()], [['1', '2'], ['0'], [], ['1', '2', '3']])
test_eq([f(o)
'a1'] = df['a']
df[= ColReader(['a', 'a1'], pref='0', suff='1')
f for o in df.itertuples()], [L('0a1', '0a1'), L('0b1', '0b1'), L('0c1', '0c1'), L('0d1', '0d1')])
test_eq([f(o)
= pd.DataFrame({'a': [L(0,1), L(2,3,4), L(5,6,7)]})
df = ColReader('a')
f for o in df.itertuples()], [L(0,1), L(2,3,4), L(5,6,7)])
test_eq([f(o)
'name'] = df['a']
df[= ColReader('name')
f 0,:])], [L(0,1)])
test_eq([f(df.iloc[
'mask'] = df['a']
df[= ColReader('mask')
f for o in df.itertuples()], [L(0,1), L(2,3,4), L(5,6,7)])
test_eq([f(o) 0,:])], [L(0,1)]) test_eq([f(df.iloc[
分类 -
class CategoryMap(CollBase):
"Collection of categories with the reverse mapping in `o2i`"
def __init__(self, col, sort=True, add_na=False, strict=False):
if hasattr(col, 'dtype') and isinstance(col.dtype, CategoricalDtype):
= L(col.cat.categories, use_list=True)
items #移除未使用的类别,同时保持顺序
if strict: items = L(o for o in items if o in col.unique())
else:
if not hasattr(col,'unique'): col = L(col, use_list=True)
# `o==o` 是 Pandas 中用于定义非 NaN 的广义定义
= L(o for o in col.unique() if o==o)
items if sort: items = items.sorted()
self.items = '#na#' + items if add_na else items
self.o2i = defaultdict(int, self.items.val2idx()) if add_na else dict(self.items.val2idx())
def map_objs(self,objs):
"Map `objs` to IDs"
return L(self.o2i[o] for o in objs)
def map_ids(self,ids):
"Map `ids` to objects in vocab"
return L(self.items[o] for o in ids)
def __eq__(self,b): return all_equal(b,self)
= CategoryMap([4,2,3,4])
t 2,3,4])
test_eq(t, [2:0,3:1,4:2})
test_eq(t.o2i, {2,3]), [0,1])
test_eq(t.map_objs([0,1]), [2,3])
test_eq(t.map_ids([lambda: t.o2i['unseen label']) test_fail(
= CategoryMap([4,2,3,4], add_na=True)
t '#na#',2,3,4])
test_eq(t, ['#na#':0,2:1,3:2,4:3}) test_eq(t.o2i, {
= CategoryMap(pd.Series([4,2,3,4]), sort=False)
t 4,2,3])
test_eq(t, [4:0,2:1,3:2}) test_eq(t.o2i, {
= pd.Series(pd.Categorical(['M','H','L','M'], categories=['H','M','L'], ordered=True))
col = CategoryMap(col)
t 'H','M','L'])
test_eq(t, ['H':0,'M':1,'L':2}) test_eq(t.o2i, {
= pd.Series(pd.Categorical(['M','H','M'], categories=['H','M','L'], ordered=True))
col = CategoryMap(col, strict=True)
t 'H','M'])
test_eq(t, ['H':0,'M':1}) test_eq(t.o2i, {
class Categorize(DisplayedTransform):
"Reversible transform of category string to `vocab` id"
=CrossEntropyLossFlat(),1
loss_func,orderdef __init__(self, vocab=None, sort=True, add_na=False):
if vocab is not None: vocab = CategoryMap(vocab, sort=sort, add_na=add_na)
store_attr()
def setups(self, dsets):
if self.vocab is None and dsets is not None: self.vocab = CategoryMap(dsets, sort=self.sort, add_na=self.add_na)
self.c = len(self.vocab)
def encodes(self, o):
try:
return TensorCategory(self.vocab.o2i[o])
except KeyError as e:
raise KeyError(f"Label '{o}' was not included in the training dataset") from e
def decodes(self, o): return Category (self.vocab [o])
class Category(str, ShowTitle): _show_args = {'label': 'category'}
= Categorize()
cat = Datasets(['cat', 'dog', 'cat'], tfms=[cat])
tds 'cat', 'dog'])
test_eq(cat.vocab, ['cat'), 0)
test_eq(cat(1), 'dog')
test_eq(cat.decode(lambda: show_at(tds,2), 'cat')
test_stdout(lambda: cat('bird')) test_fail(
= Categorize(add_na=True)
cat = Datasets(['cat', 'dog', 'cat'], tfms=[cat])
tds '#na#', 'cat', 'dog'])
test_eq(cat.vocab, ['cat'), 1)
test_eq(cat(2), 'dog')
test_eq(cat.decode(lambda: show_at(tds,2), 'cat') test_stdout(
= Categorize(vocab=['dog', 'cat'], sort=False, add_na=True)
cat = Datasets(['cat', 'dog', 'cat'], tfms=[cat])
tds '#na#', 'dog', 'cat'])
test_eq(cat.vocab, ['dog'), 1)
test_eq(cat(2), 'cat')
test_eq(cat.decode(lambda: show_at(tds,2), 'cat') test_stdout(
多分类 -
class MultiCategorize(Categorize):
"Reversible transform of multi-category strings to `vocab` id"
=BCEWithLogitsLossFlat(),1
loss_func,orderdef __init__(self, vocab=None, add_na=False): super().__init__(vocab=vocab,add_na=add_na,sort=vocab==None)
def setups(self, dsets):
if not dsets: return
if self.vocab is None:
= set()
vals for b in dsets: vals = vals.union(set(b))
self.vocab = CategoryMap(list(vals), add_na=self.add_na)
def encodes(self, o):
if not all(elem in self.vocab.o2i.keys() for elem in o):
= [elem for elem in o if elem not in self.vocab.o2i.keys()]
diff = "', '".join(diff)
diff_str raise KeyError(f"Labels '{diff_str}' were not included in the training dataset")
return TensorMultiCategory([self.vocab.o2i[o_] for o_ in o])
def decodes(self, o): return MultiCategory ([self.vocab [o_] for o_ in o])
class MultiCategory(L):
def show(self, ctx=None, sep=';', color='black', **kwargs):
return show_title(sep.join(self.map(str)), ctx=ctx, color=color, **kwargs)
= MultiCategorize()
cat = Datasets([['b', 'c'], ['a'], ['a', 'c'], []], tfms=[cat])
tds 3][0], TensorMultiCategory([]))
test_eq(tds['a', 'b', 'c'])
test_eq(cat.vocab, ['a', 'c']), tensor([0,2]))
test_eq(cat([
test_eq(cat([]), tensor([]))1]), ['b'])
test_eq(cat.decode([0,2]), ['a', 'c'])
test_eq(cat.decode([lambda: show_at(tds,2), 'a;c')
test_stdout(
# if vocab supplied, ensure it maintains its order (i.e., it doesn't sort)
= MultiCategorize(vocab=['z', 'y', 'x'])
cat 'z','y','x'])
test_eq(cat.vocab, [
lambda: cat('bird')) test_fail(
class OneHotEncode(DisplayedTransform):
"One-hot encodes targets"
=2
orderdef __init__(self, c=None): store_attr()
def setups(self, dsets):
if self.c is None: self.c = len(L(getattr(dsets, 'vocab', None)))
if not self.c: warn("Couldn't infer the number of classes, please pass a value for `c` at init")
def encodes(self, o): return TensorMultiCategory(one_hot(o, self.c).float())
def decodes(self, o): return one_hot_decode(o, None)
与 MultiCategorize
一起工作,或在您具有独热编码目标时单独使用(在这种情况下,传递 vocab
进行解码,并将 do_encode=False
)。
= OneHotEncode(c=3)
_tfm 0,2]), tensor([1.,0,1]))
test_eq(_tfm([0,1,1])), [1,2]) test_eq(_tfm.decode(tensor([
= Datasets([['b', 'c'], ['a'], ['a', 'c'], []], [[MultiCategorize(), OneHotEncode()]])
tds 1], [tensor([1.,0,0])])
test_eq(tds[3], [tensor([0.,0,0])])
test_eq(tds[False, True, True])]), [['b','c']])
test_eq(tds.decode([tensor([type(tds[1][0]), TensorMultiCategory)
test_eq(lambda: show_at(tds,2), 'a;c') test_stdout(
#通过词汇测试
= Datasets([['b', 'c'], ['a'], ['a', 'c'], []], [[MultiCategorize(vocab=['a', 'b', 'c']), OneHotEncode()]])
tds 1], [tensor([1.,0,0])])
test_eq(tds[3], [tensor([0.,0,0])])
test_eq(tds[False, True, True])]), [['b','c']])
test_eq(tds.decode([tensor([type(tds[1][0]), TensorMultiCategory)
test_eq(lambda: show_at(tds,2), 'a;c') test_stdout(
class EncodedMultiCategorize(Categorize):
"Transform of one-hot encoded multi-category that decodes with `vocab`"
=BCEWithLogitsLossFlat(),1
loss_func,orderdef __init__(self, vocab):
super().__init__(vocab, sort=vocab==None)
self.c = len(vocab)
def encodes(self, o): return TensorMultiCategory(tensor(o).float())
def decodes(self, o): return MultiCategory (one_hot_decode(o, self.vocab))
= EncodedMultiCategorize(vocab=['a', 'b', 'c'])
_tfm 1,0,1]), tensor([1., 0., 1.]))
test_eq(_tfm([type(_tfm([1,0,1])), TensorMultiCategory)
test_eq(False, True, True])), ['b','c'])
test_eq(_tfm.decode(tensor([
= EncodedMultiCategorize(vocab=['c', 'b', 'a'])
_tfm2 'c', 'b', 'a']) test_eq(_tfm2.vocab, [
class RegressionSetup(DisplayedTransform):
"Transform that floatifies targets"
=MSELossFlat()
loss_funcdef __init__(self, c=None): store_attr()
def encodes(self, o): return tensor(o).float()
def decodes(self, o): return TitledFloat(o) if o.ndim==0 else TitledTuple(o_.item() for o_ in o)
def setups(self, dsets):
if self.c is not None: return
try: self.c = len(dsets[0]) if hasattr(dsets[0], '__len__') else 1
except: self.c = 0
= RegressionSetup()
_tfm = Datasets([0, 1, 2], RegressionSetup)
dsets 1)
test_eq(dsets.c, 0], (tensor(0.),))
test_eq_type(dsets[
= Datasets([[0, 1, 2], [3,4,5]], RegressionSetup)
dsets 3)
test_eq(dsets.c, 0], (tensor([0.,1.,2.]),)) test_eq_type(dsets[
def get_c(dls):
if getattr(dls, 'c', False): return dls.c
if nested_attr(dls, 'train.after_item.c', False): return dls.train.after_item.c
if nested_attr(dls, 'train.after_batch.c', False): return dls.train.after_batch.c
= getattr(dls, 'vocab', [])
vocab if len(vocab) > 0 and is_listy(vocab[-1]): vocab = vocab[-1]
return len(vocab)
MNIST 数据集的端到端示例
让我们展示如何使用这些函数来获取 Datasets
中的 mnist 数据集。首先,我们抓取所有的图像。
= untar_data(URLs.MNIST_TINY)
path = get_image_files(path) items
然后我们根据文件夹进行训练集和验证集的拆分。
= GrandparentSplitter()
splitter = splitter(items)
splits = (items[i] for i in splits)
train,valid 3],valid[:3] train[:
((#3) [Path('/Users/jhoward/.fastai/data/mnist_tiny/train/7/9243.png'),Path('/Users/jhoward/.fastai/data/mnist_tiny/train/7/9519.png'),Path('/Users/jhoward/.fastai/data/mnist_tiny/train/7/7534.png')],
(#3) [Path('/Users/jhoward/.fastai/data/mnist_tiny/valid/7/9294.png'),Path('/Users/jhoward/.fastai/data/mnist_tiny/valid/7/9257.png'),Path('/Users/jhoward/.fastai/data/mnist_tiny/valid/7/8175.png')])
我们的输入是我们打开并转换为张量的图像,目标是根据父目录进行标记的类别。
from PIL import Image
def open_img(fn:Path): return Image.open(fn).copy()
def img2tensor(im:Image.Image): return TensorImage(array(im)[None])
= [[open_img, img2tensor],
tfms
[parent_label, Categorize()]]= Datasets(train, tfms) train_ds
= train_ds[3]
x,y = decode_at(train_ds,3)
xd,yd 3]),yd)
test_eq(parent_label(train[open(train[3])),xd[0].numpy()) test_eq(array(Image.
= show_at(train_ds, 3, cmap="Greys", figsize=(1,1)) ax
assert ax.title.get_text() in ('3','7')
test_fig_exists(ax)
ToTensor -
class ToTensor(Transform):
"Convert item to appropriate tensor class"
= 5 order
IntToFloatTensor -
class IntToFloatTensor(DisplayedTransform):
"Transform image to float tensor, optionally dividing by 255 (e.g. for images)."
= 10 #需要在GPU上运行PIL变换后执行
order def __init__(self, div=255., div_mask=1): store_attr()
def encodes(self, o:TensorImage): return o.float().div_(self.div)
def encodes(self, o:TensorMask ): return (o.long() / self.div_mask).long()
def decodes(self, o:TensorImage): return ((o.clamp(0., 1.) * self.div).long()) if self.div else o
= (TensorImage(tensor(1)),tensor(2).long(),TensorMask(tensor(3)))
t = IntToFloatTensor()
tfm = tfm(t)
ft 1./255, 2, 3])
test_eq(ft, [type(ft[0]), TensorImage)
test_eq(type(ft[2]), TensorMask)
test_eq(0].type(),'torch.FloatTensor')
test_eq(ft[1].type(),'torch.LongTensor')
test_eq(ft[2].type(),'torch.LongTensor') test_eq(ft[
归一化 -
def broadcast_vec(dim, ndim, *t, cuda=True):
"Make a vector broadcastable over `dim` (out of `ndim` total) by prepending and appending unit axes"
= [1]*ndim
v = -1
v[dim] = to_device if cuda else noop
f return [f(tensor(o).view(*v)) for o in t]
@docs
class Normalize(DisplayedTransform):
"Normalize/denorm batch of `TensorImage`"
= L('mean', 'std'),99
parameters,order def __init__(self, mean=None, std=None, axes=(0,2,3)): store_attr()
@classmethod
def from_stats(cls, mean, std, dim=1, ndim=4, cuda=True): return cls(*broadcast_vec(dim, ndim, mean, std, cuda=cuda))
def setups(self, dl:DataLoader):
if self.mean is None or self.std is None:
*_ = dl.one_batch()
x,self.mean,self.std = x.mean(self.axes, keepdim=True),x.std(self.axes, keepdim=True)+1e-7
def encodes(self, x:TensorImage): return (x-self.mean) / self.std
def decodes(self, x:TensorImage):
= to_cpu if x.device.type=='cpu' else noop
f return (x*f(self.std) + f(self.mean))
=dict(encodes="Normalize batch", decodes="Denormalize batch") _docs
= [0.5]*3,[0.5]*3
mean,std = broadcast_vec(1, 4, mean, std)
mean,std = [IntToFloatTensor(), Normalize.from_stats(mean,std)]
batch_tfms = TfmdDL(train_ds, after_batch=batch_tfms, bs=4, device=default_device()) tdl
= tdl.one_batch()
x,y = tdl.decode((x,y))
xd,yd
assert x.type().endswith('.FloatTensor')
type(), 'torch.LongTensor')
test_eq(xd.type(x), TensorImage)
test_eq(type(y), TensorCategory)
test_eq(assert x.mean()<0.0
assert x.std()>0.3
assert 0<xd.float().mean()/255.<1
assert 0<xd.float().std()/255.<0.7
= Normalize()
nrm = [IntToFloatTensor(), nrm]
batch_tfms = TfmdDL(train_ds, after_batch=batch_tfms, bs=4)
tdl = tdl.one_batch()
x,y 0.0, 1e-4)
test_close(x.mean(), assert x.std()>0.9, x.std()
#仅供视觉效果
from fastai.vision.core import *
tdl.show_batch((x,y))
= cast(x,Tensor),cast(y,Tensor) #张量丢失类型(用于模拟预测)
x,y type(x), TensorImage)
test_ne(=(1,1)) #检查类型是否已由dl放回。 tdl.show_batch((x,y), figsize
导出 -
from nbdev import nbdev_export
nbdev_export()