! [ -e /content ] && pip install -Uqq fastai # 在Colab上升级fastai
表格核心
from __future__ import annotations
from fastai.torch_basics import *
from fastai.data.all import *
from nbdev.showdoc import *
'mode.chained_assignment','raise') pd.set_option(
在将表格数据组装到
DataLoaders
之前用来预处理表格数据的基本功能。
初始预处理
def make_date(df, date_field):
"Make sure `df[date_field]` is of the right date type."
= df[date_field].dtype
field_dtype if isinstance(field_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
= np.datetime64
field_dtype if not np.issubdtype(field_dtype, np.datetime64):
= pd.to_datetime(df[date_field], infer_datetime_format=True) df[date_field]
= pd.DataFrame({'date': ['2019-12-04', '2019-11-29', '2019-11-15', '2019-10-24']})
df 'date')
make_date(df, 'date'].dtype, np.dtype('datetime64[ns]')) test_eq(df[
def add_datepart(df, field_name, prefix=None, drop=True, time=False):
"Helper function that adds columns relevant to a date in the column `field_name` of `df`."
make_date(df, field_name)= df[field_name]
field = ifnone(prefix, re.sub('[Dd]ate$', '', field_name))
prefix = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start',
attr 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
if time: attr = attr + ['Hour', 'Minute', 'Second']
# Pandas 在 v1.1.10 版本中移除了 `dt.week`。
= field.dt.isocalendar().week.astype(field.dt.day.dtype) if hasattr(field.dt, 'isocalendar') else field.dt.week
week for n in attr: df[prefix + n] = getattr(field.dt, n.lower()) if n != 'Week' else week
= ~field.isna()
mask + 'Elapsed'] = np.where(mask,field.values.astype(np.int64) // 10 ** 9,np.nan)
df[prefix if drop: df.drop(field_name, axis=1, inplace=True)
return df
例如,如果我们有一系列日期,我们可以生成特征,比如 年份
、月份
、日期
、星期几
、是否为月初
,如下所示:
= pd.DataFrame({'date': ['2019-12-04', None, '2019-11-15', '2019-10-24']})
df = add_datepart(df, 'date')
df df.head()
Year | Month | Week | Day | Dayofweek | Dayofyear | Is_month_end | Is_month_start | Is_quarter_end | Is_quarter_start | Is_year_end | Is_year_start | Elapsed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019.0 | 12.0 | 49.0 | 4.0 | 2.0 | 338.0 | False | False | False | False | False | False | 1.575418e+09 |
1 | NaN | NaN | NaN | NaN | NaN | NaN | False | False | False | False | False | False | NaN |
2 | 2019.0 | 11.0 | 46.0 | 15.0 | 4.0 | 319.0 | False | False | False | False | False | False | 1.573776e+09 |
3 | 2019.0 | 10.0 | 43.0 | 24.0 | 3.0 | 297.0 | False | False | False | False | False | False | 1.571875e+09 |
'Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start',
test_eq(df.columns, ['Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start', 'Elapsed'])
1, 13))
test_eq(df[df.Elapsed.isna()].shape,(
# 测试该周的数据类型是否与其他日期部分字段一致
'Year'].dtype, df['Week'].dtype)
test_eq(df[
'Elapsed']), True) test_eq(pd.api.types.is_numeric_dtype(df[
= pd.DataFrame({'f1': [1.],'f2': [2.],'f3': [3.],'f4': [4.],'date':['2019-12-04']})
df = add_datepart(df, 'date')
df df.head()
f1 | f2 | f3 | f4 | Year | Month | Week | Day | Dayofweek | Dayofyear | Is_month_end | Is_month_start | Is_quarter_end | Is_quarter_start | Is_year_end | Is_year_start | Elapsed | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.0 | 2.0 | 3.0 | 4.0 | 2019 | 12 | 49 | 4 | 2 | 338 | False | False | False | False | False | False | 1.575418e+09 |
# Test Order of columns when date isn't in first position
'f1', 'f2', 'f3', 'f4', 'Year', 'Month', 'Week', 'Day',
test_eq(df.columns, ['Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start',
'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start', 'Elapsed'])
# Test that week dtype is consistent with other datepart fields
'Year'].dtype, df['Week'].dtype) test_eq(df[
def _get_elapsed(df,field_names, date_field, base_field, prefix):
for f in field_names:
= np.timedelta64(1, 'D')
day1 = np.datetime64(),None,[]
last_date,last_base,res for b,v,d in zip(df[base_field].values, df[f].values, df[date_field].values):
if last_base is None or b != last_base:
= np.datetime64(),b
last_date,last_base if v: last_date = d
-last_date).astype('timedelta64[D]') / day1))
res.append(((d+ f] = res
df[prefix return df
def add_elapsed_times(df, field_names, date_field, base_field):
"Add in `df` for each event in `field_names` the elapsed time according to `date_field` grouped by `base_field`"
= list(L(field_names))
field_names #确保date_field为日期类型,base_field为布尔类型
= df[field_names].astype('bool')
df[field_names]
make_date(df, date_field)
= df[field_names + [date_field, base_field]]
work_df = work_df.sort_values([base_field, date_field])
work_df = _get_elapsed(work_df, field_names, date_field, base_field, 'After')
work_df = work_df.sort_values([base_field, date_field], ascending=[True, False])
work_df = _get_elapsed(work_df, field_names, date_field, base_field, 'Before')
work_df
for a in ['After' + f for f in field_names] + ['Before' + f for f in field_names]:
= work_df[a].fillna(0).astype(int)
work_df[a]
for a,s in zip([True, False], ['_bw', '_fw']):
= work_df.set_index(date_field)
work_df = (work_df[[base_field] + field_names].sort_index(ascending=a)
tmp 7, min_periods=1).sum())
.groupby(base_field).rolling(if base_field in tmp: tmp.drop(base_field, axis=1,inplace=True)
=True)
tmp.reset_index(inplace=True)
work_df.reset_index(inplace= work_df.merge(tmp, 'left', [date_field, base_field], suffixes=['', s])
work_df =1, inplace=True)
work_df.drop(field_names, axisreturn df.merge(work_df, 'left', [date_field, base_field])
= pd.DataFrame({'date': ['2019-12-04', '2019-11-29', '2019-11-15', '2019-10-24'],
df 'event': [False, True, False, True], 'base': [1,1,2,2]})
= add_elapsed_times(df, ['event'], 'date', 'base')
df df.head()
date | event | base | Afterevent | Beforeevent | event_bw | event_fw | |
---|---|---|---|---|---|---|---|
0 | 2019-12-04 | False | 1 | 5 | 0 | 1.0 | 0.0 |
1 | 2019-11-29 | True | 1 | 0 | 0 | 1.0 | 1.0 |
2 | 2019-11-15 | False | 2 | 22 | 0 | 1.0 | 0.0 |
3 | 2019-10-24 | True | 2 | 0 | 0 | 1.0 | 1.0 |
def cont_cat_split(df, max_card=20, dep_var=None):
"Helper function that returns column names of cont and cat variables from given `df`."
= [], []
cont_names, cat_names for label in df:
if label in L(dep_var): continue
if ((pd.api.types.is_integer_dtype(df[label].dtype) and
0] > max_card) or
df[label].unique().shape[
pd.api.types.is_float_dtype(df[label].dtype)):
cont_names.append(label)else: cat_names.append(label)
return cont_names, cat_names
这个函数通过根据列值的基数来确定该列是连续的还是分类的。如果基数超过max_card
参数(或是float
数据类型),那么它将被添加到cont_names
,否则添加到cat_names
。下面是一个示例:
# 使用简单numpy类型的示例
= pd.DataFrame({'cat1': [1, 2, 3, 4], 'cont1': [1., 2., 3., 2.], 'cat2': ['a', 'b', 'b', 'a'],
df 'i8': pd.Series([1, 2, 3, 4], dtype='int8'),
'u8': pd.Series([1, 2, 3, 4], dtype='uint8'),
'f16': pd.Series([1, 2, 3, 4], dtype='float16'),
'y1': [1, 0, 1, 0], 'y2': [2, 1, 1, 0]})
= cont_cat_split(df) cont_names, cat_names
print(f'cont_names: {cont_names}\ncat_names: {cat_names}`')
cont_names: ['cont1', 'f16']
cat_names: ['cat1', 'cat2', 'i8', 'u8', 'y1', 'y2']`
# 测试所有列
= cont_cat_split(df)
cont, cat 'cont1', 'f16'], ['cat1', 'cat2', 'i8', 'u8', 'y1', 'y2']))
test_eq((cont, cat), ([
# 排除因变量
= cont_cat_split(df, dep_var='y1')
cont, cat 'cont1', 'f16'], ['cat1', 'cat2', 'i8', 'u8', 'y2']))
test_eq((cont, cat), ([
# 多标签依赖变量的测试排除
= cont_cat_split(df, dep_var=['y1', 'y2'])
cont, cat 'cont1', 'f16'], ['cat1', 'cat2', 'i8', 'u8']))
test_eq((cont, cat), ([
# 测试整数变量的最大基数界限
= cont_cat_split(df, max_card=3)
cont, cat 'cat1', 'cont1', 'i8', 'u8', 'f16'], ['cat2', 'y1', 'y2']))
test_eq((cont, cat), ([= cont_cat_split(df, max_card=2)
cont, cat 'cat1', 'cont1', 'i8', 'u8', 'f16', 'y2'], ['cat2', 'y1']))
test_eq((cont, cat), ([= cont_cat_split(df, max_card=1)
cont, cat 'cat1', 'cont1', 'i8', 'u8', 'f16', 'y1', 'y2'], ['cat2'])) test_eq((cont, cat), ([
# 带有 pandas 类型和生成列的示例
= pd.DataFrame({'cat1': pd.Series(['l','xs','xl','s'], dtype='category'),
df 'ui32': pd.Series([1, 2, 3, 4], dtype='UInt32'),
'i64': pd.Series([1, 2, 3, 4], dtype='Int64'),
'f16': pd.Series([1, 2, 3, 4], dtype='Float64'),
'd1_date': ['2021-02-09', None, '2020-05-12', '2020-08-14'],
})= add_datepart(df, 'd1_date', drop=False)
df 'cat1'] = df['cat1'].cat.set_categories(['xl','l','m','s','xs'], ordered=True)
df[= cont_cat_split(df, max_card=0) cont_names, cat_names
/home/jhoward/miniconda3/lib/python3.8/site-packages/pandas/core/arrays/categorical.py:2630: FutureWarning: The `inplace` parameter in pandas.Categorical.set_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
res = method(*args, **kwargs)
print(f'cont_names: {cont_names}\ncat_names: {cat_names}')
cont_names: ['ui32', 'i64', 'f16', 'd1_Year', 'd1_Month', 'd1_Week', 'd1_Day', 'd1_Dayofweek', 'd1_Dayofyear', 'd1_Elapsed']
cat_names: ['cat1', 'd1_date', 'd1_Is_month_end', 'd1_Is_month_start', 'd1_Is_quarter_end', 'd1_Is_quarter_start', 'd1_Is_year_end', 'd1_Is_year_start']
= cont_cat_split(df, max_card=0)
cont, cat
test_eq((cont, cat), ('ui32', 'i64', 'f16', 'd1_Year', 'd1_Month', 'd1_Week', 'd1_Day', 'd1_Dayofweek', 'd1_Dayofyear', 'd1_Elapsed'],
['cat1', 'd1_date', 'd1_Is_month_end', 'd1_Is_month_start', 'd1_Is_quarter_end', 'd1_Is_quarter_start', 'd1_Is_year_end', 'd1_Is_year_start']
[ ))
def df_shrink_dtypes(df, skip=[], obj2cat=True, int2uint=False):
"Return any possible smaller data types for DataFrame columns. Allows `object`->`category`, `int`->`uint`, and exclusion."
# 1: 构建列过滤器和类型映射
= {'category','datetime64[ns]','bool'}, set(skip)
excl_types, skip
= {'int' : [(np.dtype(x), np.iinfo(x).min, np.iinfo(x).max) for x in (np.int8, np.int16, np.int32, np.int64)],
typemap 'uint' : [(np.dtype(x), np.iinfo(x).min, np.iinfo(x).max) for x in (np.uint8, np.uint16, np.uint32, np.uint64)],
'float' : [(np.dtype(x), np.finfo(x).min, np.finfo(x).max) for x in (np.float32, np.float64, np.longdouble)]
}if obj2cat: typemap['object'] = 'category' # User wants to categorify dtype('Object'), which may not always save space
else: excl_types.add('object')
= {}
new_dtypes = lambda dt: dt[1].name not in excl_types and dt[0] not in skip
exclude
for c, old_t in filter(exclude, df.dtypes.items()):
= next((v for k,v in typemap.items() if old_t.name.startswith(k)), None)
t
if isinstance(t, list): # 找到最合适的最小类型
if int2uint and t==typemap['int'] and df[c].min() >= 0: t=typemap['uint']
= next((r[0] for r in t if r[1]<=df[c].min() and r[2]>=df[c].max()), None)
new_t if new_t and new_t == old_t: new_t = None
else: new_t = t if isinstance(t, str) else None
if new_t: new_dtypes[c] = new_t
return new_dtypes
=3) show_doc(df_shrink_dtypes, title_level
df_shrink_dtypes
[source]
df_shrink_dtypes
(df
,skip
=[]
,obj2cat
=True
,int2uint
=False
)
Return any possible smaller data types for DataFrame columns. Allows object
->category
, int
->uint
, and exclusion.
例如,我们将创建一个包含 int
,float
,bool
和 object
数据类型的示例 DataFrame
:
= pd.DataFrame({'i': [-100, 0, 100], 'f': [-100.0, 0.0, 100.0], 'e': [True, False, True],
df 'date':['2019-12-04','2019-11-29','2019-11-15',]})
df.dtypes
i int64
f float64
e bool
date object
dtype: object
我们可以调用 df_shrink_dtypes
来找到可以支持数据的最小可能数据类型:
= df_shrink_dtypes(df)
dt dt
{'i': dtype('int8'), 'f': dtype('float32'), 'date': 'category'}
'i'].dtype, 'int64')
test_eq(df['i'], 'int8')
test_eq(dt[
'f'].dtype, 'float64')
test_eq(df['f'], 'float32')
test_eq(dt[
# Default ignore 'object' and 'boolean' columns
'date'].dtype, 'object')
test_eq(df['date'], 'category')
test_eq(dt[
# Test categorifying 'object' type
= df_shrink_dtypes(df, obj2cat=False)
dt2 'date' not in dt2, True) test_eq(
def df_shrink(df, skip=[], obj2cat=True, int2uint=False):
"Reduce DataFrame memory usage, by casting to smaller types returned by `df_shrink_dtypes()`."
= df_shrink_dtypes(df, skip, obj2cat=obj2cat, int2uint=int2uint)
dt return df.astype(dt)
=3) show_doc(df_shrink, title_level
df_shrink
[source]
df_shrink
(df
,skip
=[]
,obj2cat
=True
,int2uint
=False
)
Reduce DataFrame memory usage, by casting to smaller types returned by df_shrink_dtypes()
.
df_shrink(df)
尝试通过将数值列调整为最小的数据类型来减少 DataFrame 的内存使用。此外:
boolean
、category
、datetime64[ns]
数据类型的列将被忽略。- ‘object’ 类型的列将被转化为分类类型,这可以在大型数据集中节省大量内存。可以通过
obj2cat=False
来禁用此功能。 int2uint=True
,如果列中的所有数据都大于或等于 0,则将int
类型转换为uint
类型。- 可以通过名称使用
excl_cols=['col1','col2']
来排除某些列。
要获取新的列数据类型而不实际转换 DataFrame,可以使用 df_shrink_dtypes()
,并使用与 df_shrink()
相同的所有参数。
= pd.DataFrame({'i': [-100, 0, 100], 'f': [-100.0, 0.0, 100.0], 'u':[0, 10,254],
df 'date':['2019-12-04','2019-11-29','2019-11-15']})
= df_shrink(df, skip=['date']) df2
让我们比较一下这两者:
df.dtypes
i int64
f float64
u int64
date object
dtype: object
df2.dtypes
i int8
f float32
u int16
date object
dtype: object
我们可以看到数据类型发生了变化,更进一步,我们可以查看它们相对的内存使用情况:
print(f'Initial Dataframe: {df.memory_usage().sum()} bytes')
print(f'Reduced Dataframe: {df2.memory_usage().sum()} bytes')
Initial Dataframe: 224 bytes
Reduced Dataframe: 173 bytes
'i'].dtype=='int64' and df2['i'].dtype=='int8', True)
test_eq(df['f'].dtype=='float64' and df2['f'].dtype=='float32', True)
test_eq(df['u'].dtype=='int64' and df2['u'].dtype=='int16', True)
test_eq(df['date'].dtype, 'object')
test_eq(df2[
sum() < df.memory_usage().sum(), True)
test_eq(df2.memory_usage().
# 测试整数 => 无符号整数(当列的最小值 >= 0 时)
= df_shrink(df, int2uint=True)
df3 'u'].dtype, 'uint8') # int64 转换为 uint8 而非 int16
test_eq(df3[
# 测试排除列
= df_shrink(df, skip=['i','u'])
df4 'i'].dtype, df4['i'].dtype)
test_eq(df['u'].dtype, 'int64') test_eq(df4[
这是另一个使用 ADULT_SAMPLE
数据集的示例:
= untar_data(URLs.ADULT_SAMPLE)
path = pd.read_csv(path/'adult.csv')
df = df_shrink(df, int2uint=True) new_df
print(f'Initial Dataframe: {df.memory_usage().sum() / 1000000} megabytes')
print(f'Reduced Dataframe: {new_df.memory_usage().sum() / 1000000} megabytes')
Initial Dataframe: 3.907448 megabytes
Reduced Dataframe: 0.818329 megabytes
我们减少了整体内存使用量的79%!
表格 -
class _TabIloc:
"Get/set rows by iloc and cols by name"
def __init__(self,to): self.to = to
def __getitem__(self, idxs):
= self.to.items
df if isinstance(idxs,tuple):
= idxs
rows,cols = df.columns.isin(cols) if is_listy(cols) else df.columns.get_loc(cols)
cols else: rows,cols = idxs,slice(None)
return self.to.new(df.iloc[rows, cols])
class Tabular(CollBase, GetAttr, FilteredBase):
"A `DataFrame` wrapper that knows which cols are cont/cat/y, and returns rows in `__getitem__`"
='procs',True
_default,with_contdef __init__(self, df, procs=None, cat_names=None, cont_names=None, y_names=None, y_block=None, splits=None,
=True, device=None, inplace=False, reduce_memory=True):
do_setupif inplace and splits is not None and pd.options.mode.chained_assignment is not None:
"Using inplace with splits will trigger a pandas error. Set `pd.options.mode.chained_assignment=None` to avoid it.")
warn(if not inplace: df = df.copy()
if reduce_memory: df = df_shrink(df)
if splits is not None: df = df.iloc[sum(splits, [])]
self.dataloaders = delegates(self._dl_type.__init__)(self.dataloaders)
super().__init__(df)
self.y_names,self.device = L(y_names),device
if y_block is None and self.y_names:
# Make ys categorical if they're not numeric
= df[self.y_names]
ys if len(ys.select_dtypes(include='number').columns)!=len(ys.columns): y_block = CategoryBlock()
else: y_block = RegressionBlock()
if y_block is not None and do_setup:
if callable(y_block): y_block = y_block()
= L(procs) + y_block.type_tfms
procs self.cat_names,self.cont_names,self.procs = L(cat_names),L(cont_names),Pipeline(procs)
self.split = len(df) if splits is None else len(splits[0])
if do_setup: self.setup()
def new(self, df, inplace=False):
return type(self)(df, do_setup=False, reduce_memory=False, y_block=TransformBlock(), inplace=inplace,
**attrdict(self, 'procs','cat_names','cont_names','y_names', 'device'))
def subset(self, i): return self.new(self.items[slice(0,self.split) if i==0 else slice(self.split,len(self))])
def copy(self): self.items = self.items.copy(); return self
def decode(self): return self.procs.decode(self)
def decode_row(self, row): return self.new(pd.DataFrame(row).T).decode().items.iloc[0]
def show(self, max_n=10, **kwargs): display_df(self.new(self.all_cols[:max_n]).decode().items)
def setup(self): self.procs.setup(self)
def process(self): self.procs(self)
def loc(self): return self.items.loc
def iloc(self): return _TabIloc(self)
def targ(self): return self.items[self.y_names]
def x_names (self): return self.cat_names + self.cont_names
def n_subsets(self): return 2
def y(self): return self[self.y_names[0]]
def new_empty(self): return self.new(pd.DataFrame({}, columns=self.items.columns))
def to_device(self, d=None):
self.device = d
return self
def all_col_names (self):
= [n for n in self.y_names if n in self.items.columns]
ys return self.x_names + self.y_names if len(ys) == len(self.y_names) else self.x_names
'loc','iloc','targ','all_col_names','n_subsets','x_names','y') properties(Tabular,
df
: 你的数据的DataFrame
cat_names
: 你的类别x
变量cont_names
: 你的连续x
变量y_names
: 你的因变量y
- 注意:目前不支持混合的 y,如回归和分类,但是支持多个回归或分类输出
y_block
: 如何对子类别的y_names
进行细分(CategoryBlock
或RegressionBlock
)splits
: 如何拆分你的数据do_setup
: 一个参数,用于指示Tabular
在初始化时是否将数据通过procs
处理device
:cuda
或cpu
inplace
: 如果为True
,Tabular
将不会在内存中保留你原始DataFrame
的单独副本。你应在设置之前确保pd.options.mode.chained_assignment
为None
reduce_memory
:fastai
将尝试通过输入的DataFrame
使用df_shrink
来减少整体内存使用量
class TabularPandas(Tabular):
"A `Tabular` object with transforms"
def transform(self, cols, f, all_col=True):
if not all_col: cols = [c for c in cols if c in self.items.columns]
if len(cols) > 0: self[cols] = self[cols].transform(f)
def _add_prop(cls, nm):
@property
def f(o): return o[list(getattr(o,nm+'_names'))]
@f.setter
def fset(o, v): o[getattr(o,nm+'_names')] = v
setattr(cls, nm+'s', f)
setattr(cls, nm+'s', fset)
'cat')
_add_prop(Tabular, 'cont')
_add_prop(Tabular, 'y')
_add_prop(Tabular, 'x')
_add_prop(Tabular, 'all_col') _add_prop(Tabular,
= pd.DataFrame({'a':[0,1,2,0,2], 'b':[0,0,0,0,1]})
df = TabularPandas(df, cat_names='a')
to = pickle.loads(pickle.dumps(to))
t
test_eq(t.items,to.items)'a']]) test_eq(to.all_cols,to[[
import gc
def _count_objs(o):
"Counts number of instanes of class `o`"
= gc.get_objects()
objs return len([x for x in objs if isinstance(x, pd.DataFrame)])
= pd.DataFrame({'a':[0,1,2,0,2], 'b':[0,0,0,0,1]})
df = pd.DataFrame({'a':[1,2,0,0,2], 'b':[1,0,3,0,1]})
df_b
= TabularPandas(df, cat_names='a', inplace=True)
to
= _count_objs(pd.DataFrame)
_init_count = to.new(df_b, inplace=True)
to_new test_eq(_init_count, _count_objs(pd.DataFrame))
/home/jhoward/miniconda3/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:151: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead
warnings.warn("torch.distributed.reduce_op is deprecated, please use "
class TabularProc(InplaceTransform):
"Base class to write a non-lazy tabular processor for dataframes"
def setup(self, items=None, train_setup=False): #待办:妥善处理训练设置
super().setup(getattr(items,'train',items), train_setup=False)
# 一旦有数据可用,进程就会被调用。
return self(items.items if isinstance(items,Datasets) else items)
@property
def name(self): return f"{super().name} -- {getattr(self,'__stored_args__',{})}"
这些转换将在数据可用时立即应用,而不是在从 DataLoader
调用数据时应用。
def _apply_cats (voc, add, c):
if not (hasattr(c, 'dtype') and isinstance(c.dtype, CategoricalDtype)):
return pd.Categorical(c, categories=voc[c.name][add:]).codes+add
return c.cat.codes+add #如果c是分类数据类型,则执行以下操作,否则对c进行映射,使用voc中c名称对应的o2i函数
def _decode_cats(voc, c): return c.map(dict(enumerate(voc[c.name].items)))
class Categorify(TabularProc):
"Transform the categorical variables to something similar to `pd.Categorical`"
= 1
order def setups(self, to):
={n:CategoryMap(to.iloc[:,n].items, add_na=(n in to.cat_names)) for n in to.cat_names}, but='to')
store_attr(classes
def encodes(self, to): to.transform(to.cat_names, partial(_apply_cats, self.classes, 1))
def decodes(self, to): to.transform(to.cat_names, partial(_decode_cats, self.classes))
def __getitem__(self,k): return self.classes[k]
@Categorize
def setups(self, to:Tabular):
if len(to.y_names) > 0:
if self.vocab is None:
self.vocab = CategoryMap(getattr(to, 'train', to).iloc[:,to.y_names[0]].items, strict=True)
else:
self.vocab = CategoryMap(self.vocab, sort=False, add_na=self.add_na)
self.c = len(self.vocab)
return self(to)
@Categorize
def encodes(self, to:Tabular):
self.vocab for n in to.y_names}, 0), all_col=False)
to.transform(to.y_names, partial(_apply_cats, {n: return to
@Categorize
def decodes(self, to:Tabular):
self.vocab for n in to.y_names}), all_col=False)
to.transform(to.y_names, partial(_decode_cats, {n: return to
=3) show_doc(Categorify, title_level
class
Categorify
[source]
Categorify
(enc
=None
,dec
=None
,split_idx
=None
,order
=None
) ::TabularProc
Transform the categorical variables to something similar to pd.Categorical
虽然在 DataFrame
中从视觉上看不会看到变化,但类别是存储在 to.procs.categorify
中的,正如下方的一个虚拟 DataFrame
所示:
= pd.DataFrame({'a':[0,1,2,0,2]})
df = TabularPandas(df, Categorify, 'a')
to to.show()
a | |
---|---|
0 | 0 |
1 | 1 |
2 | 2 |
3 | 0 |
4 | 2 |
每列的唯一值存储在一个字典中,格式为 column:[values]
:
= to.procs.categorify
cat cat.classes
{'a': ['#na#', 0, 1, 2]}
def test_series(a,b): return test_eq(list(a), b)
'a'], ['#na#',0,1,2])
test_series(cat['a'], [1,2,3,1,3]) test_series(to[
= pd.DataFrame({'a':[1,0,3,-1,2]})
df1 = to.new(df1)
to1
to1.process()#Values that weren't in the training df are sent to 0 (na)
'a'], [2,1,0,0,3])
test_series(to1[= cat.decode(to1)
to2 'a'], [1,0,'#na#','#na#',2]) test_series(to2[
#测试带有分割
= Categorify()
cat = pd.DataFrame({'a':[0,1,2,3,2]})
df = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]])
to 'a'], ['#na#',0,1,2])
test_series(cat['a'], [1,2,3,0,3]) test_series(to[
= pd.DataFrame({'a':pd.Categorical(['M','H','L','M'], categories=['H','M','L'], ordered=True)})
df = TabularPandas(df, Categorify, 'a')
to = to.procs.categorify
cat 'a'], ['#na#','H','M','L'])
test_series(cat[2,1,3,2])
test_series(to.items.a, [= cat.decode(to)
to2 'a'], ['M','H','L','M']) test_series(to2[
#测试目标
= Categorify()
cat = pd.DataFrame({'a':[0,1,2,3,2], 'b': ['a', 'b', 'a', 'b', 'b']})
df = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]], y_names='b')
to 'a', 'b'])
test_series(to.vocab, ['b'], [0,1,0,1,1])
test_series(to[= to.procs.decode(to)
to2 'b'], ['a', 'b', 'a', 'b', 'b']) test_series(to2[
= Categorify()
cat = pd.DataFrame({'a':[0,1,2,3,2], 'b': ['a', 'b', 'a', 'b', 'b']})
df = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]], y_names='b')
to 'a', 'b'])
test_series(to.vocab, ['b'], [0,1,0,1,1])
test_series(to[= to.procs.decode(to)
to2 'b'], ['a', 'b', 'a', 'b', 'b']) test_series(to2[
#测试与目标及训练
= Categorify()
cat = pd.DataFrame({'a':[0,1,2,3,2], 'b': ['a', 'b', 'a', 'c', 'b']})
df = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]], y_names='b')
to 'a', 'b']) test_series(to.vocab, [
#测试以确保没有存储数据框的副本
= Categorify()
cat = pd.DataFrame({'a':[0,1,2,3,4]})
df = TabularPandas(df, cat, cont_names='a', splits=[[0,1,2],[3,4]])
to hasattr(to.categorify, 'to'), False) test_eq(
@Normalize
def setups(self, to:Tabular):
='to', means=dict(getattr(to, 'train', to).conts.mean()),
store_attr(but=dict(getattr(to, 'train', to).conts.std(ddof=0)+1e-7))
stdsreturn self(to)
@Normalize
def encodes(self, to:Tabular):
= (to.conts-self.means) / self.stds
to.conts return to
@Normalize
def decodes(self, to:Tabular):
= (to.conts*self.stds ) + self.means
to.conts return to
= Normalize()
norm = pd.DataFrame({'a':[0,1,2,3,4]})
df = TabularPandas(df, norm, cont_names='a')
to = np.array([0,1,2,3,4])
x = x.mean(),x.std()
m,s 'a'], m)
test_eq(norm.means['a'], s)
test_close(norm.stds['a'].values, (x-m)/s) test_close(to[
= pd.DataFrame({'a':[5,6,7]})
df1 = to.new(df1)
to1
to1.process()'a'].values, (np.array([5,6,7])-m)/s)
test_close(to1[= norm.decode(to1)
to2 'a'].values, [5,6,7]) test_close(to2[
= Normalize()
norm = pd.DataFrame({'a':[0,1,2,3,4]})
df = TabularPandas(df, norm, cont_names='a', splits=[[0,1,2],[3,4]])
to = np.array([0,1,2])
x = x.mean(),x.std()
m,s 'a'], m)
test_eq(norm.means['a'], s)
test_close(norm.stds['a'].values, (np.array([0,1,2,3,4])-m)/s) test_close(to[
= Normalize()
norm = pd.DataFrame({'a':[0,1,2,3,4]})
df = TabularPandas(df, norm, cont_names='a', splits=[[0,1,2],[3,4]])
to hasattr(to.procs.normalize, 'to'), False) test_eq(
class FillStrategy:
"Namespace containing the various filling strategies."
def median (c,fill): return c.median()
def constant(c,fill): return fill
def mode (c,fill): return c.dropna().value_counts().idxmax()
目前,支持使用中位数
、常数
和众数
进行填充。
class FillMissing(TabularProc):
"Fill the missing values in continuous columns."
def __init__(self, fill_strategy=FillStrategy.median, add_col=True, fill_vals=None):
if fill_vals is None: fill_vals = defaultdict(int)
store_attr()
def setups(self, to):
= pd.isnull(to.conts).any()
missing ='to', na_dict={n:self.fill_strategy(to[n], self.fill_vals[n])
store_attr(butfor n in missing[missing].keys()})
self.fill_strategy = self.fill_strategy.__name__
def encodes(self, to):
= pd.isnull(to.conts)
missing for n in missing.any()[missing.any()].keys():
assert n in self.na_dict, f"nan values in `{n}` but not in setup training set"
for n in self.na_dict.keys():
self.na_dict[n], inplace=True)
to[n].fillna(if self.add_col:
+'_na'] = missing[n]
to.loc[:,nif n+'_na' not in to.cat_names: to.cat_names.append(n+'_na')
=3) show_doc(FillMissing, title_level
class
FillMissing
[source]
FillMissing
(fill_strategy
=median
,add_col
=True
,fill_vals
=None
) ::TabularProc
Fill the missing values in continuous columns.
= (FillMissing(fill_strategy=s)
fill1,fill2,fill3 for s in [FillStrategy.median, FillStrategy.constant, FillStrategy.mode])
= pd.DataFrame({'a':[0,1,np.nan,1,2,3,4]})
df = df.copy(); df2 = df.copy()
df1 = (TabularPandas(df, fill1, cont_names='a'),
tos ='a'),
TabularPandas(df1, fill2, cont_names='a'))
TabularPandas(df2, fill3, cont_names'a': 1.5})
test_eq(fill1.na_dict, {'a': 0})
test_eq(fill2.na_dict, {'a': 1.0})
test_eq(fill3.na_dict, {
for t in tos: test_eq(t.cat_names, ['a_na'])
for to_,v in zip(tos, [1.5, 0., 1.]):
'a'].values, np.array([0, 1, v, 1, 2, 3, 4]))
test_eq(to_['a_na'].values, np.array([0, 0, 1, 0, 0, 0, 0])) test_eq(to_[
= FillMissing()
fill = pd.DataFrame({'a':[0,1,np.nan,1,2,3,4], 'b': [0,1,2,3,4,5,6]})
df = TabularPandas(df, fill, cont_names=['a', 'b'])
to 'a': 1.5})
test_eq(fill.na_dict, {'a_na'])
test_eq(to.cat_names, ['a'].values, np.array([0, 1, 1.5, 1, 2, 3, 4]))
test_eq(to['a_na'].values, np.array([0, 0, 1, 0, 0, 0, 0]))
test_eq(to['b'].values, np.array([0,1,2,3,4,5,6])) test_eq(to[
= FillMissing()
fill = pd.DataFrame({'a':[0,1,np.nan,1,2,3,4], 'b': [0,1,2,3,4,5,6]})
df = TabularPandas(df, fill, cont_names=['a', 'b'])
to hasattr(to.procs.fill_missing, 'to'), False) test_eq(
表格式Pandas管道 -
= [Normalize, Categorify, FillMissing, noop]
procs = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4]})
df = TabularPandas(df, procs, cat_names='a', cont_names='b')
to
#测试设置并应用于df_main
'a', 'b_na'])
test_series(to.cat_names, ['a'], [1,2,3,2,2,3,1])
test_series(to['b_na'], [1,1,2,1,1,1,1])
test_series(to[= np.array([0,1,1.5,1,2,3,4])
x = x.mean(),x.std()
m,s 'b'].values, (x-m)/s)
test_close(to['a': ['#na#',0,1,2], 'b_na': ['#na#',False,True]}) test_eq(to.classes, {
#在y_names上应用测试
= pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4], 'c': ['b','a','b','a','a','b','a']})
df = TabularPandas(df, procs, 'a', 'b', y_names='c')
to
'a', 'b_na'])
test_series(to.cat_names, ['a'], [1,2,3,2,2,3,1])
test_series(to['b_na'], [1,1,2,1,1,1,1])
test_series(to['c'], [1,0,1,0,0,1,0])
test_series(to[= np.array([0,1,1.5,1,2,3,4])
x = x.mean(),x.std()
m,s 'b'].values, (x-m)/s)
test_close(to['a': ['#na#',0,1,2], 'b_na': ['#na#',False,True]})
test_eq(to.classes, {'a','b']) test_eq(to.vocab, [
= pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4], 'c': ['b','a','b','a','a','b','a']})
df = TabularPandas(df, procs, 'a', 'b', y_names='c')
to
'a', 'b_na'])
test_series(to.cat_names, ['a'], [1,2,3,2,2,3,1])
test_series(to[if sys.platform == "win32" else int)
test_eq(df.a.dtype, np.int64 'b_na'], [1,1,2,1,1,1,1])
test_series(to['c'], [1,0,1,0,0,1,0]) test_series(to[
= pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,np.nan,1,1,2,3,4], 'c': ['b','a','b','a','a','b','a']})
df = TabularPandas(df, procs, cat_names='a', cont_names='b', y_names='c', splits=[[0,1,4,6], [2,3,5]])
to
'a', 'b_na'])
test_series(to.cat_names, ['a'], [1,2,2,1,0,2,0])
test_series(to[if sys.platform == "win32" else int)
test_eq(df.a.dtype, np.int64 'b_na'], [1,2,1,1,1,1,1])
test_series(to['c'], [1,0,0,0,1,0,1]) test_series(to[
def _maybe_expand(o): return o[:,None] if o.ndim==1 else o
class ReadTabBatch(ItemTransform):
"Transform `TabularPandas` values into a `Tensor` with the ability to decode"
def __init__(self, to): self.to = to.new_empty()
def encodes(self, to):
if not to.with_cont: res = (tensor(to.cats).long(),)
else: res = (tensor(to.cats).long(),tensor(to.conts).float())
= [n for n in to.y_names if n in to.items.columns]
ys if len(ys) == len(to.y_names): res = res + (tensor(to.targ),)
if to.device is not None: res = to_device(res, to.device)
return res
def decodes(self, o):
= [_maybe_expand(o_) for o_ in to_np(o) if o_.size != 0]
o = np.concatenate(o, axis=1)
vals try: df = pd.DataFrame(vals, columns=self.to.all_col_names)
except: df = pd.DataFrame(vals, columns=self.to.x_names)
= self.to.new(df)
to return to
@typedispatch
def show_batch(x: Tabular, y, its, max_n=10, ctxs=None):
x.show()
@delegates()
class TabDataLoader(TfmdDL):
"A transformed `DataLoader` for Tabular data"
def __init__(self, dataset, bs=16, shuffle=False, after_batch=None, num_workers=0, **kwargs):
if after_batch is None: after_batch = L(TransformBlock().batch_tfms)+ReadTabBatch(dataset)
super().__init__(dataset, bs=bs, shuffle=shuffle, after_batch=after_batch, num_workers=num_workers, **kwargs)
def create_item(self, s): return self.dataset.iloc[s or 0]
def create_batch(self, b): return self.dataset.iloc[b]
def do_item(self, s): return 0 if s is None else s
= TabDataLoader TabularPandas._dl_type
@delegates()
class TabWeightedDL(TabDataLoader):
"A transformed `DataLoader` for Tabular Weighted data"
def __init__(self, dataset, bs=16, wgts=None, shuffle=False, after_batch=None, num_workers=0, **kwargs):
= np.array([1.]*len(dataset) if wgts is None else wgts)
wgts self.wgts = wgts / wgts.sum()
super().__init__(dataset, bs=bs, shuffle=shuffle, after_batch=after_batch, num_workers=num_workers, **kwargs)
self.idxs = self.get_idxs()
def get_idxs(self):
if self.n == 0: return []
if not self.shuffle: return super().get_idxs()
return list(np.random.choice(self.n, self.n, p=self.wgts))
= TabWeightedDL TabularPandas._dl_type
集成示例
有关更深入的解释,请参见表格教程
= untar_data(URLs.ADULT_SAMPLE)
path = pd.read_csv(path/'adult.csv')
df = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main,df_test 'salary', axis=1, inplace=True)
df_test.drop( df_main.head()
age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | salary | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 49 | Private | 101320 | Assoc-acdm | 12.0 | Married-civ-spouse | NaN | Wife | White | Female | 0 | 1902 | 40 | United-States | >=50k |
1 | 44 | Private | 236746 | Masters | 14.0 | Divorced | Exec-managerial | Not-in-family | White | Male | 10520 | 0 | 45 | United-States | >=50k |
2 | 38 | Private | 96185 | HS-grad | NaN | Divorced | NaN | Unmarried | Black | Female | 0 | 0 | 32 | United-States | <50k |
3 | 38 | Self-emp-inc | 112847 | Prof-school | 15.0 | Married-civ-spouse | Prof-specialty | Husband | Asian-Pac-Islander | Male | 0 | 0 | 40 | United-States | >=50k |
4 | 42 | Self-emp-not-inc | 82297 | 7th-8th | NaN | Married-civ-spouse | Other-service | Wife | Black | Female | 0 | 0 | 50 | United-States | <50k |
= ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cat_names = ['age', 'fnlwgt', 'education-num']
cont_names = [Categorify, FillMissing, Normalize]
procs = RandomSplitter()(range_of(df_main)) splits
= TabularPandas(df_main, procs, cat_names, cont_names, y_names="salary", splits=splits) to
= to.dataloaders()
dls dls.valid.show_batch()
workclass | education | marital-status | occupation | relationship | race | education-num_na | age | fnlwgt | education-num | salary | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | Self-emp-not-inc | Prof-school | Divorced | Prof-specialty | Not-in-family | White | False | 65.000000 | 316093.005287 | 15.0 | <50k |
1 | Private | Bachelors | Married-civ-spouse | Exec-managerial | Husband | White | False | 69.999999 | 280306.998091 | 13.0 | <50k |
2 | Federal-gov | Some-college | Married-civ-spouse | Adm-clerical | Husband | Black | False | 34.000000 | 199933.999862 | 10.0 | >=50k |
3 | Private | HS-grad | Never-married | Handlers-cleaners | Unmarried | White | False | 24.000001 | 300584.002430 | 9.0 | <50k |
4 | Private | Assoc-voc | Never-married | Other-service | Not-in-family | White | False | 34.000000 | 220630.999335 | 11.0 | <50k |
5 | Private | Bachelors | Divorced | Prof-specialty | Unmarried | White | False | 45.000000 | 289230.003178 | 13.0 | >=50k |
6 | ? | Some-college | Never-married | ? | Own-child | White | False | 26.000000 | 208993.999494 | 10.0 | <50k |
7 | Private | Some-college | Divorced | Adm-clerical | Not-in-family | White | False | 43.000000 | 174574.999446 | 10.0 | <50k |
8 | Self-emp-not-inc | Assoc-voc | Married-civ-spouse | Other-service | Husband | White | False | 63.000000 | 420628.997361 | 11.0 | <50k |
9 | State-gov | Some-college | Married-civ-spouse | Adm-clerical | Husband | Black | False | 25.000000 | 257064.003065 | 10.0 | <50k |
to.show()
workclass | education | marital-status | occupation | relationship | race | education-num_na | age | fnlwgt | education-num | salary | |
---|---|---|---|---|---|---|---|---|---|---|---|
5516 | Private | HS-grad | Divorced | Exec-managerial | Unmarried | White | False | 49.0 | 140121.0 | 9.0 | <50k |
7184 | Self-emp-inc | Some-college | Never-married | Exec-managerial | Not-in-family | White | False | 70.0 | 207938.0 | 10.0 | <50k |
2336 | Private | Some-college | Never-married | Priv-house-serv | Own-child | White | False | 23.0 | 50953.0 | 10.0 | <50k |
4342 | Private | Assoc-voc | Married-civ-spouse | Machine-op-inspct | Husband | White | False | 46.0 | 27802.0 | 11.0 | <50k |
8474 | Self-emp-not-inc | Assoc-acdm | Married-civ-spouse | Craft-repair | Husband | White | False | 47.0 | 107231.0 | 12.0 | <50k |
5948 | Local-gov | HS-grad | Married-civ-spouse | Transport-moving | Husband | White | False | 40.0 | 55363.0 | 9.0 | <50k |
5342 | Local-gov | HS-grad | Married-civ-spouse | Craft-repair | Husband | White | False | 46.0 | 36228.0 | 9.0 | <50k |
9005 | Private | Bachelors | Married-civ-spouse | Adm-clerical | Husband | White | False | 38.0 | 297449.0 | 13.0 | >=50k |
1189 | Private | Assoc-voc | Divorced | Sales | Not-in-family | Amer-Indian-Eskimo | False | 31.0 | 87950.0 | 11.0 | <50k |
8784 | Private | Assoc-voc | Divorced | Prof-specialty | Own-child | Black | False | 35.0 | 491000.0 | 11.0 | <50k |
我们可以通过调用 to.decode_row
并传入我们的原始数据来解码任何一组转换后的数据:
= to.items.iloc[0]
row to.decode_row(row)
age 49.0
workclass Private
fnlwgt 140121.0
education HS-grad
education-num 9.0
marital-status Divorced
occupation Exec-managerial
relationship Unmarried
race White
sex Male
capital-gain 0
capital-loss 0
hours-per-week 50
native-country United-States
salary <50k
education-num_na False
Name: 5516, dtype: object
我们可以基于训练数据使用 to.new()
创建新的测试数据集。
由于机器学习模型无法神奇地理解它从未训练过的类别,因此数据应该反映这一点。如果您的测试数据中存在不同的缺失值,您应该在训练之前解决这个问题。
= to.new(df_test)
to_tst
to_tst.process() to_tst.items.head()
age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | education-num_na | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
10000 | 0.465031 | 5 | 1.319553 | 10 | 1.176677 | 3 | 2 | 1 | 2 | Male | 0 | 0 | 40 | Philippines | 1 |
10001 | -0.926675 | 5 | 1.233650 | 12 | -0.420035 | 3 | 15 | 1 | 4 | Male | 0 | 0 | 40 | United-States | 1 |
10002 | 1.051012 | 5 | 0.145161 | 2 | -1.218391 | 1 | 9 | 2 | 5 | Female | 0 | 0 | 37 | United-States | 1 |
10003 | 0.538279 | 5 | -0.282370 | 12 | -0.420035 | 7 | 2 | 5 | 5 | Female | 0 | 0 | 43 | United-States | 1 |
10004 | 0.758022 | 6 | 1.420768 | 9 | 0.378321 | 3 | 5 | 1 | 5 | Male | 0 | 0 | 60 | United-States | 1 |
我们可以将其转换为 DataLoader
:
= dls.valid.new(to_tst)
tst_dl tst_dl.show_batch()
workclass | education | marital-status | occupation | relationship | race | education-num_na | age | fnlwgt | education-num | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Private | Bachelors | Married-civ-spouse | Adm-clerical | Husband | Asian-Pac-Islander | False | 45.000000 | 338105.005817 | 13.0 |
1 | Private | HS-grad | Married-civ-spouse | Transport-moving | Husband | Other | False | 26.000000 | 328663.002806 | 9.0 |
2 | Private | 11th | Divorced | Other-service | Not-in-family | White | False | 52.999999 | 209022.000317 | 7.0 |
3 | Private | HS-grad | Widowed | Adm-clerical | Unmarried | White | False | 46.000000 | 162029.998917 | 9.0 |
4 | Self-emp-inc | Assoc-voc | Married-civ-spouse | Exec-managerial | Husband | White | False | 49.000000 | 349230.006300 | 11.0 |
5 | Local-gov | Some-college | Married-civ-spouse | Exec-managerial | Husband | White | False | 34.000000 | 124827.002059 | 10.0 |
6 | Self-emp-inc | Some-college | Married-civ-spouse | Sales | Husband | White | False | 52.999999 | 290640.002462 | 10.0 |
7 | Private | Some-college | Never-married | Sales | Own-child | White | False | 19.000000 | 106272.998239 | 10.0 |
8 | Private | Some-college | Married-civ-spouse | Protective-serv | Husband | Black | False | 71.999999 | 53684.001668 | 10.0 |
9 | Private | Some-college | Never-married | Sales | Own-child | White | False | 20.000000 | 505980.010609 | 10.0 |
# 创建一个TabWeightedDL
= to.train
train_ds = np.random.random(len(train_ds))
weights = TabWeightedDL(train_ds, wgts=weights, bs=64, shuffle=True)
train_dl
train_dl.show_batch()
workclass | education | marital-status | occupation | relationship | race | education-num_na | age | fnlwgt | education-num | salary | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | Local-gov | Masters | Never-married | Prof-specialty | Not-in-family | White | False | 31.000000 | 204469.999932 | 14.0 | <50k |
1 | Self-emp-not-inc | HS-grad | Divorced | Farming-fishing | Not-in-family | White | False | 32.000000 | 34572.002104 | 9.0 | <50k |
2 | ? | Some-college | Widowed | ? | Not-in-family | White | False | 64.000000 | 34099.998990 | 10.0 | <50k |
3 | Private | Some-college | Divorced | Exec-managerial | Not-in-family | White | False | 32.000000 | 251242.999189 | 10.0 | >=50k |
4 | Federal-gov | HS-grad | Married-civ-spouse | Exec-managerial | Husband | White | False | 55.000001 | 176903.999313 | 9.0 | <50k |
5 | Private | 11th | Married-civ-spouse | Transport-moving | Husband | White | False | 50.000000 | 192203.000000 | 7.0 | <50k |
6 | Private | 10th | Never-married | Farming-fishing | Own-child | Black | False | 36.000000 | 181720.999704 | 6.0 | <50k |
7 | Local-gov | Masters | Divorced | Prof-specialty | Not-in-family | Amer-Indian-Eskimo | False | 50.000000 | 220640.001490 | 14.0 | >=50k |
8 | Private | HS-grad | Married-civ-spouse | Adm-clerical | Wife | White | False | 36.000000 | 189381.999993 | 9.0 | >=50k |
9 | Private | Masters | Divorced | Prof-specialty | Unmarried | White | False | 42.000000 | 265697.997341 | 14.0 | <50k |
= next(iter(train_dl))
batch
= batch[0].shape
x, y
64)
test_eq(x, 7)
test_eq(y,
assert hasattr(train_dl, 'wgts'), "Weights attribute missing in DataLoader"
TabDataLoader的create_item方法
= pd.DataFrame([{'age': 35}])
df = TabularPandas(df)
to = to.dataloaders()
dls print(dls.create_item(0))
# test_eq(dls.create_item(0).items.to_dict(), {'age': 0.5330614747286777, 'workclass': 5, 'fnlwgt': -0.26305443080666174, 'education': 10, 'education-num': 1.169790230219763, 'marital-status': 1, 'occupation': 13, 'relationship': 5, 'race': 3, 'sex': ' Female', 'capital-gain': 0, 'capital-loss': 0, 'hours-per-week': 35, 'native-country': 'United-States', 'salary': 1, 'education-num_na': 1})
age 35
Name: 0, dtype: int8
其他目标类型
多标签类别
独热编码标签
def _mock_multi_label(df):
= [],[],[]
sal,sex,white for row in df.itertuples():
== '>=50k')
sal.append(row.salary == ' Male')
sex.append(row.sex == ' White')
white.append(row.race 'salary'] = np.array(sal)
df['male'] = np.array(sex)
df['white'] = np.array(white)
df[return df
= untar_data(URLs.ADULT_SAMPLE)
path = pd.read_csv(path/'adult.csv')
df = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main,df_test = _mock_multi_label(df_main) df_main
df_main.head()
age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | salary | male | white | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 49 | Private | 101320 | Assoc-acdm | 12.0 | Married-civ-spouse | NaN | Wife | White | Female | 0 | 1902 | 40 | United-States | True | False | True |
1 | 44 | Private | 236746 | Masters | 14.0 | Divorced | Exec-managerial | Not-in-family | White | Male | 10520 | 0 | 45 | United-States | True | True | True |
2 | 38 | Private | 96185 | HS-grad | NaN | Divorced | NaN | Unmarried | Black | Female | 0 | 0 | 32 | United-States | False | False | False |
3 | 38 | Self-emp-inc | 112847 | Prof-school | 15.0 | Married-civ-spouse | Prof-specialty | Husband | Asian-Pac-Islander | Male | 0 | 0 | 40 | United-States | True | True | False |
4 | 42 | Self-emp-not-inc | 82297 | 7th-8th | NaN | Married-civ-spouse | Other-service | Wife | Black | Female | 0 | 0 | 50 | United-States | False | False | False |
@EncodedMultiCategorize
def setups(self, to:Tabular):
self.c = len(self.vocab)
return self(to)
@EncodedMultiCategorize
def encodes(self, to:Tabular): return to
@EncodedMultiCategorize
def decodes(self, to:Tabular):
lambda c: c==1)
to.transform(to.y_names, return to
= ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cat_names = ['age', 'fnlwgt', 'education-num']
cont_names = [Categorify, FillMissing, Normalize]
procs = RandomSplitter()(range_of(df_main))
splits =["salary", "male", "white"] y_names
%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names=y_names, y_block=MultiCategoryBlock(encoded=True, vocab=y_names), splits=splits)
CPU times: user 66 ms, sys: 0 ns, total: 66 ms
Wall time: 65.3 ms
= to.dataloaders()
dls dls.valid.show_batch()
workclass | education | marital-status | occupation | relationship | race | education-num_na | age | fnlwgt | education-num | salary | male | white | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Private | HS-grad | Divorced | Exec-managerial | Unmarried | White | False | 47.000000 | 164423.000013 | 9.0 | False | False | True |
1 | Private | Some-college | Married-civ-spouse | Transport-moving | Husband | White | False | 74.999999 | 239037.999499 | 10.0 | False | True | True |
2 | Private | HS-grad | Married-civ-spouse | Sales | Wife | White | False | 45.000000 | 228570.000761 | 9.0 | False | False | True |
3 | Self-emp-not-inc | HS-grad | Married-civ-spouse | Exec-managerial | Husband | Asian-Pac-Islander | False | 45.000000 | 285574.998753 | 9.0 | False | True | False |
4 | Private | Some-college | Never-married | Adm-clerical | Own-child | White | False | 21.999999 | 184812.999966 | 10.0 | False | True | True |
5 | Private | 10th | Married-civ-spouse | Transport-moving | Husband | White | False | 67.000001 | 274450.998865 | 6.0 | False | True | True |
6 | Private | HS-grad | Divorced | Exec-managerial | Unmarried | White | False | 53.999999 | 192862.000000 | 9.0 | False | False | True |
7 | Federal-gov | Some-college | Divorced | Tech-support | Unmarried | Amer-Indian-Eskimo | False | 37.000000 | 33486.997455 | 10.0 | False | False | False |
8 | Private | HS-grad | Never-married | Machine-op-inspct | Other-relative | White | False | 30.000000 | 219318.000010 | 9.0 | False | False | True |
9 | Self-emp-not-inc | Bachelors | Married-civ-spouse | Sales | Husband | White | False | 44.000000 | 167279.999960 | 13.0 | False | True | True |
非独热编码
def _mock_multi_label(df):
= []
targ for row in df.itertuples():
= []
labels if row.salary == '>=50k': labels.append('>50k')
if row.sex == ' Male': labels.append('male')
if row.race == ' White': labels.append('white')
' '.join(labels))
targ.append('target'] = np.array(targ)
df[return df
= untar_data(URLs.ADULT_SAMPLE)
path = pd.read_csv(path/'adult.csv')
df = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main,df_test = _mock_multi_label(df_main) df_main
df_main.head()
age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | salary | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 49 | Private | 101320 | Assoc-acdm | 12.0 | Married-civ-spouse | NaN | Wife | White | Female | 0 | 1902 | 40 | United-States | >=50k | >50k white |
1 | 44 | Private | 236746 | Masters | 14.0 | Divorced | Exec-managerial | Not-in-family | White | Male | 10520 | 0 | 45 | United-States | >=50k | >50k male white |
2 | 38 | Private | 96185 | HS-grad | NaN | Divorced | NaN | Unmarried | Black | Female | 0 | 0 | 32 | United-States | <50k | |
3 | 38 | Self-emp-inc | 112847 | Prof-school | 15.0 | Married-civ-spouse | Prof-specialty | Husband | Asian-Pac-Islander | Male | 0 | 0 | 40 | United-States | >=50k | >50k male |
4 | 42 | Self-emp-not-inc | 82297 | 7th-8th | NaN | Married-civ-spouse | Other-service | Wife | Black | Female | 0 | 0 | 50 | United-States | <50k |
@MultiCategorize
def encodes(self, to:Tabular):
#to.transform(to.y_names, partial(_apply_cats, {n: self.vocab for n in to.y_names}, 0))
return to
@MultiCategorize
def decodes(self, to:Tabular):
#to.transform(to.y_names, partial(_decode_cats, {n: self.vocab for n in to.y_names}))
return to
= ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cat_names = ['age', 'fnlwgt', 'education-num']
cont_names = [Categorify, FillMissing, Normalize]
procs = RandomSplitter()(range_of(df_main)) splits
%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names="target", y_block=MultiCategoryBlock(), splits=splits)
CPU times: user 68.6 ms, sys: 0 ns, total: 68.6 ms
Wall time: 67.9 ms
2].vocab to.procs[
['-', '_', 'a', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y']
回归
@RegressionSetup
def setups(self, to:Tabular):
if self.c is not None: return
self.c = len(to.y_names)
return to
@RegressionSetup
def encodes(self, to:Tabular): return to
@RegressionSetup
def decodes(self, to:Tabular): return to
= untar_data(URLs.ADULT_SAMPLE)
path = pd.read_csv(path/'adult.csv')
df = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main,df_test = _mock_multi_label(df_main) df_main
= ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cat_names = ['fnlwgt', 'education-num']
cont_names = [Categorify, FillMissing, Normalize]
procs = RandomSplitter()(range_of(df_main)) splits
%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names='age', splits=splits)
CPU times: user 70.7 ms, sys: 290 µs, total: 71 ms
Wall time: 70.3 ms
-1].means to.procs[
{'fnlwgt': 192085.701, 'education-num': 10.059124946594238}
= to.dataloaders()
dls dls.valid.show_batch()
workclass | education | marital-status | occupation | relationship | race | education-num_na | fnlwgt | education-num | age | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Private | 12th | Never-married | Adm-clerical | Other-relative | Black | False | 503454.004078 | 8.0 | 47.0 |
1 | Federal-gov | Bachelors | Married-civ-spouse | Exec-managerial | Husband | White | False | 586656.993690 | 13.0 | 49.0 |
2 | Self-emp-not-inc | Assoc-voc | Married-civ-spouse | Farming-fishing | Husband | White | False | 164607.001243 | 11.0 | 29.0 |
3 | Private | HS-grad | Never-married | Adm-clerical | Not-in-family | Black | False | 155508.999873 | 9.0 | 48.0 |
4 | Private | 11th | Never-married | Other-service | Own-child | White | False | 318189.998679 | 7.0 | 18.0 |
5 | Private | HS-grad | Never-married | Adm-clerical | Other-relative | White | False | 140219.001104 | 9.0 | 47.0 |
6 | Private | Masters | Divorced | #na# | Unmarried | White | True | 235683.001562 | 10.0 | 47.0 |
7 | Private | Bachelors | Married-civ-spouse | Craft-repair | Husband | White | False | 187321.999825 | 13.0 | 43.0 |
8 | Private | Bachelors | Married-civ-spouse | Prof-specialty | Husband | White | False | 104196.002410 | 13.0 | 40.0 |
9 | Private | Some-college | Separated | Priv-house-serv | Other-relative | White | False | 184302.999784 | 10.0 | 25.0 |
目前未使用 - 用于多模态
class TensorTabular(fastuple):
def get_ctxs(self, max_n=10, **kwargs):
= min(self[0].shape[0], max_n)
n_samples = pd.DataFrame(index = range(n_samples))
df return [df.iloc[i] for i in range(n_samples)]
def display(self, ctxs): display_df(pd.DataFrame(ctxs))
class TabularLine(pd.Series):
"A line of a dataframe that knows how to show itself"
def show(self, ctx=None, **kwargs): return self if ctx is None else ctx.append(self)
class ReadTabLine(ItemTransform):
def __init__(self, proc): self.proc = proc
def encodes(self, row):
= (o.map(row.__getitem__) for o in (self.proc.cat_names,self.proc.cont_names))
cats,conts return TensorTabular(tensor(cats).long(),tensor(conts).float())
def decodes(self, o):
= TabularPandas(o, self.proc.cat_names, self.proc.cont_names, self.proc.y_names)
to = self.proc.decode(to)
to return TabularLine(pd.Series({c: v for v,c in zip(to.items[0]+to.items[1], self.proc.cat_names+self.proc.cont_names)}))
class ReadTabTarget(ItemTransform):
def __init__(self, proc): self.proc = proc
def encodes(self, row): return row[self.proc.y_names].astype(np.int64)
def decodes(self, o): return Category(self.proc.classes[self.proc.y_names][o])
# tds = TfmdDS(to.items, tfms=[[读取表格行(proc)], 读取表格目标(proc)])
# enc = tds[1]
# test_eq(enc[0][0], 张量([2,1]))
# 测试关闭(enc[0][1],张量([-0.628828]))
# test_eq(enc[1], 1)
# dec = tds.decode(enc)
# 断言 isinstance(dec[0], TabularLine)
# test_close(dec[0], pd.Series({'a': 1, 'b_na': False, 'b': 1}))
# test_eq(dec[1], 'a')
# test_stdout(lambda: print(show_at(tds, 1)), """a 1
# b_na False
# b 1
# category a
# dtype: object""")
导出 -
from nbdev import nbdev_export
nbdev_export()
Converted 00_torch_core.ipynb.
Converted 01_layers.ipynb.
Converted 01a_losses.ipynb.
Converted 02_data.load.ipynb.
Converted 03_data.core.ipynb.
Converted 04_data.external.ipynb.
Converted 05_data.transforms.ipynb.
Converted 06_data.block.ipynb.
Converted 07_vision.core.ipynb.
Converted 08_vision.data.ipynb.
Converted 09_vision.augment.ipynb.
Converted 09b_vision.utils.ipynb.
Converted 09c_vision.widgets.ipynb.
Converted 10_tutorial.pets.ipynb.
Converted 10b_tutorial.albumentations.ipynb.
Converted 11_vision.models.xresnet.ipynb.
Converted 12_optimizer.ipynb.
Converted 13_callback.core.ipynb.
Converted 13a_learner.ipynb.
Converted 13b_metrics.ipynb.
Converted 14_callback.schedule.ipynb.
Converted 14a_callback.data.ipynb.
Converted 15_callback.hook.ipynb.
Converted 15a_vision.models.unet.ipynb.
Converted 16_callback.progress.ipynb.
Converted 17_callback.tracker.ipynb.
Converted 18_callback.fp16.ipynb.
Converted 18a_callback.training.ipynb.
Converted 18b_callback.preds.ipynb.
Converted 19_callback.mixup.ipynb.
Converted 20_interpret.ipynb.
Converted 20a_distributed.ipynb.
Converted 21_vision.learner.ipynb.
Converted 22_tutorial.imagenette.ipynb.
Converted 23_tutorial.vision.ipynb.
Converted 24_tutorial.image_sequence.ipynb.
Converted 24_tutorial.siamese.ipynb.
Converted 24_vision.gan.ipynb.
Converted 30_text.core.ipynb.
Converted 31_text.data.ipynb.
Converted 32_text.models.awdlstm.ipynb.
Converted 33_text.models.core.ipynb.
Converted 34_callback.rnn.ipynb.
Converted 35_tutorial.wikitext.ipynb.
Converted 37_text.learner.ipynb.
Converted 38_tutorial.text.ipynb.
Converted 39_tutorial.transformers.ipynb.
Converted 40_tabular.core.ipynb.
Converted 41_tabular.data.ipynb.
Converted 42_tabular.model.ipynb.
Converted 43_tabular.learner.ipynb.
Converted 44_tutorial.tabular.ipynb.
Converted 45_collab.ipynb.
Converted 46_tutorial.collab.ipynb.
Converted 50_tutorial.datablock.ipynb.
Converted 60_medical.imaging.ipynb.
Converted 61_tutorial.medical_imaging.ipynb.
Converted 65_medical.text.ipynb.
Converted 70_callback.wandb.ipynb.
Converted 71_callback.tensorboard.ipynb.
Converted 72_callback.neptune.ipynb.
Converted 73_callback.captum.ipynb.
Converted 74_callback.azureml.ipynb.
Converted 97_test_utils.ipynb.
Converted 99_pytorch_doc.ipynb.
Converted dev-setup.ipynb.
Converted index.ipynb.
Converted quick_start.ipynb.
Converted tutorial.ipynb.