表格核心



! [ -e /content ] && pip install -Uqq fastai  # 在Colab上升级fastai


from __future__ import annotations
from fastai.torch_basics import *
from fastai.data.all import *


from nbdev.showdoc import *


pd.set_option('mode.chained_assignment','raise')

在将表格数据组装到 DataLoaders 之前用来预处理表格数据的基本功能。

初始预处理


def make_date(df, date_field):
    "Make sure `df[date_field]` is of the right date type."
    field_dtype = df[date_field].dtype
    if isinstance(field_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        field_dtype = np.datetime64
    if not np.issubdtype(field_dtype, np.datetime64):
        df[date_field] = pd.to_datetime(df[date_field], infer_datetime_format=True)

df = pd.DataFrame({'date': ['2019-12-04', '2019-11-29', '2019-11-15', '2019-10-24']})
make_date(df, 'date')
test_eq(df['date'].dtype, np.dtype('datetime64[ns]'))


def add_datepart(df, field_name, prefix=None, drop=True, time=False):
    "Helper function that adds columns relevant to a date in the column `field_name` of `df`."
    make_date(df, field_name)
    field = df[field_name]
    prefix = ifnone(prefix, re.sub('[Dd]ate$', '', field_name))
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start',
            'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    # Pandas 在 v1.1.10 版本中移除了 `dt.week`。
    week = field.dt.isocalendar().week.astype(field.dt.day.dtype) if hasattr(field.dt, 'isocalendar') else field.dt.week
    for n in attr: df[prefix + n] = getattr(field.dt, n.lower()) if n != 'Week' else week
    mask = ~field.isna()
    df[prefix + 'Elapsed'] = np.where(mask,field.values.astype(np.int64) // 10 ** 9,np.nan)
    if drop: df.drop(field_name, axis=1, inplace=True)
    return df

例如，如果我们有一系列日期，我们可以生成特征，比如 年份、月份、日期、星期几、是否为月初，如下所示：

df = pd.DataFrame({'date': ['2019-12-04', None, '2019-11-15', '2019-10-24']})
df = add_datepart(df, 'date')
df.head()

	Year	Month	Week	Day	Dayofweek	Dayofyear	Is_month_end	Is_month_start	Is_quarter_end	Is_quarter_start	Is_year_end	Is_year_start	Elapsed
0	2019.0	12.0	49.0	4.0	2.0	338.0	False	False	False	False	False	False	1.575418e+09
1	NaN	NaN	NaN	NaN	NaN	NaN	False	False	False	False	False	False	NaN
2	2019.0	11.0	46.0	15.0	4.0	319.0	False	False	False	False	False	False	1.573776e+09
3	2019.0	10.0	43.0	24.0	3.0	297.0	False	False	False	False	False	False	1.571875e+09


test_eq(df.columns, ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start',
            'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start', 'Elapsed'])
test_eq(df[df.Elapsed.isna()].shape,(1, 13))

# 测试该周的数据类型是否与其他日期部分字段一致
test_eq(df['Year'].dtype, df['Week'].dtype)

test_eq(pd.api.types.is_numeric_dtype(df['Elapsed']), True)


df = pd.DataFrame({'f1': [1.],'f2': [2.],'f3': [3.],'f4': [4.],'date':['2019-12-04']})
df = add_datepart(df, 'date')
df.head()

	f1	f2	f3	f4	Year	Month	Week	Day	Dayofweek	Dayofyear	Is_month_end	Is_month_start	Is_quarter_end	Is_quarter_start	Is_year_end	Is_year_start	Elapsed
0	1.0	2.0	3.0	4.0	2019	12	49	4	2	338	False	False	False	False	False	False	1.575418e+09


# Test Order of columns when date isn't in first position
test_eq(df.columns, ['f1', 'f2', 'f3', 'f4', 'Year', 'Month', 'Week', 'Day',
            'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start',
            'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start', 'Elapsed'])

# Test that week dtype is consistent with other datepart fields
test_eq(df['Year'].dtype, df['Week'].dtype)


def _get_elapsed(df,field_names, date_field, base_field, prefix):
    for f in field_names:
        day1 = np.timedelta64(1, 'D')
        last_date,last_base,res = np.datetime64(),None,[]
        for b,v,d in zip(df[base_field].values, df[f].values, df[date_field].values):
            if last_base is None or b != last_base:
                last_date,last_base = np.datetime64(),b
            if v: last_date = d
            res.append(((d-last_date).astype('timedelta64[D]') / day1))
        df[prefix + f] = res
    return df


def add_elapsed_times(df, field_names, date_field, base_field):
    "Add in `df` for each event in `field_names` the elapsed time according to `date_field` grouped by `base_field`"
    field_names = list(L(field_names))
    #确保date_field为日期类型，base_field为布尔类型
    df[field_names] = df[field_names].astype('bool')
    make_date(df, date_field)

    work_df = df[field_names + [date_field, base_field]]
    work_df = work_df.sort_values([base_field, date_field])
    work_df = _get_elapsed(work_df, field_names, date_field, base_field, 'After')
    work_df = work_df.sort_values([base_field, date_field], ascending=[True, False])
    work_df = _get_elapsed(work_df, field_names, date_field, base_field, 'Before')

    for a in ['After' + f for f in field_names] + ['Before' + f for f in field_names]:
        work_df[a] = work_df[a].fillna(0).astype(int)

    for a,s in zip([True, False], ['_bw', '_fw']):
        work_df = work_df.set_index(date_field)
        tmp = (work_df[[base_field] + field_names].sort_index(ascending=a)
                      .groupby(base_field).rolling(7, min_periods=1).sum())
        if base_field in tmp: tmp.drop(base_field, axis=1,inplace=True)
        tmp.reset_index(inplace=True)
        work_df.reset_index(inplace=True)
        work_df = work_df.merge(tmp, 'left', [date_field, base_field], suffixes=['', s])
    work_df.drop(field_names, axis=1, inplace=True)
    return df.merge(work_df, 'left', [date_field, base_field])

df = pd.DataFrame({'date': ['2019-12-04', '2019-11-29', '2019-11-15', '2019-10-24'],
                   'event': [False, True, False, True], 'base': [1,1,2,2]})
df = add_elapsed_times(df, ['event'], 'date', 'base')
df.head()

	date	event	base	Afterevent	event_bw	event_fw
0	2019-12-04	False	1	5	1.0	0.0
1	2019-11-29	True	1	0	1.0	1.0
2	2019-11-15	False	2	22	1.0	0.0
3	2019-10-24	True	2	0	1.0	1.0


def cont_cat_split(df, max_card=20, dep_var=None):
    "Helper function that returns column names of cont and cat variables from given `df`."
    cont_names, cat_names = [], []
    for label in df:
        if label in L(dep_var): continue
        if ((pd.api.types.is_integer_dtype(df[label].dtype) and
            df[label].unique().shape[0] > max_card) or
            pd.api.types.is_float_dtype(df[label].dtype)):
            cont_names.append(label)
        else: cat_names.append(label)
    return cont_names, cat_names

这个函数通过根据列值的基数来确定该列是连续的还是分类的。如果基数超过max_card参数（或是float数据类型），那么它将被添加到cont_names，否则添加到cat_names。下面是一个示例：

# 使用简单numpy类型的示例
df = pd.DataFrame({'cat1': [1, 2, 3, 4], 'cont1': [1., 2., 3., 2.], 'cat2': ['a', 'b', 'b', 'a'],
                   'i8': pd.Series([1, 2, 3, 4], dtype='int8'),
                   'u8': pd.Series([1, 2, 3, 4], dtype='uint8'),
                   'f16': pd.Series([1, 2, 3, 4], dtype='float16'),
                   'y1': [1, 0, 1, 0], 'y2': [2, 1, 1, 0]})
cont_names, cat_names = cont_cat_split(df)


print(f'cont_names: {cont_names}\ncat_names: {cat_names}`')

cont_names: ['cont1', 'f16']
cat_names: ['cat1', 'cat2', 'i8', 'u8', 'y1', 'y2']`


# 测试所有列
cont, cat = cont_cat_split(df)
test_eq((cont, cat), (['cont1', 'f16'], ['cat1', 'cat2', 'i8', 'u8', 'y1', 'y2']))

# 排除因变量
cont, cat = cont_cat_split(df, dep_var='y1')
test_eq((cont, cat), (['cont1', 'f16'], ['cat1', 'cat2', 'i8', 'u8', 'y2']))

# 多标签依赖变量的测试排除
cont, cat = cont_cat_split(df, dep_var=['y1', 'y2'])
test_eq((cont, cat), (['cont1', 'f16'], ['cat1', 'cat2', 'i8', 'u8']))

# 测试整数变量的最大基数界限
cont, cat = cont_cat_split(df, max_card=3)
test_eq((cont, cat), (['cat1', 'cont1', 'i8', 'u8', 'f16'], ['cat2', 'y1', 'y2']))
cont, cat = cont_cat_split(df, max_card=2)
test_eq((cont, cat), (['cat1', 'cont1', 'i8', 'u8', 'f16', 'y2'], ['cat2', 'y1']))
cont, cat = cont_cat_split(df, max_card=1)
test_eq((cont, cat), (['cat1', 'cont1', 'i8', 'u8', 'f16', 'y1', 'y2'], ['cat2']))

# 带有 pandas 类型和生成列的示例
df = pd.DataFrame({'cat1': pd.Series(['l','xs','xl','s'], dtype='category'),
                    'ui32': pd.Series([1, 2, 3, 4], dtype='UInt32'),
                    'i64': pd.Series([1, 2, 3, 4], dtype='Int64'),
                    'f16': pd.Series([1, 2, 3, 4], dtype='Float64'),
                    'd1_date': ['2021-02-09', None, '2020-05-12', '2020-08-14'],
                    })
df = add_datepart(df, 'd1_date', drop=False)
df['cat1'] = df['cat1'].cat.set_categories(['xl','l','m','s','xs'], ordered=True)
cont_names, cat_names = cont_cat_split(df, max_card=0)

/home/jhoward/miniconda3/lib/python3.8/site-packages/pandas/core/arrays/categorical.py:2630: FutureWarning: The `inplace` parameter in pandas.Categorical.set_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.
  res = method(*args, **kwargs)


print(f'cont_names: {cont_names}\ncat_names: {cat_names}')

cont_names: ['ui32', 'i64', 'f16', 'd1_Year', 'd1_Month', 'd1_Week', 'd1_Day', 'd1_Dayofweek', 'd1_Dayofyear', 'd1_Elapsed']
cat_names: ['cat1', 'd1_date', 'd1_Is_month_end', 'd1_Is_month_start', 'd1_Is_quarter_end', 'd1_Is_quarter_start', 'd1_Is_year_end', 'd1_Is_year_start']


cont, cat = cont_cat_split(df, max_card=0)
test_eq((cont, cat), (
    ['ui32', 'i64', 'f16', 'd1_Year', 'd1_Month', 'd1_Week', 'd1_Day', 'd1_Dayofweek', 'd1_Dayofyear', 'd1_Elapsed'],
    ['cat1', 'd1_date', 'd1_Is_month_end', 'd1_Is_month_start', 'd1_Is_quarter_end', 'd1_Is_quarter_start', 'd1_Is_year_end', 'd1_Is_year_start']
    ))


def df_shrink_dtypes(df, skip=[], obj2cat=True, int2uint=False):
    "Return any possible smaller data types for DataFrame columns. Allows `object`->`category`, `int`->`uint`, and exclusion."

    # 1: 构建列过滤器和类型映射
    excl_types, skip = {'category','datetime64[ns]','bool'}, set(skip)

    typemap = {'int'   : [(np.dtype(x), np.iinfo(x).min, np.iinfo(x).max) for x in (np.int8, np.int16, np.int32, np.int64)],
               'uint'  : [(np.dtype(x), np.iinfo(x).min, np.iinfo(x).max) for x in (np.uint8, np.uint16, np.uint32, np.uint64)],
               'float' : [(np.dtype(x), np.finfo(x).min, np.finfo(x).max) for x in (np.float32, np.float64, np.longdouble)]
              }
    if obj2cat: typemap['object'] = 'category'  # User wants to categorify dtype('Object'), which may not always save space
    else:       excl_types.add('object')

    new_dtypes = {}
    exclude = lambda dt: dt[1].name not in excl_types and dt[0] not in skip

    for c, old_t in filter(exclude, df.dtypes.items()):
        t = next((v for k,v in typemap.items() if old_t.name.startswith(k)), None)

        if isinstance(t, list): # 找到最合适的最小类型
            if int2uint and t==typemap['int'] and df[c].min() >= 0: t=typemap['uint']
            new_t = next((r[0] for r in t if r[1]<=df[c].min() and r[2]>=df[c].max()), None)
            if new_t and new_t == old_t: new_t = None
        else: new_t = t if isinstance(t, str) else None

        if new_t: new_dtypes[c] = new_t
    return new_dtypes

show_doc(df_shrink_dtypes, title_level=3)

`df_shrink_dtypes`[source]

df_shrink_dtypes(df, skip=[], obj2cat=True, int2uint=False)

Return any possible smaller data types for DataFrame columns. Allows object->category, int->uint, and exclusion.

例如，我们将创建一个包含 int，float，bool 和 object 数据类型的示例 DataFrame：

df = pd.DataFrame({'i': [-100, 0, 100], 'f': [-100.0, 0.0, 100.0], 'e': [True, False, True],
                   'date':['2019-12-04','2019-11-29','2019-11-15',]})
df.dtypes

i         int64
f       float64
e          bool
date     object
dtype: object

我们可以调用 df_shrink_dtypes 来找到可以支持数据的最小可能数据类型：

dt = df_shrink_dtypes(df)
dt

{'i': dtype('int8'), 'f': dtype('float32'), 'date': 'category'}


test_eq(df['i'].dtype, 'int64')
test_eq(dt['i'], 'int8')

test_eq(df['f'].dtype, 'float64')
test_eq(dt['f'], 'float32')

# Default ignore 'object' and 'boolean' columns
test_eq(df['date'].dtype, 'object')
test_eq(dt['date'], 'category')

# Test categorifying 'object' type
dt2 = df_shrink_dtypes(df, obj2cat=False)
test_eq('date' not in dt2, True)


def df_shrink(df, skip=[], obj2cat=True, int2uint=False):
    "Reduce DataFrame memory usage, by casting to smaller types returned by `df_shrink_dtypes()`."
    dt = df_shrink_dtypes(df, skip, obj2cat=obj2cat, int2uint=int2uint)
    return df.astype(dt)

show_doc(df_shrink, title_level=3)

`df_shrink`[source]

df_shrink(df, skip=[], obj2cat=True, int2uint=False)

Reduce DataFrame memory usage, by casting to smaller types returned by df_shrink_dtypes().

df_shrink(df) 尝试通过将数值列调整为最小的数据类型来减少 DataFrame 的内存使用。此外：

boolean、category、datetime64[ns] 数据类型的列将被忽略。
‘object’ 类型的列将被转化为分类类型，这可以在大型数据集中节省大量内存。可以通过 obj2cat=False 来禁用此功能。
int2uint=True，如果列中的所有数据都大于或等于 0，则将 int 类型转换为 uint 类型。
可以通过名称使用 excl_cols=['col1','col2'] 来排除某些列。

要获取新的列数据类型而不实际转换 DataFrame，可以使用 df_shrink_dtypes()，并使用与 df_shrink() 相同的所有参数。

df = pd.DataFrame({'i': [-100, 0, 100], 'f': [-100.0, 0.0, 100.0], 'u':[0, 10,254],
                  'date':['2019-12-04','2019-11-29','2019-11-15']})
df2 = df_shrink(df, skip=['date'])

让我们比较一下这两者：

df.dtypes

i         int64
f       float64
u         int64
date     object
dtype: object

df2.dtypes

i          int8
f       float32
u         int16
date     object
dtype: object

我们可以看到数据类型发生了变化，更进一步，我们可以查看它们相对的内存使用情况：


print(f'Initial Dataframe: {df.memory_usage().sum()} bytes')
print(f'Reduced Dataframe: {df2.memory_usage().sum()} bytes')

Initial Dataframe: 224 bytes
Reduced Dataframe: 173 bytes


test_eq(df['i'].dtype=='int64' and df2['i'].dtype=='int8', True)
test_eq(df['f'].dtype=='float64' and df2['f'].dtype=='float32', True)
test_eq(df['u'].dtype=='int64' and df2['u'].dtype=='int16', True)
test_eq(df2['date'].dtype, 'object')

test_eq(df2.memory_usage().sum() < df.memory_usage().sum(), True)

# 测试整数 => 无符号整数（当列的最小值 >= 0 时）
df3 = df_shrink(df, int2uint=True)
test_eq(df3['u'].dtype, 'uint8')  # int64 转换为 uint8 而非 int16

# 测试排除列
df4 = df_shrink(df, skip=['i','u'])
test_eq(df['i'].dtype, df4['i'].dtype)
test_eq(df4['u'].dtype, 'int64')

这是另一个使用 ADULT_SAMPLE 数据集的示例：

path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
new_df = df_shrink(df, int2uint=True)


print(f'Initial Dataframe: {df.memory_usage().sum() / 1000000} megabytes')
print(f'Reduced Dataframe: {new_df.memory_usage().sum() / 1000000} megabytes')

Initial Dataframe: 3.907448 megabytes
Reduced Dataframe: 0.818329 megabytes

我们减少了整体内存使用量的79%!

表格 -


class _TabIloc:
    "Get/set rows by iloc and cols by name"
    def __init__(self,to): self.to = to
    def __getitem__(self, idxs):
        df = self.to.items
        if isinstance(idxs,tuple):
            rows,cols = idxs
            cols = df.columns.isin(cols) if is_listy(cols) else df.columns.get_loc(cols)
        else: rows,cols = idxs,slice(None)
        return self.to.new(df.iloc[rows, cols])


class Tabular(CollBase, GetAttr, FilteredBase):
    "A `DataFrame` wrapper that knows which cols are cont/cat/y, and returns rows in `__getitem__`"
    _default,with_cont='procs',True
    def __init__(self, df, procs=None, cat_names=None, cont_names=None, y_names=None, y_block=None, splits=None,
                 do_setup=True, device=None, inplace=False, reduce_memory=True):
        if inplace and splits is not None and pd.options.mode.chained_assignment is not None:
            warn("Using inplace with splits will trigger a pandas error. Set `pd.options.mode.chained_assignment=None` to avoid it.")
        if not inplace: df = df.copy()
        if reduce_memory: df = df_shrink(df)
        if splits is not None: df = df.iloc[sum(splits, [])]
        self.dataloaders = delegates(self._dl_type.__init__)(self.dataloaders)
        super().__init__(df)

        self.y_names,self.device = L(y_names),device
        if y_block is None and self.y_names:
            # Make ys categorical if they're not numeric
            ys = df[self.y_names]
            if len(ys.select_dtypes(include='number').columns)!=len(ys.columns): y_block = CategoryBlock()
            else: y_block = RegressionBlock()
        if y_block is not None and do_setup:
            if callable(y_block): y_block = y_block()
            procs = L(procs) + y_block.type_tfms
        self.cat_names,self.cont_names,self.procs = L(cat_names),L(cont_names),Pipeline(procs)
        self.split = len(df) if splits is None else len(splits[0])
        if do_setup: self.setup()

    def new(self, df, inplace=False):
        return type(self)(df, do_setup=False, reduce_memory=False, y_block=TransformBlock(), inplace=inplace,
                          **attrdict(self, 'procs','cat_names','cont_names','y_names', 'device'))

    def subset(self, i): return self.new(self.items[slice(0,self.split) if i==0 else slice(self.split,len(self))])
    def copy(self): self.items = self.items.copy(); return self
    def decode(self): return self.procs.decode(self)
    def decode_row(self, row): return self.new(pd.DataFrame(row).T).decode().items.iloc[0]
    def show(self, max_n=10, **kwargs): display_df(self.new(self.all_cols[:max_n]).decode().items)
    def setup(self): self.procs.setup(self)
    def process(self): self.procs(self)
    def loc(self): return self.items.loc
    def iloc(self): return _TabIloc(self)
    def targ(self): return self.items[self.y_names]
    def x_names (self): return self.cat_names + self.cont_names
    def n_subsets(self): return 2
    def y(self): return self[self.y_names[0]]
    def new_empty(self): return self.new(pd.DataFrame({}, columns=self.items.columns))
    def to_device(self, d=None):
        self.device = d
        return self

    def all_col_names (self):
        ys = [n for n in self.y_names if n in self.items.columns]
        return self.x_names + self.y_names if len(ys) == len(self.y_names) else self.x_names

properties(Tabular,'loc','iloc','targ','all_col_names','n_subsets','x_names','y')

df: 你的数据的 DataFrame
cat_names: 你的类别 x 变量
cont_names: 你的连续 x 变量
y_names: 你的因变量 y
- 注意：目前不支持混合的 y，如回归和分类，但是支持多个回归或分类输出
y_block: 如何对子类别的 y_names 进行细分（CategoryBlock 或 RegressionBlock）
splits: 如何拆分你的数据
do_setup: 一个参数，用于指示 Tabular 在初始化时是否将数据通过 procs 处理
device: cuda 或 cpu
inplace: 如果为 True，Tabular 将不会在内存中保留你原始 DataFrame 的单独副本。你应在设置之前确保 pd.options.mode.chained_assignment 为 None
reduce_memory: fastai 将尝试通过输入的 DataFrame 使用 df_shrink 来减少整体内存使用量


class TabularPandas(Tabular):
    "A `Tabular` object with transforms"
    def transform(self, cols, f, all_col=True):
        if not all_col: cols = [c for c in cols if c in self.items.columns]
        if len(cols) > 0: self[cols] = self[cols].transform(f)


def _add_prop(cls, nm):
    @property
    def f(o): return o[list(getattr(o,nm+'_names'))]
    @f.setter
    def fset(o, v): o[getattr(o,nm+'_names')] = v
    setattr(cls, nm+'s', f)
    setattr(cls, nm+'s', fset)

_add_prop(Tabular, 'cat')
_add_prop(Tabular, 'cont')
_add_prop(Tabular, 'y')
_add_prop(Tabular, 'x')
_add_prop(Tabular, 'all_col')


df = pd.DataFrame({'a':[0,1,2,0,2], 'b':[0,0,0,0,1]})
to = TabularPandas(df, cat_names='a')
t = pickle.loads(pickle.dumps(to))
test_eq(t.items,to.items)
test_eq(to.all_cols,to[['a']])


import gc


def _count_objs(o):
    "Counts number of instanes of class `o`"
    objs = gc.get_objects()
    return len([x for x in objs if isinstance(x, pd.DataFrame)])

df = pd.DataFrame({'a':[0,1,2,0,2], 'b':[0,0,0,0,1]})
df_b = pd.DataFrame({'a':[1,2,0,0,2], 'b':[1,0,3,0,1]})

to = TabularPandas(df, cat_names='a', inplace=True)

_init_count = _count_objs(pd.DataFrame)
to_new = to.new(df_b, inplace=True)
test_eq(_init_count, _count_objs(pd.DataFrame))

/home/jhoward/miniconda3/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:151: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead
  warnings.warn("torch.distributed.reduce_op is deprecated, please use "


class TabularProc(InplaceTransform):
    "Base class to write a non-lazy tabular processor for dataframes"
    def setup(self, items=None, train_setup=False): #待办：妥善处理训练设置
        super().setup(getattr(items,'train',items), train_setup=False)
        # 一旦有数据可用，进程就会被调用。
        return self(items.items if isinstance(items,Datasets) else items)

    @property
    def name(self): return f"{super().name} -- {getattr(self,'__stored_args__',{})}"

这些转换将在数据可用时立即应用，而不是在从 DataLoader 调用数据时应用。


def _apply_cats (voc, add, c):
    if not (hasattr(c, 'dtype') and isinstance(c.dtype, CategoricalDtype)):
        return pd.Categorical(c, categories=voc[c.name][add:]).codes+add
    return c.cat.codes+add #如果c是分类数据类型，则执行以下操作，否则对c进行映射，使用voc中c名称对应的o2i函数
def _decode_cats(voc, c): return c.map(dict(enumerate(voc[c.name].items)))


class Categorify(TabularProc):
    "Transform the categorical variables to something similar to `pd.Categorical`"
    order = 1
    def setups(self, to):
        store_attr(classes={n:CategoryMap(to.iloc[:,n].items, add_na=(n in to.cat_names)) for n in to.cat_names}, but='to')

    def encodes(self, to): to.transform(to.cat_names, partial(_apply_cats, self.classes, 1))
    def decodes(self, to): to.transform(to.cat_names, partial(_decode_cats, self.classes))
    def __getitem__(self,k): return self.classes[k]


@Categorize
def setups(self, to:Tabular):
    if len(to.y_names) > 0:
        if self.vocab is None:
            self.vocab = CategoryMap(getattr(to, 'train', to).iloc[:,to.y_names[0]].items, strict=True)
        else:
            self.vocab = CategoryMap(self.vocab, sort=False, add_na=self.add_na)
        self.c = len(self.vocab)
    return self(to)

@Categorize
def encodes(self, to:Tabular):
    to.transform(to.y_names, partial(_apply_cats, {n: self.vocab for n in to.y_names}, 0), all_col=False)
    return to

@Categorize
def decodes(self, to:Tabular):
    to.transform(to.y_names, partial(_decode_cats, {n: self.vocab for n in to.y_names}), all_col=False)
    return to

show_doc(Categorify, title_level=3)

`class` `Categorify`[source]

Categorify(enc=None, dec=None, split_idx=None, order=None) :: TabularProc

Transform the categorical variables to something similar to pd.Categorical

虽然在 DataFrame 中从视觉上看不会看到变化，但类别是存储在 to.procs.categorify 中的，正如下方的一个虚拟 DataFrame 所示：

df = pd.DataFrame({'a':[0,1,2,0,2]})
to = TabularPandas(df, Categorify, 'a')
to.show()

	a
0	0
1	1
2	2
3	0
4	2

每列的唯一值存储在一个字典中，格式为 column:[values]：

cat = to.procs.categorify
cat.classes

{'a': ['#na#', 0, 1, 2]}


def test_series(a,b): return test_eq(list(a), b)
test_series(cat['a'], ['#na#',0,1,2])
test_series(to['a'], [1,2,3,1,3])


df1 = pd.DataFrame({'a':[1,0,3,-1,2]})
to1 = to.new(df1)
to1.process()
#Values that weren't in the training df are sent to 0 (na)
test_series(to1['a'], [2,1,0,0,3])
to2 = cat.decode(to1)
test_series(to2['a'], [1,0,'#na#','#na#',2])


#测试带有分割
cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,3,2]})
to = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]])
test_series(cat['a'], ['#na#',0,1,2])
test_series(to['a'], [1,2,3,0,3])


df = pd.DataFrame({'a':pd.Categorical(['M','H','L','M'], categories=['H','M','L'], ordered=True)})
to = TabularPandas(df, Categorify, 'a')
cat = to.procs.categorify
test_series(cat['a'], ['#na#','H','M','L'])
test_series(to.items.a, [2,1,3,2])
to2 = cat.decode(to)
test_series(to2['a'], ['M','H','L','M'])


#测试目标
cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,3,2], 'b': ['a', 'b', 'a', 'b', 'b']})
to = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]], y_names='b')
test_series(to.vocab, ['a', 'b'])
test_series(to['b'], [0,1,0,1,1])
to2 = to.procs.decode(to)
test_series(to2['b'], ['a', 'b', 'a', 'b', 'b'])


cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,3,2], 'b': ['a', 'b', 'a', 'b', 'b']})
to = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]], y_names='b')
test_series(to.vocab, ['a', 'b'])
test_series(to['b'], [0,1,0,1,1])
to2 = to.procs.decode(to)
test_series(to2['b'], ['a', 'b', 'a', 'b', 'b'])


#测试与目标及训练
cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,3,2], 'b': ['a', 'b', 'a', 'c', 'b']})
to = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]], y_names='b')
test_series(to.vocab, ['a', 'b'])


#测试以确保没有存储数据框的副本
cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,3,4]})
to = TabularPandas(df, cat, cont_names='a', splits=[[0,1,2],[3,4]])
test_eq(hasattr(to.categorify, 'to'), False)


@Normalize
def setups(self, to:Tabular):
    store_attr(but='to', means=dict(getattr(to, 'train', to).conts.mean()),
               stds=dict(getattr(to, 'train', to).conts.std(ddof=0)+1e-7))
    return self(to)

@Normalize
def encodes(self, to:Tabular):
    to.conts = (to.conts-self.means) / self.stds
    return to

@Normalize
def decodes(self, to:Tabular):
    to.conts = (to.conts*self.stds ) + self.means
    return to


norm = Normalize()
df = pd.DataFrame({'a':[0,1,2,3,4]})
to = TabularPandas(df, norm, cont_names='a')
x = np.array([0,1,2,3,4])
m,s = x.mean(),x.std()
test_eq(norm.means['a'], m)
test_close(norm.stds['a'], s)
test_close(to['a'].values, (x-m)/s)


df1 = pd.DataFrame({'a':[5,6,7]})
to1 = to.new(df1)
to1.process()
test_close(to1['a'].values, (np.array([5,6,7])-m)/s)
to2 = norm.decode(to1)
test_close(to2['a'].values, [5,6,7])


norm = Normalize()
df = pd.DataFrame({'a':[0,1,2,3,4]})
to = TabularPandas(df, norm, cont_names='a', splits=[[0,1,2],[3,4]])
x = np.array([0,1,2])
m,s = x.mean(),x.std()
test_eq(norm.means['a'], m)
test_close(norm.stds['a'], s)
test_close(to['a'].values, (np.array([0,1,2,3,4])-m)/s)


norm = Normalize()
df = pd.DataFrame({'a':[0,1,2,3,4]})
to = TabularPandas(df, norm, cont_names='a', splits=[[0,1,2],[3,4]])
test_eq(hasattr(to.procs.normalize, 'to'), False)


class FillStrategy:
    "Namespace containing the various filling strategies."
    def median  (c,fill): return c.median()
    def constant(c,fill): return fill
    def mode    (c,fill): return c.dropna().value_counts().idxmax()

目前，支持使用中位数、常数和众数进行填充。


class FillMissing(TabularProc):
    "Fill the missing values in continuous columns."
    def __init__(self, fill_strategy=FillStrategy.median, add_col=True, fill_vals=None):
        if fill_vals is None: fill_vals = defaultdict(int)
        store_attr()

    def setups(self, to):
        missing = pd.isnull(to.conts).any()
        store_attr(but='to', na_dict={n:self.fill_strategy(to[n], self.fill_vals[n])
                            for n in missing[missing].keys()})
        self.fill_strategy = self.fill_strategy.__name__

    def encodes(self, to):
        missing = pd.isnull(to.conts)
        for n in missing.any()[missing.any()].keys():
            assert n in self.na_dict, f"nan values in `{n}` but not in setup training set"
        for n in self.na_dict.keys():
            to[n].fillna(self.na_dict[n], inplace=True)
            if self.add_col:
                to.loc[:,n+'_na'] = missing[n]
                if n+'_na' not in to.cat_names: to.cat_names.append(n+'_na')

show_doc(FillMissing, title_level=3)

`class` `FillMissing`[source]

FillMissing(fill_strategy=median, add_col=True, fill_vals=None) :: TabularProc

Fill the missing values in continuous columns.


fill1,fill2,fill3 = (FillMissing(fill_strategy=s)
                     for s in [FillStrategy.median, FillStrategy.constant, FillStrategy.mode])
df = pd.DataFrame({'a':[0,1,np.nan,1,2,3,4]})
df1 = df.copy(); df2 = df.copy()
tos = (TabularPandas(df, fill1, cont_names='a'),
       TabularPandas(df1, fill2, cont_names='a'),
       TabularPandas(df2, fill3, cont_names='a'))
test_eq(fill1.na_dict, {'a': 1.5})
test_eq(fill2.na_dict, {'a': 0})
test_eq(fill3.na_dict, {'a': 1.0})

for t in tos: test_eq(t.cat_names, ['a_na'])

for to_,v in zip(tos, [1.5, 0., 1.]):
    test_eq(to_['a'].values, np.array([0, 1, v, 1, 2, 3, 4]))
    test_eq(to_['a_na'].values, np.array([0, 0, 1, 0, 0, 0, 0]))


fill = FillMissing()
df = pd.DataFrame({'a':[0,1,np.nan,1,2,3,4], 'b': [0,1,2,3,4,5,6]})
to = TabularPandas(df, fill, cont_names=['a', 'b'])
test_eq(fill.na_dict, {'a': 1.5})
test_eq(to.cat_names, ['a_na'])
test_eq(to['a'].values, np.array([0, 1, 1.5, 1, 2, 3, 4]))
test_eq(to['a_na'].values, np.array([0, 0, 1, 0, 0, 0, 0]))
test_eq(to['b'].values, np.array([0,1,2,3,4,5,6]))


fill = FillMissing()
df = pd.DataFrame({'a':[0,1,np.nan,1,2,3,4], 'b': [0,1,2,3,4,5,6]})
to = TabularPandas(df, fill, cont_names=['a', 'b'])
test_eq(hasattr(to.procs.fill_missing, 'to'), False)

表格式Pandas管道 -


procs = [Normalize, Categorify, FillMissing, noop]
df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4]})
to = TabularPandas(df, procs, cat_names='a', cont_names='b')

#测试设置并应用于df_main
test_series(to.cat_names, ['a', 'b_na'])
test_series(to['a'], [1,2,3,2,2,3,1])
test_series(to['b_na'], [1,1,2,1,1,1,1])
x = np.array([0,1,1.5,1,2,3,4])
m,s = x.mean(),x.std()
test_close(to['b'].values, (x-m)/s)
test_eq(to.classes, {'a': ['#na#',0,1,2], 'b_na': ['#na#',False,True]})


#在y_names上应用测试
df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4], 'c': ['b','a','b','a','a','b','a']})
to = TabularPandas(df, procs, 'a', 'b', y_names='c')

test_series(to.cat_names, ['a', 'b_na'])
test_series(to['a'], [1,2,3,2,2,3,1])
test_series(to['b_na'], [1,1,2,1,1,1,1])
test_series(to['c'], [1,0,1,0,0,1,0])
x = np.array([0,1,1.5,1,2,3,4])
m,s = x.mean(),x.std()
test_close(to['b'].values, (x-m)/s)
test_eq(to.classes, {'a': ['#na#',0,1,2], 'b_na': ['#na#',False,True]})
test_eq(to.vocab, ['a','b'])


df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4], 'c': ['b','a','b','a','a','b','a']})
to = TabularPandas(df, procs, 'a', 'b', y_names='c')

test_series(to.cat_names, ['a', 'b_na'])
test_series(to['a'], [1,2,3,2,2,3,1])
test_eq(df.a.dtype, np.int64 if sys.platform == "win32" else int)
test_series(to['b_na'], [1,1,2,1,1,1,1])
test_series(to['c'], [1,0,1,0,0,1,0])


df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,np.nan,1,1,2,3,4], 'c': ['b','a','b','a','a','b','a']})
to = TabularPandas(df, procs, cat_names='a', cont_names='b', y_names='c', splits=[[0,1,4,6], [2,3,5]])

test_series(to.cat_names, ['a', 'b_na'])
test_series(to['a'], [1,2,2,1,0,2,0])
test_eq(df.a.dtype, np.int64 if sys.platform == "win32" else int)
test_series(to['b_na'], [1,2,1,1,1,1,1])
test_series(to['c'], [1,0,0,0,1,0,1])


def _maybe_expand(o): return o[:,None] if o.ndim==1 else o


class ReadTabBatch(ItemTransform):
    "Transform `TabularPandas` values into a `Tensor` with the ability to decode"
    def __init__(self, to): self.to = to.new_empty()

    def encodes(self, to):
        if not to.with_cont: res = (tensor(to.cats).long(),)
        else: res = (tensor(to.cats).long(),tensor(to.conts).float())
        ys = [n for n in to.y_names if n in to.items.columns]
        if len(ys) == len(to.y_names): res = res + (tensor(to.targ),)
        if to.device is not None: res = to_device(res, to.device)
        return res

    def decodes(self, o):
        o = [_maybe_expand(o_) for o_ in to_np(o) if o_.size != 0]
        vals = np.concatenate(o, axis=1)
        try: df = pd.DataFrame(vals, columns=self.to.all_col_names)
        except: df = pd.DataFrame(vals, columns=self.to.x_names)
        to = self.to.new(df)
        return to


@typedispatch
def show_batch(x: Tabular, y, its, max_n=10, ctxs=None):
    x.show()


@delegates()
class TabDataLoader(TfmdDL):
    "A transformed `DataLoader` for Tabular data"
    def __init__(self, dataset, bs=16, shuffle=False, after_batch=None, num_workers=0, **kwargs):
        if after_batch is None: after_batch = L(TransformBlock().batch_tfms)+ReadTabBatch(dataset)
        super().__init__(dataset, bs=bs, shuffle=shuffle, after_batch=after_batch, num_workers=num_workers, **kwargs)

    def create_item(self, s):  return self.dataset.iloc[s or 0]
    def create_batch(self, b): return self.dataset.iloc[b]
    def do_item(self, s):      return 0 if s is None else s

TabularPandas._dl_type = TabDataLoader


@delegates()
class TabWeightedDL(TabDataLoader):
    "A transformed `DataLoader` for Tabular Weighted data"
    def __init__(self, dataset, bs=16, wgts=None, shuffle=False, after_batch=None, num_workers=0, **kwargs):
        wgts = np.array([1.]*len(dataset) if wgts is None else wgts)
        self.wgts = wgts / wgts.sum()
        super().__init__(dataset, bs=bs, shuffle=shuffle, after_batch=after_batch, num_workers=num_workers, **kwargs)
        self.idxs = self.get_idxs()

    def get_idxs(self):
        if self.n == 0: return []
        if not self.shuffle: return super().get_idxs()
        return list(np.random.choice(self.n, self.n, p=self.wgts))

TabularPandas._dl_type = TabWeightedDL

集成示例

有关更深入的解释，请参见表格教程

path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_test.drop('salary', axis=1, inplace=True)
df_main.head()

	age	workclass	fnlwgt	education	education-num	marital-status	occupation	relationship	race	sex	capital-gain	capital-loss	hours-per-week	native-country	salary
0	49	Private	101320	Assoc-acdm	12.0	Married-civ-spouse	NaN	Wife	White	Female	0	1902	40	United-States	>=50k
1	44	Private	236746	Masters	14.0	Divorced	Exec-managerial	Not-in-family	White	Male	10520	0	45	United-States	>=50k
2	38	Private	96185	HS-grad	NaN	Divorced	NaN	Unmarried	Black	Female	0	0	32	United-States	<50k
3	38	Self-emp-inc	112847	Prof-school	15.0	Married-civ-spouse	Prof-specialty	Husband	Asian-Pac-Islander	Male	0	0	40	United-States	>=50k
4	42	Self-emp-not-inc	82297	7th-8th	NaN	Married-civ-spouse	Other-service	Wife	Black	Female	0	0	50	United-States	<50k

cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))

to = TabularPandas(df_main, procs, cat_names, cont_names, y_names="salary", splits=splits)

dls = to.dataloaders()
dls.valid.show_batch()

	workclass	education	marital-status	occupation	relationship	race	education-num_na	age	fnlwgt	education-num	salary
0	Self-emp-not-inc	Prof-school	Divorced	Prof-specialty	Not-in-family	White	False	65.000000	316093.005287	15.0	<50k
1	Private	Bachelors	Married-civ-spouse	Exec-managerial	Husband	White	False	69.999999	280306.998091	13.0	<50k
2	Federal-gov	Some-college	Married-civ-spouse	Adm-clerical	Husband	Black	False	34.000000	199933.999862	10.0	>=50k
3	Private	HS-grad	Never-married	Handlers-cleaners	Unmarried	White	False	24.000001	300584.002430	9.0	<50k
4	Private	Assoc-voc	Never-married	Other-service	Not-in-family	White	False	34.000000	220630.999335	11.0	<50k
5	Private	Bachelors	Divorced	Prof-specialty	Unmarried	White	False	45.000000	289230.003178	13.0	>=50k
6	?	Some-college	Never-married	?	Own-child	White	False	26.000000	208993.999494	10.0	<50k
7	Private	Some-college	Divorced	Adm-clerical	Not-in-family	White	False	43.000000	174574.999446	10.0	<50k
8	Self-emp-not-inc	Assoc-voc	Married-civ-spouse	Other-service	Husband	White	False	63.000000	420628.997361	11.0	<50k
9	State-gov	Some-college	Married-civ-spouse	Adm-clerical	Husband	Black	False	25.000000	257064.003065	10.0	<50k

to.show()

	workclass	education	marital-status	occupation	relationship	race	education-num_na	age	fnlwgt	education-num	salary
5516	Private	HS-grad	Divorced	Exec-managerial	Unmarried	White	False	49.0	140121.0	9.0	<50k
7184	Self-emp-inc	Some-college	Never-married	Exec-managerial	Not-in-family	White	False	70.0	207938.0	10.0	<50k
2336	Private	Some-college	Never-married	Priv-house-serv	Own-child	White	False	23.0	50953.0	10.0	<50k
4342	Private	Assoc-voc	Married-civ-spouse	Machine-op-inspct	Husband	White	False	46.0	27802.0	11.0	<50k
8474	Self-emp-not-inc	Assoc-acdm	Married-civ-spouse	Craft-repair	Husband	White	False	47.0	107231.0	12.0	<50k
5948	Local-gov	HS-grad	Married-civ-spouse	Transport-moving	Husband	White	False	40.0	55363.0	9.0	<50k
5342	Local-gov	HS-grad	Married-civ-spouse	Craft-repair	Husband	White	False	46.0	36228.0	9.0	<50k
9005	Private	Bachelors	Married-civ-spouse	Adm-clerical	Husband	White	False	38.0	297449.0	13.0	>=50k
1189	Private	Assoc-voc	Divorced	Sales	Not-in-family	Amer-Indian-Eskimo	False	31.0	87950.0	11.0	<50k
8784	Private	Assoc-voc	Divorced	Prof-specialty	Own-child	Black	False	35.0	491000.0	11.0	<50k

我们可以通过调用 to.decode_row 并传入我们的原始数据来解码任何一组转换后的数据：

row = to.items.iloc[0]
to.decode_row(row)

age                             49.0
workclass                    Private
fnlwgt                      140121.0
education                    HS-grad
education-num                    9.0
marital-status              Divorced
occupation           Exec-managerial
relationship               Unmarried
race                           White
sex                             Male
capital-gain                       0
capital-loss                       0
hours-per-week                    50
native-country         United-States
salary                          <50k
education-num_na               False
Name: 5516, dtype: object

我们可以基于训练数据使用 to.new() 创建新的测试数据集。

Note

由于机器学习模型无法神奇地理解它从未训练过的类别，因此数据应该反映这一点。如果您的测试数据中存在不同的缺失值，您应该在训练之前解决这个问题。

to_tst = to.new(df_test)
to_tst.process()
to_tst.items.head()

	age	workclass	fnlwgt	education	education-num	marital-status	occupation	relationship	race	sex	hours-per-week	native-country	education-num_na
10000	0.465031	5	1.319553	10	1.176677	3	2	1	2	Male	40	Philippines	1
10001	-0.926675	5	1.233650	12	-0.420035	3	15	1	4	Male	40	United-States	1
10002	1.051012	5	0.145161	2	-1.218391	1	9	2	5	Female	37	United-States	1
10003	0.538279	5	-0.282370	12	-0.420035	7	2	5	5	Female	43	United-States	1
10004	0.758022	6	1.420768	9	0.378321	3	5	1	5	Male	60	United-States	1

我们可以将其转换为 DataLoader：

tst_dl = dls.valid.new(to_tst)
tst_dl.show_batch()

	workclass	education	marital-status	occupation	relationship	race	education-num_na	age	fnlwgt	education-num
0	Private	Bachelors	Married-civ-spouse	Adm-clerical	Husband	Asian-Pac-Islander	False	45.000000	338105.005817	13.0
1	Private	HS-grad	Married-civ-spouse	Transport-moving	Husband	Other	False	26.000000	328663.002806	9.0
2	Private	11th	Divorced	Other-service	Not-in-family	White	False	52.999999	209022.000317	7.0
3	Private	HS-grad	Widowed	Adm-clerical	Unmarried	White	False	46.000000	162029.998917	9.0
4	Self-emp-inc	Assoc-voc	Married-civ-spouse	Exec-managerial	Husband	White	False	49.000000	349230.006300	11.0
5	Local-gov	Some-college	Married-civ-spouse	Exec-managerial	Husband	White	False	34.000000	124827.002059	10.0
6	Self-emp-inc	Some-college	Married-civ-spouse	Sales	Husband	White	False	52.999999	290640.002462	10.0
7	Private	Some-college	Never-married	Sales	Own-child	White	False	19.000000	106272.998239	10.0
8	Private	Some-college	Married-civ-spouse	Protective-serv	Husband	Black	False	71.999999	53684.001668	10.0
9	Private	Some-college	Never-married	Sales	Own-child	White	False	20.000000	505980.010609	10.0

# 创建一个TabWeightedDL
train_ds = to.train
weights = np.random.random(len(train_ds))
train_dl = TabWeightedDL(train_ds, wgts=weights, bs=64, shuffle=True)

train_dl.show_batch()

	workclass	education	marital-status	occupation	relationship	race	education-num_na	age	fnlwgt	education-num	salary
0	Local-gov	Masters	Never-married	Prof-specialty	Not-in-family	White	False	31.000000	204469.999932	14.0	<50k
1	Self-emp-not-inc	HS-grad	Divorced	Farming-fishing	Not-in-family	White	False	32.000000	34572.002104	9.0	<50k
2	?	Some-college	Widowed	?	Not-in-family	White	False	64.000000	34099.998990	10.0	<50k
3	Private	Some-college	Divorced	Exec-managerial	Not-in-family	White	False	32.000000	251242.999189	10.0	>=50k
4	Federal-gov	HS-grad	Married-civ-spouse	Exec-managerial	Husband	White	False	55.000001	176903.999313	9.0	<50k
5	Private	11th	Married-civ-spouse	Transport-moving	Husband	White	False	50.000000	192203.000000	7.0	<50k
6	Private	10th	Never-married	Farming-fishing	Own-child	Black	False	36.000000	181720.999704	6.0	<50k
7	Local-gov	Masters	Divorced	Prof-specialty	Not-in-family	Amer-Indian-Eskimo	False	50.000000	220640.001490	14.0	>=50k
8	Private	HS-grad	Married-civ-spouse	Adm-clerical	Wife	White	False	36.000000	189381.999993	9.0	>=50k
9	Private	Masters	Divorced	Prof-specialty	Unmarried	White	False	42.000000	265697.997341	14.0	<50k


batch = next(iter(train_dl))

x, y = batch[0].shape

test_eq(x, 64)
test_eq(y, 7)

assert hasattr(train_dl, 'wgts'), "Weights attribute missing in DataLoader"

TabDataLoader的create_item方法

df = pd.DataFrame([{'age': 35}])
to = TabularPandas(df)
dls = to.dataloaders()
print(dls.create_item(0))
# test_eq(dls.create_item(0).items.to_dict(), {'age': 0.5330614747286777, 'workclass': 5, 'fnlwgt': -0.26305443080666174, 'education': 10, 'education-num': 1.169790230219763, 'marital-status': 1, 'occupation': 13, 'relationship': 5, 'race': 3, 'sex': ' Female', 'capital-gain': 0, 'capital-loss': 0, 'hours-per-week': 35, 'native-country': 'United-States', 'salary': 1, 'education-num_na': 1})

age    35
Name: 0, dtype: int8

其他目标类型

多标签类别

独热编码标签

def _mock_multi_label(df):
    sal,sex,white = [],[],[]
    for row in df.itertuples():
        sal.append(row.salary == '>=50k')
        sex.append(row.sex == ' Male')
        white.append(row.race == ' White')
    df['salary'] = np.array(sal)
    df['male']   = np.array(sex)
    df['white']  = np.array(white)
    return df

path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)

df_main.head()

	age	workclass	fnlwgt	education	education-num	marital-status	occupation	relationship	race	sex	capital-gain	capital-loss	hours-per-week	native-country	salary	male	white
0	49	Private	101320	Assoc-acdm	12.0	Married-civ-spouse	NaN	Wife	White	Female	0	1902	40	United-States	True	False	True
1	44	Private	236746	Masters	14.0	Divorced	Exec-managerial	Not-in-family	White	Male	10520	0	45	United-States	True	True	True
2	38	Private	96185	HS-grad	NaN	Divorced	NaN	Unmarried	Black	Female	0	0	32	United-States	False	False	False
3	38	Self-emp-inc	112847	Prof-school	15.0	Married-civ-spouse	Prof-specialty	Husband	Asian-Pac-Islander	Male	0	0	40	United-States	True	True	False
4	42	Self-emp-not-inc	82297	7th-8th	NaN	Married-civ-spouse	Other-service	Wife	Black	Female	0	0	50	United-States	False	False	False


@EncodedMultiCategorize
def setups(self, to:Tabular):
    self.c = len(self.vocab)
    return self(to)

@EncodedMultiCategorize
def encodes(self, to:Tabular): return to

@EncodedMultiCategorize
def decodes(self, to:Tabular):
    to.transform(to.y_names, lambda c: c==1)
    return to

cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))
y_names=["salary", "male", "white"]

%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names=y_names, y_block=MultiCategoryBlock(encoded=True, vocab=y_names), splits=splits)

CPU times: user 66 ms, sys: 0 ns, total: 66 ms
Wall time: 65.3 ms

dls = to.dataloaders()
dls.valid.show_batch()

	workclass	education	marital-status	occupation	relationship	race	education-num_na	age	fnlwgt	education-num	salary	male	white
0	Private	HS-grad	Divorced	Exec-managerial	Unmarried	White	False	47.000000	164423.000013	9.0	False	False	True
1	Private	Some-college	Married-civ-spouse	Transport-moving	Husband	White	False	74.999999	239037.999499	10.0	False	True	True
2	Private	HS-grad	Married-civ-spouse	Sales	Wife	White	False	45.000000	228570.000761	9.0	False	False	True
3	Self-emp-not-inc	HS-grad	Married-civ-spouse	Exec-managerial	Husband	Asian-Pac-Islander	False	45.000000	285574.998753	9.0	False	True	False
4	Private	Some-college	Never-married	Adm-clerical	Own-child	White	False	21.999999	184812.999966	10.0	False	True	True
5	Private	10th	Married-civ-spouse	Transport-moving	Husband	White	False	67.000001	274450.998865	6.0	False	True	True
6	Private	HS-grad	Divorced	Exec-managerial	Unmarried	White	False	53.999999	192862.000000	9.0	False	False	True
7	Federal-gov	Some-college	Divorced	Tech-support	Unmarried	Amer-Indian-Eskimo	False	37.000000	33486.997455	10.0	False	False	False
8	Private	HS-grad	Never-married	Machine-op-inspct	Other-relative	White	False	30.000000	219318.000010	9.0	False	False	True
9	Self-emp-not-inc	Bachelors	Married-civ-spouse	Sales	Husband	White	False	44.000000	167279.999960	13.0	False	True	True

非独热编码

def _mock_multi_label(df):
    targ = []
    for row in df.itertuples():
        labels = []
        if row.salary == '>=50k': labels.append('>50k')
        if row.sex == ' Male':   labels.append('male')
        if row.race == ' White': labels.append('white')
        targ.append(' '.join(labels))
    df['target'] = np.array(targ)
    return df

path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)

df_main.head()

	age	workclass	fnlwgt	education	education-num	marital-status	occupation	relationship	race	sex	capital-gain	capital-loss	hours-per-week	native-country	salary	target
0	49	Private	101320	Assoc-acdm	12.0	Married-civ-spouse	NaN	Wife	White	Female	0	1902	40	United-States	>=50k	>50k white
1	44	Private	236746	Masters	14.0	Divorced	Exec-managerial	Not-in-family	White	Male	10520	0	45	United-States	>=50k	>50k male white
2	38	Private	96185	HS-grad	NaN	Divorced	NaN	Unmarried	Black	Female	0	0	32	United-States	<50k
3	38	Self-emp-inc	112847	Prof-school	15.0	Married-civ-spouse	Prof-specialty	Husband	Asian-Pac-Islander	Male	0	0	40	United-States	>=50k	>50k male
4	42	Self-emp-not-inc	82297	7th-8th	NaN	Married-civ-spouse	Other-service	Wife	Black	Female	0	0	50	United-States	<50k

@MultiCategorize
def encodes(self, to:Tabular):
    #to.transform(to.y_names, partial(_apply_cats, {n: self.vocab for n in to.y_names}, 0))
    return to

@MultiCategorize
def decodes(self, to:Tabular):
    #to.transform(to.y_names, partial(_decode_cats, {n: self.vocab for n in to.y_names}))
    return to

cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))

%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names="target", y_block=MultiCategoryBlock(), splits=splits)

CPU times: user 68.6 ms, sys: 0 ns, total: 68.6 ms
Wall time: 67.9 ms

to.procs[2].vocab

['-', '_', 'a', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y']

回归


@RegressionSetup
def setups(self, to:Tabular):
    if self.c is not None: return
    self.c = len(to.y_names)
    return to

@RegressionSetup
def encodes(self, to:Tabular): return to

@RegressionSetup
def decodes(self, to:Tabular): return to

path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)

cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))

%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names='age', splits=splits)

CPU times: user 70.7 ms, sys: 290 µs, total: 71 ms
Wall time: 70.3 ms

to.procs[-1].means

{'fnlwgt': 192085.701, 'education-num': 10.059124946594238}

dls = to.dataloaders()
dls.valid.show_batch()

	workclass	education	marital-status	occupation	relationship	race	education-num_na	fnlwgt	education-num	age
0	Private	12th	Never-married	Adm-clerical	Other-relative	Black	False	503454.004078	8.0	47.0
1	Federal-gov	Bachelors	Married-civ-spouse	Exec-managerial	Husband	White	False	586656.993690	13.0	49.0
2	Self-emp-not-inc	Assoc-voc	Married-civ-spouse	Farming-fishing	Husband	White	False	164607.001243	11.0	29.0
3	Private	HS-grad	Never-married	Adm-clerical	Not-in-family	Black	False	155508.999873	9.0	48.0
4	Private	11th	Never-married	Other-service	Own-child	White	False	318189.998679	7.0	18.0
5	Private	HS-grad	Never-married	Adm-clerical	Other-relative	White	False	140219.001104	9.0	47.0
6	Private	Masters	Divorced	#na#	Unmarried	White	True	235683.001562	10.0	47.0
7	Private	Bachelors	Married-civ-spouse	Craft-repair	Husband	White	False	187321.999825	13.0	43.0
8	Private	Bachelors	Married-civ-spouse	Prof-specialty	Husband	White	False	104196.002410	13.0	40.0
9	Private	Some-college	Separated	Priv-house-serv	Other-relative	White	False	184302.999784	10.0	25.0

目前未使用 - 用于多模态

class TensorTabular(fastuple):
    def get_ctxs(self, max_n=10, **kwargs):
        n_samples = min(self[0].shape[0], max_n)
        df = pd.DataFrame(index = range(n_samples))
        return [df.iloc[i] for i in range(n_samples)]

    def display(self, ctxs): display_df(pd.DataFrame(ctxs))

class TabularLine(pd.Series):
    "A line of a dataframe that knows how to show itself"
    def show(self, ctx=None, **kwargs): return self if ctx is None else ctx.append(self)

class ReadTabLine(ItemTransform):
    def __init__(self, proc): self.proc = proc

    def encodes(self, row):
        cats,conts = (o.map(row.__getitem__) for o in (self.proc.cat_names,self.proc.cont_names))
        return TensorTabular(tensor(cats).long(),tensor(conts).float())

    def decodes(self, o):
        to = TabularPandas(o, self.proc.cat_names, self.proc.cont_names, self.proc.y_names)
        to = self.proc.decode(to)
        return TabularLine(pd.Series({c: v for v,c in zip(to.items[0]+to.items[1], self.proc.cat_names+self.proc.cont_names)}))

class ReadTabTarget(ItemTransform):
    def __init__(self, proc): self.proc = proc
    def encodes(self, row): return row[self.proc.y_names].astype(np.int64)
    def decodes(self, o): return Category(self.proc.classes[self.proc.y_names][o])

# tds = TfmdDS(to.items, tfms=[[读取表格行(proc)], 读取表格目标(proc)])
# enc = tds[1]
# test_eq(enc[0][0], 张量([2,1]))
# 测试关闭（enc[0][1]，张量([-0.628828])）
# test_eq(enc[1], 1)

# dec = tds.decode(enc)
# 断言 isinstance(dec[0], TabularLine)
# test_close(dec[0], pd.Series({'a': 1, 'b_na': False, 'b': 1}))
# test_eq(dec[1], 'a')

# test_stdout(lambda: print(show_at(tds, 1)), """a               1
# b_na        False
# b               1
# category        a
# dtype: object""")

导出 -


from nbdev import nbdev_export
nbdev_export()

Converted 00_torch_core.ipynb.
Converted 01_layers.ipynb.
Converted 01a_losses.ipynb.
Converted 02_data.load.ipynb.
Converted 03_data.core.ipynb.
Converted 04_data.external.ipynb.
Converted 05_data.transforms.ipynb.
Converted 06_data.block.ipynb.
Converted 07_vision.core.ipynb.
Converted 08_vision.data.ipynb.
Converted 09_vision.augment.ipynb.
Converted 09b_vision.utils.ipynb.
Converted 09c_vision.widgets.ipynb.
Converted 10_tutorial.pets.ipynb.
Converted 10b_tutorial.albumentations.ipynb.
Converted 11_vision.models.xresnet.ipynb.
Converted 12_optimizer.ipynb.
Converted 13_callback.core.ipynb.
Converted 13a_learner.ipynb.
Converted 13b_metrics.ipynb.
Converted 14_callback.schedule.ipynb.
Converted 14a_callback.data.ipynb.
Converted 15_callback.hook.ipynb.
Converted 15a_vision.models.unet.ipynb.
Converted 16_callback.progress.ipynb.
Converted 17_callback.tracker.ipynb.
Converted 18_callback.fp16.ipynb.
Converted 18a_callback.training.ipynb.
Converted 18b_callback.preds.ipynb.
Converted 19_callback.mixup.ipynb.
Converted 20_interpret.ipynb.
Converted 20a_distributed.ipynb.
Converted 21_vision.learner.ipynb.
Converted 22_tutorial.imagenette.ipynb.
Converted 23_tutorial.vision.ipynb.
Converted 24_tutorial.image_sequence.ipynb.
Converted 24_tutorial.siamese.ipynb.
Converted 24_vision.gan.ipynb.
Converted 30_text.core.ipynb.
Converted 31_text.data.ipynb.
Converted 32_text.models.awdlstm.ipynb.
Converted 33_text.models.core.ipynb.
Converted 34_callback.rnn.ipynb.
Converted 35_tutorial.wikitext.ipynb.
Converted 37_text.learner.ipynb.
Converted 38_tutorial.text.ipynb.
Converted 39_tutorial.transformers.ipynb.
Converted 40_tabular.core.ipynb.
Converted 41_tabular.data.ipynb.
Converted 42_tabular.model.ipynb.
Converted 43_tabular.learner.ipynb.
Converted 44_tutorial.tabular.ipynb.
Converted 45_collab.ipynb.
Converted 46_tutorial.collab.ipynb.
Converted 50_tutorial.datablock.ipynb.
Converted 60_medical.imaging.ipynb.
Converted 61_tutorial.medical_imaging.ipynb.
Converted 65_medical.text.ipynb.
Converted 70_callback.wandb.ipynb.
Converted 71_callback.tensorboard.ipynb.
Converted 72_callback.neptune.ipynb.
Converted 73_callback.captum.ipynb.
Converted 74_callback.azureml.ipynb.
Converted 97_test_utils.ipynb.
Converted 99_pytorch_doc.ipynb.
Converted dev-setup.ipynb.
Converted index.ipynb.
Converted quick_start.ipynb.
Converted tutorial.ipynb.

初始预处理

df_shrink_dtypes[source]

df_shrink[source]

表格 -

class Categorify[source]

class FillMissing[source]

表格式Pandas管道 -

集成示例

其他目标类型

多标签类别

独热编码标签

非独热编码

回归

目前未使用 - 用于多模态

导出 -

`df_shrink_dtypes`[source]

`df_shrink`[source]

`class` `Categorify`[source]

`class` `FillMissing`[source]