dask_ml.compose._column_transformer 源代码
import warnings
import dask.array as da
import dask.dataframe as dd
import numpy as np
import pandas as pd
import sklearn.compose
from scipy import sparse
from sklearn.compose._column_transformer import _get_transformer_list
[文档]class ColumnTransformer(sklearn.compose.ColumnTransformer):
"""Applies transformers to columns of an array or pandas DataFrame.
EXPERIMENTAL: some behaviors may change between releases without
deprecation.
This estimator allows different columns or column subsets of the input
to be transformed separately and the results combined into a single
feature space.
This is useful for heterogeneous or columnar data, to combine several
feature extraction mechanisms or transformations into a single transformer.
Read more in the :ref:`User Guide <column_transformer>`.
.. versionadded:: 0.9.0
.. note::
This requires scikit-learn 0.20.0 or newer.
Parameters
----------
transformers : list of tuples
List of (name, transformer, column(s)) tuples specifying the
transformer objects to be applied to subsets of the data.
name : string
Like in Pipeline and FeatureUnion, this allows the transformer and
its parameters to be set using ``set_params`` and searched in grid
search.
transformer : estimator or {'passthrough', 'drop'}
Estimator must support `fit` and `transform`. Special-cased
strings 'drop' and 'passthrough' are accepted as well, to
indicate to drop the columns or to pass them through untransformed,
respectively.
column(s) : string or int, array-like of string or int, slice, \
boolean mask array or callable
Indexes the data on its second axis. Integers are interpreted as
positional columns, while strings can reference DataFrame columns
by name. A scalar string or int should be used where
``transformer`` expects X to be a 1d array-like (vector),
otherwise a 2d array will be passed to the transformer.
A callable is passed the input data `X` and can return any of the
above.
remainder : {'drop', 'passthrough'} or estimator, default 'drop'
By default, only the specified columns in `transformers` are
transformed and combined in the output, and the non-specified
columns are dropped. (default of ``'drop'``).
By specifying ``remainder='passthrough'``, all remaining columns that
were not specified in `transformers` will be automatically passed
through. This subset of columns is concatenated with the output of
the transformers.
By setting ``remainder`` to be an estimator, the remaining
non-specified columns will use the ``remainder`` estimator. The
estimator must support `fit` and `transform`.
sparse_threshold : float, default = 0.3
If the transformed output consists of a mix of sparse and dense data,
it will be stacked as a sparse matrix if the density is lower than this
value. Use ``sparse_threshold=0`` to always return dense.
When the transformed output consists of all sparse or all dense data,
the stacked result will be sparse or dense, respectively, and this
keyword will be ignored.
n_jobs : int or None, optional (default=None)
Number of jobs to run in parallel.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
transformer_weights : dict, optional
Multiplicative weights for features per transformer. The output of the
transformer is multiplied by these weights. Keys are transformer names,
values the weights.
preserve_dataframe : bool, (default=True)
Whether to preserve preserve pandas DataFrames when concatenating
the results.
.. warning::
The default behavior of keeping DataFrames differs from
scikit-learn's current behavior. Set ``preserve_dataframe=False``
if you need to ensure that the output matches scikit-learn's
ColumnTransformer.
Attributes
----------
transformers_ : list
The collection of fitted transformers as tuples of
(name, fitted_transformer, column). `fitted_transformer` can be an
estimator, 'drop', or 'passthrough'. If there are remaining columns,
the final element is a tuple of the form:
('remainder', transformer, remaining_columns) corresponding to the
``remainder`` parameter. If there are remaining columns, then
``len(transformers_)==len(transformers)+1``, otherwise
``len(transformers_)==len(transformers)``.
named_transformers_ : Bunch object, a dictionary with attribute access
Read-only attribute to access any transformer by given name.
Keys are transformer names and values are the fitted transformer
objects.
sparse_output_ : boolean
Boolean flag indicating whether the output of ``transform`` is a
sparse matrix or a dense numpy array, which depends on the output
of the individual transformers and the `sparse_threshold` keyword.
Notes
-----
The order of the columns in the transformed feature matrix follows the
order of how the columns are specified in the `transformers` list.
Columns of the original feature matrix that are not specified are
dropped from the resulting transformed feature matrix, unless specified
in the `passthrough` keyword. Those columns specified with `passthrough`
are added at the right to the output of the transformers.
See also
--------
dask_ml.compose.make_column_transformer : convenience function for
combining the outputs of multiple transformer objects applied to
column subsets of the original feature space.
Examples
--------
>>> from dask_ml.compose import ColumnTransformer
>>> from sklearn.preprocessing import Normalizer
>>> ct = ColumnTransformer(
... [("norm1", Normalizer(norm='l1'), [0, 1]),
... ("norm2", Normalizer(norm='l1'), slice(2, 4))])
>>> X = np.array([[0., 1., 2., 2.],
... [1., 1., 0., 1.]])
>>> # Normalizer scales each row of X to unit norm. A separate scaling
>>> # is applied for the two first and two last elements of each
>>> # row independently.
>>> ct.fit_transform(X) # doctest: +NORMALIZE_WHITESPACE
array([[0. , 1. , 0.5, 0.5],
[0.5, 0.5, 0. , 1. ]])
"""
[文档] def __init__(
self,
transformers,
remainder="drop",
sparse_threshold=0.3,
n_jobs=1,
transformer_weights=None,
preserve_dataframe=True,
):
self.preserve_dataframe = preserve_dataframe
super(ColumnTransformer, self).__init__(
transformers=transformers,
remainder=remainder,
sparse_threshold=sparse_threshold,
n_jobs=n_jobs,
transformer_weights=transformer_weights,
)
def _hstack(self, Xs):
"""
Stacks X horizontally.
Supports input types (X): list of
numpy arrays, sparse arrays and DataFrames
"""
types = set(type(X) for X in Xs)
if self.sparse_output_:
return sparse.hstack(Xs).tocsr()
elif dd.Series in types or dd.DataFrame in types:
with warnings.catch_warnings():
warnings.filterwarnings("ignore", "Concatenating", UserWarning)
return dd.concat(Xs, axis="columns")
elif da.Array in types:
# To allow compatibility with dask core 1.0.0, this is the `else`
# part of the definition of the dask.array.hstack inlined.
# The `then` branch is removed because _validate_output in
# sklearn.compose.ColumnTransformer ensures ndim == 2, so the
# check `all(x.ndim == 1 for x in Xs)` should always fail.
#
# Once dask.array.hstack supports allow_unknown_chunksizes,
# changed this to da.hstack(Xs, allow_unknown_chunksizes=True)
return da.concatenate(Xs, axis=1, allow_unknown_chunksizes=True)
elif self.preserve_dataframe and (pd.Series in types or pd.DataFrame in types):
return pd.concat(Xs, axis="columns")
else:
return np.hstack(Xs)
[文档]def make_column_transformer(*transformers, **kwargs):
# This is identical to scikit-learn's. We're just using our
# ColumnTransformer instead.
n_jobs = kwargs.pop("n_jobs", 1)
remainder = kwargs.pop("remainder", "drop")
preserve_dataframe = kwargs.pop("preserve_dataframe", True)
if kwargs:
raise TypeError(
'Unknown keyword arguments: "{}"'.format(list(kwargs.keys())[0])
)
transformer_list = _get_transformer_list(transformers)
return ColumnTransformer(
transformer_list,
n_jobs=n_jobs,
remainder=remainder,
preserve_dataframe=preserve_dataframe,
)
make_column_transformer.__doc__ = getattr(
sklearn.compose.make_column_transformer, "__doc__"
)