dask_ml.compose._column_transformer 源代码

import warnings

import dask.array as da
import dask.dataframe as dd
import numpy as np
import pandas as pd
import sklearn.compose
from scipy import sparse
from sklearn.compose._column_transformer import _get_transformer_list


[文档]class ColumnTransformer(sklearn.compose.ColumnTransformer): """Applies transformers to columns of an array or pandas DataFrame. EXPERIMENTAL: some behaviors may change between releases without deprecation. This estimator allows different columns or column subsets of the input to be transformed separately and the results combined into a single feature space. This is useful for heterogeneous or columnar data, to combine several feature extraction mechanisms or transformations into a single transformer. Read more in the :ref:`User Guide <column_transformer>`. .. versionadded:: 0.9.0 .. note:: This requires scikit-learn 0.20.0 or newer. Parameters ---------- transformers : list of tuples List of (name, transformer, column(s)) tuples specifying the transformer objects to be applied to subsets of the data. name : string Like in Pipeline and FeatureUnion, this allows the transformer and its parameters to be set using ``set_params`` and searched in grid search. transformer : estimator or {'passthrough', 'drop'} Estimator must support `fit` and `transform`. Special-cased strings 'drop' and 'passthrough' are accepted as well, to indicate to drop the columns or to pass them through untransformed, respectively. column(s) : string or int, array-like of string or int, slice, \ boolean mask array or callable Indexes the data on its second axis. Integers are interpreted as positional columns, while strings can reference DataFrame columns by name. A scalar string or int should be used where ``transformer`` expects X to be a 1d array-like (vector), otherwise a 2d array will be passed to the transformer. A callable is passed the input data `X` and can return any of the above. remainder : {'drop', 'passthrough'} or estimator, default 'drop' By default, only the specified columns in `transformers` are transformed and combined in the output, and the non-specified columns are dropped. (default of ``'drop'``). By specifying ``remainder='passthrough'``, all remaining columns that were not specified in `transformers` will be automatically passed through. This subset of columns is concatenated with the output of the transformers. By setting ``remainder`` to be an estimator, the remaining non-specified columns will use the ``remainder`` estimator. The estimator must support `fit` and `transform`. sparse_threshold : float, default = 0.3 If the transformed output consists of a mix of sparse and dense data, it will be stacked as a sparse matrix if the density is lower than this value. Use ``sparse_threshold=0`` to always return dense. When the transformed output consists of all sparse or all dense data, the stacked result will be sparse or dense, respectively, and this keyword will be ignored. n_jobs : int or None, optional (default=None) Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. transformer_weights : dict, optional Multiplicative weights for features per transformer. The output of the transformer is multiplied by these weights. Keys are transformer names, values the weights. preserve_dataframe : bool, (default=True) Whether to preserve preserve pandas DataFrames when concatenating the results. .. warning:: The default behavior of keeping DataFrames differs from scikit-learn's current behavior. Set ``preserve_dataframe=False`` if you need to ensure that the output matches scikit-learn's ColumnTransformer. Attributes ---------- transformers_ : list The collection of fitted transformers as tuples of (name, fitted_transformer, column). `fitted_transformer` can be an estimator, 'drop', or 'passthrough'. If there are remaining columns, the final element is a tuple of the form: ('remainder', transformer, remaining_columns) corresponding to the ``remainder`` parameter. If there are remaining columns, then ``len(transformers_)==len(transformers)+1``, otherwise ``len(transformers_)==len(transformers)``. named_transformers_ : Bunch object, a dictionary with attribute access Read-only attribute to access any transformer by given name. Keys are transformer names and values are the fitted transformer objects. sparse_output_ : boolean Boolean flag indicating whether the output of ``transform`` is a sparse matrix or a dense numpy array, which depends on the output of the individual transformers and the `sparse_threshold` keyword. Notes ----- The order of the columns in the transformed feature matrix follows the order of how the columns are specified in the `transformers` list. Columns of the original feature matrix that are not specified are dropped from the resulting transformed feature matrix, unless specified in the `passthrough` keyword. Those columns specified with `passthrough` are added at the right to the output of the transformers. See also -------- dask_ml.compose.make_column_transformer : convenience function for combining the outputs of multiple transformer objects applied to column subsets of the original feature space. Examples -------- >>> from dask_ml.compose import ColumnTransformer >>> from sklearn.preprocessing import Normalizer >>> ct = ColumnTransformer( ... [("norm1", Normalizer(norm='l1'), [0, 1]), ... ("norm2", Normalizer(norm='l1'), slice(2, 4))]) >>> X = np.array([[0., 1., 2., 2.], ... [1., 1., 0., 1.]]) >>> # Normalizer scales each row of X to unit norm. A separate scaling >>> # is applied for the two first and two last elements of each >>> # row independently. >>> ct.fit_transform(X) # doctest: +NORMALIZE_WHITESPACE array([[0. , 1. , 0.5, 0.5], [0.5, 0.5, 0. , 1. ]]) """
[文档] def __init__( self, transformers, remainder="drop", sparse_threshold=0.3, n_jobs=1, transformer_weights=None, preserve_dataframe=True, ): self.preserve_dataframe = preserve_dataframe super(ColumnTransformer, self).__init__( transformers=transformers, remainder=remainder, sparse_threshold=sparse_threshold, n_jobs=n_jobs, transformer_weights=transformer_weights, )
def _hstack(self, Xs): """ Stacks X horizontally. Supports input types (X): list of numpy arrays, sparse arrays and DataFrames """ types = set(type(X) for X in Xs) if self.sparse_output_: return sparse.hstack(Xs).tocsr() elif dd.Series in types or dd.DataFrame in types: with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Concatenating", UserWarning) return dd.concat(Xs, axis="columns") elif da.Array in types: # To allow compatibility with dask core 1.0.0, this is the `else` # part of the definition of the dask.array.hstack inlined. # The `then` branch is removed because _validate_output in # sklearn.compose.ColumnTransformer ensures ndim == 2, so the # check `all(x.ndim == 1 for x in Xs)` should always fail. # # Once dask.array.hstack supports allow_unknown_chunksizes, # changed this to da.hstack(Xs, allow_unknown_chunksizes=True) return da.concatenate(Xs, axis=1, allow_unknown_chunksizes=True) elif self.preserve_dataframe and (pd.Series in types or pd.DataFrame in types): return pd.concat(Xs, axis="columns") else: return np.hstack(Xs)
[文档]def make_column_transformer(*transformers, **kwargs): # This is identical to scikit-learn's. We're just using our # ColumnTransformer instead. n_jobs = kwargs.pop("n_jobs", 1) remainder = kwargs.pop("remainder", "drop") preserve_dataframe = kwargs.pop("preserve_dataframe", True) if kwargs: raise TypeError( 'Unknown keyword arguments: "{}"'.format(list(kwargs.keys())[0]) ) transformer_list = _get_transformer_list(transformers) return ColumnTransformer( transformer_list, n_jobs=n_jobs, remainder=remainder, preserve_dataframe=preserve_dataframe, )
make_column_transformer.__doc__ = getattr( sklearn.compose.make_column_transformer, "__doc__" )