dask_ml.preprocessing.label 源代码

from __future__ import division

from operator import getitem
from typing import Optional, Union

import dask.array as da
import dask.dataframe as dd
import numpy as np
import pandas as pd
import scipy.sparse
import sklearn.preprocessing
from sklearn.utils.validation import check_is_fitted

from .._typing import ArrayLike, SeriesType


[文档]class LabelEncoder(sklearn.preprocessing.LabelEncoder):
    """Encode labels with value between 0 and n_classes-1.

    .. note::

       This differs from the scikit-learn version for Categorical data.
       When passed a categorical `y`, this implementation will use the
       categorical information for the label encoding and transformation.
       You will receive different answers when

       1. Your categories are not monotonically increasing
       2. You have unobserved categories

       Specify ``use_categorical=False`` to recover the scikit-learn behavior.

    Parameters
    ----------
    use_categorical : bool, default True
        Whether to use the categorical dtype information when `y` is a
        dask or pandas Series with a categorical dtype.

    Attributes
    ----------
    classes_ : array of shape (n_class,)
        Holds the label for each class.
    dtype_ : Optional CategoricalDtype
        For Categorical `y`, the dtype is stored here.

    Examples
    --------
    `LabelEncoder` can be used to normalize labels.

    >>> from dask_ml import preprocessing
    >>> le = preprocessing.LabelEncoder()
    >>> le.fit([1, 2, 2, 6])
    LabelEncoder()
    >>> le.classes_
    array([1, 2, 6])
    >>> le.transform([1, 1, 2, 6]) #doctest: +ELLIPSIS
    array([0, 0, 1, 2]...)
    >>> le.inverse_transform([0, 0, 1, 2])
    array([1, 1, 2, 6])

    It can also be used to transform non-numerical labels (as long as they are
    hashable and comparable) to numerical labels.

    >>> le = preprocessing.LabelEncoder()
    >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
    LabelEncoder()
    >>> list(le.classes_)
    ['amsterdam', 'paris', 'tokyo']
    >>> le.transform(["tokyo", "tokyo", "paris"]) #doctest: +ELLIPSIS
    array([2, 2, 1]...)
    >>> list(le.inverse_transform([2, 2, 1]))
    ['tokyo', 'tokyo', 'paris']

    When using Dask, we strongly recommend using a Categorical dask Series if
    possible. This avoids a (potentially expensive) scan of the values and
    enables a faster `transform` algorithm.

    >>> import dask.dataframe as dd
    >>> import pandas as pd
    >>> data = dd.from_pandas(pd.Series(['a', 'a', 'b'], dtype='category'),
    ...                       npartitions=2)
    >>> le.fit_transform(data)
    dask.array<values, shape=(nan,), dtype=int8, chunksize=(nan,)>
    >>> le.fit_transform(data).compute()
    array([0, 0, 1], dtype=int8)
    """

[文档]    def __init__(self, use_categorical: bool = True):
        self.use_categorical = use_categorical
        super(LabelEncoder, self).__init__()

    def _check_array(self, y: Union[ArrayLike, SeriesType]):
        if isinstance(y, (dd.Series, pd.DataFrame)):
            y = y.squeeze()

            if y.ndim > 1:
                raise ValueError("Expected a 1-D array or Series.")

        if not self.use_categorical:
            if isinstance(y, dd.Series):
                y = y.to_dask_array(lengths=True)
            elif isinstance(y, pd.Series):
                y = np.asarray(y)

        if isinstance(y, dd.Series):
            if isinstance(y.dtype, pd.CategoricalDtype):
                # TODO(dask-3784): just call y.cat.as_known()
                # https://github.com/dask/dask/issues/3784
                if not y.cat.known:
                    y = y.cat.as_known()
            else:
                y = y.to_dask_array(lengths=True)
        return y

    def fit(self, y: Union[ArrayLike, SeriesType]) -> "LabelEncoder":
        y = self._check_array(y)

        if isinstance(y, da.Array):
            classes_ = _encode_dask_array(y)
            self.classes_ = classes_.compute()
            self.dtype_: Optional[pd.CategoricalDtype] = None
        elif _is_categorical(y):
            self.classes_ = _encode_categorical(y)
            self.dtype_ = y.dtype
        else:
            self.dtype_ = None
            return super(LabelEncoder, self).fit(y)

        return self

    def fit_transform(
        self, y: Union[ArrayLike, SeriesType]
    ) -> Union[ArrayLike, SeriesType]:
        y = self._check_array(y)

        if isinstance(y, da.Array):
            self.classes_, y = _encode_dask_array(y, encode=True)
            self.dtype_ = None
        elif _is_categorical(y):
            self.classes_, y = _encode_categorical(y, encode=True)
            self.dtype_ = y.dtype
        else:
            return super(LabelEncoder, self).fit_transform(y)

        return y

    def transform(self, y: Union[ArrayLike, SeriesType]):
        check_is_fitted(self, "classes_")
        y = self._check_array(y)

        if isinstance(y, da.Array):
            return _encode_dask_array(y, self.classes_, encode=True)[1]
        elif isinstance(y, (pd.Series, dd.Series)):
            if self.dtype_ is not None:
                assert y.dtype.categories.equals(self.dtype_.categories)
            return y.cat.codes.values
        else:
            return np.searchsorted(self.classes_, y)

    def inverse_transform(self, y: Union[ArrayLike, SeriesType]):
        check_is_fitted(self, "classes_")
        y = self._check_array(y)

        if isinstance(y, da.Array):
            if getattr(self, "dtype_", None):
                # -> Series[category]
                if self.dtype_ is not None:
                    result = (
                        dd.from_dask_array(y)
                        .astype("category")
                        .cat.set_categories(np.arange(len(self.classes_)))
                        .cat.rename_categories(self.dtype_.categories)
                    )
                if self.dtype_.ordered:
                    result = result.cat.as_ordered()
                return result
            else:
                return da.map_blocks(
                    getitem,
                    self.classes_,
                    y,
                    dtype=self.classes_.dtype,
                    chunks=y.chunks,
                )
        else:
            y = np.asarray(y)
            if getattr(self, "dtype_", None):
                if self.dtype_ is not None:
                    return pd.Series(
                        pd.Categorical.from_codes(
                            y,
                            categories=self.dtype_.categories,
                            ordered=self.dtype_.ordered,
                        )
                    )
            else:
                return self.classes_[y]


def _encode_categorical(
    values: pd.Series, uniques: Optional[np.ndarray] = None, encode: bool = False
):
    new_uniques = np.asarray(values.cat.categories)

    if uniques is not None:
        diff = list(np.setdiff1d(uniques, new_uniques, assume_unique=True))
        if diff:
            raise ValueError("y contains previously unseen labels: {}".format(diff))

    uniques = new_uniques

    if encode:
        return uniques, values.cat.codes
    else:
        return uniques


def _check_and_search_block(arr, uniques, onehot_dtype=None, block_info=None):
    diff = list(np.setdiff1d(arr, uniques, assume_unique=True))

    if diff:
        msg = (
            "Block contains previously unseen values {}.\nBlock info:\n\n" "{}".format(
                diff, block_info
            )
        )
        raise ValueError(msg)

    label_encoded = np.searchsorted(uniques, arr)
    if onehot_dtype:
        return _construct(label_encoded, uniques)
    else:
        return label_encoded


def _construct(x: np.ndarray, categories: np.ndarray) -> scipy.sparse.csr_matrix:
    """Make a sparse matrix from an encoded array.

    >>> construct(np.array([0, 1, 0]), np.array([0, 1])).toarray()
    array([[1., 0.],
           [0., 1.],
           [1., 0.]])
    """
    data = np.ones(len(x))
    rows = np.arange(len(x))
    columns = x.ravel()
    return scipy.sparse.csr_matrix(
        (data, (rows, columns)), shape=(len(x), len(categories))
    )


def _encode_dask_array(
    values: da.Array,
    uniques: Optional[np.ndarray] = None,
    encode: bool = False,
    onehot_dtype: Optional[np.dtype] = None,
):
    """One-hot or label encode a dask array.

    Parameters
    ----------
    values : da.Array, shape [n_samples,]
    uniques : np.ndarray, shape [n_uniques,]
    encode : bool, default False
        Whether to encode the values (True) or just discover the uniques.
    onehot_dtype : np.dtype, optional
        Optional dtype for the resulting one-hot encoded array. This changes
        the shape, dtype, and underlying storage of the returned dask array.

        ======= ================= =========================
        thing   onehot_dtype=None onehot_dtype=onehot_dtype
        ======= ================= =========================
        shape   (n_samples,)      (n_samples, len(uniques))
        dtype   np.intp           onehot_dtype
        storage np.ndarray        scipy.sparse.csr_matrix
        ======= ================= =========================

    Returns
    -------
    uniques : ndarray
        The discovered uniques (uniques=None) or just `uniques`
    encoded : da.Array, optional
        The encoded values. Only returned when ``encode=True``.
    """

    if uniques is None:
        if encode and onehot_dtype:
            raise ValueError("Cannot use 'encode` and 'onehot_dtype' simultaneously.")
        if encode:
            uniques, encoded = da.unique(values, return_inverse=True)
            return uniques, encoded
        else:
            return da.unique(values)

    if encode:
        if onehot_dtype:
            dtype = onehot_dtype
            new_axis: Optional[int] = 1
            chunks = values.chunks + (len(uniques),)
        else:
            dtype = np.dtype("int")
            new_axis = None
            chunks = values.chunks

        return (
            uniques,
            values.map_blocks(
                _check_and_search_block,
                uniques,
                onehot_dtype=onehot_dtype,
                dtype=dtype,
                new_axis=new_axis,
                chunks=chunks,
            ),
        )
    else:
        return uniques


def _encode(values, uniques=None, encode=False):
    if isinstance(values, (pd.Series, dd.Series)) and _is_categorical(values):
        return _encode_categorical(values, uniques=uniques, encode=encode)
    elif isinstance(values, da.Array):
        return _encode_dask_array(values, uniques=uniques, encode=encode)
    else:
        raise ValueError("Unknown type {}".format(type(values)))


def _is_categorical(y: Union[ArrayLike, SeriesType]) -> bool:
    return isinstance(y, (dd.Series, pd.Series)) and isinstance(
        y.dtype, pd.CategoricalDtype
    )