pypop7.benchmarks.data_science 的源代码

"""https://jermwatt.github.io/machine_learning_refined/
"""
import requests

import numpy as np  # engine for numerical computing
from numpy import genfromtxt  # to read datasets

from pypop7.benchmarks.base_functions import BaseFunction


[docs]def cross_entropy_loss_lr(w, x, y):
    """Cross-entropy loss function of logistic regression (LR with binary labels/classes {0, 1}).

       .. note:: This loss function for binary (two-class) classification is always convex
          regardless of the used dataset. It is very often used in practice to perform LR.

    Parameters
    ----------
    w : ndarray
        input vector (weights).
    x : ndarray
        features in the used train set.
    y : ndarray
        labels in the used train set.

    Returns
    -------
    loss/fitness value (`float`).

    References
    ----------
    https://jermwatt.github.io/machine_learning_refined/ (2020)

    https://openreview.net/forum?id=BJe-DsC5Fm (2019)
    """
    loss = np.empty(len(y))
    for i in range(len(y)):
        p = 1.0 / (1.0 + np.exp(-(w[0] + np.dot(x[i], w[1:]))))
        loss[i] = -y[i] * np.log(p) - (1.0 - y[i]) * np.log(1.0 - p)
    return np.mean(loss)


class CrossEntropyLossLR(BaseFunction):
    def __call__(self, w, x, y):
        return cross_entropy_loss_lr(w, x, y)


[docs]def cross_entropy_loss_l2(w, x, y):
    """Cross-entropy loss function with L2-regularization of logistic regression (LR with binary labels {0, 1}).

    Parameters
    ----------
    w : ndarray
        input vector (weights).
    x : ndarray
        features in the used train set.
    y : ndarray
        labels in the used train set.

    Returns
    -------
    loss/fitness value (`float`).

    References
    ----------
    https://jermwatt.github.io/machine_learning_refined/ (2020)

    https://epubs.siam.org/doi/abs/10.1137/17M1154679?journalCode=sjope8 (2018)
    """
    return cross_entropy_loss_lr(w, x, y) + np.sum(np.square(w)) / (2.0 * len(y))


class CrossEntropyLossL2(BaseFunction):
    def __call__(self, w, x, y):
        return cross_entropy_loss_l2(w, x, y)


[docs]def square_loss_lr(w, x, y):
    """Square loss function of logistic regression (LR with binary labels/classes {0, 1}).

       .. note:: This loss function for binary classification is generally non-convex
                 (non-linear least squares).

    Parameters
    ----------
    w : ndarray
        input vector (weights).
    x : ndarray
        features in the used train set.
    y : ndarray
        labels in the used train set.

    Returns
    -------
    loss/fitness value (`float`).

    References
    ----------
    https://jermwatt.github.io/machine_learning_refined/ (2020)

    https://openreview.net/forum?id=ryxz8CVYDH (2020)

    https://epubs.siam.org/doi/abs/10.1137/1.9781611976236.23 (2020)

    https://openreview.net/forum?id=BJe-DsC5Fm (2019)

    https://proceedings.neurips.cc/paper/2018/file/ba9a56ce0a9bfa26e8ed9e10b2cc8f46-Paper.pdf (2018)

    https://epubs.siam.org/doi/abs/10.1137/17M1154679?journalCode=sjope8 (2018)
    """
    loss = np.empty(len(y))
    for i in range(len(y)):
        loss[i] = np.square(y[i] - 1.0 / (1.0 + np.exp(-(w[0] + np.dot(x[i], w[1:])))))
    return np.mean(loss)


class SquareLossLR(BaseFunction):
    def __call__(self, w, x, y):
        return square_loss_lr(w, x, y)


[docs]def logistic_loss_lr(w, x, y):
    """Logistic loss function of logistic regression (LR with binary labels/classes {-1, 1}).

       .. note:: AKA softmax cost (always convex regardless of the dataset used).

    Parameters
    ----------
    w : ndarray
        input vector (weights).
    x : ndarray
        features in the used train set.
    y : ndarray
        labels in the used train set.

    Returns
    -------
    loss/fitness value (`float`).

    References
    ----------
    https://www.tandfonline.com/doi/full/10.1080/00031305.2021.2006781 (2021)

    https://jermwatt.github.io/machine_learning_refined/ (2020)
    """
    loss = np.empty(len(y))
    for i in range(len(y)):
        loss[i] = np.log(1.0 + np.exp(-y[i] * (w[0] + np.dot(x[i], w[1:]))))
    return np.mean(loss)


class LogisticLossLR(BaseFunction):
    def __call__(self, w, x, y):
        return logistic_loss_lr(w, x, y)


[docs]def logistic_loss_l2(w, x, y):
    """Logistic loss function with L2-regularization of logistic regression (LR with binary labels/classes {-1, 1}).

    Parameters
    ----------
    w : ndarray
        input vector (weights).
    x : ndarray
        features in the used train set.
    y : ndarray
        labels in the used train set.

    Returns
    -------
    loss/fitness value (`float`).

    References
    ----------
    https://epubs.siam.org/doi/abs/10.1137/17M1154679?journalCode=sjope8 (2018)
    """
    return logistic_loss_lr(w, x, y) + np.sum(np.square(w)) / (2.0 * len(y))


class LogisticLossL2(BaseFunction):
    def __call__(self, w, x, y):
        return logistic_loss_l2(w, x, y)


[docs]def tanh_loss_lr(w, x, y):
    """Tanh loss function of logistic regression (LR with binary class labels {-1, 1}).

       .. note:: This loss function for binary classification is generally non-convex
          (non-linear least squares).

    Parameters
    ----------
    w : ndarray
        input vector (weights to be optimized).
    x : ndarray
        features in the used training set.
    y : ndarray
        class labels in the used training set.

    Returns
    -------
    loss/fitness value to be minimized (`float`).

    References
    ----------
    https://github.com/jermwatt/machine_learning_refined/
    """
    loss = np.empty(len(y))
    for i in range(len(y)):
        loss[i] = np.square(2.0 / (1.0 + np.exp(-(w[0] + np.dot(x[i], w[1:])))) - 1.0 - y[i])
    return np.mean(loss)


class TanhLossLR(BaseFunction):
    def __call__(self, w, x, y):
        return tanh_loss_lr(w, x, y)


[docs]def hinge_loss_perceptron(w, x, y):
    """Hinge loss function of perceptron (with binary labels/classes {-1, 1}).

       .. note:: AKA *perceptron cost* or *rectified linear unit cost*. This cost function is
          always convex but has a single discontinuous derivative in each variable dimension.
          It always has a trivial solution at the origin, thus one may need to take care in
          practice in order to avoid finding it **accidentally**.

    Parameters
    ----------
    w : ndarray
        input vector (weights).
    x : ndarray
        features in the used train set.
    y : ndarray
        labels in the used train set.

    Returns
    -------
    loss/fitness value (`float`).

    References
    ----------
    https://github.com/jermwatt/machine_learning_refined/
    """
    loss = np.empty(len(y))
    for i in range(len(y)):
        loss[i] = np.maximum(0.0, -y[i] * (w[0] + np.dot(x[i], w[1:])))
    return np.mean(loss)


class HingeLossPerceptron(BaseFunction):
    def __call__(self, w, x, y):
        return hinge_loss_perceptron(w, x, y)


[docs]def loss_margin_perceptron(w, x, y):
    """Loss function of margin perceptron (with binary labels/classes {-1, 1}).

    Parameters
    ----------
    w : ndarray
        input vector (weights).
    x : ndarray
        features in the used train set.
    y : ndarray
        labels in the used train set.

    Returns
    -------
    loss/fitness value (`float`).

    References
    ----------
    https://github.com/jermwatt/machine_learning_refined/
    """
    loss = np.empty(len(y))
    for i in range(len(y)):
        loss[i] = np.maximum(0.0, 1.0 - y[i] * (w[0] + np.dot(x[i], w[1:])))
    return np.mean(loss)


class LossMarginPerceptron(BaseFunction):
    def __call__(self, w, x, y):
        return loss_margin_perceptron(w, x, y)


[docs]def loss_svm(w, x, y):
    """Loss function of support vector machines (SVM with binary labels/classes {-1, 1}).

    Parameters
    ----------
    w : ndarray
        input vector (weights).
    x : ndarray
        features in the used train set.
    y : ndarray
        labels in the used train set.

    Returns
    -------
    loss/fitness value (`float`).

    References
    ----------
    https://github.com/jermwatt/machine_learning_refined/
    """
    r = 1e-3
    loss = np.empty(len(y))
    for i in range(len(y)):
        loss[i] = np.maximum(0.0, 1.0 - y[i] * (w[0] + np.dot(x[i], w[1:])))
    return np.mean(loss) + r * np.sum(np.square(w))


class LossSVM(BaseFunction):
    def __call__(self, w, x, y):
        return loss_svm(w, x, y)


[docs]def mpc2023_nonsmooth(w, x, y):
    """Nonsmooth function from MPC-2023.

    Parameters
    ----------
    w : ndarray
        input vector (weights).
    x : ndarray
        features in the used train set.
    y : ndarray
        labels in the used train set.

    Returns
    -------
    loss/fitness value (`float`).

    References
    ----------
    https://link.springer.com/article/10.1007/s12532-023-00233-9 (2023)
    """
    loss = np.empty(len(y))
    for i in range(len(y)):
        loss[i] = np.abs(np.dot(w, x[i]) - y[i])
    return np.mean(loss)


class MPC2023Nonsmooth(BaseFunction):
    def __call__(self, w, x, y):
        return mpc2023_nonsmooth(w, x, y)


[docs]def read_parkinson_disease_classification(is_10=True):
    """Read dataset Parkinson's Disease Classification.

       .. note::

          # Data: https://archive.ics.uci.edu/static/public/470/parkinson+s+disease+classification.zip

          # Instances: 756

          # Features: 753 (Delete `id` from 754 Features)

          # Class: 0/1

          # Missing Values: No

    References
    ----------
    Sakar,C., Serbes,Gorkem, Gunduz,Aysegul, Nizam,Hatice, and Sakar,Betul. (2018).
    Parkinson's Disease Classification.
    UCI Machine Learning Repository.
    https://doi.org/10.24432/C5MS4X
    """
    file = 'pd_speech_features.csv'
    file_path = 'https://github.com/Evolutionary-Intelligence/pypop/raw/main/docs/' + \
                'data-science/' + file
    r = requests.get(file_path)
    with open(file, 'wb') as f:
        f.write(r.content)
    d = genfromtxt(file, delimiter=',')
    x, y = d[2:, 1:-1], d[2:, -1]
    if not is_10:
        y[y == 0] = -1
    return x, y


[docs]def read_semeion_handwritten_digit(is_10=True):
    """Read dataset Semeion Handwritten Digit.

       .. note::

          # Data: https://archive.ics.uci.edu/static/public/178/semeion+handwritten+digit.zip

          # Instances: 1593

          # Features: 256 (For 266 Features: the last 10 columns are class labels for digit 0 - 9)

          # Class: 0/1

          # Missing Values: No

    References
    ----------
    Tactile,Srl, Massimo,Buscema, and Stefano,Terzi (1994).
    Semeion Handwritten Digit.
    UCI Machine Learning Repository.
    https://doi.org/10.24432/C5SC8V
    """
    file = 'semeion.data'
    file_path = 'https://github.com/Evolutionary-Intelligence/pypop/raw/main/docs/' + \
                'data-science/' + file
    r = requests.get(file_path)
    with open(file, 'wb') as f:
        f.write(r.content)
    d = genfromtxt(file, delimiter=' ')
    x, y = d[:, :256], d[:, -1]  # only to classify digit 9
    if not is_10:
        y[y == 0] = -1
    return x, y


[docs]def read_cnae9(is_10=True):
    """Read dataset CNAE-9.

       .. note::

          # Data: https://archive.ics.uci.edu/static/public/233/cnae+9.zip

          # Instances: 1080

          # Features: 856

          # Class: 0/1

          # Missing Values: No

    References
    ----------
    Ciarelli,Patrick and Oliveira,Elias. (2012).
    CNAE-9.
    UCI Machine Learning Repository.
    https://doi.org/10.24432/C51G7P.
    """
    file = 'CNAE-9.data'
    file_path = 'https://github.com/Evolutionary-Intelligence/pypop/raw/main/docs/' + \
                'data-science/' + file
    r = requests.get(file_path)
    with open(file, 'wb') as f:
        f.write(r.content)
    d = genfromtxt(file, delimiter=',')
    x, y = d[:, 1:], d[:, 0]
    index_not_9, index_9 = y != 9, y == 9
    if is_10:
        y[index_not_9], y[index_9] = 0, 1
    else:
        y[index_not_9], y[index_9] = -1, 1
    return x, y


[docs]def read_madelon(is_10=False):
    """Read dataset Madelon.

       .. note::

          # Data: https://archive.ics.uci.edu/static/public/171/madelon.zip

          # Instances: 2000

          # Features: 500

          # Class: -1/1

          # Missing Values: No

    References
    ----------
    Guyon,Isabelle. (2008).
    Madelon.
    UCI Machine Learning Repository.
    https://doi.org/10.24432/C5602H.
    """
    file = 'madelon_train.data'
    file_path = 'https://github.com/Evolutionary-Intelligence/pypop/raw/main/docs/' + \
                'data-science/' + file
    r = requests.get(file_path)
    with open(file, 'wb') as f:
        f.write(r.content)
    x = genfromtxt(file, delimiter=' ')

    file = 'madelon_train.labels'
    file_path = 'https://github.com/Evolutionary-Intelligence/pypop/raw/main/docs/' + \
                'data-science/' + file
    r = requests.get(file_path)
    with open(file, 'wb') as f:
        f.write(r.content)
    y = genfromtxt(file)
    if is_10:  # to convert all labels with -1 to 0
        y[y == -1] = 0
    return x, y


[docs]def read_qsar_androgen_receptor(is_10=False):
    """Read dataset QSAR androgen receptor.

       .. note::

          # Data: https://archive.ics.uci.edu/static/public/509/qsar+androgen+receptor.zip

          # Instances: 1687

          # Features: 1024

          # Class: -1/1

          # Missing Values: No

    References
    ----------
    QSAR androgen receptor. (2019).
    UCI Machine Learning Repository.
    https://doi.org/10.24432/C53317.
    """
    file = 'qsar_androgen_receptor.csv'
    file_path = 'https://github.com/Evolutionary-Intelligence/pypop/raw/main/docs/' + \
                'data-science/' + file
    r = requests.get(file_path)
    with open(file, 'wb') as f:
        f.write(r.content)
    d = genfromtxt(file, delimiter=';')
    x, y = d[:, :-1], d[:, -1]
    if is_10:  # to convert all labels with -1 to 0
        y[y == -1] = 0
    return x, y