feature_engine.datasets.titanic 源代码

import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline

from feature_engine.imputation import CategoricalImputer, MeanMedianImputer


# TODO: loading the dataset from the internet is not the best, we need to store it
[文档]def load_titanic( return_X_y_frame=False, predictors_only=False, handle_missing=False, cabin=None ): """ The load_titanic() function returns the well-known titanic dataset. Note that you need to have an internet connection for this function to work, as we are calling the dataset stored in `openML <https://www.openml.org/d/40945>`_ which can be downloaded from `here <https://www.openml.org/data/get_csv/16826755/phpMYEkMl>`_. Parameters ---------- return_X_y_frame: bool, default=False If `True`, it returns a DataFrame (X) with the predictors and a Series (y) with the target variable. If `False`, it returns a single DataFrame with predictors and target. predictors_only: bool, default=False If `False`, it returns all the variables from the original Titanic Dataset. If `True`, it reurns only relevant predictors. handle_missing: bool, default=False If `False`, it returns the original dataset with missing values. If `True`, missing data is replaced with the string "Missing" in categorical variables and the mean in numerical variables. cabin: str, default=None If `None`, it returns the variable cabin as in the original data. If 'drop', it removes the variable from the data. If 'letter_only' it returns just the first letter of the cabin, without the number. Examples -------- >>> from feature_engine.datasets import load_titanic >>> data = load_titanic(predictors_only=True, cabin="drop") >>> print(data.head()) pclass survived sex age sibsp parch fare embarked 0 1 1 female 29.0000 0 0 211.3375 S 1 1 1 male 0.9167 1 2 151.5500 S 2 1 0 female 2.0000 1 2 151.5500 S 3 1 0 male 30.0000 1 2 151.5500 S 4 1 0 female 25.0000 1 2 151.5500 S """ # param checks if not isinstance(return_X_y_frame, bool): raise ValueError( "return_X_y_frame takes only booleans True and False. " f"Got {return_X_y_frame} instead." ) if not isinstance(predictors_only, bool): raise ValueError( "predictors_only takes only booleans True and False. " f"Got {predictors_only} instead." ) if not isinstance(handle_missing, bool): raise ValueError( "handle_missing takes only booleans True and False. " f"Got {handle_missing} instead." ) if cabin is not None: if not isinstance(cabin, str) or cabin not in ["letter_only", "drop"]: raise ValueError( "the parameter 'cabin' takes only values None, 'letter_only' and " f"'drop'. Got {cabin} instead." ) # load and prepare data df = pd.read_csv("https://www.openml.org/data/get_csv/16826755/phpMYEkMl") df = df.replace("?", np.nan) df["age"] = df["age"].astype("float64") df["fare"] = df["fare"].astype("float64") if predictors_only: df = df.drop( columns=["name", "ticket", "home.dest", "boat", "body"], ) if handle_missing: pipeline = Pipeline( steps=[ ( "categorical_imputer", CategoricalImputer(imputation_method="missing"), ), ("mean_median_imputer", MeanMedianImputer(imputation_method="mean")), ] ) df = pipeline.fit_transform(df) if cabin == "letter_only": df["cabin"] = df["cabin"].astype(str).str[0] elif cabin == "drop": df = df.drop(columns=["cabin"]) if return_X_y_frame: X = df.drop(columns="survived") y = df["survived"] return X, y else: return df