survivalist.preprocessing

  1# This program is free software: you can redistribute it and/or modify
  2# it under the terms of the GNU General Public License as published by
  3# the Free Software Foundation, either version 3 of the License, or
  4# (at your option) any later version.
  5#
  6# This program is distributed in the hope that it will be useful,
  7# but WITHOUT ANY WARRANTY; without even the implied warranty of
  8# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  9# GNU General Public License for more details.
 10#
 11# You should have received a copy of the GNU General Public License
 12# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 13from sklearn.base import BaseEstimator, TransformerMixin
 14from sklearn.utils.validation import _check_feature_names_in, check_is_fitted
 15
 16from .column import encode_categorical
 17
 18__all__ = ["OneHotEncoder"]
 19
 20
 21def check_columns_exist(actual, expected):
 22    missing_features = expected.difference(actual)
 23    if len(missing_features) != 0:
 24        raise ValueError(
 25            f"{len(missing_features)} features are missing from data: {missing_features.tolist()}")
 26
 27
 28class OneHotEncoder(BaseEstimator, TransformerMixin):
 29    """Encode categorical columns with `M` categories into `M-1` columns according
 30    to the one-hot scheme.
 31
 32    The order of non-categorical columns is preserved, encoded columns are inserted
 33    inplace of the original column.
 34
 35    Parameters
 36    ----------
 37    allow_drop : boolean, optional, default: True
 38        Whether to allow dropping categorical columns that only consist
 39        of a single category.
 40
 41    Attributes
 42    ----------
 43    feature_names_ : pandas.Index
 44        List of encoded columns.
 45
 46    categories_ : dict
 47        Categories of encoded columns.
 48
 49    encoded_columns_ : list
 50        Name of columns after encoding.
 51        Includes names of non-categorical columns.
 52
 53    n_features_in_ : int
 54        Number of features seen during ``fit``.
 55
 56    feature_names_in_ : ndarray of shape (`n_features_in_`,)
 57        Names of features seen during ``fit``. Defined only when `X`
 58        has feature names that are all strings.
 59    """
 60
 61    def __init__(self, *, allow_drop=True):
 62        self.allow_drop = allow_drop
 63
 64    def fit(self, X, y=None):  # pylint: disable=unused-argument
 65        """Retrieve categorical columns.
 66
 67        Parameters
 68        ----------
 69        X : pandas.DataFrame
 70            Data to encode.
 71        y :
 72            Ignored. For compatibility with Pipeline.
 73        Returns
 74        -------
 75        self : object
 76            Returns self
 77        """
 78        self.fit_transform(X)
 79        return self
 80
 81    def _encode(self, X, columns_to_encode):
 82        return encode_categorical(X, columns=columns_to_encode, allow_drop=self.allow_drop)
 83
 84    def fit_transform(self, X, y=None, **fit_params):  # pylint: disable=unused-argument
 85        """Convert categorical columns to numeric values.
 86
 87        Parameters
 88        ----------
 89        X : pandas.DataFrame
 90            Data to encode.
 91        y :
 92            Ignored. For compatibility with TransformerMixin.
 93        fit_params :
 94            Ignored. For compatibility with TransformerMixin.
 95
 96        Returns
 97        -------
 98        Xt : pandas.DataFrame
 99            Encoded data.
100        """
101        self._check_feature_names(X, reset=True)
102        self._check_n_features(X, reset=True)
103        columns_to_encode = X.select_dtypes(
104            include=["object", "category"]).columns
105        x_dummy = self._encode(X, columns_to_encode)
106
107        self.feature_names_ = columns_to_encode
108        self.categories_ = {k: X[k].cat.categories for k in columns_to_encode}
109        self.encoded_columns_ = x_dummy.columns
110        return x_dummy
111
112    def transform(self, X):
113        """Convert categorical columns to numeric values.
114
115        Parameters
116        ----------
117        X : pandas.DataFrame
118            Data to encode.
119
120        Returns
121        -------
122        Xt : pandas.DataFrame
123            Encoded data.
124        """
125        check_is_fitted(self, "encoded_columns_")
126        self._check_n_features(X, reset=False)
127        check_columns_exist(X.columns, self.feature_names_)
128
129        Xt = X.copy()
130        for col, cat in self.categories_.items():
131            Xt[col] = Xt[col].cat.set_categories(cat)
132
133        new_data = self._encode(Xt, self.feature_names_)
134        return new_data.loc[:, self.encoded_columns_]
135
136    def get_feature_names_out(self, input_features=None):
137        """Get output feature names for transformation.
138
139        Parameters
140        ----------
141        input_features : array-like of str or None, default=None
142            Input features.
143
144            - If `input_features` is `None`, then `feature_names_in_` is
145              used as feature names in.
146            - If `input_features` is an array-like, then `input_features` must
147              match `feature_names_in_` if `feature_names_in_` is defined.
148
149        Returns
150        -------
151        feature_names_out : ndarray of str objects
152            Transformed feature names.
153        """
154        check_is_fitted(self, "encoded_columns_")
155        input_features = _check_feature_names_in(self, input_features)
156
157        return self.encoded_columns_.values.copy()
class OneHotEncoder(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
 29class OneHotEncoder(BaseEstimator, TransformerMixin):
 30    """Encode categorical columns with `M` categories into `M-1` columns according
 31    to the one-hot scheme.
 32
 33    The order of non-categorical columns is preserved, encoded columns are inserted
 34    inplace of the original column.
 35
 36    Parameters
 37    ----------
 38    allow_drop : boolean, optional, default: True
 39        Whether to allow dropping categorical columns that only consist
 40        of a single category.
 41
 42    Attributes
 43    ----------
 44    feature_names_ : pandas.Index
 45        List of encoded columns.
 46
 47    categories_ : dict
 48        Categories of encoded columns.
 49
 50    encoded_columns_ : list
 51        Name of columns after encoding.
 52        Includes names of non-categorical columns.
 53
 54    n_features_in_ : int
 55        Number of features seen during ``fit``.
 56
 57    feature_names_in_ : ndarray of shape (`n_features_in_`,)
 58        Names of features seen during ``fit``. Defined only when `X`
 59        has feature names that are all strings.
 60    """
 61
 62    def __init__(self, *, allow_drop=True):
 63        self.allow_drop = allow_drop
 64
 65    def fit(self, X, y=None):  # pylint: disable=unused-argument
 66        """Retrieve categorical columns.
 67
 68        Parameters
 69        ----------
 70        X : pandas.DataFrame
 71            Data to encode.
 72        y :
 73            Ignored. For compatibility with Pipeline.
 74        Returns
 75        -------
 76        self : object
 77            Returns self
 78        """
 79        self.fit_transform(X)
 80        return self
 81
 82    def _encode(self, X, columns_to_encode):
 83        return encode_categorical(X, columns=columns_to_encode, allow_drop=self.allow_drop)
 84
 85    def fit_transform(self, X, y=None, **fit_params):  # pylint: disable=unused-argument
 86        """Convert categorical columns to numeric values.
 87
 88        Parameters
 89        ----------
 90        X : pandas.DataFrame
 91            Data to encode.
 92        y :
 93            Ignored. For compatibility with TransformerMixin.
 94        fit_params :
 95            Ignored. For compatibility with TransformerMixin.
 96
 97        Returns
 98        -------
 99        Xt : pandas.DataFrame
100            Encoded data.
101        """
102        self._check_feature_names(X, reset=True)
103        self._check_n_features(X, reset=True)
104        columns_to_encode = X.select_dtypes(
105            include=["object", "category"]).columns
106        x_dummy = self._encode(X, columns_to_encode)
107
108        self.feature_names_ = columns_to_encode
109        self.categories_ = {k: X[k].cat.categories for k in columns_to_encode}
110        self.encoded_columns_ = x_dummy.columns
111        return x_dummy
112
113    def transform(self, X):
114        """Convert categorical columns to numeric values.
115
116        Parameters
117        ----------
118        X : pandas.DataFrame
119            Data to encode.
120
121        Returns
122        -------
123        Xt : pandas.DataFrame
124            Encoded data.
125        """
126        check_is_fitted(self, "encoded_columns_")
127        self._check_n_features(X, reset=False)
128        check_columns_exist(X.columns, self.feature_names_)
129
130        Xt = X.copy()
131        for col, cat in self.categories_.items():
132            Xt[col] = Xt[col].cat.set_categories(cat)
133
134        new_data = self._encode(Xt, self.feature_names_)
135        return new_data.loc[:, self.encoded_columns_]
136
137    def get_feature_names_out(self, input_features=None):
138        """Get output feature names for transformation.
139
140        Parameters
141        ----------
142        input_features : array-like of str or None, default=None
143            Input features.
144
145            - If `input_features` is `None`, then `feature_names_in_` is
146              used as feature names in.
147            - If `input_features` is an array-like, then `input_features` must
148              match `feature_names_in_` if `feature_names_in_` is defined.
149
150        Returns
151        -------
152        feature_names_out : ndarray of str objects
153            Transformed feature names.
154        """
155        check_is_fitted(self, "encoded_columns_")
156        input_features = _check_feature_names_in(self, input_features)
157
158        return self.encoded_columns_.values.copy()

Encode categorical columns with M categories into M-1 columns according to the one-hot scheme.

The order of non-categorical columns is preserved, encoded columns are inserted inplace of the original column.

Parameters

allow_drop : boolean, optional, default: True Whether to allow dropping categorical columns that only consist of a single category.

Attributes

feature_names_ : pandas.Index List of encoded columns.

categories_ : dict Categories of encoded columns.

encoded_columns_ : list Name of columns after encoding. Includes names of non-categorical columns.

n_features_in_ : int Number of features seen during fit.

feature_names_in_ : ndarray of shape (n_features_in_,) Names of features seen during fit. Defined only when X has feature names that are all strings.

def fit(self, X, y=None):
65    def fit(self, X, y=None):  # pylint: disable=unused-argument
66        """Retrieve categorical columns.
67
68        Parameters
69        ----------
70        X : pandas.DataFrame
71            Data to encode.
72        y :
73            Ignored. For compatibility with Pipeline.
74        Returns
75        -------
76        self : object
77            Returns self
78        """
79        self.fit_transform(X)
80        return self

Retrieve categorical columns.

Parameters

X : pandas.DataFrame Data to encode. y : Ignored. For compatibility with Pipeline.

Returns

self : object Returns self