survivalist.preprocessing
1# This program is free software: you can redistribute it and/or modify 2# it under the terms of the GNU General Public License as published by 3# the Free Software Foundation, either version 3 of the License, or 4# (at your option) any later version. 5# 6# This program is distributed in the hope that it will be useful, 7# but WITHOUT ANY WARRANTY; without even the implied warranty of 8# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 9# GNU General Public License for more details. 10# 11# You should have received a copy of the GNU General Public License 12# along with this program. If not, see <http://www.gnu.org/licenses/>. 13from sklearn.base import BaseEstimator, TransformerMixin 14from sklearn.utils.validation import _check_feature_names_in, check_is_fitted 15 16from .column import encode_categorical 17 18__all__ = ["OneHotEncoder"] 19 20 21def check_columns_exist(actual, expected): 22 missing_features = expected.difference(actual) 23 if len(missing_features) != 0: 24 raise ValueError( 25 f"{len(missing_features)} features are missing from data: {missing_features.tolist()}") 26 27 28class OneHotEncoder(BaseEstimator, TransformerMixin): 29 """Encode categorical columns with `M` categories into `M-1` columns according 30 to the one-hot scheme. 31 32 The order of non-categorical columns is preserved, encoded columns are inserted 33 inplace of the original column. 34 35 Parameters 36 ---------- 37 allow_drop : boolean, optional, default: True 38 Whether to allow dropping categorical columns that only consist 39 of a single category. 40 41 Attributes 42 ---------- 43 feature_names_ : pandas.Index 44 List of encoded columns. 45 46 categories_ : dict 47 Categories of encoded columns. 48 49 encoded_columns_ : list 50 Name of columns after encoding. 51 Includes names of non-categorical columns. 52 53 n_features_in_ : int 54 Number of features seen during ``fit``. 55 56 feature_names_in_ : ndarray of shape (`n_features_in_`,) 57 Names of features seen during ``fit``. Defined only when `X` 58 has feature names that are all strings. 59 """ 60 61 def __init__(self, *, allow_drop=True): 62 self.allow_drop = allow_drop 63 64 def fit(self, X, y=None): # pylint: disable=unused-argument 65 """Retrieve categorical columns. 66 67 Parameters 68 ---------- 69 X : pandas.DataFrame 70 Data to encode. 71 y : 72 Ignored. For compatibility with Pipeline. 73 Returns 74 ------- 75 self : object 76 Returns self 77 """ 78 self.fit_transform(X) 79 return self 80 81 def _encode(self, X, columns_to_encode): 82 return encode_categorical(X, columns=columns_to_encode, allow_drop=self.allow_drop) 83 84 def fit_transform(self, X, y=None, **fit_params): # pylint: disable=unused-argument 85 """Convert categorical columns to numeric values. 86 87 Parameters 88 ---------- 89 X : pandas.DataFrame 90 Data to encode. 91 y : 92 Ignored. For compatibility with TransformerMixin. 93 fit_params : 94 Ignored. For compatibility with TransformerMixin. 95 96 Returns 97 ------- 98 Xt : pandas.DataFrame 99 Encoded data. 100 """ 101 self._check_feature_names(X, reset=True) 102 self._check_n_features(X, reset=True) 103 columns_to_encode = X.select_dtypes( 104 include=["object", "category"]).columns 105 x_dummy = self._encode(X, columns_to_encode) 106 107 self.feature_names_ = columns_to_encode 108 self.categories_ = {k: X[k].cat.categories for k in columns_to_encode} 109 self.encoded_columns_ = x_dummy.columns 110 return x_dummy 111 112 def transform(self, X): 113 """Convert categorical columns to numeric values. 114 115 Parameters 116 ---------- 117 X : pandas.DataFrame 118 Data to encode. 119 120 Returns 121 ------- 122 Xt : pandas.DataFrame 123 Encoded data. 124 """ 125 check_is_fitted(self, "encoded_columns_") 126 self._check_n_features(X, reset=False) 127 check_columns_exist(X.columns, self.feature_names_) 128 129 Xt = X.copy() 130 for col, cat in self.categories_.items(): 131 Xt[col] = Xt[col].cat.set_categories(cat) 132 133 new_data = self._encode(Xt, self.feature_names_) 134 return new_data.loc[:, self.encoded_columns_] 135 136 def get_feature_names_out(self, input_features=None): 137 """Get output feature names for transformation. 138 139 Parameters 140 ---------- 141 input_features : array-like of str or None, default=None 142 Input features. 143 144 - If `input_features` is `None`, then `feature_names_in_` is 145 used as feature names in. 146 - If `input_features` is an array-like, then `input_features` must 147 match `feature_names_in_` if `feature_names_in_` is defined. 148 149 Returns 150 ------- 151 feature_names_out : ndarray of str objects 152 Transformed feature names. 153 """ 154 check_is_fitted(self, "encoded_columns_") 155 input_features = _check_feature_names_in(self, input_features) 156 157 return self.encoded_columns_.values.copy()
29class OneHotEncoder(BaseEstimator, TransformerMixin): 30 """Encode categorical columns with `M` categories into `M-1` columns according 31 to the one-hot scheme. 32 33 The order of non-categorical columns is preserved, encoded columns are inserted 34 inplace of the original column. 35 36 Parameters 37 ---------- 38 allow_drop : boolean, optional, default: True 39 Whether to allow dropping categorical columns that only consist 40 of a single category. 41 42 Attributes 43 ---------- 44 feature_names_ : pandas.Index 45 List of encoded columns. 46 47 categories_ : dict 48 Categories of encoded columns. 49 50 encoded_columns_ : list 51 Name of columns after encoding. 52 Includes names of non-categorical columns. 53 54 n_features_in_ : int 55 Number of features seen during ``fit``. 56 57 feature_names_in_ : ndarray of shape (`n_features_in_`,) 58 Names of features seen during ``fit``. Defined only when `X` 59 has feature names that are all strings. 60 """ 61 62 def __init__(self, *, allow_drop=True): 63 self.allow_drop = allow_drop 64 65 def fit(self, X, y=None): # pylint: disable=unused-argument 66 """Retrieve categorical columns. 67 68 Parameters 69 ---------- 70 X : pandas.DataFrame 71 Data to encode. 72 y : 73 Ignored. For compatibility with Pipeline. 74 Returns 75 ------- 76 self : object 77 Returns self 78 """ 79 self.fit_transform(X) 80 return self 81 82 def _encode(self, X, columns_to_encode): 83 return encode_categorical(X, columns=columns_to_encode, allow_drop=self.allow_drop) 84 85 def fit_transform(self, X, y=None, **fit_params): # pylint: disable=unused-argument 86 """Convert categorical columns to numeric values. 87 88 Parameters 89 ---------- 90 X : pandas.DataFrame 91 Data to encode. 92 y : 93 Ignored. For compatibility with TransformerMixin. 94 fit_params : 95 Ignored. For compatibility with TransformerMixin. 96 97 Returns 98 ------- 99 Xt : pandas.DataFrame 100 Encoded data. 101 """ 102 self._check_feature_names(X, reset=True) 103 self._check_n_features(X, reset=True) 104 columns_to_encode = X.select_dtypes( 105 include=["object", "category"]).columns 106 x_dummy = self._encode(X, columns_to_encode) 107 108 self.feature_names_ = columns_to_encode 109 self.categories_ = {k: X[k].cat.categories for k in columns_to_encode} 110 self.encoded_columns_ = x_dummy.columns 111 return x_dummy 112 113 def transform(self, X): 114 """Convert categorical columns to numeric values. 115 116 Parameters 117 ---------- 118 X : pandas.DataFrame 119 Data to encode. 120 121 Returns 122 ------- 123 Xt : pandas.DataFrame 124 Encoded data. 125 """ 126 check_is_fitted(self, "encoded_columns_") 127 self._check_n_features(X, reset=False) 128 check_columns_exist(X.columns, self.feature_names_) 129 130 Xt = X.copy() 131 for col, cat in self.categories_.items(): 132 Xt[col] = Xt[col].cat.set_categories(cat) 133 134 new_data = self._encode(Xt, self.feature_names_) 135 return new_data.loc[:, self.encoded_columns_] 136 137 def get_feature_names_out(self, input_features=None): 138 """Get output feature names for transformation. 139 140 Parameters 141 ---------- 142 input_features : array-like of str or None, default=None 143 Input features. 144 145 - If `input_features` is `None`, then `feature_names_in_` is 146 used as feature names in. 147 - If `input_features` is an array-like, then `input_features` must 148 match `feature_names_in_` if `feature_names_in_` is defined. 149 150 Returns 151 ------- 152 feature_names_out : ndarray of str objects 153 Transformed feature names. 154 """ 155 check_is_fitted(self, "encoded_columns_") 156 input_features = _check_feature_names_in(self, input_features) 157 158 return self.encoded_columns_.values.copy()
Encode categorical columns with M categories into M-1 columns according
to the one-hot scheme.
The order of non-categorical columns is preserved, encoded columns are inserted inplace of the original column.
Parameters
allow_drop : boolean, optional, default: True Whether to allow dropping categorical columns that only consist of a single category.
Attributes
feature_names_ : pandas.Index List of encoded columns.
categories_ : dict Categories of encoded columns.
encoded_columns_ : list Name of columns after encoding. Includes names of non-categorical columns.
n_features_in_ : int
Number of features seen during fit.
feature_names_in_ : ndarray of shape (n_features_in_,)
Names of features seen during fit. Defined only when X
has feature names that are all strings.
65 def fit(self, X, y=None): # pylint: disable=unused-argument 66 """Retrieve categorical columns. 67 68 Parameters 69 ---------- 70 X : pandas.DataFrame 71 Data to encode. 72 y : 73 Ignored. For compatibility with Pipeline. 74 Returns 75 ------- 76 self : object 77 Returns self 78 """ 79 self.fit_transform(X) 80 return self
Retrieve categorical columns.
Parameters
X : pandas.DataFrame Data to encode. y : Ignored. For compatibility with Pipeline.
Returns
self : object Returns self