survivalist.column
1# This program is free software: you can redistribute it and/or modify 2# it under the terms of the GNU General Public License as published by 3# the Free Software Foundation, either version 3 of the License, or 4# (at your option) any later version. 5# 6# This program is distributed in the hope that it will be useful, 7# but WITHOUT ANY WARRANTY; without even the implied warranty of 8# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 9# GNU General Public License for more details. 10# 11# You should have received a copy of the GNU General Public License 12# along with this program. If not, see <http://www.gnu.org/licenses/>. 13import logging 14 15import numpy as np 16import pandas as pd 17from pandas.api.types import CategoricalDtype, is_object_dtype 18 19__all__ = ["categorical_to_numeric", "encode_categorical", "standardize"] 20 21 22def _apply_along_column(array, func1d, **kwargs): 23 if isinstance(array, pd.DataFrame): 24 return array.apply(func1d, **kwargs) 25 return np.apply_along_axis(func1d, 0, array, **kwargs) 26 27 28def standardize_column(series_or_array, with_std=True): 29 d = series_or_array.dtype 30 if issubclass(d.type, np.number): 31 output = series_or_array.astype(float) 32 m = series_or_array.mean() 33 output -= m 34 35 if with_std: 36 s = series_or_array.std(ddof=1) 37 output /= s 38 39 return output 40 41 return series_or_array 42 43 44def standardize(table, with_std=True): 45 """ 46 Perform Z-Normalization on each numeric column of the given table. 47 48 If `table` is a pandas.DataFrame, only numeric columns are modified, 49 all other columns remain unchanged. If `table` is a numpy.ndarray, 50 it is only modified if it has numeric dtype, in which case the returned 51 array will have floating point dtype. 52 53 Parameters 54 ---------- 55 table : pandas.DataFrame or numpy.ndarray 56 Data to standardize. 57 58 with_std : bool, optional, default: True 59 If ``False`` data is only centered and not converted to unit variance. 60 61 Returns 62 ------- 63 normalized : pandas.DataFrame 64 Table with numeric columns normalized. 65 Categorical columns in the input table remain unchanged. 66 """ 67 new_frame = _apply_along_column( 68 table, standardize_column, with_std=with_std) 69 70 return new_frame 71 72 73def _encode_categorical_series(series, allow_drop=True): 74 values = _get_dummies_1d(series, allow_drop=allow_drop) 75 if values is None: 76 return 77 78 enc, levels = values 79 if enc is None: 80 return pd.Series(index=series.index, name=series.name, dtype=series.dtype) 81 82 if not allow_drop and enc.shape[1] == 1: 83 return series 84 85 names = [] 86 for key in range(1, enc.shape[1]): 87 names.append(f"{series.name}={levels[key]}") 88 series = pd.DataFrame(enc[:, 1:], columns=names, index=series.index) 89 90 return series 91 92 93def encode_categorical(table, columns=None, **kwargs): 94 """ 95 Encode categorical columns with `M` categories into `M-1` columns according 96 to the one-hot scheme. 97 98 Parameters 99 ---------- 100 table : pandas.DataFrame 101 Table with categorical columns to encode. 102 103 columns : list-like, optional, default: None 104 Column names in the DataFrame to be encoded. 105 If `columns` is None then all the columns with 106 `object` or `category` dtype will be converted. 107 108 allow_drop : boolean, optional, default: True 109 Whether to allow dropping categorical columns that only consist 110 of a single category. 111 112 Returns 113 ------- 114 encoded : pandas.DataFrame 115 Table with categorical columns encoded as numeric. 116 Numeric columns in the input table remain unchanged. 117 """ 118 if isinstance(table, pd.Series): 119 if not isinstance(table.dtype, CategoricalDtype) and not is_object_dtype(table.dtype): 120 raise TypeError( 121 f"series must be of categorical dtype, but was {table.dtype}") 122 return _encode_categorical_series(table, **kwargs) 123 124 def _is_categorical_or_object(series): 125 return isinstance(series.dtype, CategoricalDtype) or is_object_dtype(series.dtype) 126 127 if columns is None: 128 # for columns containing categories 129 columns_to_encode = {nam for nam, 130 s in table.items() if _is_categorical_or_object(s)} 131 else: 132 columns_to_encode = set(columns) 133 134 items = [] 135 for name, series in table.items(): 136 if name in columns_to_encode: 137 series = _encode_categorical_series(series, **kwargs) 138 if series is None: 139 continue 140 items.append(series) 141 142 # concat columns of tables 143 new_table = pd.concat(items, axis=1, copy=False) 144 return new_table 145 146 147def _get_dummies_1d(data, allow_drop=True): 148 # Series avoids inconsistent NaN handling 149 cat = pd.Categorical(data) 150 levels = cat.categories 151 number_of_cols = len(levels) 152 153 # if all NaN or only one level 154 if allow_drop and number_of_cols < 2: 155 logging.getLogger(__package__).warning( 156 f"dropped categorical variable {data.name!r}, because it has only {number_of_cols} values" 157 ) 158 return 159 if number_of_cols == 0: 160 return None, levels 161 162 dummy_mat = np.eye(number_of_cols).take(cat.codes, axis=0) 163 164 # reset NaN GH4446 165 dummy_mat[cat.codes == -1] = np.nan 166 167 return dummy_mat, levels 168 169 170def categorical_to_numeric(table): 171 """Encode categorical columns to numeric by converting each category to 172 an integer value. 173 174 Parameters 175 ---------- 176 table : pandas.DataFrame 177 Table with categorical columns to encode. 178 179 Returns 180 ------- 181 encoded : pandas.DataFrame 182 Table with categorical columns encoded as numeric. 183 Numeric columns in the input table remain unchanged. 184 """ 185 186 def transform(column): 187 if isinstance(column.dtype, CategoricalDtype): 188 return column.cat.codes 189 if is_object_dtype(column.dtype): 190 try: 191 nc = column.astype(np.int64) 192 except ValueError: 193 classes = column.dropna().unique() 194 classes.sort(kind="mergesort") 195 nc = column.map(dict(zip(classes, range(classes.shape[0])))) 196 return nc 197 if column.dtype == bool: 198 return column.astype(np.int64) 199 200 return column 201 202 if isinstance(table, pd.Series): 203 return pd.Series(transform(table), name=table.name, index=table.index) 204 return table.apply(transform, axis=0, result_type="expand")
171def categorical_to_numeric(table): 172 """Encode categorical columns to numeric by converting each category to 173 an integer value. 174 175 Parameters 176 ---------- 177 table : pandas.DataFrame 178 Table with categorical columns to encode. 179 180 Returns 181 ------- 182 encoded : pandas.DataFrame 183 Table with categorical columns encoded as numeric. 184 Numeric columns in the input table remain unchanged. 185 """ 186 187 def transform(column): 188 if isinstance(column.dtype, CategoricalDtype): 189 return column.cat.codes 190 if is_object_dtype(column.dtype): 191 try: 192 nc = column.astype(np.int64) 193 except ValueError: 194 classes = column.dropna().unique() 195 classes.sort(kind="mergesort") 196 nc = column.map(dict(zip(classes, range(classes.shape[0])))) 197 return nc 198 if column.dtype == bool: 199 return column.astype(np.int64) 200 201 return column 202 203 if isinstance(table, pd.Series): 204 return pd.Series(transform(table), name=table.name, index=table.index) 205 return table.apply(transform, axis=0, result_type="expand")
Encode categorical columns to numeric by converting each category to an integer value.
Parameters
table : pandas.DataFrame Table with categorical columns to encode.
Returns
encoded : pandas.DataFrame Table with categorical columns encoded as numeric. Numeric columns in the input table remain unchanged.
94def encode_categorical(table, columns=None, **kwargs): 95 """ 96 Encode categorical columns with `M` categories into `M-1` columns according 97 to the one-hot scheme. 98 99 Parameters 100 ---------- 101 table : pandas.DataFrame 102 Table with categorical columns to encode. 103 104 columns : list-like, optional, default: None 105 Column names in the DataFrame to be encoded. 106 If `columns` is None then all the columns with 107 `object` or `category` dtype will be converted. 108 109 allow_drop : boolean, optional, default: True 110 Whether to allow dropping categorical columns that only consist 111 of a single category. 112 113 Returns 114 ------- 115 encoded : pandas.DataFrame 116 Table with categorical columns encoded as numeric. 117 Numeric columns in the input table remain unchanged. 118 """ 119 if isinstance(table, pd.Series): 120 if not isinstance(table.dtype, CategoricalDtype) and not is_object_dtype(table.dtype): 121 raise TypeError( 122 f"series must be of categorical dtype, but was {table.dtype}") 123 return _encode_categorical_series(table, **kwargs) 124 125 def _is_categorical_or_object(series): 126 return isinstance(series.dtype, CategoricalDtype) or is_object_dtype(series.dtype) 127 128 if columns is None: 129 # for columns containing categories 130 columns_to_encode = {nam for nam, 131 s in table.items() if _is_categorical_or_object(s)} 132 else: 133 columns_to_encode = set(columns) 134 135 items = [] 136 for name, series in table.items(): 137 if name in columns_to_encode: 138 series = _encode_categorical_series(series, **kwargs) 139 if series is None: 140 continue 141 items.append(series) 142 143 # concat columns of tables 144 new_table = pd.concat(items, axis=1, copy=False) 145 return new_table
Encode categorical columns with M categories into M-1 columns according
to the one-hot scheme.
Parameters
table : pandas.DataFrame Table with categorical columns to encode.
columns : list-like, optional, default: None
Column names in the DataFrame to be encoded.
If columns is None then all the columns with
object or category dtype will be converted.
allow_drop : boolean, optional, default: True Whether to allow dropping categorical columns that only consist of a single category.
Returns
encoded : pandas.DataFrame Table with categorical columns encoded as numeric. Numeric columns in the input table remain unchanged.
45def standardize(table, with_std=True): 46 """ 47 Perform Z-Normalization on each numeric column of the given table. 48 49 If `table` is a pandas.DataFrame, only numeric columns are modified, 50 all other columns remain unchanged. If `table` is a numpy.ndarray, 51 it is only modified if it has numeric dtype, in which case the returned 52 array will have floating point dtype. 53 54 Parameters 55 ---------- 56 table : pandas.DataFrame or numpy.ndarray 57 Data to standardize. 58 59 with_std : bool, optional, default: True 60 If ``False`` data is only centered and not converted to unit variance. 61 62 Returns 63 ------- 64 normalized : pandas.DataFrame 65 Table with numeric columns normalized. 66 Categorical columns in the input table remain unchanged. 67 """ 68 new_frame = _apply_along_column( 69 table, standardize_column, with_std=with_std) 70 71 return new_frame
Perform Z-Normalization on each numeric column of the given table.
If table is a pandas.DataFrame, only numeric columns are modified,
all other columns remain unchanged. If table is a numpy.ndarray,
it is only modified if it has numeric dtype, in which case the returned
array will have floating point dtype.
Parameters
table : pandas.DataFrame or numpy.ndarray Data to standardize.
with_std : bool, optional, default: True
If False data is only centered and not converted to unit variance.
Returns
normalized : pandas.DataFrame Table with numeric columns normalized. Categorical columns in the input table remain unchanged.