survivalist.column

  1# This program is free software: you can redistribute it and/or modify
  2# it under the terms of the GNU General Public License as published by
  3# the Free Software Foundation, either version 3 of the License, or
  4# (at your option) any later version.
  5#
  6# This program is distributed in the hope that it will be useful,
  7# but WITHOUT ANY WARRANTY; without even the implied warranty of
  8# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  9# GNU General Public License for more details.
 10#
 11# You should have received a copy of the GNU General Public License
 12# along with this program.  If not, see <http://www.gnu.org/licenses/>.
 13import logging
 14
 15import numpy as np
 16import pandas as pd
 17from pandas.api.types import CategoricalDtype, is_object_dtype
 18
 19__all__ = ["categorical_to_numeric", "encode_categorical", "standardize"]
 20
 21
 22def _apply_along_column(array, func1d, **kwargs):
 23    if isinstance(array, pd.DataFrame):
 24        return array.apply(func1d, **kwargs)
 25    return np.apply_along_axis(func1d, 0, array, **kwargs)
 26
 27
 28def standardize_column(series_or_array, with_std=True):
 29    d = series_or_array.dtype
 30    if issubclass(d.type, np.number):
 31        output = series_or_array.astype(float)
 32        m = series_or_array.mean()
 33        output -= m
 34
 35        if with_std:
 36            s = series_or_array.std(ddof=1)
 37            output /= s
 38
 39        return output
 40
 41    return series_or_array
 42
 43
 44def standardize(table, with_std=True):
 45    """
 46    Perform Z-Normalization on each numeric column of the given table.
 47
 48    If `table` is a pandas.DataFrame, only numeric columns are modified,
 49    all other columns remain unchanged. If `table` is a numpy.ndarray,
 50    it is only modified if it has numeric dtype, in which case the returned
 51    array will have floating point dtype.
 52
 53    Parameters
 54    ----------
 55    table : pandas.DataFrame or numpy.ndarray
 56        Data to standardize.
 57
 58    with_std : bool, optional, default: True
 59        If ``False`` data is only centered and not converted to unit variance.
 60
 61    Returns
 62    -------
 63    normalized : pandas.DataFrame
 64        Table with numeric columns normalized.
 65        Categorical columns in the input table remain unchanged.
 66    """
 67    new_frame = _apply_along_column(
 68        table, standardize_column, with_std=with_std)
 69
 70    return new_frame
 71
 72
 73def _encode_categorical_series(series, allow_drop=True):
 74    values = _get_dummies_1d(series, allow_drop=allow_drop)
 75    if values is None:
 76        return
 77
 78    enc, levels = values
 79    if enc is None:
 80        return pd.Series(index=series.index, name=series.name, dtype=series.dtype)
 81
 82    if not allow_drop and enc.shape[1] == 1:
 83        return series
 84
 85    names = []
 86    for key in range(1, enc.shape[1]):
 87        names.append(f"{series.name}={levels[key]}")
 88    series = pd.DataFrame(enc[:, 1:], columns=names, index=series.index)
 89
 90    return series
 91
 92
 93def encode_categorical(table, columns=None, **kwargs):
 94    """
 95    Encode categorical columns with `M` categories into `M-1` columns according
 96    to the one-hot scheme.
 97
 98    Parameters
 99    ----------
100    table : pandas.DataFrame
101        Table with categorical columns to encode.
102
103    columns : list-like, optional, default: None
104        Column names in the DataFrame to be encoded.
105        If `columns` is None then all the columns with
106        `object` or `category` dtype will be converted.
107
108    allow_drop : boolean, optional, default: True
109        Whether to allow dropping categorical columns that only consist
110        of a single category.
111
112    Returns
113    -------
114    encoded : pandas.DataFrame
115        Table with categorical columns encoded as numeric.
116        Numeric columns in the input table remain unchanged.
117    """
118    if isinstance(table, pd.Series):
119        if not isinstance(table.dtype, CategoricalDtype) and not is_object_dtype(table.dtype):
120            raise TypeError(
121                f"series must be of categorical dtype, but was {table.dtype}")
122        return _encode_categorical_series(table, **kwargs)
123
124    def _is_categorical_or_object(series):
125        return isinstance(series.dtype, CategoricalDtype) or is_object_dtype(series.dtype)
126
127    if columns is None:
128        # for columns containing categories
129        columns_to_encode = {nam for nam,
130                             s in table.items() if _is_categorical_or_object(s)}
131    else:
132        columns_to_encode = set(columns)
133
134    items = []
135    for name, series in table.items():
136        if name in columns_to_encode:
137            series = _encode_categorical_series(series, **kwargs)
138            if series is None:
139                continue
140        items.append(series)
141
142    # concat columns of tables
143    new_table = pd.concat(items, axis=1, copy=False)
144    return new_table
145
146
147def _get_dummies_1d(data, allow_drop=True):
148    # Series avoids inconsistent NaN handling
149    cat = pd.Categorical(data)
150    levels = cat.categories
151    number_of_cols = len(levels)
152
153    # if all NaN or only one level
154    if allow_drop and number_of_cols < 2:
155        logging.getLogger(__package__).warning(
156            f"dropped categorical variable {data.name!r}, because it has only {number_of_cols} values"
157        )
158        return
159    if number_of_cols == 0:
160        return None, levels
161
162    dummy_mat = np.eye(number_of_cols).take(cat.codes, axis=0)
163
164    # reset NaN GH4446
165    dummy_mat[cat.codes == -1] = np.nan
166
167    return dummy_mat, levels
168
169
170def categorical_to_numeric(table):
171    """Encode categorical columns to numeric by converting each category to
172    an integer value.
173
174    Parameters
175    ----------
176    table : pandas.DataFrame
177        Table with categorical columns to encode.
178
179    Returns
180    -------
181    encoded : pandas.DataFrame
182        Table with categorical columns encoded as numeric.
183        Numeric columns in the input table remain unchanged.
184    """
185
186    def transform(column):
187        if isinstance(column.dtype, CategoricalDtype):
188            return column.cat.codes
189        if is_object_dtype(column.dtype):
190            try:
191                nc = column.astype(np.int64)
192            except ValueError:
193                classes = column.dropna().unique()
194                classes.sort(kind="mergesort")
195                nc = column.map(dict(zip(classes, range(classes.shape[0]))))
196            return nc
197        if column.dtype == bool:
198            return column.astype(np.int64)
199
200        return column
201
202    if isinstance(table, pd.Series):
203        return pd.Series(transform(table), name=table.name, index=table.index)
204    return table.apply(transform, axis=0, result_type="expand")
def categorical_to_numeric(table):
171def categorical_to_numeric(table):
172    """Encode categorical columns to numeric by converting each category to
173    an integer value.
174
175    Parameters
176    ----------
177    table : pandas.DataFrame
178        Table with categorical columns to encode.
179
180    Returns
181    -------
182    encoded : pandas.DataFrame
183        Table with categorical columns encoded as numeric.
184        Numeric columns in the input table remain unchanged.
185    """
186
187    def transform(column):
188        if isinstance(column.dtype, CategoricalDtype):
189            return column.cat.codes
190        if is_object_dtype(column.dtype):
191            try:
192                nc = column.astype(np.int64)
193            except ValueError:
194                classes = column.dropna().unique()
195                classes.sort(kind="mergesort")
196                nc = column.map(dict(zip(classes, range(classes.shape[0]))))
197            return nc
198        if column.dtype == bool:
199            return column.astype(np.int64)
200
201        return column
202
203    if isinstance(table, pd.Series):
204        return pd.Series(transform(table), name=table.name, index=table.index)
205    return table.apply(transform, axis=0, result_type="expand")

Encode categorical columns to numeric by converting each category to an integer value.

Parameters

table : pandas.DataFrame Table with categorical columns to encode.

Returns

encoded : pandas.DataFrame Table with categorical columns encoded as numeric. Numeric columns in the input table remain unchanged.

def encode_categorical(table, columns=None, **kwargs):
 94def encode_categorical(table, columns=None, **kwargs):
 95    """
 96    Encode categorical columns with `M` categories into `M-1` columns according
 97    to the one-hot scheme.
 98
 99    Parameters
100    ----------
101    table : pandas.DataFrame
102        Table with categorical columns to encode.
103
104    columns : list-like, optional, default: None
105        Column names in the DataFrame to be encoded.
106        If `columns` is None then all the columns with
107        `object` or `category` dtype will be converted.
108
109    allow_drop : boolean, optional, default: True
110        Whether to allow dropping categorical columns that only consist
111        of a single category.
112
113    Returns
114    -------
115    encoded : pandas.DataFrame
116        Table with categorical columns encoded as numeric.
117        Numeric columns in the input table remain unchanged.
118    """
119    if isinstance(table, pd.Series):
120        if not isinstance(table.dtype, CategoricalDtype) and not is_object_dtype(table.dtype):
121            raise TypeError(
122                f"series must be of categorical dtype, but was {table.dtype}")
123        return _encode_categorical_series(table, **kwargs)
124
125    def _is_categorical_or_object(series):
126        return isinstance(series.dtype, CategoricalDtype) or is_object_dtype(series.dtype)
127
128    if columns is None:
129        # for columns containing categories
130        columns_to_encode = {nam for nam,
131                             s in table.items() if _is_categorical_or_object(s)}
132    else:
133        columns_to_encode = set(columns)
134
135    items = []
136    for name, series in table.items():
137        if name in columns_to_encode:
138            series = _encode_categorical_series(series, **kwargs)
139            if series is None:
140                continue
141        items.append(series)
142
143    # concat columns of tables
144    new_table = pd.concat(items, axis=1, copy=False)
145    return new_table

Encode categorical columns with M categories into M-1 columns according to the one-hot scheme.

Parameters

table : pandas.DataFrame Table with categorical columns to encode.

columns : list-like, optional, default: None Column names in the DataFrame to be encoded. If columns is None then all the columns with object or category dtype will be converted.

allow_drop : boolean, optional, default: True Whether to allow dropping categorical columns that only consist of a single category.

Returns

encoded : pandas.DataFrame Table with categorical columns encoded as numeric. Numeric columns in the input table remain unchanged.

def standardize(table, with_std=True):
45def standardize(table, with_std=True):
46    """
47    Perform Z-Normalization on each numeric column of the given table.
48
49    If `table` is a pandas.DataFrame, only numeric columns are modified,
50    all other columns remain unchanged. If `table` is a numpy.ndarray,
51    it is only modified if it has numeric dtype, in which case the returned
52    array will have floating point dtype.
53
54    Parameters
55    ----------
56    table : pandas.DataFrame or numpy.ndarray
57        Data to standardize.
58
59    with_std : bool, optional, default: True
60        If ``False`` data is only centered and not converted to unit variance.
61
62    Returns
63    -------
64    normalized : pandas.DataFrame
65        Table with numeric columns normalized.
66        Categorical columns in the input table remain unchanged.
67    """
68    new_frame = _apply_along_column(
69        table, standardize_column, with_std=with_std)
70
71    return new_frame

Perform Z-Normalization on each numeric column of the given table.

If table is a pandas.DataFrame, only numeric columns are modified, all other columns remain unchanged. If table is a numpy.ndarray, it is only modified if it has numeric dtype, in which case the returned array will have floating point dtype.

Parameters

table : pandas.DataFrame or numpy.ndarray Data to standardize.

with_std : bool, optional, default: True If False data is only centered and not converted to unit variance.

Returns

normalized : pandas.DataFrame Table with numeric columns normalized. Categorical columns in the input table remain unchanged.