Source code for sklego.preprocessing.columncapper

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted


[docs]class ColumnCapper(TransformerMixin, BaseEstimator): """ Caps the values of columns according to the given quantile thresholds. :type quantile_range: tuple or list, optional, default=(5.0, 95.0) :param quantile_range: The quantile ranges to perform the capping. Their valus must be in the interval [0; 100]. :type interpolation: str, optional, default='linear' :param interpolation: The interpolation method to compute the quantiles when the desired quantile lies between two data points `i` and `j`. The Available values are: * ``'linear'``: `i + (j - i) * fraction`, where `fraction` is the fractional part of\ the index surrounded by `i` and `j`. * ``'lower'``: `i`. * ``'higher'``: `j`. * ``'nearest'``: `i` or `j` whichever is nearest. * ``'midpoint'``: (`i` + `j`) / 2. :type discard_infs: bool, optional, default=False :param discard_infs: Whether to discard ``-np.inf`` and ``np.inf`` values or not. If ``False``, such values will be capped. If ``True``, they will be replaced by ``np.nan``. .. note:: Setting ``discard_infs=True`` is important if the `inf` values are results of divisions by 0, which are interpreted by ``pandas`` as ``-np.inf`` or ``np.inf`` depending on the signal of the numerator. :type copy: bool, optional, default=True :param copy: If False, try to avoid a copy and do inplace capping instead. This is not guaranteed to always work inplace; e.g. if the data is not a NumPy array or scipy.sparse CSR matrix, a copy may still be returned. :raises: ``TypeError``, ``ValueError`` :Example: >>> import pandas as pd >>> import numpy as np >>> from sklego.preprocessing import ColumnCapper >>> df = pd.DataFrame({'a':[2, 4.5, 7, 9], 'b':[11, 12, np.inf, 14]}) >>> df a b 0 2.0 11.0 1 4.5 12.0 2 7.0 inf 3 9.0 14.0 >>> capper = ColumnCapper() >>> capper.fit_transform(df) array([[ 2.375, 11.1 ], [ 4.5 , 12. ], [ 7. , 13.8 ], [ 8.7 , 13.8 ]]) >>> capper = ColumnCapper(discard_infs=True) # Discarding infs >>> df[['a', 'b']] = capper.fit_transform(df) >>> df a b 0 2.375 11.1 1 4.500 12.0 2 7.000 NaN 3 8.700 13.8 """ def __init__( self, quantile_range=(5.0, 95.0), interpolation="linear", discard_infs=False, copy=True, ): self._check_quantile_range(quantile_range) self._check_interpolation(interpolation) self.quantile_range = quantile_range self.interpolation = interpolation self.discard_infs = discard_infs self.copy = copy
[docs] def fit(self, X, y=None): """ Computes the quantiles for each column of ``X``. :type X: pandas.DataFrame or numpy.ndarray :param X: The column(s) from which the capping limit(s) will be computed. :param y: Ignored. :rtype: sklego.preprocessing.ColumnCapper :returns: The fitted object. :raises: ``ValueError`` if ``X`` contains non-numeric columns """ X = check_array( X, copy=True, force_all_finite=False, dtype=FLOAT_DTYPES, estimator=self ) # If X contains infs, we need to replace them by nans before computing quantiles np.putmask(X, (X == np.inf) | (X == -np.inf), np.nan) # There should be no column containing only nan cells at this point. If that's not the case, # it means that the user asked ColumnCapper to fit some column containing only nan or inf cells. nans_mask = np.isnan(X) invalid_columns_mask = ( nans_mask.sum(axis=0) == X.shape[0] ) # Contains as many nans as rows if invalid_columns_mask.any(): raise ValueError( "ColumnCapper cannot fit columns containing only inf/nan values" ) q = [quantile_limit / 100 for quantile_limit in self.quantile_range] self.quantiles_ = np.nanquantile( a=X, q=q, axis=0, overwrite_input=True, interpolation=self.interpolation ) # Saving the number of columns to ensure coherence between fit and transform inputs self.n_columns_ = X.shape[1] return self
[docs] def transform(self, X): """ Performs the capping on the column(s) of ``X``. :type X: pandas.DataFrame or numpy.ndarray :param X: The column(s) for which the capping limit(s) will be applied. :rtype: numpy.ndarray :returns: ``X`` values with capped limits. :raises: ``ValueError`` if the number of columns from ``X`` differs from the number of columns when fitting """ check_is_fitted(self, "quantiles_") X = check_array( X, copy=self.copy, force_all_finite=False, dtype=FLOAT_DTYPES, estimator=self, ) if X.shape[1] != self.n_columns_: raise ValueError( "X must have the same number of columns in fit and transform" ) if self.discard_infs: np.putmask(X, (X == np.inf) | (X == -np.inf), np.nan) # Actually capping X = np.minimum(X, self.quantiles_[1, :]) X = np.maximum(X, self.quantiles_[0, :]) return X
@staticmethod def _check_quantile_range(quantile_range): """ Checks for the validity of quantile_range. """ if not isinstance(quantile_range, tuple) and not isinstance( quantile_range, list ): raise TypeError("quantile_range must be a tuple or a list") if len(quantile_range) != 2: raise ValueError( "quantile_range must contain 2 elements: min_quantile and max_quantile" ) min_quantile, max_quantile = quantile_range for quantile in min_quantile, max_quantile: if not isinstance(quantile, float) and not isinstance(quantile, int): raise TypeError("min_quantile and max_quantile must be numbers") if quantile < 0 or 100 < quantile: raise ValueError("min_quantile and max_quantile must be in [0; 100]") if min_quantile > max_quantile: raise ValueError("min_quantile must be less than or equal to max_quantile") @staticmethod def _check_interpolation(interpolation): """ Checks for the validity of interpolation """ allowed_interpolations = ("linear", "lower", "higher", "midpoint", "nearest") if interpolation not in allowed_interpolations: raise ValueError( "Available interpolation methods: {}".format( ", ".join(allowed_interpolations) ) )