Source code for sklego.meta.subjective_classifier

import numpy as np
from sklearn.base import (
    BaseEstimator,
    ClassifierMixin,
    MetaEstimatorMixin,
)
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import normalize
from sklearn.utils.validation import (
    check_is_fitted,
    check_X_y,
    check_array,
    FLOAT_DTYPES,
)


[docs]class SubjectiveClassifier(BaseEstimator, ClassifierMixin, MetaEstimatorMixin): """ Corrects predictions of the inner classifier by taking into account a (subjective) prior distribution of the classes. This can be useful when there is a difference in class distribution between the training data set and the real world. Using the confusion matrix of the inner classifier and the prior, the posterior probability for a class, given the prediction of the inner classifier, can be computed. The background for this posterior estimation is given `in this article <https://lucdemortier.github.io/articles/16/PerformanceMetrics>_`. Based on the `evidence` attribute, this meta estimator's predictions are based on simple weighing of the inner estimator's `predict_proba()` results, the posterior probabilities based on the confusion matrix, or a combination of the two approaches. :param estimator: An sklearn-compatible classifier estimator :param prior: A dict of class->frequency representing the prior (a.k.a. subjective real-world) class distribution. The class frequencies should sum to 1. :param evidence: A string indicating which evidence should be used to correct the inner estimator's predictions. Should be one of 'predict_proba', 'confusion_matrix', or 'both' (default). If `predict_proba`, the inner estimator's `predict_proba()` results are multiplied by the prior distribution. In case of `confusion_matrix`, the inner estimator's discrete predictions are converted to posterior probabilities using the prior and the inner estimator's confusion matrix (obtained from the train data used in `fit()`). In case of `both` (default), the the inner estimator's `predict_proba()` results are multiplied by the posterior probabilities. """ def __init__(self, estimator, prior, evidence="both"): self.estimator = estimator self.prior = prior self.evidence = evidence def _likelihood(self, predicted_class, given_class, cfm): return cfm[given_class, predicted_class] / cfm[given_class, :].sum() def _evidence(self, predicted_class, cfm): return sum( [ self._likelihood(predicted_class, given_class, cfm) * self.prior[self.classes_[given_class]] for given_class in range(cfm.shape[0]) ] ) def _posterior(self, y, y_hat, cfm): y_hat_evidence = self._evidence(y_hat, cfm) return ( ( self._likelihood(y_hat, y, cfm) * self.prior[self.classes_[y]] / y_hat_evidence ) if y_hat_evidence > 0 else self.prior[y] # in case confusion matrix has all-zero column for y_hat )
[docs] def fit(self, X, y): """ Fits the inner estimator based on the data. Raises a `ValueError` if the `y` vector contains classes that are not specified in the prior, or if the prior is not a valid probability distribution (i.e. does not sum to 1). :param X: array-like, shape=(n_columns, n_samples,) training data. :param y: array-like, shape=(n_samples,) training data. :return: Returns an instance of self. """ if not isinstance(self.estimator, ClassifierMixin): raise ValueError( "Invalid inner estimator: the SubjectiveClassifier meta model only works on classification models" ) if not np.isclose(sum(self.prior.values()), 1): raise ValueError( "Invalid prior: the prior probabilities of all classes should sum to 1" ) valid_evidence_types = ["predict_proba", "confusion_matrix", "both"] if self.evidence not in valid_evidence_types: raise ValueError( f"Invalid evidence: the provided evidence should be one of {valid_evidence_types}" ) X, y = check_X_y(X, y, estimator=self.estimator, dtype=FLOAT_DTYPES) if set(y) - set(self.prior.keys()): raise ValueError( f"Training data is inconsistent with prior: no prior defined for classes " f"{set(y) - set(self.prior.keys())}" ) self.estimator.fit(X, y) cfm = confusion_matrix(y, self.estimator.predict(X)) self.posterior_matrix_ = np.array( [ [self._posterior(y, y_hat, cfm) for y_hat in range(cfm.shape[0])] for y in range(cfm.shape[0]) ] ) return self
@staticmethod def _weighted_proba(weights, y_hat_probas): return normalize(weights * y_hat_probas, norm="l1") @staticmethod def _to_discrete(y_hat_probas): y_hat_discrete = np.zeros(y_hat_probas.shape) y_hat_discrete[ np.arange(y_hat_probas.shape[0]), y_hat_probas.argmax(axis=1) ] = 1 return y_hat_discrete
[docs] def predict_proba(self, X): """ Returns probability distribution of the class, based on the provided data. :param X: array-like, shape=(n_columns, n_samples,) training data. :return: array, shape=(n_samples, n_classes) the predicted data """ check_is_fitted(self, ["posterior_matrix_"]) X = check_array(X, estimator=self, dtype=FLOAT_DTYPES) y_hats = self.estimator.predict_proba(X) # these are ignorant of the prior if self.evidence == "predict_proba": prior_weights = np.array([self.prior[klass] for klass in self.classes_]) return self._weighted_proba(prior_weights, y_hats) else: posterior_probas = self._to_discrete(y_hats) @ self.posterior_matrix_.T return ( self._weighted_proba(posterior_probas, y_hats) if self.evidence == "both" else posterior_probas )
[docs] def predict(self, X): """ Returns predicted class, based on the provided data. :param X: array-like, shape=(n_columns, n_samples,) training data. :return: array, shape=(n_samples, n_classes) the predicted data """ check_is_fitted(self, ["posterior_matrix_"]) X = check_array(X, estimator=self, dtype=FLOAT_DTYPES) return self.classes_[self.predict_proba(X).argmax(axis=1)]
@property def classes_(self): return self.estimator.classes_