Source code for acton.kde_predictor

"""A predictor that uses KDE to classify instances."""

import numpy
import sklearn.base
import sklearn.neighbors
import sklearn.utils.multiclass
import sklearn.utils.validation


[docs]class KDEClassifier(sklearn.base.BaseEstimator, sklearn.base.ClassifierMixin):
    """A classifier using kernel density estimation to classify instances."""

    def __init__(self, bandwidth=1.0):
        """A classifier using kernel density estimation to classify instances.

        A kernel density estimate is fit to each class. These estimates are used
        to score instances and the highest score class is used as the label for
        each instance.

        bandwidth : float
            Bandwidth for the kernel density estimate.
        """
        self.bandwidth = bandwidth

[docs]    def fit(self, X, y):
        """Fits kernel density models to the data.

        Parameters
        ----------
        X : array_like, shape (n_samples, n_features)
            List of n_features-dimensional data points. Each row
            corresponds to a single data point.
        y : array-like, shape (n_samples,)
            Target vector relative to X.
        """
        X, y = sklearn.utils.validation.check_X_y(X, y)

        self.classes_ = sklearn.utils.multiclass.unique_labels(y)

        self.kdes_ = [
            sklearn.neighbors.KernelDensity(self.bandwidth).fit(X[y == label])
            for label in self.classes_]

        return self

[docs]    def predict(self, X):
        """Predicts class labels.

        Parameters
        ----------
        X : array_like, shape (n_samples, n_features)
            List of n_features-dimensional data points. Each row
            corresponds to a single data point.
        """
        sklearn.utils.validation.check_is_fitted(self, ['kdes_', 'classes_'])
        X = sklearn.utils.validation.check_array(X)

        scores = self.predict_proba(X)

        most_probable_indices = scores.argmax(axis=1)
        assert most_probable_indices.shape[0] == X.shape[0]

        return numpy.array([self.classes_[i] for i in most_probable_indices])

    @staticmethod
    def _softmax(data, axis=0):
        """Computes the softmax of an array along an axis.

        Notes
        -----
        Adapted from https://gist.github.com/stober/1946926.

        Parameters
        ----------
        data : array_like
            Array of numbers.
        axis : int
            Axis to softmax along.
        """
        e_x = numpy.exp(
            data - numpy.expand_dims(numpy.max(data, axis=axis), axis))
        out = e_x / numpy.expand_dims(e_x.sum(axis=axis), axis)
        return out

[docs]    def predict_proba(self, X):
        """Predicts class probabilities.

        Class probabilities are normalised log densities of the kernel density
        estimates.

        Parameters
        ----------
        X : array_like, shape (n_samples, n_features)
            List of n_features-dimensional data points. Each row
            corresponds to a single data point.
        """
        sklearn.utils.validation.check_is_fitted(self, ['kdes_', 'classes_'])
        X = sklearn.utils.validation.check_array(X)

        scores = numpy.zeros((X.shape[0], len(self.classes_)))
        for label, kde in enumerate(self.kdes_):
            scores[:, label] = kde.score_samples(X)

        scores = self._softmax(scores, axis=1)

        assert scores.shape == (X.shape[0], len(self.classes_))
        assert numpy.allclose(scores.sum(axis=1), numpy.ones((X.shape[0],)))

        return scores