Source code for acton.predictors

"""Predictor classes."""

from abc import ABC, abstractmethod
import logging
from typing import Iterable, Sequence

import acton.database
import acton.kde_predictor
import GPy as gpy
import numpy
import sklearn.base
import sklearn.linear_model
import sklearn.model_selection
import sklearn.preprocessing


[docs]class Predictor(ABC):
    """Base class for predictors.

    Attributes
    ----------
    prediction_type : str
        What kind of predictions this class generates, e.g. classification.s
    """
    prediction_type = 'classification'

    @abstractmethod
[docs]    def fit(self, ids: Iterable[int]):
        """Fits the predictor to labelled data.

        Parameters
        ----------
        ids
            List of IDs of instances to train from.
        """

    @abstractmethod
[docs]    def predict(self, ids: Sequence[int]) -> (numpy.ndarray, numpy.ndarray):
        """Predicts labels of instances.

        Notes
        -----
            Unlike in scikit-learn, predictions are always real-valued.
            Predicted labels for a classification problem are represented by
            predicted probabilities of each class.

        Parameters
        ----------
        ids
            List of IDs of instances to predict labels for.

        Returns
        -------
        numpy.ndarray
            An N x T x C array of corresponding predictions.
        numpy.ndarray
            A N array of confidences (or None if not applicable).
        """

    @abstractmethod
[docs]    def reference_predict(
            self, ids: Sequence[int]) -> (numpy.ndarray, numpy.ndarray):
        """Predicts labels using the best possible method.

        Parameters
        ----------
        ids
            List of IDs of instances to predict labels for.

        Returns
        -------
        numpy.ndarray
            An N x 1 x C array of corresponding predictions.
        numpy.ndarray
            A N array of confidences (or None if not applicable).
        """


class _InstancePredictor(Predictor):
    """Wrapper for a scikit-learn instance.

    Attributes
    ----------
    _db : acton.database.Database
        Database storing features and labels.
    _instance : sklearn.base.BaseEstimator
        scikit-learn predictor instance.
    """

    def __init__(self, instance: sklearn.base.BaseEstimator,
                 db: acton.database.Database):
        """
        Arguments
        ---------
        instance
            scikit-learn predictor instance.
        db
            Database storing features and labels.
        """
        self._db = db
        self._instance = instance

    def fit(self, ids: Iterable[int]):
        """Fits the predictor to labelled data.

        Parameters
        ----------
        ids
            List of IDs of instances to train from.
        """
        features = self._db.read_features(ids)
        labels = self._db.read_labels([0], ids)
        self._instance.fit(features, labels.ravel())

    def predict(self, ids: Sequence[int]) -> (numpy.ndarray, None):
        """Predicts labels of instances.

        Notes
        -----
            Unlike in scikit-learn, predictions are always real-valued.
            Predicted labels for a classification problem are represented by
            predicted probabilities of each class.

        Parameters
        ----------
        ids
            List of IDs of instances to predict labels for.

        Returns
        -------
        numpy.ndarray
            An N x 1 x C array of corresponding predictions.
        numpy.ndarray
            A N array of confidences (or None if not applicable).
        """
        features = self._db.read_features(ids)
        try:
            probs = self._instance.predict_proba(features)
            return probs.reshape((probs.shape[0], 1, probs.shape[1])), None
        except AttributeError:
            probs = self._instance.predict(features)
            if len(probs.shape) == 1:
                return probs.reshape((probs.shape[0], 1, 1)), None
            else:
                raise NotImplementedError()

    def reference_predict(self, ids: Sequence[int]) -> (numpy.ndarray, None):
        """Predicts labels using the best possible method.

        Parameters
        ----------
        ids
            List of IDs of instances to predict labels for.

        Returns
        -------
        numpy.ndarray
            An N x 1 x C array of corresponding predictions.
        numpy.ndarray
            A N array of confidences (or None if not applicable).
        """
        return self.predict(ids)


[docs]def from_instance(predictor: sklearn.base.BaseEstimator,
                  db: acton.database.Database, regression: bool=False
                  ) -> Predictor:
    """Converts a scikit-learn predictor instance into a Predictor instance.

    Arguments
    ---------
    predictor
        scikit-learn predictor.
    db
        Database storing features and labels.
    regression
        Whether this predictor does regression (as opposed to classification).

    Returns
    -------
    Predictor
        Predictor instance wrapping the scikit-learn predictor.
    """
    ip = _InstancePredictor(predictor, db)
    if regression:
        ip.prediction_type = 'regression'
    return ip


[docs]def from_class(Predictor: type, regression: bool=False) -> type:
    """Converts a scikit-learn predictor class into a Predictor class.

    Arguments
    ---------
    Predictor
        scikit-learn predictor class.
    regression
        Whether this predictor does regression (as opposed to classification).

    Returns
    -------
    type
        Predictor class wrapping the scikit-learn class.
    """
    class Predictor_(_InstancePredictor):

        def __init__(self, db, **kwargs):
            super().__init__(instance=None, db=db)
            self._instance = Predictor(**kwargs)

    if regression:
        Predictor_.prediction_type = 'regression'

    return Predictor_


[docs]class Committee(Predictor):
    """A predictor using a committee of other predictors.

    Attributes
    ----------
    n_classifiers : int
        Number of logistic regression classifiers in the committee.
    subset_size : float
        Percentage of known labels to take subsets of to train the
        classifier. Lower numbers increase variety.
    _db : acton.database.Database
        Database storing features and labels.
    _committee : List[sklearn.linear_model.LogisticRegression]
        Underlying committee of logistic regression classifiers.
    _reference_predictor : Predictor
        Reference predictor trained on all known labels.
    """

    def __init__(self, Predictor: type, db: acton.database.Database,
                 n_classifiers: int=10, subset_size: float=0.6,
                 **kwargs: dict):
        """
        Parameters
        ----------
        Predictor
            Predictor to use in the committee.
        db
            Database storing features and labels.
        n_classifiers
            Number of logistic regression classifiers in the committee.
        subset_size
            Percentage of known labels to take subsets of to train the
            classifier. Lower numbers increase variety.
        kwargs
            Keyword arguments passed to the underlying Predictor.
        """
        self.n_classifiers = n_classifiers
        self.subset_size = subset_size
        self._db = db
        self._committee = [Predictor(db=db, **kwargs)
                           for _ in range(n_classifiers)]
        self._reference_predictor = Predictor(db=db, **kwargs)

[docs]    def fit(self, ids: Iterable[int]):
        """Fits the predictor to labelled data.

        Parameters
        ----------
        ids
            List of IDs of instances to train from.
        """
        # Get labels so we can stratify a split.
        labels = self._db.read_labels([0], ids)
        for classifier in self._committee:
            # Take a subsets to introduce variety.
            try:
                subset, _ = sklearn.model_selection.train_test_split(
                    ids, train_size=self.subset_size, stratify=labels)
            except ValueError:
                # Too few labels.
                subset = ids
            classifier.fit(subset)
        self._reference_predictor.fit(ids)

[docs]    def predict(self, ids: Sequence[int]) -> (numpy.ndarray, numpy.ndarray):
        """Predicts labels of instances.

        Notes
        -----
            Unlike in scikit-learn, predictions are always real-valued.
            Predicted labels for a classification problem are represented by
            predicted probabilities of each class.

        Parameters
        ----------
        ids
            List of IDs of instances to predict labels for.

        Returns
        -------
        numpy.ndarray
            An N x T x C array of corresponding predictions.
        numpy.ndarray
            A N array of confidences (or None if not applicable).
        """
        predictions = numpy.concatenate(
            [classifier.predict(ids)[0]
             for classifier in self._committee],
            axis=1)
        assert predictions.shape[:2] == (len(ids), len(self._committee))
        stdevs = predictions.std(axis=1).mean(axis=1)
        return predictions, stdevs

[docs]    def reference_predict(
            self, ids: Sequence[int]) -> (numpy.ndarray, numpy.ndarray):
        """Predicts labels using the best possible method.

        Parameters
        ----------
        ids
            List of IDs of instances to predict labels for.

        Returns
        -------
        numpy.ndarray
            An N x 1 x C array of corresponding predictions.
        numpy.ndarray
            A N array of confidences (or None if not applicable).
        """
        _, stdevs = self.predict(ids)
        return self._reference_predictor.predict(ids)[0], stdevs


[docs]def AveragePredictions(predictor: Predictor) -> Predictor:
    """Wrapper for a predictor that averages predicted probabilities.

    Notes
    -----
    This effectively reduces the number of predictors to 1.

    Arguments
    ---------
    predictor
        Predictor to wrap.

    Returns
    -------
    Predictor
        Predictor with averaged predictions.
    """
    predictor.predict_ = predictor.predict

    def predict(features: numpy.ndarray) -> (numpy.ndarray, numpy.ndarray):
        predictions, stdevs = predictor.predict_(features)
        predictions = predictions.mean(axis=1)
        return predictions.reshape(
            (predictions.shape[0], 1, predictions.shape[1])), stdevs

    predictor.predict = predict

    return predictor


[docs]class GPClassifier(Predictor):
    """Classifier using Gaussian processes.

    Attributes
    ----------
    max_iters : int
        Maximum optimisation iterations.
    label_encoder : sklearn.preprocessing.LabelEncoder
        Encodes labels as integers.
    model_ : gpy.models.GPClassification
        GP model.
    _db : acton.database.Database
        Database storing features and labels.
    """
    def __init__(self, db: acton.database.Database, max_iters: int=50000,
                 n_jobs: int=1):
        """
        Parameters
        ----------
        db
            Database.
        max_iters
            Maximum optimisation iterations.
        n_jobs
            Does nothing; here for compatibility with sklearn.
        """
        self._db = db
        self.max_iters = max_iters

[docs]    def fit(self, ids: Iterable[int]):
        """Fits the predictor to labelled data.

        Parameters
        ----------
        ids
            List of IDs of instances to train from.
        """
        features = self._db.read_features(ids)
        labels = self._db.read_labels([0], ids).ravel()
        self.label_encoder_ = sklearn.preprocessing.LabelEncoder()
        labels = self.label_encoder_.fit_transform(labels).reshape((-1, 1))
        if len(self.label_encoder_.classes_) > 2:
            raise ValueError(
                'GPClassifier only supports binary classification.')
        self.model_ = gpy.models.GPClassification(features, labels)
        self.model_.optimize('bfgs', max_iters=self.max_iters)

[docs]    def predict(self, ids: Sequence[int]) -> (numpy.ndarray, numpy.ndarray):
        """Predicts labels of instances.

        Notes
        -----
            Unlike in scikit-learn, predictions are always real-valued.
            Predicted labels for a classification problem are represented by
            predicted probabilities of each class.

        Parameters
        ----------
        ids
            List of IDs of instances to predict labels for.

        Returns
        -------
        numpy.ndarray
            An N x 1 x C array of corresponding predictions.
        numpy.ndarray
            A N array of confidences (or None if not applicable).
        """
        features = self._db.read_features(ids)
        p_predictions, variances = self.model_.predict(features)
        n_predictions = 1 - p_predictions
        predictions = numpy.concatenate([n_predictions, p_predictions], axis=1)

        logging.debug('Variance: {}'.format(variances))
        if isinstance(variances, float) and numpy.isnan(variances):
            variances = None
        else:
            variances = variances.ravel()
            assert variances.shape == (len(ids),)
        assert predictions.shape[1] == 2
        return predictions.reshape((-1, 1, 2)), variances

[docs]    def reference_predict(
            self, ids: Sequence[int]) -> (numpy.ndarray, numpy.ndarray):
        """Predicts labels using the best possible method.

        Parameters
        ----------
        ids
            List of IDs of instances to predict labels for.

        Returns
        -------
        numpy.ndarray
            An N x 1 x C array of corresponding predictions.
        numpy.ndarray
            A N array of confidences (or None if not applicable).
        """
        return self.predict(ids)


# Helper functions to generate predictor classes.


def _logistic_regression() -> type:
    return from_class(sklearn.linear_model.LogisticRegression)


def _linear_regression() -> type:
    return from_class(sklearn.linear_model.LinearRegression, regression=True)


def _logistic_regression_committee() -> type:
    def make_committee(db, *args, **kwargs):
        return Committee(_logistic_regression(), db, *args, **kwargs)

    return make_committee


def _kde() -> type:
    return from_class(acton.kde_predictor.KDEClassifier)


PREDICTORS = {
    'LogisticRegression': _logistic_regression(),
    'LogisticRegressionCommittee': _logistic_regression_committee(),
    'LinearRegression': _linear_regression(),
    'KDE': _kde(),
    'GPC': GPClassifier,
}