Source code for acton.predictors

"""Predictor classes."""

from abc import ABC, abstractmethod
import logging
from typing import Iterable, Sequence

import acton.database
import acton.kde_predictor
import GPy as gpy
import numpy
import sklearn.base
import sklearn.linear_model
import sklearn.model_selection
import sklearn.preprocessing


[docs]class Predictor(ABC): """Base class for predictors. Attributes ---------- prediction_type : str What kind of predictions this class generates, e.g. classification.s """ prediction_type = 'classification' @abstractmethod
[docs] def fit(self, ids: Iterable[int]): """Fits the predictor to labelled data. Parameters ---------- ids List of IDs of instances to train from. """
@abstractmethod
[docs] def predict(self, ids: Sequence[int]) -> (numpy.ndarray, numpy.ndarray): """Predicts labels of instances. Notes ----- Unlike in scikit-learn, predictions are always real-valued. Predicted labels for a classification problem are represented by predicted probabilities of each class. Parameters ---------- ids List of IDs of instances to predict labels for. Returns ------- numpy.ndarray An N x T x C array of corresponding predictions. numpy.ndarray A N array of confidences (or None if not applicable). """
@abstractmethod
[docs] def reference_predict( self, ids: Sequence[int]) -> (numpy.ndarray, numpy.ndarray): """Predicts labels using the best possible method. Parameters ---------- ids List of IDs of instances to predict labels for. Returns ------- numpy.ndarray An N x 1 x C array of corresponding predictions. numpy.ndarray A N array of confidences (or None if not applicable). """
class _InstancePredictor(Predictor): """Wrapper for a scikit-learn instance. Attributes ---------- _db : acton.database.Database Database storing features and labels. _instance : sklearn.base.BaseEstimator scikit-learn predictor instance. """ def __init__(self, instance: sklearn.base.BaseEstimator, db: acton.database.Database): """ Arguments --------- instance scikit-learn predictor instance. db Database storing features and labels. """ self._db = db self._instance = instance def fit(self, ids: Iterable[int]): """Fits the predictor to labelled data. Parameters ---------- ids List of IDs of instances to train from. """ features = self._db.read_features(ids) labels = self._db.read_labels([0], ids) self._instance.fit(features, labels.ravel()) def predict(self, ids: Sequence[int]) -> (numpy.ndarray, None): """Predicts labels of instances. Notes ----- Unlike in scikit-learn, predictions are always real-valued. Predicted labels for a classification problem are represented by predicted probabilities of each class. Parameters ---------- ids List of IDs of instances to predict labels for. Returns ------- numpy.ndarray An N x 1 x C array of corresponding predictions. numpy.ndarray A N array of confidences (or None if not applicable). """ features = self._db.read_features(ids) try: probs = self._instance.predict_proba(features) return probs.reshape((probs.shape[0], 1, probs.shape[1])), None except AttributeError: probs = self._instance.predict(features) if len(probs.shape) == 1: return probs.reshape((probs.shape[0], 1, 1)), None else: raise NotImplementedError() def reference_predict(self, ids: Sequence[int]) -> (numpy.ndarray, None): """Predicts labels using the best possible method. Parameters ---------- ids List of IDs of instances to predict labels for. Returns ------- numpy.ndarray An N x 1 x C array of corresponding predictions. numpy.ndarray A N array of confidences (or None if not applicable). """ return self.predict(ids)
[docs]def from_instance(predictor: sklearn.base.BaseEstimator, db: acton.database.Database, regression: bool=False ) -> Predictor: """Converts a scikit-learn predictor instance into a Predictor instance. Arguments --------- predictor scikit-learn predictor. db Database storing features and labels. regression Whether this predictor does regression (as opposed to classification). Returns ------- Predictor Predictor instance wrapping the scikit-learn predictor. """ ip = _InstancePredictor(predictor, db) if regression: ip.prediction_type = 'regression' return ip
[docs]def from_class(Predictor: type, regression: bool=False) -> type: """Converts a scikit-learn predictor class into a Predictor class. Arguments --------- Predictor scikit-learn predictor class. regression Whether this predictor does regression (as opposed to classification). Returns ------- type Predictor class wrapping the scikit-learn class. """ class Predictor_(_InstancePredictor): def __init__(self, db, **kwargs): super().__init__(instance=None, db=db) self._instance = Predictor(**kwargs) if regression: Predictor_.prediction_type = 'regression' return Predictor_
[docs]class Committee(Predictor): """A predictor using a committee of other predictors. Attributes ---------- n_classifiers : int Number of logistic regression classifiers in the committee. subset_size : float Percentage of known labels to take subsets of to train the classifier. Lower numbers increase variety. _db : acton.database.Database Database storing features and labels. _committee : List[sklearn.linear_model.LogisticRegression] Underlying committee of logistic regression classifiers. _reference_predictor : Predictor Reference predictor trained on all known labels. """ def __init__(self, Predictor: type, db: acton.database.Database, n_classifiers: int=10, subset_size: float=0.6, **kwargs: dict): """ Parameters ---------- Predictor Predictor to use in the committee. db Database storing features and labels. n_classifiers Number of logistic regression classifiers in the committee. subset_size Percentage of known labels to take subsets of to train the classifier. Lower numbers increase variety. kwargs Keyword arguments passed to the underlying Predictor. """ self.n_classifiers = n_classifiers self.subset_size = subset_size self._db = db self._committee = [Predictor(db=db, **kwargs) for _ in range(n_classifiers)] self._reference_predictor = Predictor(db=db, **kwargs)
[docs] def fit(self, ids: Iterable[int]): """Fits the predictor to labelled data. Parameters ---------- ids List of IDs of instances to train from. """ # Get labels so we can stratify a split. labels = self._db.read_labels([0], ids) for classifier in self._committee: # Take a subsets to introduce variety. try: subset, _ = sklearn.model_selection.train_test_split( ids, train_size=self.subset_size, stratify=labels) except ValueError: # Too few labels. subset = ids classifier.fit(subset) self._reference_predictor.fit(ids)
[docs] def predict(self, ids: Sequence[int]) -> (numpy.ndarray, numpy.ndarray): """Predicts labels of instances. Notes ----- Unlike in scikit-learn, predictions are always real-valued. Predicted labels for a classification problem are represented by predicted probabilities of each class. Parameters ---------- ids List of IDs of instances to predict labels for. Returns ------- numpy.ndarray An N x T x C array of corresponding predictions. numpy.ndarray A N array of confidences (or None if not applicable). """ predictions = numpy.concatenate( [classifier.predict(ids)[0] for classifier in self._committee], axis=1) assert predictions.shape[:2] == (len(ids), len(self._committee)) stdevs = predictions.std(axis=1).mean(axis=1) return predictions, stdevs
[docs] def reference_predict( self, ids: Sequence[int]) -> (numpy.ndarray, numpy.ndarray): """Predicts labels using the best possible method. Parameters ---------- ids List of IDs of instances to predict labels for. Returns ------- numpy.ndarray An N x 1 x C array of corresponding predictions. numpy.ndarray A N array of confidences (or None if not applicable). """ _, stdevs = self.predict(ids) return self._reference_predictor.predict(ids)[0], stdevs
[docs]def AveragePredictions(predictor: Predictor) -> Predictor: """Wrapper for a predictor that averages predicted probabilities. Notes ----- This effectively reduces the number of predictors to 1. Arguments --------- predictor Predictor to wrap. Returns ------- Predictor Predictor with averaged predictions. """ predictor.predict_ = predictor.predict def predict(features: numpy.ndarray) -> (numpy.ndarray, numpy.ndarray): predictions, stdevs = predictor.predict_(features) predictions = predictions.mean(axis=1) return predictions.reshape( (predictions.shape[0], 1, predictions.shape[1])), stdevs predictor.predict = predict return predictor
[docs]class GPClassifier(Predictor): """Classifier using Gaussian processes. Attributes ---------- max_iters : int Maximum optimisation iterations. label_encoder : sklearn.preprocessing.LabelEncoder Encodes labels as integers. model_ : gpy.models.GPClassification GP model. _db : acton.database.Database Database storing features and labels. """ def __init__(self, db: acton.database.Database, max_iters: int=50000, n_jobs: int=1): """ Parameters ---------- db Database. max_iters Maximum optimisation iterations. n_jobs Does nothing; here for compatibility with sklearn. """ self._db = db self.max_iters = max_iters
[docs] def fit(self, ids: Iterable[int]): """Fits the predictor to labelled data. Parameters ---------- ids List of IDs of instances to train from. """ features = self._db.read_features(ids) labels = self._db.read_labels([0], ids).ravel() self.label_encoder_ = sklearn.preprocessing.LabelEncoder() labels = self.label_encoder_.fit_transform(labels).reshape((-1, 1)) if len(self.label_encoder_.classes_) > 2: raise ValueError( 'GPClassifier only supports binary classification.') self.model_ = gpy.models.GPClassification(features, labels) self.model_.optimize('bfgs', max_iters=self.max_iters)
[docs] def predict(self, ids: Sequence[int]) -> (numpy.ndarray, numpy.ndarray): """Predicts labels of instances. Notes ----- Unlike in scikit-learn, predictions are always real-valued. Predicted labels for a classification problem are represented by predicted probabilities of each class. Parameters ---------- ids List of IDs of instances to predict labels for. Returns ------- numpy.ndarray An N x 1 x C array of corresponding predictions. numpy.ndarray A N array of confidences (or None if not applicable). """ features = self._db.read_features(ids) p_predictions, variances = self.model_.predict(features) n_predictions = 1 - p_predictions predictions = numpy.concatenate([n_predictions, p_predictions], axis=1) logging.debug('Variance: {}'.format(variances)) if isinstance(variances, float) and numpy.isnan(variances): variances = None else: variances = variances.ravel() assert variances.shape == (len(ids),) assert predictions.shape[1] == 2 return predictions.reshape((-1, 1, 2)), variances
[docs] def reference_predict( self, ids: Sequence[int]) -> (numpy.ndarray, numpy.ndarray): """Predicts labels using the best possible method. Parameters ---------- ids List of IDs of instances to predict labels for. Returns ------- numpy.ndarray An N x 1 x C array of corresponding predictions. numpy.ndarray A N array of confidences (or None if not applicable). """ return self.predict(ids)
# Helper functions to generate predictor classes. def _logistic_regression() -> type: return from_class(sklearn.linear_model.LogisticRegression) def _linear_regression() -> type: return from_class(sklearn.linear_model.LinearRegression, regression=True) def _logistic_regression_committee() -> type: def make_committee(db, *args, **kwargs): return Committee(_logistic_regression(), db, *args, **kwargs) return make_committee def _kde() -> type: return from_class(acton.kde_predictor.KDEClassifier) PREDICTORS = { 'LogisticRegression': _logistic_regression(), 'LogisticRegressionCommittee': _logistic_regression_committee(), 'LinearRegression': _linear_regression(), 'KDE': _kde(), 'GPC': GPClassifier, }