Source code for acton.recommenders

"""Recommender classes."""

from abc import ABC, abstractmethod
import logging
from typing import Sequence
import warnings

import acton.database
import numpy
import scipy.stats


[docs]def choose_mmr(features: numpy.ndarray, scores: numpy.ndarray, n: int,
               l: float=0.5) -> Sequence[int]:
    """Chooses n scores using maximal marginal relevance.

    Notes
    -----
    Scores are chosen from highest to lowest. If there are less scores to choose
    from than requested, all scores will be returned in order of preference.

    Parameters
    ----------
    scores
        1D array of scores.
    n
        Number of scores to choose.
    l
        Lambda parameter for MMR. l = 1 gives a relevance-ranked list and l = 0
        gives a maximal diversity ranking.

    Returns
    -------
    Sequence[int]
        List of indices of scores chosen.
    """
    if n < 0:
        raise ValueError('n must be a non-negative integer.')

    if n == 0:
        return []

    selections = [scores.argmax()]
    selections_set = set(selections)

    logging.debug('Running MMR.')
    dists = []
    dists_matrix = None
    while len(selections) < n:
        if len(selections) % (n // 10) == 0:
            logging.debug('MMR epoch {}/{}.'.format(len(selections), n))
        # Compute distances for last selection.
        last = features[selections[-1]:selections[-1] + 1]
        last_dists = numpy.linalg.norm(features - last, axis=1)
        dists.append(last_dists)
        dists_matrix = numpy.array(dists)

        next_best = None
        next_best_margin = float('-inf')

        for i in range(len(scores)):
            if i in selections_set:
                continue

            margin = l * (scores[i] - (1 - l) * dists_matrix[:, i].max())
            if margin > next_best_margin:
                next_best_margin = margin
                next_best = i

        if next_best is None:
            break

        selections.append(next_best)
        selections_set.add(next_best)

    return selections


[docs]def choose_boltzmann(features: numpy.ndarray, scores: numpy.ndarray, n: int,
                     temperature: float=1.0) -> Sequence[int]:
    """Chooses n scores using a Boltzmann distribution.

    Notes
    -----
    Scores are chosen from highest to lowest. If there are less scores to choose
    from than requested, all scores will be returned in order of preference.

    Parameters
    ----------
    scores
        1D array of scores.
    n
        Number of scores to choose.
    temperature
        Temperature parameter for sampling. Higher temperatures give more
        diversity.

    Returns
    -------
    Sequence[int]
        List of indices of scores chosen.
    """
    if n < 0:
        raise ValueError('n must be a non-negative integer.')

    if n == 0:
        return []

    boltzmann_scores = numpy.exp(scores / temperature)
    boltzmann_scores /= boltzmann_scores.sum()
    not_chosen = list(range(len(boltzmann_scores)))
    chosen = []
    while len(chosen) < n and not_chosen:
        scores_ = boltzmann_scores[not_chosen]
        r = numpy.random.uniform(high=scores_.sum())
        total = 0
        upto = 0
        while True:
            score = scores_[upto]
            total += score
            if total > r:
                break

            upto += 1
        chosen.append(not_chosen[upto])
        not_chosen.pop(upto)

    return chosen


[docs]class Recommender(ABC):
    """Base class for recommenders.

    Attributes
    ----------
    """

    @abstractmethod
[docs]    def recommend(self, ids: Sequence[int],
                  predictions: numpy.ndarray,
                  n: int=1, diversity: float=0.5) -> Sequence[int]:
        """Recommends an instance to label.

        Parameters
        ----------
        ids
            Sequence of IDs in the unlabelled data pool.
        predictions
            N x T x C array of predictions.
        n
            Number of recommendations to make.
        diversity
            Recommendation diversity in [0, 1].

        Returns
        -------
        Sequence[int]
            IDs of the instances to label.
        """


[docs]class RandomRecommender(Recommender):
    """Recommends instances at random."""

    def __init__(self, db: acton.database.Database):
        """
        Parameters
        ----------
        db
            Features database.
        """
        self._db = db

[docs]    def recommend(self, ids: Sequence[int],
                  predictions: numpy.ndarray,
                  n: int=1, diversity: float=0.5) -> Sequence[int]:
        """Recommends an instance to label.

        Parameters
        ----------
        ids
            Sequence of IDs in the unlabelled data pool.
        predictions
            N x T x C array of predictions.
        n
            Number of recommendations to make.
        diversity
            Recommendation diversity in [0, 1].

        Returns
        -------
        Sequence[int]
            IDs of the instances to label.
        """
        return numpy.random.choice(list(ids), size=n)


[docs]class QBCRecommender(Recommender):
    """Recommends instances by committee disagreement."""

    def __init__(self, db: acton.database.Database):
        """
        Parameters
        ----------
        db
            Features database.
        """
        self._db = db

[docs]    def recommend(self, ids: Sequence[int],
                  predictions: numpy.ndarray,
                  n: int=1, diversity: float=0.5) -> Sequence[int]:
        """Recommends an instance to label.

        Notes
        -----
        Assumes predictions are probabilities of positive binary label.

        Parameters
        ----------
        ids
            Sequence of IDs in the unlabelled data pool.
        predictions
            N x T x C array of predictions. The ith row must correspond with the
            ith ID in the sequence.
        n
            Number of recommendations to make.
        diversity
            Recommendation diversity in [0, 1].

        Returns
        -------
        Sequence[int]
            IDs of the instances to label.
        """
        assert predictions.shape[1] > 2, "QBC must have > 2 predictors."
        assert len(ids) == predictions.shape[0]
        assert 0 <= diversity <= 1
        labels = predictions.argmax(axis=2)
        plurality_labels, plurality_counts = scipy.stats.mode(labels, axis=1)
        assert plurality_labels.shape == (predictions.shape[0], 1), \
            'plurality_labels has shape {}; expected {}'.format(
                plurality_labels.shape, (predictions.shape[0], 1))
        agree_with_plurality = labels == plurality_labels
        assert labels.shape == agree_with_plurality.shape
        n_agree = labels.sum(axis=1)
        p_agree = n_agree / n_agree.max()  # Agreement is now between 0 and 1.
        disagreement = 1 - p_agree
        indices = choose_boltzmann(self._db.read_features(ids), disagreement, n,
                                   temperature=diversity * 2)
        return [ids[i] for i in indices]


[docs]class UncertaintyRecommender(Recommender):
    """Recommends instances by confidence-based uncertainty sampling."""

    def __init__(self, db: acton.database.Database):
        """
        Parameters
        ----------
        db
            Features database.
        """
        self._db = db

[docs]    def recommend(self, ids: Sequence[int],
                  predictions: numpy.ndarray,
                  n: int=1, diversity: float=0.5) -> Sequence[int]:
        """Recommends an instance to label.

        Notes
        -----
        Assumes predictions are probabilities of positive binary label.

        Parameters
        ----------
        ids
            Sequence of IDs in the unlabelled data pool.
        predictions
            N x 1 x C array of predictions. The ith row must correspond with the
            ith ID in the sequence.
        n
            Number of recommendations to make.
        diversity
            Recommendation diversity in [0, 1].

        Returns
        -------
        Sequence[int]
            IDs of the instances to label.
        """
        if predictions.shape[1] != 1:
            raise ValueError('Uncertainty sampling must have one predictor')

        assert len(ids) == predictions.shape[0]

        # x* = argmax (1 - p(y^ | x)) where y^ = argmax p(y | x) (Settles 2009).
        proximities = 1 - predictions.max(axis=2).ravel()
        assert proximities.shape == (len(ids),)

        indices = choose_boltzmann(self._db.read_features(ids), proximities, n,
                                   temperature=diversity * 2)
        return [ids[i] for i in indices]


[docs]class EntropyRecommender(Recommender):
    """Recommends instances by confidence-based uncertainty sampling."""

    def __init__(self, db: acton.database.Database):
        """
        Parameters
        ----------
        db
            Features database.
        """
        self._db = db

[docs]    def recommend(self, ids: Sequence[int],
                  predictions: numpy.ndarray,
                  n: int=1, diversity: float=0.5) -> Sequence[int]:
        """Recommends an instance to label.

        Parameters
        ----------
        ids
            Sequence of IDs in the unlabelled data pool.
        predictions
            N x 1 x C array of predictions. The ith row must correspond with the
            ith ID in the sequence.
        n
            Number of recommendations to make.
        diversity
            Recommendation diversity in [0, 1].

        Returns
        -------
        Sequence[int]
            IDs of the instances to label.
        """
        if predictions.shape[1] != 1:
            raise ValueError('Uncertainty sampling must have one predictor')

        assert len(ids) == predictions.shape[0]

        with warnings.catch_warnings():
            warnings.filterwarnings(action='ignore', category=RuntimeWarning)
            proximities = -predictions * numpy.log(predictions)

        proximities = proximities.sum(axis=1).max(axis=1).ravel()
        proximities[numpy.isnan(proximities)] = float('-inf')

        assert proximities.shape == (len(ids),)

        indices = choose_boltzmann(self._db.read_features(ids), proximities, n,
                                   temperature=diversity * 2)
        return [ids[i] for i in indices]


[docs]class MarginRecommender(Recommender):
    """Recommends instances by margin-based uncertainty sampling."""

    def __init__(self, db: acton.database.Database):
        """
        Parameters
        ----------
        db
            Features database.
        """
        self._db = db

[docs]    def recommend(self, ids: Sequence[int],
                  predictions: numpy.ndarray,
                  n: int=1, diversity: float=0.5) -> Sequence[int]:
        """Recommends an instance to label.

        Notes
        -----
        Assumes predictions are probabilities of positive binary label.

        Parameters
        ----------
        ids
            Sequence of IDs in the unlabelled data pool.
        predictions
            N x 1 x C array of predictions. The ith row must correspond with the
            ith ID in the sequence.
        n
            Number of recommendations to make.
        diversity
            Recommendation diversity in [0, 1].

        Returns
        -------
        Sequence[int]
            IDs of the instances to label.
        """
        if predictions.shape[1] != 1:
            raise ValueError('Uncertainty sampling must have one predictor')

        assert len(ids) == predictions.shape[0]

        # x* = argmin p(y1^ | x) - p(y2^ | x) where yn^ = argmax p(yn | x)
        # (Settles 2009).
        partitioned = numpy.partition(predictions, -2, axis=2)
        most_likely = partitioned[:, 0, -1]
        second_most_likely = partitioned[:, 0, -2]
        assert most_likely.shape == (len(ids),)
        scores = 1 - (most_likely - second_most_likely)

        indices = choose_boltzmann(self._db.read_features(ids), scores, n,
                                   temperature=diversity * 2)
        return [ids[i] for i in indices]


# For safe string-based access to recommender classes.
RECOMMENDERS = {
    'RandomRecommender': RandomRecommender,
    'QBCRecommender': QBCRecommender,
    'UncertaintyRecommender': UncertaintyRecommender,
    'EntropyRecommender': EntropyRecommender,
    'MarginRecommender': MarginRecommender,
    'None': RandomRecommender,
}