Source code for acton.recommenders

"""Recommender classes."""

from abc import ABC, abstractmethod
import logging
from typing import Sequence
import warnings

import acton.database
import numpy
import scipy.stats


[docs]def choose_mmr(features: numpy.ndarray, scores: numpy.ndarray, n: int, l: float=0.5) -> Sequence[int]: """Chooses n scores using maximal marginal relevance. Notes ----- Scores are chosen from highest to lowest. If there are less scores to choose from than requested, all scores will be returned in order of preference. Parameters ---------- scores 1D array of scores. n Number of scores to choose. l Lambda parameter for MMR. l = 1 gives a relevance-ranked list and l = 0 gives a maximal diversity ranking. Returns ------- Sequence[int] List of indices of scores chosen. """ if n < 0: raise ValueError('n must be a non-negative integer.') if n == 0: return [] selections = [scores.argmax()] selections_set = set(selections) logging.debug('Running MMR.') dists = [] dists_matrix = None while len(selections) < n: if len(selections) % (n // 10) == 0: logging.debug('MMR epoch {}/{}.'.format(len(selections), n)) # Compute distances for last selection. last = features[selections[-1]:selections[-1] + 1] last_dists = numpy.linalg.norm(features - last, axis=1) dists.append(last_dists) dists_matrix = numpy.array(dists) next_best = None next_best_margin = float('-inf') for i in range(len(scores)): if i in selections_set: continue margin = l * (scores[i] - (1 - l) * dists_matrix[:, i].max()) if margin > next_best_margin: next_best_margin = margin next_best = i if next_best is None: break selections.append(next_best) selections_set.add(next_best) return selections
[docs]def choose_boltzmann(features: numpy.ndarray, scores: numpy.ndarray, n: int, temperature: float=1.0) -> Sequence[int]: """Chooses n scores using a Boltzmann distribution. Notes ----- Scores are chosen from highest to lowest. If there are less scores to choose from than requested, all scores will be returned in order of preference. Parameters ---------- scores 1D array of scores. n Number of scores to choose. temperature Temperature parameter for sampling. Higher temperatures give more diversity. Returns ------- Sequence[int] List of indices of scores chosen. """ if n < 0: raise ValueError('n must be a non-negative integer.') if n == 0: return [] boltzmann_scores = numpy.exp(scores / temperature) boltzmann_scores /= boltzmann_scores.sum() not_chosen = list(range(len(boltzmann_scores))) chosen = [] while len(chosen) < n and not_chosen: scores_ = boltzmann_scores[not_chosen] r = numpy.random.uniform(high=scores_.sum()) total = 0 upto = 0 while True: score = scores_[upto] total += score if total > r: break upto += 1 chosen.append(not_chosen[upto]) not_chosen.pop(upto) return chosen
[docs]class Recommender(ABC): """Base class for recommenders. Attributes ---------- """ @abstractmethod
[docs] def recommend(self, ids: Sequence[int], predictions: numpy.ndarray, n: int=1, diversity: float=0.5) -> Sequence[int]: """Recommends an instance to label. Parameters ---------- ids Sequence of IDs in the unlabelled data pool. predictions N x T x C array of predictions. n Number of recommendations to make. diversity Recommendation diversity in [0, 1]. Returns ------- Sequence[int] IDs of the instances to label. """
[docs]class RandomRecommender(Recommender): """Recommends instances at random.""" def __init__(self, db: acton.database.Database): """ Parameters ---------- db Features database. """ self._db = db
[docs] def recommend(self, ids: Sequence[int], predictions: numpy.ndarray, n: int=1, diversity: float=0.5) -> Sequence[int]: """Recommends an instance to label. Parameters ---------- ids Sequence of IDs in the unlabelled data pool. predictions N x T x C array of predictions. n Number of recommendations to make. diversity Recommendation diversity in [0, 1]. Returns ------- Sequence[int] IDs of the instances to label. """ return numpy.random.choice(list(ids), size=n)
[docs]class QBCRecommender(Recommender): """Recommends instances by committee disagreement.""" def __init__(self, db: acton.database.Database): """ Parameters ---------- db Features database. """ self._db = db
[docs] def recommend(self, ids: Sequence[int], predictions: numpy.ndarray, n: int=1, diversity: float=0.5) -> Sequence[int]: """Recommends an instance to label. Notes ----- Assumes predictions are probabilities of positive binary label. Parameters ---------- ids Sequence of IDs in the unlabelled data pool. predictions N x T x C array of predictions. The ith row must correspond with the ith ID in the sequence. n Number of recommendations to make. diversity Recommendation diversity in [0, 1]. Returns ------- Sequence[int] IDs of the instances to label. """ assert predictions.shape[1] > 2, "QBC must have > 2 predictors." assert len(ids) == predictions.shape[0] assert 0 <= diversity <= 1 labels = predictions.argmax(axis=2) plurality_labels, plurality_counts = scipy.stats.mode(labels, axis=1) assert plurality_labels.shape == (predictions.shape[0], 1), \ 'plurality_labels has shape {}; expected {}'.format( plurality_labels.shape, (predictions.shape[0], 1)) agree_with_plurality = labels == plurality_labels assert labels.shape == agree_with_plurality.shape n_agree = labels.sum(axis=1) p_agree = n_agree / n_agree.max() # Agreement is now between 0 and 1. disagreement = 1 - p_agree indices = choose_boltzmann(self._db.read_features(ids), disagreement, n, temperature=diversity * 2) return [ids[i] for i in indices]
[docs]class UncertaintyRecommender(Recommender): """Recommends instances by confidence-based uncertainty sampling.""" def __init__(self, db: acton.database.Database): """ Parameters ---------- db Features database. """ self._db = db
[docs] def recommend(self, ids: Sequence[int], predictions: numpy.ndarray, n: int=1, diversity: float=0.5) -> Sequence[int]: """Recommends an instance to label. Notes ----- Assumes predictions are probabilities of positive binary label. Parameters ---------- ids Sequence of IDs in the unlabelled data pool. predictions N x 1 x C array of predictions. The ith row must correspond with the ith ID in the sequence. n Number of recommendations to make. diversity Recommendation diversity in [0, 1]. Returns ------- Sequence[int] IDs of the instances to label. """ if predictions.shape[1] != 1: raise ValueError('Uncertainty sampling must have one predictor') assert len(ids) == predictions.shape[0] # x* = argmax (1 - p(y^ | x)) where y^ = argmax p(y | x) (Settles 2009). proximities = 1 - predictions.max(axis=2).ravel() assert proximities.shape == (len(ids),) indices = choose_boltzmann(self._db.read_features(ids), proximities, n, temperature=diversity * 2) return [ids[i] for i in indices]
[docs]class EntropyRecommender(Recommender): """Recommends instances by confidence-based uncertainty sampling.""" def __init__(self, db: acton.database.Database): """ Parameters ---------- db Features database. """ self._db = db
[docs] def recommend(self, ids: Sequence[int], predictions: numpy.ndarray, n: int=1, diversity: float=0.5) -> Sequence[int]: """Recommends an instance to label. Parameters ---------- ids Sequence of IDs in the unlabelled data pool. predictions N x 1 x C array of predictions. The ith row must correspond with the ith ID in the sequence. n Number of recommendations to make. diversity Recommendation diversity in [0, 1]. Returns ------- Sequence[int] IDs of the instances to label. """ if predictions.shape[1] != 1: raise ValueError('Uncertainty sampling must have one predictor') assert len(ids) == predictions.shape[0] with warnings.catch_warnings(): warnings.filterwarnings(action='ignore', category=RuntimeWarning) proximities = -predictions * numpy.log(predictions) proximities = proximities.sum(axis=1).max(axis=1).ravel() proximities[numpy.isnan(proximities)] = float('-inf') assert proximities.shape == (len(ids),) indices = choose_boltzmann(self._db.read_features(ids), proximities, n, temperature=diversity * 2) return [ids[i] for i in indices]
[docs]class MarginRecommender(Recommender): """Recommends instances by margin-based uncertainty sampling.""" def __init__(self, db: acton.database.Database): """ Parameters ---------- db Features database. """ self._db = db
[docs] def recommend(self, ids: Sequence[int], predictions: numpy.ndarray, n: int=1, diversity: float=0.5) -> Sequence[int]: """Recommends an instance to label. Notes ----- Assumes predictions are probabilities of positive binary label. Parameters ---------- ids Sequence of IDs in the unlabelled data pool. predictions N x 1 x C array of predictions. The ith row must correspond with the ith ID in the sequence. n Number of recommendations to make. diversity Recommendation diversity in [0, 1]. Returns ------- Sequence[int] IDs of the instances to label. """ if predictions.shape[1] != 1: raise ValueError('Uncertainty sampling must have one predictor') assert len(ids) == predictions.shape[0] # x* = argmin p(y1^ | x) - p(y2^ | x) where yn^ = argmax p(yn | x) # (Settles 2009). partitioned = numpy.partition(predictions, -2, axis=2) most_likely = partitioned[:, 0, -1] second_most_likely = partitioned[:, 0, -2] assert most_likely.shape == (len(ids),) scores = 1 - (most_likely - second_most_likely) indices = choose_boltzmann(self._db.read_features(ids), scores, n, temperature=diversity * 2) return [ids[i] for i in indices]
# For safe string-based access to recommender classes. RECOMMENDERS = { 'RandomRecommender': RandomRecommender, 'QBCRecommender': QBCRecommender, 'UncertaintyRecommender': UncertaintyRecommender, 'EntropyRecommender': EntropyRecommender, 'MarginRecommender': MarginRecommender, 'None': RandomRecommender, }