Source code for bace.base

import numpy as np
from abc import ABCMeta, abstractmethod
import warnings
import six
from sklearn.exceptions import NotFittedError
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator
from sklearn.preprocessing import LabelBinarizer
from scipy.sparse import csr_matrix
from bace.utils import get_complement_matrix

# Warnings


class AlphaZeroWarning(Warning):
    pass


class NotImplementedYet(Warning):
    pass


# Base Naive Bayes classifier class


class BaseNB(six.with_metaclass(ABCMeta, BaseEstimator)):

    _estimator_type = "classifier"

    def __init__(self):
        self.is_fitted = False
        self.classes_ = None
        self.class_count_ = None

    # Properties

    @property
    def complement_class_count_(self):
        '''

        Complement class count, i.e. number of occurrences of all the samples with
        all the classes except the given class c

        '''
        size = len(self.class_count_)
        return self.class_count_.dot(get_complement_matrix(size))

    @property
    def complement_class_log_proba_(self):
        '''

        Complement class probability, i.e. logprob of occurrence of a sample, which
        does not belong to the given class c

        '''
        all_samples_count = np.float64(np.sum(self.class_count_))
        return np.log(self.complement_class_count_ / all_samples_count)

    @property
    def class_log_proba_(self):
        '''
        Log probability of class occurrence
        '''
        all_samples_count = np.float64(np.sum(self.class_count_))
        return np.log(self.class_count_ / all_samples_count)

    # Fitting model
    def fit(self, X, y):
        '''

        Fit model to given training set

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape (n_samples,)
            Target values.
        Returns
        -------
        self : Naive Bayes estimator object
            Returns self.
        '''
        self._reset()
        self._partial_fit(X, y)
        return self

    def partial_fit(self, X, y, classes=None):
        """
        Incremental fit on a batch of samples.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.
        classes : array-like, shape = [n_classes], optional (default=None)
            List of all the classes that can possibly appear in the y vector.
            Must be provided at the first call to partial_fit, can be omitted
            in subsequent calls.

        Returns
        -------
        self : object
             Returns self.
        """
        self._partial_fit(X, y, classes=classes, first_partial_fit=not self.is_fitted)
        return self

    @abstractmethod
    def _partial_fit(self, X, y, classes=None, first_partial_fit=None):
        ''''''

    @abstractmethod
    def predict(self, X):
        """
        Perform classification on an array of test vectors X.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Unseen samples vector
        Returns
        -------
        C : array, shape = [n_samples]
            Predicted target values for X

        """

    def _update_complement_features(self, X, y_one_hot):
        '''

        Compute complement features counts

        Parameters
        ----------
        X: numpy array (n_samples, n_features)
            Matrix of input samples
        y_one_hot: numpy array (n_samples, n_classes)
            Binary matrix encoding input
        '''
        # FIXME: complement_features nomenclature is incoherent
        if self.is_fitted:
            self.complement_features += X.T @ np.logical_not(y_one_hot)
        else:
            self.complement_features = X.T @ np.logical_not(y_one_hot)

    def _update_features(self, X, y_one_hot):
        '''

        Compute features counts

        Parameters
        ----------
        X: numpy array (n_samples, n_features)
            Matrix of input samples
        y_one_hot: numpy array (n_samples, n_classes)
            Binary matrix encoding input
        '''
        if self.is_fitted:
            self.features_ += X.T @ y_one_hot
        else:
            self.features_ = X.T @ y_one_hot

    @abstractmethod
    def predict_log_proba(self, X):
        """
        Return log-probability estimates for the test vector X.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
        Returns
        -------
        C : array-like, shape = [n_samples, n_classes]
            Returns the log-probability of the samples for each class in
            the model. The columns correspond to the classes in sorted
            order, as they appear in the attribute `classes_`.
        """

    def predict_proba(self, X):
        """
        Return probability estimates for the test vector X.
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
        Returns
        -------
        C : array-like, shape = [n_samples, n_classes]
            Returns the probability of the samples for each class in
            the model. The columns correspond to the classes in sorted
            order, as they appear in the attribute `classes_`.
        """
        # TODO: Handle float exponent error
        return np.exp(self.predict_log_proba(X))

    # Scores

    def accuracy_score(self, X, y):
        '''

        Return acuracy score

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.

        Returns
        -------
        accuracy_score: float
            Accuracy on the given test set

        '''
        self._check_is_fitted()
        return accuracy_score(y, self.predict(X))

    def _prepare_X_y(self, X, y, first_partial_fit, classes):
        if first_partial_fit and not classes:
            raise ValueError("classes must be passed on the first call "
                         "to partial_fit.")

        if not self.is_fitted:
            self.alpha_sum_ = X.shape[1] * self.alpha

        if classes:
            self.classes_ = classes

        lb = LabelBinarizer()
        y_one_hot = lb.fit_transform(y)
        self.class_count_ = np.sum(y_one_hot, axis=0)

        if not self.classes_:
            self.classes_ = lb.classes_

        return X, y_one_hot

    def _reset(self):
        '''

        Reset object params for refit

        '''
        self.classes_ = None
        self.class_counts_ = None
        self.complement_features_ = None
        self.complement_class_counts_ = None

    def _check_is_fitted(self):
        if not self.is_fitted:
            raise NotFittedError

    def _check_alpha_param(self):
        if self.alpha == 0.0:
            warnings.warn('Alpha sholud not be zero. It may cause division by zero', AlphaZeroWarning)

    def _not_implemented_yet(self, message):
        warnings.warn(NotImplementedYet(message))