Source code for bace.classifiers.cnb

# -*- coding: utf-8 -*-
# Author: Krzysztof Joachimiak 2016

import numpy as np
from bace.base import BaseNB
from bace.utils import inherit_docstring

# TODO: check weight normalization


[docs]@inherit_docstring
class ComplementNB(BaseNB):

    '''
    Complement Naive Bayes classifier

    References
    ----------
    Rennie J. D. M., Shih L., Teevan J., Karger D. R.  (2003).
    Tackling the Poor Assumptions of Naive Bayes Text Classifiers

    https://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf

    Parameters
    ----------
    alpha: float
        Smoothing parameter
    weight_normalized: bool, default False
        Enable Weight-normalized Complement Naive Bayes method.

    Attributes
    ----------
    alpha_sum_ : int
        Sum of alpha params
    classes_ : array, shape (n_classes,)
        Classes list
    class_count_ : array, shape (n_classes,)
        number of training samples observed in each class.

    Examples
    --------
    >>> from sklearn.datasets import fetch_20newsgroups
    >>> from sklearn.feature_extraction.text import CountVectorizer
    >>> from bace import ComplementNB
    Prepare data
    >>> vectorizer = CountVectorizer()
    >>> categories = ['alt.atheism', 'talk.religion.misc','comp.graphics', 'sci.space']
    Train set
    >>> newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True)
    >>> train_vectors = vectorizer.fit_transform(newsgroups_train.data)
    Test set
    >>> newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True)
    >>> test_vectors = vectorizer.transform(newsgroups_test.data)
    >>> clf = ComplementNB()
    >>> clf.fit(newsgroups_train, train_vectors).accuracy_score(newsgroups_test, test_vectors)
    '''

    def __init__(self, alpha=1.0, weight_normalized=False):
        super(ComplementNB, self).__init__()

        # Params
        self.alpha = alpha
        self._check_alpha_param()
        self.weight_normalized = weight_normalized

        # Computed attributes
        self.complement_features_ = None
        self.alpha_sum_ = None

[docs]    def predict(self, X):
        return self.classes_[np.argmax(self.predict_log_proba(X), axis=1)]

[docs]    def predict_log_proba(self, X):
        self._check_is_fitted()
        denominator = np.sum(self.complement_features, axis=0) + self.alpha_sum_
        features_weights = np.log((self.complement_features + self.alpha) / denominator)

        if self.weight_normalized:
            features_weights /= np.abs(features_weights).sum(axis=1, keepdims=True)

        features_doc_logprob = X @ features_weights
        return self.class_log_proba_ - features_doc_logprob
        #return (features_doc_logprob * - np.exp(-1)) + self.class_log_proba_

    # Fitting model
    def _partial_fit(self, X, y, classes=None, first_partial_fit=None):
        X, y_one_hot = self._prepare_X_y(X, y, first_partial_fit, classes)
        #self._class_log_prob()
        self._update_complement_features(X, y_one_hot)
        self.is_fitted = True