In [1]:
# Standard imports
from __future__ import print_function, division

# Third party
from pandas import read_csv, DataFrame
from sklearn import cross_validation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
import numpy as np

# My local imports
from basic_imports import (
    X_train, y_train, 
    X_cv, y_cv,
    X_test,
    vectorizer, training_data, test_data)

Local imports

The file "basic_import.py" contains the script for import data


In [2]:
# My local imports
from basic_imports import (
    X_train, y_train, 
    X_cv, y_cv,
    X_test,
    vectorizer, training_data, test_data)

In [3]:
# Define a function to take a clasifier ant return training data
def fit_chunks(clf, Xtrain, ytrain, Xcv, ycv, Nchunks=50):
    """
    Train the classifier in Nchunks, and return training data.
    """
    # Break training data into chunks
    Xchunks = [X_train[i::Nchunks] for i in range(Nchunks)]
    ychunks = [y_train[i::Nchunks] for i in range(Nchunks)]
    
    # Loop over the chunks and train the classifier
    M = 0
    Mlist = []
    scores = []
    
    for Xchunk, ychunk in zip(Xchunks, ychunks):

        # Update measurement count
        M += ychunk.size
        Mlist.append(M)

        # train the bernoulli model with more data
        clf.partial_fit(Xchunk, ychunk, classes=[0,1])
        scores.append(clf.score(X_cv, y_cv))

    return clf, np.array(Mlist), np.array(scores)

In [5]:
# Make classifiers by trying different values of smoothing parameter alpha
alphas = np.linspace(.7, 1.0, 3)
bernoulli_clfs = []
bernoulli_score_lists = []
multinomial_clfs = []
multinomial_score_lists = []

# Loop over smoothing parameters
for alpha in alphas:
    
    # Make a Bernoulli based nieve base classifier
    bernoulli_clf = BernoulliNB(binarize=.1, alpha=alpha)
    bernoulli_clf, ms, scores = fit_chunks(bernoulli_clf, 
                                           X_train, y_train, 
                                           X_cv, y_cv)
    bernoulli_clfs.append(bernoulli_clf)
    bernoulli_score_lists.append(scores)

    # Make a Multinomial based nieve classifier
    multinomial_clf = MultinomialNB(alpha=alpha)
    multinomial_clf, ms, scores = fit_chunks(multinomial_clf, 
                                             X_train, y_train, 
                                             X_cv, y_cv)
    multinomial_clfs.append(multinomial_clf)
    multinomial_score_lists.append(scores)

In [13]:
print(np.array(bernoulli_score_lists).max())
print(np.array(multinomial_score_lists).max())


0.849371428571
0.846514285714

In [ ]: