In [1]:
    
# Standard imports
from __future__ import print_function, division
# Third party
from pandas import read_csv, DataFrame
from sklearn import cross_validation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
import numpy as np
# My local imports
from basic_imports import (
    X_train, y_train, 
    X_cv, y_cv,
    X_test,
    vectorizer, training_data, test_data)
    
In [2]:
    
# My local imports
from basic_imports import (
    X_train, y_train, 
    X_cv, y_cv,
    X_test,
    vectorizer, training_data, test_data)
    
In [3]:
    
# Define a function to take a clasifier ant return training data
def fit_chunks(clf, Xtrain, ytrain, Xcv, ycv, Nchunks=50):
    """
    Train the classifier in Nchunks, and return training data.
    """
    # Break training data into chunks
    Xchunks = [X_train[i::Nchunks] for i in range(Nchunks)]
    ychunks = [y_train[i::Nchunks] for i in range(Nchunks)]
    
    # Loop over the chunks and train the classifier
    M = 0
    Mlist = []
    scores = []
    
    for Xchunk, ychunk in zip(Xchunks, ychunks):
        # Update measurement count
        M += ychunk.size
        Mlist.append(M)
        # train the bernoulli model with more data
        clf.partial_fit(Xchunk, ychunk, classes=[0,1])
        scores.append(clf.score(X_cv, y_cv))
    return clf, np.array(Mlist), np.array(scores)
    
In [5]:
    
# Make classifiers by trying different values of smoothing parameter alpha
alphas = np.linspace(.7, 1.0, 3)
bernoulli_clfs = []
bernoulli_score_lists = []
multinomial_clfs = []
multinomial_score_lists = []
# Loop over smoothing parameters
for alpha in alphas:
    
    # Make a Bernoulli based nieve base classifier
    bernoulli_clf = BernoulliNB(binarize=.1, alpha=alpha)
    bernoulli_clf, ms, scores = fit_chunks(bernoulli_clf, 
                                           X_train, y_train, 
                                           X_cv, y_cv)
    bernoulli_clfs.append(bernoulli_clf)
    bernoulli_score_lists.append(scores)
    # Make a Multinomial based nieve classifier
    multinomial_clf = MultinomialNB(alpha=alpha)
    multinomial_clf, ms, scores = fit_chunks(multinomial_clf, 
                                             X_train, y_train, 
                                             X_cv, y_cv)
    multinomial_clfs.append(multinomial_clf)
    multinomial_score_lists.append(scores)
    
In [13]:
    
print(np.array(bernoulli_score_lists).max())
print(np.array(multinomial_score_lists).max())
    
    
In [ ]: