In [1]:
# Standard imports
from __future__ import print_function, division
# Third party
from pandas import read_csv, DataFrame
from sklearn import cross_validation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
import numpy as np
# My local imports
from basic_imports import (
X_train, y_train,
X_cv, y_cv,
X_test,
vectorizer, training_data, test_data)
In [2]:
# My local imports
from basic_imports import (
X_train, y_train,
X_cv, y_cv,
X_test,
vectorizer, training_data, test_data)
In [3]:
# Define a function to take a clasifier ant return training data
def fit_chunks(clf, Xtrain, ytrain, Xcv, ycv, Nchunks=50):
"""
Train the classifier in Nchunks, and return training data.
"""
# Break training data into chunks
Xchunks = [X_train[i::Nchunks] for i in range(Nchunks)]
ychunks = [y_train[i::Nchunks] for i in range(Nchunks)]
# Loop over the chunks and train the classifier
M = 0
Mlist = []
scores = []
for Xchunk, ychunk in zip(Xchunks, ychunks):
# Update measurement count
M += ychunk.size
Mlist.append(M)
# train the bernoulli model with more data
clf.partial_fit(Xchunk, ychunk, classes=[0,1])
scores.append(clf.score(X_cv, y_cv))
return clf, np.array(Mlist), np.array(scores)
In [5]:
# Make classifiers by trying different values of smoothing parameter alpha
alphas = np.linspace(.7, 1.0, 3)
bernoulli_clfs = []
bernoulli_score_lists = []
multinomial_clfs = []
multinomial_score_lists = []
# Loop over smoothing parameters
for alpha in alphas:
# Make a Bernoulli based nieve base classifier
bernoulli_clf = BernoulliNB(binarize=.1, alpha=alpha)
bernoulli_clf, ms, scores = fit_chunks(bernoulli_clf,
X_train, y_train,
X_cv, y_cv)
bernoulli_clfs.append(bernoulli_clf)
bernoulli_score_lists.append(scores)
# Make a Multinomial based nieve classifier
multinomial_clf = MultinomialNB(alpha=alpha)
multinomial_clf, ms, scores = fit_chunks(multinomial_clf,
X_train, y_train,
X_cv, y_cv)
multinomial_clfs.append(multinomial_clf)
multinomial_score_lists.append(scores)
In [13]:
print(np.array(bernoulli_score_lists).max())
print(np.array(multinomial_score_lists).max())
In [ ]: