In [1]:
import cPickle
from scipy.io import loadmat
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import StratifiedKFold, train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.grid_search import GridSearchCV

Load processed data


In [2]:
productCategory='Video_Games'
tfIdfArr=loadmat('/home/hencrice/Downloads/AsterixDBClassData/processedData/TfIdf_{0}.mat'.format(productCategory))['data']
scores=load('/home/hencrice/Downloads/AsterixDBClassData/processedData/score_{0}.npy'.format(productCategory))
scores.shape


Out[2]:
(100000,)

Split data into training+validation (used gridSearch to pick hyper-parameters), and test set (evaluate model performance)


In [3]:
tfIdfArr_trVaSet, tfIdfArr_teSet, scores_trVaSet, scores_teSet = train_test_split(tfIdfArr, scores, test_size=0.1)

In [4]:
hist(scores_teSet)


Out[4]:
(array([ 1991.,     0.,  2060.,     0.,     0.,  1995.,     0.,  1995.,
           0.,  1959.]),
 array([ 1. ,  1.4,  1.8,  2.2,  2.6,  3. ,  3.4,  3.8,  4.2,  4.6,  5. ]),
 <a list of 10 Patch objects>)

Pick hyper-parameters for SGDClassifier using grid search:


In [4]:
hyperParam={'n_iter':range(5, 20, 5),
            # strength of regularization
            'alpha': logspace(-5, -3, 10)
            }
clf = GridSearchCV(SGDClassifier(loss='log', class_weight={1:0.5, 2:0.4, 3:0.05, 4:0.1, 5:0.25}), hyperParam, n_jobs=8, verbose=1)
clf.fit(tfIdfArr_trVaSet, scores_trVaSet)
bestClf=clf.best_estimator_
clf.best_params_


Fitting 3 folds for each of 30 candidates, totalling 90 fits
[Parallel(n_jobs=8)]: Done   1 jobs       | elapsed:   11.1s
[Parallel(n_jobs=8)]: Done  50 jobs       | elapsed:  2.6min
[Parallel(n_jobs=8)]: Done  76 out of  90 | elapsed:  4.0min remaining:   44.0s
[Parallel(n_jobs=8)]: Done  90 out of  90 | elapsed:  4.5min finished
Out[4]:
{'alpha': 1.0000000000000001e-05, 'n_iter': 10}

Prediction accuracy of each class:


In [5]:
te_cm=confusion_matrix(scores_teSet, bestClf.predict(tfIdfArr_teSet))
te_cm.diagonal()/sum(te_cm,1,dtype=float32)


Out[5]:
array([ 0.81291645,  0.62580967,  0.        ,  0.18110236,  0.80654912])

Save the resulting model:


In [6]:
with open('/home/hencrice/Downloads/AsterixDBClassData/models/clf_{0}.pkl'.format(productCategory),'wb') as fp:
    cPickle.dump(bestClf, fp, -1)