In [1]:
import cPickle
from scipy.io import loadmat
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import StratifiedKFold, train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.grid_search import GridSearchCV
Load processed data
In [2]:
productCategory='Toys_Games'
tfIdfArr=loadmat('/home/hencrice/Downloads/AsterixDBClassData/processedData/TfIdf_{0}.mat'.format(productCategory))['data']
scores=load('/home/hencrice/Downloads/AsterixDBClassData/processedData/score_{0}.npy'.format(productCategory))
scores.shape
Out[2]:
Split data into training+validation (used gridSearch to pick hyper-parameters), and test set (evaluate model performance)
In [3]:
tfIdfArr_trVaSet, tfIdfArr_teSet, scores_trVaSet, scores_teSet = train_test_split(tfIdfArr, scores, test_size=0.1)
In [4]:
hist(scores_teSet)
Out[4]:
Pick hyper-parameters for SGDClassifier using grid search:
In [7]:
hyperParam={'n_iter':range(5, 20, 5),
# strength of regularization
'alpha': logspace(-5, -3, 10)
}
clf = GridSearchCV(SGDClassifier(loss='log', class_weight={1:0.5, 2:0.4, 3:0.05, 4:0.1, 5:0.25}), hyperParam, n_jobs=8, verbose=1)
clf.fit(tfIdfArr_trVaSet, scores_trVaSet)
bestClf=clf.best_estimator_
clf.best_params_
Out[7]:
Prediction accuracy of each class:
In [9]:
te_cm=confusion_matrix(scores_teSet, bestClf.predict(tfIdfArr_teSet))
te_cm.diagonal()/sum(te_cm,1,dtype=float32)
Out[9]:
Save the resulting model:
In [10]:
with open('/home/hencrice/Downloads/AsterixDBClassData/models/clf_{0}.pkl'.format(productCategory),'wb') as fp:
cPickle.dump(bestClf, fp, -1)