In [24]:
import cPickle
import os
from itertools import izip
from scipy.io import loadmat, savemat
from sklearn.linear_model import SGDClassifier
from numpy.core.defchararray import add as stringVecAdd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.grid_search import GridSearchCV
Load and concatenate raw data for every 8 product categories, the ratio of score-3 reviews v.s. non score-3 reviews is 1:3, then store these mixtures of raw data from different categories=>
In [34]:
txtFileList=[fName for fName in next(os.walk('/home/hencrice/Downloads/AsterixDBClassData'))[2] if fName.endswith('.npy')]
numOfCategoriesInAMixture=8
assert len(txtFileList)%numOfCategoriesInAMixture==0, "Can't divide {0} by {1}".format(len(txtFileList), numOfCategoriesInAMixture)
for s,e in izip(range(0, len(txtFileList), numOfCategoriesInAMixture), range(numOfCategoriesInAMixture, len(txtFileList)+1, numOfCategoriesInAMixture)):
print((s, e))
fileNameL=[]
for fname in txtFileList[s:e]:
print('Loading table {0} ......'.format(fname[8:-4]))
fileNameL.append(fname[8:-4])
if s%numOfCategoriesInAMixture==0:
rawDataArr=load('/home/hencrice/Downloads/AsterixDBClassData/{0}'.format(fname))
rawDataArr=stringVecAdd(rawDataArr[:,1], stringVecAdd(" ", rawDataArr[:,2]))
scores=load('/home/hencrice/Downloads/AsterixDBClassData/processedData/score_{0}'.format(fname[8:]))
score3Cnt=sum(scores==3)
score3Cnt=2000 if score3Cnt>2000 else score3Cnt
rawDataArr=hstack((rawDataArr[scores==3][:score3Cnt], rawDataArr[scores!=3][:score3Cnt*3]))
targets=hstack((scores[scores==3][:score3Cnt], scores[scores!=3][:score3Cnt*3]))
else:
tempArr=load('/home/hencrice/Downloads/AsterixDBClassData/{0}'.format(fname))
tempArr=stringVecAdd(tempArr[:,1], stringVecAdd(" ", tempArr[:,2]))
scores=load('/home/hencrice/Downloads/AsterixDBClassData/processedData/score_{0}'.format(fname[8:]))
score3Cnt=sum(scores==3)
score3Cnt=2000 if score3Cnt>2000 else score3Cnt
rawDataArr=hstack((rawDataArr, tempArr[scores==3][:score3Cnt], tempArr[scores!=3][:score3Cnt*3]))
targets=hstack((targets, scores[scores==3][:score3Cnt], scores[scores!=3][:score3Cnt*3]))
del tempArr, scores
if fname==txtFileList[e-1]:
save('/home/hencrice/Downloads/AsterixDBClassData/mixtures/rawData_mixtureOf_{0}'.format('_'.join(fileNameL)), rawDataArr)
save('/home/hencrice/Downloads/AsterixDBClassData/mixtures/scores_mixtureOf_{0}'.format('_'.join(fileNameL)), targets)
print('data shape: {0}'.format((rawDataArr.shape, targets.shape)))
del rawDataArr, scores
fileNameL[:]=[]
Transform raw data mixture into tf-idf format and save the vectorizer that is used to produce the results:
In [17]:
fileList=[fName[18:] for fName in next(os.walk('/home/hencrice/Downloads/AsterixDBClassData/mixtures/'))[2] if fName.endswith('.npy') and fName[0]=='r']
for fName in fileList:
rawData=load('/home/hencrice/Downloads/AsterixDBClassData/mixtures/rawData_mixtureOf_{0}'.format(fName))
vectorizer=TfidfVectorizer(min_df=9, max_df=0.7, ngram_range=(1, 3))
savemat('/home/hencrice/Downloads/AsterixDBClassData/mixtures/TfIdf_{0}'.format(fName[:-4]), {'data': vectorizer.fit_transform(rawData)})
with open('/home/hencrice/Downloads/AsterixDBClassData/models/vectorizerTfIdf_{0}.pkl'.format(fName[:-4]),'wb') as fp:
cPickle.dump(vectorizer, fp, -1)
Split data into training+validation (used gridSearch to pick hyper-parameters), and test set (evaluate model performance)
In [20]:
productCategory='Home_Kitchen_Office_Products_Arts_Pet_Supplies_Toys_Games_Beauty_Gourmet_Foods_Software'
tfIdfArr=loadmat('/home/hencrice/Downloads/AsterixDBClassData/mixtures/TfIdf_{0}.mat'.format(productCategory))['data']
scores=load('/home/hencrice/Downloads/AsterixDBClassData/mixtures/scores_mixtureOf_{0}.npy'.format(productCategory))
In [67]:
tfIdfArr_trVaSet, tfIdfArr_teSet, scores_trVaSet, scores_teSet = train_test_split(tfIdfArr, scores, test_size=0.1)
hist(scores_teSet)
Out[67]:
Pick hyper-parameters for SGDClassifier using grid search:
In [63]:
hyperParam={'n_iter':range(5, 20, 5),
# strength of regularization
'alpha': logspace(-5, -3, 10)
}
clf = GridSearchCV(SGDClassifier(loss='log', class_weight={1:0.2, 2:0.2, 3:0.8, 4:0.1, 5:0.25}), hyperParam, n_jobs=8, verbose=1)
clf.fit(tfIdfArr_trVaSet, scores_trVaSet)
bestClf=clf.best_estimator_
clf.best_params_
Out[63]:
Prediction accuracy of each class:
In [64]:
te_cm=confusion_matrix(scores_teSet, bestClf.predict(tfIdfArr_teSet))
te_cm.diagonal()/sum(te_cm,1,dtype=float32)
Out[64]:
Save the resulting model:
In [66]:
with open('/home/hencrice/Downloads/AsterixDBClassData/models/clf_mixtureOf_{0}.pkl'.format(productCategory),'wb') as fp:
cPickle.dump(bestClf, fp, -1)
In [68]:
productCategory='Jewelry_Patio_Watches_Video_Games_Clothing_Accessories_Health_Automotive_Musical_Instruments'
tfIdfArr=loadmat('/home/hencrice/Downloads/AsterixDBClassData/mixtures/TfIdf_{0}.mat'.format(productCategory))['data']
scores=load('/home/hencrice/Downloads/AsterixDBClassData/mixtures/scores_mixtureOf_{0}.npy'.format(productCategory))
In [69]:
tfIdfArr_trVaSet, tfIdfArr_teSet, scores_trVaSet, scores_teSet = train_test_split(tfIdfArr, scores, test_size=0.1)
hist(scores_teSet)
Out[69]:
Pick hyper-parameters for SGDClassifier using grid search:
In [74]:
hyperParam={'n_iter':range(5, 20, 5),
# strength of regularization
'alpha': logspace(-5, -3, 10)
}
clf = GridSearchCV(SGDClassifier(loss='log', class_weight={1:0.2, 2:0.2, 3:0.65, 4:0.1, 5:0.25}), hyperParam, n_jobs=8, verbose=1)
clf.fit(tfIdfArr_trVaSet, scores_trVaSet)
bestClf=clf.best_estimator_
clf.best_params_
Out[74]:
Prediction accuracy of each class:
In [76]:
te_cm=confusion_matrix(scores_teSet, bestClf.predict(tfIdfArr_teSet))
te_cm.diagonal()/sum(te_cm,1,dtype=float32)
Out[76]:
Save the resulting model:
In [77]:
with open('/home/hencrice/Downloads/AsterixDBClassData/models/clf_mixtureOf_{0}.pkl'.format(productCategory),'wb') as fp:
cPickle.dump(bestClf, fp, -1)
In [78]:
productCategory='Shoes_Kindle_Store_Tools_Home_Improvement_Industrial_Scientific_Electronics_Baby_Sports_Outdoors_Cell_Phones_Accessories'
tfIdfArr=loadmat('/home/hencrice/Downloads/AsterixDBClassData/mixtures/TfIdf_{0}.mat'.format(productCategory))['data']
scores=load('/home/hencrice/Downloads/AsterixDBClassData/mixtures/scores_mixtureOf_{0}.npy'.format(productCategory))
In [79]:
tfIdfArr_trVaSet, tfIdfArr_teSet, scores_trVaSet, scores_teSet = train_test_split(tfIdfArr, scores, test_size=0.1)
hist(scores_teSet)
Out[79]:
Pick hyper-parameters for SGDClassifier using grid search:
In [84]:
hyperParam={'n_iter':range(5, 20, 5),
# strength of regularization
'alpha': logspace(-5, -3, 10)
}
clf = GridSearchCV(SGDClassifier(loss='log', class_weight={1:0.2, 2:0.2, 3:0.8, 4:0.1, 5:0.25}), hyperParam, n_jobs=8, verbose=1)
clf.fit(tfIdfArr_trVaSet, scores_trVaSet)
bestClf=clf.best_estimator_
clf.best_params_
Out[84]:
Prediction accuracy of each class:
In [85]:
te_cm=confusion_matrix(scores_teSet, bestClf.predict(tfIdfArr_teSet))
te_cm.diagonal()/sum(te_cm,1,dtype=float32)
Out[85]:
Save the resulting model:
In [86]:
with open('/home/hencrice/Downloads/AsterixDBClassData/models/clf_mixtureOf_{0}.pkl'.format(productCategory),'wb') as fp:
cPickle.dump(bestClf, fp, -1)