In [24]:
import cPickle
import os
from itertools import izip
from scipy.io import loadmat, savemat
from sklearn.linear_model import SGDClassifier
from numpy.core.defchararray import add as stringVecAdd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.grid_search import GridSearchCV

Load and concatenate raw data for every 8 product categories, the ratio of score-3 reviews v.s. non score-3 reviews is 1:3, then store these mixtures of raw data from different categories=>


In [34]:
txtFileList=[fName for fName in next(os.walk('/home/hencrice/Downloads/AsterixDBClassData'))[2] if fName.endswith('.npy')]

numOfCategoriesInAMixture=8
assert len(txtFileList)%numOfCategoriesInAMixture==0, "Can't divide {0} by {1}".format(len(txtFileList), numOfCategoriesInAMixture)

for s,e in izip(range(0, len(txtFileList), numOfCategoriesInAMixture), range(numOfCategoriesInAMixture, len(txtFileList)+1, numOfCategoriesInAMixture)):
    print((s, e))
    fileNameL=[]
    for fname in txtFileList[s:e]:
        print('Loading table {0} ......'.format(fname[8:-4]))
        fileNameL.append(fname[8:-4])
        if s%numOfCategoriesInAMixture==0:
            rawDataArr=load('/home/hencrice/Downloads/AsterixDBClassData/{0}'.format(fname))
            rawDataArr=stringVecAdd(rawDataArr[:,1], stringVecAdd(" ", rawDataArr[:,2]))
            scores=load('/home/hencrice/Downloads/AsterixDBClassData/processedData/score_{0}'.format(fname[8:]))
            score3Cnt=sum(scores==3)
            score3Cnt=2000 if score3Cnt>2000 else score3Cnt
            rawDataArr=hstack((rawDataArr[scores==3][:score3Cnt], rawDataArr[scores!=3][:score3Cnt*3]))
            targets=hstack((scores[scores==3][:score3Cnt], scores[scores!=3][:score3Cnt*3]))
        else:
            tempArr=load('/home/hencrice/Downloads/AsterixDBClassData/{0}'.format(fname))
            tempArr=stringVecAdd(tempArr[:,1], stringVecAdd(" ", tempArr[:,2]))
            scores=load('/home/hencrice/Downloads/AsterixDBClassData/processedData/score_{0}'.format(fname[8:]))
            score3Cnt=sum(scores==3)
            score3Cnt=2000 if score3Cnt>2000 else score3Cnt
            rawDataArr=hstack((rawDataArr, tempArr[scores==3][:score3Cnt], tempArr[scores!=3][:score3Cnt*3]))
            targets=hstack((targets, scores[scores==3][:score3Cnt], scores[scores!=3][:score3Cnt*3]))
            del tempArr, scores
        if fname==txtFileList[e-1]:
            save('/home/hencrice/Downloads/AsterixDBClassData/mixtures/rawData_mixtureOf_{0}'.format('_'.join(fileNameL)), rawDataArr)
            save('/home/hencrice/Downloads/AsterixDBClassData/mixtures/scores_mixtureOf_{0}'.format('_'.join(fileNameL)), targets)
            print('data shape: {0}'.format((rawDataArr.shape, targets.shape)))
            del rawDataArr, scores
            fileNameL[:]=[]


(0, 8)
Loading table Home_Kitchen ......
Loading table Office_Products ......
Loading table Arts ......
Loading table Pet_Supplies ......
Loading table Toys_Games ......
Loading table Beauty ......
Loading table Gourmet_Foods ......
Loading table Software ......
data shape: ((8000,), (8000,))
(8, 16)
Loading table Shoes ......
Loading table Kindle_Store ......
Loading table Tools_Home_Improvement ......
Loading table Industrial_Scientific ......
Loading table Electronics ......
Loading table Baby ......
Loading table Sports_Outdoors ......
Loading table Cell_Phones_Accessories ......
data shape: ((8000,), (8000,))
(16, 24)
Loading table Jewelry ......
Loading table Patio ......
Loading table Watches ......
Loading table Video_Games ......
Loading table Clothing_Accessories ......
Loading table Health ......
Loading table Automotive ......
Loading table Musical_Instruments ......
data shape: ((8000,), (8000,))

Transform raw data mixture into tf-idf format and save the vectorizer that is used to produce the results:


In [17]:
fileList=[fName[18:] for fName in next(os.walk('/home/hencrice/Downloads/AsterixDBClassData/mixtures/'))[2] if fName.endswith('.npy') and fName[0]=='r']
for fName in fileList:
    rawData=load('/home/hencrice/Downloads/AsterixDBClassData/mixtures/rawData_mixtureOf_{0}'.format(fName))
    vectorizer=TfidfVectorizer(min_df=9, max_df=0.7, ngram_range=(1, 3))    
    savemat('/home/hencrice/Downloads/AsterixDBClassData/mixtures/TfIdf_{0}'.format(fName[:-4]), {'data': vectorizer.fit_transform(rawData)})
    with open('/home/hencrice/Downloads/AsterixDBClassData/models/vectorizerTfIdf_{0}.pkl'.format(fName[:-4]),'wb') as fp:
        cPickle.dump(vectorizer, fp, -1)

Split data into training+validation (used gridSearch to pick hyper-parameters), and test set (evaluate model performance)


In [20]:
productCategory='Home_Kitchen_Office_Products_Arts_Pet_Supplies_Toys_Games_Beauty_Gourmet_Foods_Software'
tfIdfArr=loadmat('/home/hencrice/Downloads/AsterixDBClassData/mixtures/TfIdf_{0}.mat'.format(productCategory))['data']
scores=load('/home/hencrice/Downloads/AsterixDBClassData/mixtures/scores_mixtureOf_{0}.npy'.format(productCategory))

In [67]:
tfIdfArr_trVaSet, tfIdfArr_teSet, scores_trVaSet, scores_teSet = train_test_split(tfIdfArr, scores, test_size=0.1)
hist(scores_teSet)


Out[67]:
(array([ 169.,    0.,   62.,    0.,    0.,  221.,    0.,  146.,    0.,  202.]),
 array([ 1. ,  1.4,  1.8,  2.2,  2.6,  3. ,  3.4,  3.8,  4.2,  4.6,  5. ]),
 <a list of 10 Patch objects>)

Pick hyper-parameters for SGDClassifier using grid search:


In [63]:
hyperParam={'n_iter':range(5, 20, 5),
            # strength of regularization
            'alpha': logspace(-5, -3, 10)
            }
clf = GridSearchCV(SGDClassifier(loss='log', class_weight={1:0.2, 2:0.2, 3:0.8, 4:0.1, 5:0.25}), hyperParam, n_jobs=8, verbose=1)
clf.fit(tfIdfArr_trVaSet, scores_trVaSet)
bestClf=clf.best_estimator_
clf.best_params_


Fitting 3 folds for each of 30 candidates, totalling 90 fits
[Parallel(n_jobs=8)]: Done   1 jobs       | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done  50 jobs       | elapsed:    5.5s
[Parallel(n_jobs=8)]: Done  76 out of  90 | elapsed:    8.0s remaining:    1.5s
[Parallel(n_jobs=8)]: Done  90 out of  90 | elapsed:    9.4s finished
Out[63]:
{'alpha': 1.0000000000000001e-05, 'n_iter': 15}

Prediction accuracy of each class:


In [64]:
te_cm=confusion_matrix(scores_teSet, bestClf.predict(tfIdfArr_teSet))
te_cm.diagonal()/sum(te_cm,1,dtype=float32)


Out[64]:
array([ 0.67553191,  0.        ,  0.79104478,  0.04516129,  0.77486911])

Save the resulting model:


In [66]:
with open('/home/hencrice/Downloads/AsterixDBClassData/models/clf_mixtureOf_{0}.pkl'.format(productCategory),'wb') as fp:
    cPickle.dump(bestClf, fp, -1)

Train another mixture:


In [68]:
productCategory='Jewelry_Patio_Watches_Video_Games_Clothing_Accessories_Health_Automotive_Musical_Instruments'
tfIdfArr=loadmat('/home/hencrice/Downloads/AsterixDBClassData/mixtures/TfIdf_{0}.mat'.format(productCategory))['data']
scores=load('/home/hencrice/Downloads/AsterixDBClassData/mixtures/scores_mixtureOf_{0}.npy'.format(productCategory))

In [69]:
tfIdfArr_trVaSet, tfIdfArr_teSet, scores_trVaSet, scores_teSet = train_test_split(tfIdfArr, scores, test_size=0.1)
hist(scores_teSet)


Out[69]:
(array([ 172.,    0.,   91.,    0.,    0.,  188.,    0.,  181.,    0.,  168.]),
 array([ 1. ,  1.4,  1.8,  2.2,  2.6,  3. ,  3.4,  3.8,  4.2,  4.6,  5. ]),
 <a list of 10 Patch objects>)

Pick hyper-parameters for SGDClassifier using grid search:


In [74]:
hyperParam={'n_iter':range(5, 20, 5),
            # strength of regularization
            'alpha': logspace(-5, -3, 10)
            }
clf = GridSearchCV(SGDClassifier(loss='log', class_weight={1:0.2, 2:0.2, 3:0.65, 4:0.1, 5:0.25}), hyperParam, n_jobs=8, verbose=1)
clf.fit(tfIdfArr_trVaSet, scores_trVaSet)
bestClf=clf.best_estimator_
clf.best_params_


Fitting 3 folds for each of 30 candidates, totalling 90 fits
[Parallel(n_jobs=8)]: Done   1 jobs       | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done  50 jobs       | elapsed:    4.5s
[Parallel(n_jobs=8)]: Done  76 out of  90 | elapsed:    6.7s remaining:    1.2s
[Parallel(n_jobs=8)]: Done  90 out of  90 | elapsed:    7.6s finished
Out[74]:
{'alpha': 1.0000000000000001e-05, 'n_iter': 15}

Prediction accuracy of each class:


In [76]:
te_cm=confusion_matrix(scores_teSet, bestClf.predict(tfIdfArr_teSet))
te_cm.diagonal()/sum(te_cm,1,dtype=float32)


Out[76]:
array([ 0.73255814,  0.05494505,  0.84574468,  0.15469613,  0.75595238])

Save the resulting model:


In [77]:
with open('/home/hencrice/Downloads/AsterixDBClassData/models/clf_mixtureOf_{0}.pkl'.format(productCategory),'wb') as fp:
    cPickle.dump(bestClf, fp, -1)

Train the final mixture:


In [78]:
productCategory='Shoes_Kindle_Store_Tools_Home_Improvement_Industrial_Scientific_Electronics_Baby_Sports_Outdoors_Cell_Phones_Accessories'
tfIdfArr=loadmat('/home/hencrice/Downloads/AsterixDBClassData/mixtures/TfIdf_{0}.mat'.format(productCategory))['data']
scores=load('/home/hencrice/Downloads/AsterixDBClassData/mixtures/scores_mixtureOf_{0}.npy'.format(productCategory))

In [79]:
tfIdfArr_trVaSet, tfIdfArr_teSet, scores_trVaSet, scores_teSet = train_test_split(tfIdfArr, scores, test_size=0.1)
hist(scores_teSet)


Out[79]:
(array([ 171.,    0.,   79.,    0.,    0.,  196.,    0.,  178.,    0.,  176.]),
 array([ 1. ,  1.4,  1.8,  2.2,  2.6,  3. ,  3.4,  3.8,  4.2,  4.6,  5. ]),
 <a list of 10 Patch objects>)

Pick hyper-parameters for SGDClassifier using grid search:


In [84]:
hyperParam={'n_iter':range(5, 20, 5),
            # strength of regularization
            'alpha': logspace(-5, -3, 10)
            }
clf = GridSearchCV(SGDClassifier(loss='log', class_weight={1:0.2, 2:0.2, 3:0.8, 4:0.1, 5:0.25}), hyperParam, n_jobs=8, verbose=1)
clf.fit(tfIdfArr_trVaSet, scores_trVaSet)
bestClf=clf.best_estimator_
clf.best_params_


Fitting 3 folds for each of 30 candidates, totalling 90 fits
[Parallel(n_jobs=8)]: Done   1 jobs       | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done  50 jobs       | elapsed:    5.0s
[Parallel(n_jobs=8)]: Done  76 out of  90 | elapsed:    7.1s remaining:    1.3s
[Parallel(n_jobs=8)]: Done  90 out of  90 | elapsed:    8.2s finished
Out[84]:
{'alpha': 1.0000000000000001e-05, 'n_iter': 15}

Prediction accuracy of each class:


In [85]:
te_cm=confusion_matrix(scores_teSet, bestClf.predict(tfIdfArr_teSet))
te_cm.diagonal()/sum(te_cm,1,dtype=float32)


Out[85]:
array([ 0.64912281,  0.03797468,  0.78571429,  0.12359551,  0.71022727])

Save the resulting model:


In [86]:
with open('/home/hencrice/Downloads/AsterixDBClassData/models/clf_mixtureOf_{0}.pkl'.format(productCategory),'wb') as fp:
    cPickle.dump(bestClf, fp, -1)