In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [149]:
import pandas as pd
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import linear_model, svm
from sklearn import model_selection
from sklearn import metrics, ensemble, multiclass, naive_bayes

from imblearn import over_sampling, under_sampling, pipeline, combine, ensemble

import tabulate

sns.set_context('poster')
sns.set_style('ticks')
mpl.rcParams['figure.figsize'] = [10.0, 7.0]

In [177]:
import sys
sys.path.append("../src/")
import utils

In [3]:
data = pd.read_pickle('../processed/normalized.pkl.gz')

In [159]:
sample = data.review_alpha_splitted_nostopwords_spellcorrected_normalized
# sample = data[[col for col in data.columns if not 'rating' in col]].sum(1)

In [160]:
vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features=2000, )

In [161]:
X = vectorizer.fit_transform(sample.apply(lambda x: ' '.join(x))).todense()

In [162]:
X.shape


Out[162]:
(18340, 2000)

In [163]:
mapping = {1:1,2:2,3:3,4:4,5:5}

In [164]:
idx = y>0
y =  data.rating.round().astype(int).replace(mapping)
# clf = linear_model.LogisticRegression(penalty='l1', class_weight='balanced', solver='liblinear')
clf = naive_bayes.BernoulliNB()
# clf = ensemble.GradientBoostingClassifier()
clf = svm.LinearSVC(penalty='l1', dual=False, class_weight='balanced')
# clf = svm.NuSVC(nu=0.1, kernel='linear', class_weight='balanced')
y_pred = model_selection.cross_val_predict(clf, X[idx], y[idx], 
                                           n_jobs=-1, cv=model_selection.StratifiedKFold(10, shuffle=True, random_state=42))
print(metrics.classification_report(y[idx], y_pred))
print(metrics.confusion_matrix(y[idx], y_pred))


             precision    recall  f1-score   support

          1       0.41      0.50      0.45      1700
          2       0.17      0.24      0.20      1025
          3       0.21      0.26      0.24      1500
          4       0.39      0.32      0.36      3159
          5       0.83      0.79      0.81     10956

avg / total       0.63      0.61      0.62     18340

[[ 850  338  260   76  176]
 [ 338  250  235   98  104]
 [ 282  280  394  296  248]
 [ 207  263  474 1021 1194]
 [ 377  341  481 1096 8661]]

In [152]:
print(tabulate.tabulate(metrics.confusion_matrix(y[idx], y_pred), 
                        headers=range(1,6), tablefmt='orgtbl', showindex=range(1,6)).replace('+', '|'))


|    |   1 |   2 |   3 |    4 |    5 |
|----|-----|-----|-----|------|------|
|  1 | 850 | 338 | 260 |   76 |  176 |
|  2 | 338 | 250 | 235 |   98 |  104 |
|  3 | 282 | 280 | 394 |  296 |  248 |
|  4 | 207 | 263 | 474 | 1021 | 1194 |
|  5 | 377 | 341 | 481 | 1096 | 8661 |

In [ ]: