In [1]:
%pylab inline
In [149]:
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import linear_model, svm
from sklearn import model_selection
from sklearn import metrics, ensemble, multiclass, naive_bayes
from imblearn import over_sampling, under_sampling, pipeline, combine, ensemble
import tabulate
sns.set_context('poster')
sns.set_style('ticks')
mpl.rcParams['figure.figsize'] = [10.0, 7.0]
In [177]:
import sys
sys.path.append("../src/")
import utils
In [3]:
data = pd.read_pickle('../processed/normalized.pkl.gz')
In [159]:
sample = data.review_alpha_splitted_nostopwords_spellcorrected_normalized
# sample = data[[col for col in data.columns if not 'rating' in col]].sum(1)
In [160]:
vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features=2000, )
In [161]:
X = vectorizer.fit_transform(sample.apply(lambda x: ' '.join(x))).todense()
In [162]:
X.shape
Out[162]:
In [163]:
mapping = {1:1,2:2,3:3,4:4,5:5}
In [164]:
idx = y>0
y = data.rating.round().astype(int).replace(mapping)
# clf = linear_model.LogisticRegression(penalty='l1', class_weight='balanced', solver='liblinear')
clf = naive_bayes.BernoulliNB()
# clf = ensemble.GradientBoostingClassifier()
clf = svm.LinearSVC(penalty='l1', dual=False, class_weight='balanced')
# clf = svm.NuSVC(nu=0.1, kernel='linear', class_weight='balanced')
y_pred = model_selection.cross_val_predict(clf, X[idx], y[idx],
n_jobs=-1, cv=model_selection.StratifiedKFold(10, shuffle=True, random_state=42))
print(metrics.classification_report(y[idx], y_pred))
print(metrics.confusion_matrix(y[idx], y_pred))
In [152]:
print(tabulate.tabulate(metrics.confusion_matrix(y[idx], y_pred),
headers=range(1,6), tablefmt='orgtbl', showindex=range(1,6)).replace('+', '|'))
In [ ]: