In [1]:
%pylab inline
In [16]:
import pandas as pd
from sklearn import svm, model_selection, pipeline, linear_model, preprocessing, feature_selection
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.exceptions import UndefinedMetricWarning
import nltk
from tabulate import tabulate
In [3]:
import sys
sys.path.append("../src/")
import utils
In [4]:
import warnings
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
In [5]:
data = pd.read_csv('~/cloud/data/mvideo/X_train.csv')
data.columns = ['product_id', 'category_level1', 'category_level2', 'brand', 'property', 'user_name', 'rating', 'date',
'review', 'negative', 'positive']
data['date'] = pd.to_datetime(data.date)
In [5]:
tmp = TfidfVectorizer().fit_transform(data['review'])
tmp.shape
Out[5]:
In [6]:
tmp = utils.WordNormalizer(['review']).fit_transform(utils.NonAlphaRemover(['review']).fit_transform(data[['rating', 'review']]))
In [ ]:
svm.SVC(kernel='linear').fit(tmp, data.rating.round())
In [ ]:
data.rating
In [6]:
clf = pipeline.Pipeline([('nonalpha', utils.NonAlphaRemover(['review'])),
('wordnorm', utils.WordNormalizer(['review'])),
('tfidf', TfidfVectorizer(analyzer='word', ngram_range=(1,3), stop_words=None)),
('svm', svm.LinearSVC(penalty='l1', multi_class='ovr', C=1, dual=False, class_weight='balanced'))])
In [69]:
res = model_selection.cross_validate(clf, data['review'], data['rating'].round(),
cv=model_selection.StratifiedKFold(10, shuffle=True, random_state=42), n_jobs=-1,
scoring=['f1_micro', 'f1_macro', 'f1_weighted'], return_train_score=False)
In [70]:
for k, v in res.items():
print([k, v.mean()])
In [ ]:
preprocessing.MinMaxScaler()
In [16]:
Out[16]:
In [7]:
param_grid = {#'nonalpha__doit': [True, False],
#'wordnorm__doit': [True, False],
'tfidf__ngram_range': [(1,1), (1,2), (1,3), (1,4)],
# 'tfidf__stop_words': [None, nltk.corpus.stopwords.words('russian')],
'tfidf__max_features': [None, 500,1000,2000,5000, 10000],
# 'tfidf__sublinear_tf': [True, False],
# 'tfidf__norm': ['l1', 'l2'],
'svm__penalty': ['l1', 'l2'],
'svm__C': logspace(-10,10, num=20, base=e),
# 'svm__class_weight': [None, 'balanced']
}
model = model_selection.RandomizedSearchCV(clf, param_grid, n_iter=100, n_jobs=4, scoring='f1_weighted',
random_state=42, error_score=0, verbose=1,
fynjybyf
cv=model_selection.StratifiedKFold(10, shuffle=True, random_state=42))
In [8]:
model.fit(data['review'], data['rating'].round())
Out[8]:
In [9]:
model.best_params_
Out[9]:
In [10]:
model.best_score_
Out[10]:
In [13]:
clf_best = clf.set_params(**model.best_params_)
In [ ]:
clf_best
In [24]:
scoring = ['precision', 'recall', 'f1']
for val in copy(scoring):
scoring.append(val+'_micro')
scoring.append(val+'_macro')
scoring.append(val+'_weighted')
scoring = scoring[3:]
In [25]:
scoring
Out[25]:
In [ ]:
In [26]:
res = model_selection.cross_validate(clf_best, data['review'], data['rating'].round(),
cv=model_selection.StratifiedKFold(10, shuffle=True, random_state=42), n_jobs=-1,
scoring=scoring, return_train_score=False)
In [27]:
toshow = []
for k, v in res.items():
toshow.append([k, v.mean()])
In [28]:
print(tabulate(toshow, headers=['metric', 'score']))
In [ ]: