In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [16]:
import pandas as pd
from sklearn import svm, model_selection, pipeline, linear_model, preprocessing, feature_selection
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.exceptions import UndefinedMetricWarning
import nltk
from tabulate import tabulate

In [3]:
import sys
sys.path.append("../src/")
import utils

In [4]:
import warnings
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

In [5]:
data = pd.read_csv('~/cloud/data/mvideo/X_train.csv')
data.columns = ['product_id', 'category_level1', 'category_level2', 'brand', 'property', 'user_name', 'rating', 'date',
               'review', 'negative', 'positive']
data['date'] = pd.to_datetime(data.date)

In [5]:
tmp = TfidfVectorizer().fit_transform(data['review'])
tmp.shape


Out[5]:
(15587, 55458)

In [6]:
tmp = utils.WordNormalizer(['review']).fit_transform(utils.NonAlphaRemover(['review']).fit_transform(data[['rating', 'review']]))

In [ ]:
svm.SVC(kernel='linear').fit(tmp, data.rating.round())

In [ ]:
data.rating

In [6]:
clf = pipeline.Pipeline([('nonalpha', utils.NonAlphaRemover(['review'])),
                         ('wordnorm', utils.WordNormalizer(['review'])),
                         ('tfidf', TfidfVectorizer(analyzer='word', ngram_range=(1,3), stop_words=None)),
                         ('svm', svm.LinearSVC(penalty='l1', multi_class='ovr', C=1, dual=False, class_weight='balanced'))])

In [69]:
res = model_selection.cross_validate(clf, data['review'], data['rating'].round(), 
                                     cv=model_selection.StratifiedKFold(10, shuffle=True, random_state=42), n_jobs=-1, 
                                     scoring=['f1_micro', 'f1_macro', 'f1_weighted'], return_train_score=False)

In [70]:
for k, v in res.items():
    print([k, v.mean()])


['fit_time', 52.932166481018065]
['score_time', 16.82380223274231]
['test_f1_micro', 0.62013310614770278]
['test_f1_macro', 0.42482592868066327]
['test_f1_weighted', 0.6304038514314273]

In [ ]:
preprocessing.MinMaxScaler()

In [16]:



Out[16]:
array([  4.53999298e-05,   1.30079023e-04,   3.72699966e-04,
         1.06785292e-03,   3.05959206e-03,   8.76628553e-03,
         2.51169961e-02,   7.19647439e-02,   2.06192028e-01,
         5.90777514e-01,   1.69268460e+00,   4.84984802e+00,
         1.38956932e+01,   3.98136782e+01,   1.14073401e+02,
         3.26840958e+02,   9.36458553e+02,   2.68312340e+03,
         7.68763460e+03,   2.20264658e+04])

In [7]:
param_grid = {#'nonalpha__doit': [True, False], 
              #'wordnorm__doit': [True, False],
              'tfidf__ngram_range': [(1,1), (1,2), (1,3), (1,4)],
#               'tfidf__stop_words': [None, nltk.corpus.stopwords.words('russian')],
              'tfidf__max_features': [None, 500,1000,2000,5000, 10000],
#               'tfidf__sublinear_tf': [True, False],
#               'tfidf__norm': ['l1', 'l2'],
              'svm__penalty': ['l1', 'l2'],
              'svm__C': logspace(-10,10, num=20, base=e),
#               'svm__class_weight': [None, 'balanced']
             }
model = model_selection.RandomizedSearchCV(clf, param_grid, n_iter=100, n_jobs=4, scoring='f1_weighted', 
                                   random_state=42, error_score=0, verbose=1,
          fynjybyf
                                           cv=model_selection.StratifiedKFold(10, shuffle=True, random_state=42))

In [8]:
model.fit(data['review'], data['rating'].round())


Fitting 10 folds for each of 100 candidates, totalling 1000 fits
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  7.0min
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed: 31.4min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed: 460.0min
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed: 533.7min
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed: 632.6min finished
Out[8]:
RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=42, shuffle=True),
          error_score=0,
          estimator=Pipeline(memory=None,
     steps=[('nonalpha', NonAlphaRemover(cols=['review'], doit=True)), ('wordnorm', WordNormalizer(cols=['review'], doit=True)), ('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, ...ax_iter=1000,
     multi_class='ovr', penalty='l1', random_state=None, tol=0.0001,
     verbose=0))]),
          fit_params=None, iid=True, n_iter=100, n_jobs=4,
          param_distributions={'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)], 'tfidf__max_features': [None, 500, 1000, 2000, 5000, 10000], 'svm__penalty': ['l1', 'l2'], 'svm__C': array([  4.53999e-05,   1.30079e-04,   3.72700e-04,   1.06785e-03,
         3.05959e-03,   8.76629e-03,   2.51170e-02,   7...,   1.14073e+02,   3.26841e+02,
         9.36459e+02,   2.68312e+03,   7.68763e+03,   2.20265e+04])},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score=True, scoring='f1_weighted', verbose=1)

In [9]:
model.best_params_


Out[9]:
{'svm__C': 0.59077751390123123,
 'svm__penalty': 'l2',
 'tfidf__max_features': None,
 'tfidf__ngram_range': (1, 2)}

In [10]:
model.best_score_


Out[10]:
0.64352777323448507

In [13]:
clf_best = clf.set_params(**model.best_params_)

In [ ]:
clf_best

In [24]:
scoring = ['precision', 'recall', 'f1']
for val in copy(scoring):
    scoring.append(val+'_micro')
    scoring.append(val+'_macro')
    scoring.append(val+'_weighted')
scoring = scoring[3:]

In [25]:
scoring


Out[25]:
['precision_micro',
 'precision_macro',
 'precision_weighted',
 'recall_micro',
 'recall_macro',
 'recall_weighted',
 'f1_micro',
 'f1_macro',
 'f1_weighted']

In [ ]:


In [26]:
res = model_selection.cross_validate(clf_best, data['review'], data['rating'].round(), 
                                     cv=model_selection.StratifiedKFold(10, shuffle=True, random_state=42), n_jobs=-1, 
                                     scoring=scoring, return_train_score=False)

In [27]:
toshow = []
for k, v in res.items():
    toshow.append([k, v.mean()])

In [28]:
print(tabulate(toshow, headers=['metric', 'score']))


metric                        score
-----------------------  ----------
fit_time                  54.5067
score_time               353.617
test_precision_micro       0.667544
test_precision_macro       0.446301
test_precision_weighted    0.628906
test_recall_micro          0.667544
test_recall_macro          0.417641
test_recall_weighted       0.667544
test_f1_micro              0.667544
test_f1_macro              0.423346
test_f1_weighted           0.643529

In [ ]: