In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
import pandas as pd
from sklearn import svm, model_selection, pipeline, linear_model, preprocessing, feature_selection, ensemble
from imblearn import pipeline as imbpipeline
from imblearn import over_sampling
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.exceptions import UndefinedMetricWarning, DataConversionWarning
import nltk
from tabulate import tabulate

from sklearn.pipeline import Pipeline

from mlxtend.feature_selection import ColumnSelector
from mlxtend.classifier import StackingClassifier

from gensim.models.word2vec import Word2Vec
from gensim.models.wrappers import FastText
import gensim

In [3]:
import warnings
warnings.filterwarnings('ignore', category=DataConversionWarning)

In [4]:
import sys
sys.path.append("../src/")
import utils

In [5]:
data = pd.read_csv('~/cloud/data/mvideo/X_train.csv')
data.columns = ['product_id', 'category_level1', 'category_level2', 'brand', 'property', 'user_name', 'rating', 'date',
               'review', 'negative', 'positive']
data['date'] = pd.to_datetime(data.date)

In [6]:
meta = pd.read_pickle('../processed/meta_features.pkl.gz').fillna(0).drop(['product_id', 'rating_cat'], axis=1).astype(float)
meta.shape


Out[6]:
(15587, 114)

In [7]:
polarity = pd.read_pickle('../processed/polarity.pkl.gz').fillna(0); polarity.shape


Out[7]:
(15587, 4)

In [8]:
categories = pd.read_pickle('../processed/categories_dummies.pkl.gz').fillna(0); categories.shape


Out[8]:
(15587, 11896)

In [9]:
X = pd.concat((data.drop(['product_id', 'rating'], axis=1), 
               meta.astype(float), 
               polarity.astype(float),
              categories.astype(float)), axis=1); X.shape


Out[9]:
(15587, 12023)

In [10]:
y = data.rating.round().astype(int)

In [11]:
#Models

In [12]:
base_svm = svm.SVC(kernel='linear', C=0.5, class_weight='balanced', probability=True, decision_function_shape='ovr', 
                   random_state=42)
base_gboost = ensemble.GradientBoostingClassifier(n_estimators=100, random_state=42)

In [33]:
pipeline.Pipeline([('selector', ColumnSelector(where(X.columns.isin(categories.columns))[0])),
                    ('fs', feature_selection.VarianceThreshold(threshold=0.01))]).fit_transform(X.values).shape


Out[33]:
(15587, 507)

In [21]:
meta_gboost = imbpipeline.Pipeline([('selector', ColumnSelector(where(X.columns.isin(meta.columns))[0])),
                                    ('scaler', preprocessing.MinMaxScaler()), 
                                    ('fs', feature_selection.VarianceThreshold(threshold=0.01)),
#                                     ('resample', over_sampling.RandomOverSampler()),
                                    ('gboost', base_gboost)])

polarity_gboost = imbpipeline.Pipeline([('selector', ColumnSelector(where(X.columns.isin(polarity.columns))[0])),
#                                         ('resample', over_sampling.RandomOverSampler()),
                                        ('gboost', base_gboost)])

cat_gboost = imbpipeline.Pipeline([('selector', ColumnSelector(where(X.columns.isin(categories.columns))[0])),
                                    ('fs', feature_selection.VarianceThreshold(threshold=0.1)),
                                    ('gboost', base_gboost)])


tfidf_svm = pipeline.Pipeline([('selector', ColumnSelector(where(X.columns.isin(['review']))[0])),
                         ('nonalpha', utils.NonAlphaRemover(['review'])),
                         ('wordnorm', utils.WordNormalizer(['review'])),
                         ('tfidf', TfidfVectorizer(analyzer='word', ngram_range=(1,2), stop_words=None)),
                         ('svm', base_svm)])

tfidf_over_svm = imbpipeline.Pipeline([('selector', ColumnSelector(where(X.columns.isin(['review']))[0])),
                         ('nonalpha', utils.NonAlphaRemover(['review'])),
                         ('wordnorm', utils.WordNormalizer(['review'])),
                         ('tfidf', TfidfVectorizer(analyzer='word', ngram_range=(1,2), stop_words=None)),
                         ('resample', over_sampling.RandomOverSampler()),
                         ('svm', base_svm)])

w2v_model = FastText()
w2v_model = w2v_model.load_fasttext_format("../external/wiki_w2v_256.bin")
# w2v = {w: vec for w, vec in zip(w2v_model.wv.index2word, w2v_model.wv.syn0)}

tfidf_word2vec_wiki_svm = pipeline.Pipeline([('selector', ColumnSelector(where(X.columns.isin(['review']))[0])),
                        ("word2vec", utils.Word2VecTfidfEmbeddingVectorizer(w2v_model)), 
                        ('svm', base_svm)])



# w2v_model = gensim.models.KeyedVectors.load_word2vec_format("../external/news_rusvectores2.bin.gz", binary=True)
# w2v = {w: vec for w, vec in zip(w2v_model.wv.index2word, w2v_model.wv.syn0)}


# tfidf_word2vec_news_svm = pipeline.Pipeline([('selector', ColumnSelector(where(X.columns.isin(['review']))[0])),
#                         ("word2vec", utils.TfidfEmbeddingVectorizer(w2v)), 
#                         ('svm', svm.SVC(kernel='linear', C=0.5, class_weight='balanced', probability=True))])

# w2v_model = gensim.models.KeyedVectors.load_word2vec_format("../external/araneum_1_600_2.bin.gz", binary=True)
# w2v = {w: vec for w, vec in zip(w2v_model.wv.index2word, w2v_model.wv.syn0)}

# tfidf_word2vec_araneum_svm = pipeline.Pipeline([('selector', ColumnSelector(where(X.columns.isin(['review']))[0])),
#                         ("word2vec", utils.TfidfEmbeddingVectorizer(w2v)), 
#                         ('svm', svm.SVC(kernel='linear', C=0.5, class_weight='balanced', probability=True))])

# cols_non_digit = ['category_level1', 'category_level2', 'brand', 'property', 'user_name', 'rating', 'date',
#                'review', 'negative', 'positive']

In [22]:
step_clfs = [meta_gboost, polarity_gboost, cat_gboost, tfidf_svm, #tfidf_over_svm,
             tfidf_word2vec_wiki_svm]#tfidf_word2vec_news_svm, tfidf_word2vec_araneum_svm]
sclf = StackingClassifier(classifiers=step_clfs, verbose=1,
                          use_probas=True,
                          average_probas=False,
                          meta_classifier=linear_model.LogisticRegression(class_weight='balanced'))

In [23]:
scoring = ['f1_micro', 'f1_macro', 'f1_weighted', 
           'precision_micro', 'precision_macro', 'precision_weighted',
           'recall_micro', 'recall_macro', 'recall_weighted',
            'neg_mean_absolute_error', 'neg_mean_squared_error', 
#            'explained_variance','neg_mean_squared_log_error', 'neg_median_absolute_error', 'r2'
          ]

In [24]:
result = model_selection.cross_validate(sclf, X.values, y.values, 
                                        cv=model_selection.StratifiedKFold(10, shuffle=True, random_state=42), n_jobs=1, 
                                        scoring=scoring, return_train_score=False, verbose=3)


[CV]  ................................................................
Fitting 5 classifiers...
Fitting classifier1: pipeline (1/5)
Fitting classifier2: pipeline (2/5)
Fitting classifier3: pipeline (3/5)
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-24-b13c69cc6859> in <module>()
      1 result = model_selection.cross_validate(sclf, X.values, y.values, 
      2                                         cv=model_selection.StratifiedKFold(10, shuffle=True, random_state=42), n_jobs=1,
----> 3                                         scoring=scoring, return_train_score=False, verbose=3)

~/anaconda/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score)
    193             fit_params, return_train_score=return_train_score,
    194             return_times=True)
--> 195         for train, test in cv.split(X, y, groups))
    196 
    197     if return_train_score:

~/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    777             # was dispatched. In particular this covers the edge
    778             # case of Parallel used with an exhausted iterator.
--> 779             while self.dispatch_one_batch(iterator):
    780                 self._iterating = True
    781             else:

~/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
    623                 return False
    624             else:
--> 625                 self._dispatch(tasks)
    626                 return True
    627 

~/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
    586         dispatch_timestamp = time.time()
    587         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588         job = self._backend.apply_async(batch, callback=cb)
    589         self._jobs.append(job)
    590 

~/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
    109     def apply_async(self, func, callback=None):
    110         """Schedule a func to be run"""
--> 111         result = ImmediateResult(func)
    112         if callback:
    113             callback(result)

~/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
    330         # Don't delay the application, to avoid keeping the input
    331         # arguments in memory
--> 332         self.results = batch()
    333 
    334     def get(self):

~/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

~/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

~/anaconda/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
    435             estimator.fit(X_train, **fit_params)
    436         else:
--> 437             estimator.fit(X_train, y_train, **fit_params)
    438 
    439     except Exception as e:

~/anaconda/lib/python3.6/site-packages/mlxtend/classifier/stacking_classification.py in fit(self, X, y)
    113                 print(_name_estimators((clf,))[0][1])
    114 
--> 115             clf.fit(X, y)
    116 
    117         meta_features = self._predict_meta_features(X)

~/anaconda/lib/python3.6/site-packages/imblearn/pipeline.py in fit(self, X, y, **fit_params)
    240 
    241         """
--> 242         Xt, yt, fit_params = self._fit(X, y, **fit_params)
    243         if self._final_estimator is not None:
    244             self._final_estimator.fit(Xt, yt, **fit_params)

~/anaconda/lib/python3.6/site-packages/imblearn/pipeline.py in _fit(self, X, y, **fit_params)
    199                     Xt, fitted_transformer = fit_transform_one_cached(
    200                         cloned_transformer, None, Xt, yt,
--> 201                         **fit_params_steps[name])
    202                 elif hasattr(cloned_transformer, "sample"):
    203                     Xt, yt, fitted_transformer = fit_sample_one_cached(

~/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/memory.py in __call__(self, *args, **kwargs)
    360 
    361     def __call__(self, *args, **kwargs):
--> 362         return self.func(*args, **kwargs)
    363 
    364     def call_and_shelve(self, *args, **kwargs):

~/anaconda/lib/python3.6/site-packages/imblearn/pipeline.py in _fit_transform_one(transformer, weight, X, y, **fit_params)
    591                        **fit_params):
    592     if hasattr(transformer, 'fit_transform'):
--> 593         res = transformer.fit_transform(X, y, **fit_params)
    594     else:
    595         res = transformer.fit(X, y, **fit_params).transform(X)

~/anaconda/lib/python3.6/site-packages/mlxtend/feature_selection/column_selector.py in fit_transform(self, X, y)
     42 
     43         """
---> 44         return self.transform(X=X, y=y)
     45 
     46     def transform(self, X, y=None):

~/anaconda/lib/python3.6/site-packages/mlxtend/feature_selection/column_selector.py in transform(self, X, y)
     60 
     61         """
---> 62         t = X[:, self.cols]
     63         if len(t.shape) == 1:
     64             t = t[:, np.newaxis]

KeyboardInterrupt: 

In [ ]:
toprint = []
for k, v in result.items():
    toprint.append([k, mean(v)])
print(tabulate(toprint, headers=['Metric', 'Average K-Fold']))

In [ ]:


In [20]:
union = pipeline.FeatureUnion([('tfidf', Pipeline([('selector', ColumnSelector(where(X.columns.isin(['review']))[0])),
                                                  ('nonalpha', utils.NonAlphaRemover(['review'])),
                                                  ('wordnorm', utils.WordNormalizer(['review'])),
                                                  ('tfidf', TfidfVectorizer(analyzer='word', ngram_range=(1,2)))])),
                               ('metaf', Pipeline([('selector', ColumnSelector(where(X.columns.isin(meta.columns))[0])),
                                                  ('scaler', preprocessing.MinMaxScaler()), 
                                                  ('fs', feature_selection.VarianceThreshold(threshold=0.1))])),
                               ('polar', Pipeline([('selector', ColumnSelector(where(X.columns.isin(polarity.columns))[0])),
                                                   ('fs', feature_selection.VarianceThreshold(threshold=0.1))])),
#                                ('categ', Pipeline([('selector', ColumnSelector(where(X.columns.isin(categories.columns))[0])),
#                                                   ('fs', feature_selection.VarianceThreshold(threshold=0.1))]))
                              ])

In [21]:
model = Pipeline([('union', union), ('clf', svm.LinearSVC(multi_class='ovr', C=0.6, class_weight='balanced', 
                                                          random_state=42))])

In [ ]:
result = model_selection.cross_validate(model, X.values, y.values, 
                                        cv=model_selection.StratifiedKFold(10, shuffle=True, random_state=42), n_jobs=1, 
                                        scoring=scoring, return_train_score=False, verbose=3)


[CV]  ................................................................
[CV]  , f1_micro=0.6576923076923077, f1_macro=0.4244570724172136, f1_weighted=0.6395957143492131, precision_micro=0.6576923076923077, precision_macro=0.4331187725246489, precision_weighted=0.625988429650122, recall_micro=0.6576923076923077, recall_macro=0.4245666956679329, recall_weighted=0.6576923076923077, neg_mean_absolute_error=-0.5288461538461539, neg_mean_squared_error=-1.0826923076923076, total= 4.1min
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.2min remaining:    0.0s
[CV]  ................................................................
[CV]  , f1_micro=0.675, f1_macro=0.4344067331159778, f1_weighted=0.6522872596110262, precision_micro=0.675, precision_macro=0.4499493858081565, precision_weighted=0.6377278462129062, recall_micro=0.675, recall_macro=0.4328676040030068, recall_weighted=0.675, neg_mean_absolute_error=-0.5025641025641026, neg_mean_squared_error=-1.0487179487179488, total= 4.0min
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  8.2min remaining:    0.0s

In [42]:
model = Pipeline([('union', union), ('clf', base_gboost)])

In [43]:
result = model_selection.cross_validate(model, X.values, y.values, 
                                        cv=model_selection.StratifiedKFold(3, shuffle=True, random_state=42), n_jobs=1, 
                                        scoring=scoring, return_train_score=False, verbose=3)


[CV]  ................................................................
[CV]  , f1_micro=0.6338976529434398, f1_macro=0.3047014626373738, f1_weighted=0.5546328641956374, precision_micro=0.6338976529434398, precision_macro=0.4303048089224098, precision_weighted=0.5568678326192937, recall_micro=0.6338976529434398, recall_macro=0.2947903538265619, recall_weighted=0.6338976529434398, neg_mean_absolute_error=-0.7079646017699115, neg_mean_squared_error=-1.8407079646017699, total= 7.3min
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  7.4min remaining:    0.0s
[CV]  ................................................................
[CV]  , f1_micro=0.6304138594802695, f1_macro=0.2964404584088821, f1_weighted=0.5479284246703516, precision_micro=0.6304138594802695, precision_macro=0.4232679443621123, precision_weighted=0.5485782143430727, recall_micro=0.6304138594802695, recall_macro=0.2898385932140076, recall_weighted=0.6304138594802695, neg_mean_absolute_error=-0.7195380173243503, neg_mean_squared_error=-1.8756496631376323, total= 7.3min
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 14.8min remaining:    0.0s
[CV]  ................................................................
[CV]  , f1_micro=0.6370812475933769, f1_macro=0.3159084041002867, f1_weighted=0.5632094163155642, precision_micro=0.6370812475933769, precision_macro=0.4471225520612476, precision_weighted=0.5643720631602934, recall_micro=0.6370812475933769, recall_macro=0.30382204436534666, recall_weighted=0.6370812475933769, neg_mean_absolute_error=-0.702926453600308, neg_mean_squared_error=-1.8234501347708896, total= 7.4min
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 22.4min finished

In [19]:
model = pipeline.Pipeline([('nonalpha', utils.NonAlphaRemover(['review'])),
                         ('wordnorm', utils.WordNormalizer(['review'])),
                         ('tfidf', TfidfVectorizer(analyzer='word', ngram_range=(1,2))),
#                          ('svm', svm.LinearSVC(multi_class='ovr', C=0.6, class_weight='balanced'))
                         ('elastic', linear_model.ElasticNet(alpha=0.001, l1_ratio=0.01))
                          ])
result = model_selection.cross_validate(model, data['review'], y.values, 
                                        cv=model_selection.StratifiedKFold(10, shuffle=True, random_state=42), n_jobs=-1, 
                                        scoring=scoring, return_train_score=False, verbose=3)


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-19-8d22b9607ee9> in <module>()
      7 result = model_selection.cross_validate(model, data['review'], y.values, 
      8                                         cv=model_selection.StratifiedKFold(10, shuffle=True, random_state=42), n_jobs=-1,
----> 9                                         scoring=scoring, return_train_score=False, verbose=3)

~/anaconda/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score)
    193             fit_params, return_train_score=return_train_score,
    194             return_times=True)
--> 195         for train, test in cv.split(X, y, groups))
    196 
    197     if return_train_score:

~/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    787                 # consumption.
    788                 self._iterating = False
--> 789             self.retrieve()
    790             # Make sure that we get a last message telling us we are done
    791             elapsed_time = time.time() - self._start_time

~/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in retrieve(self)
    697             try:
    698                 if getattr(self._backend, 'supports_timeout', False):
--> 699                     self._output.extend(job.get(timeout=self.timeout))
    700                 else:
    701                     self._output.extend(job.get())

~/anaconda/lib/python3.6/multiprocessing/pool.py in get(self, timeout)
    636 
    637     def get(self, timeout=None):
--> 638         self.wait(timeout)
    639         if not self.ready():
    640             raise TimeoutError

~/anaconda/lib/python3.6/multiprocessing/pool.py in wait(self, timeout)
    633 
    634     def wait(self, timeout=None):
--> 635         self._event.wait(timeout)
    636 
    637     def get(self, timeout=None):

~/anaconda/lib/python3.6/threading.py in wait(self, timeout)
    549             signaled = self._flag
    550             if not signaled:
--> 551                 signaled = self._cond.wait(timeout)
    552             return signaled
    553 

~/anaconda/lib/python3.6/threading.py in wait(self, timeout)
    293         try:    # restore state no matter what (e.g., KeyboardInterrupt)
    294             if timeout is None:
--> 295                 waiter.acquire()
    296                 gotit = True
    297             else:

KeyboardInterrupt: 

In [17]:
y.value_counts()


Out[17]:
5    9289
4    2679
1    1474
3    1276
2     869
Name: rating, dtype: int64

In [15]:
toprint = []
for k, v in result.items():
    toprint.append([k, mean(v)])

print(tabulate(toprint, headers=['Metric', 'Average K-Fold']))


Metric                          Average K-Fold
----------------------------  ----------------
fit_time                             61.4599
score_time                           59.379
test_f1_micro                         0.665556
test_f1_macro                         0.410998
test_f1_weighted                      0.63986
test_precision_micro                  0.665556
test_precision_macro                  0.436835
test_precision_weighted               0.626508
test_recall_micro                     0.665556
test_recall_macro                     0.413511
test_recall_weighted                  0.665556
test_neg_mean_absolute_error         -0.539871
test_neg_mean_squared_error          -1.16847

In [ ]: