In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [26]:
import nltk
import seaborn as sns

import pandas as pd

from sklearn import svm, model_selection, pipeline, linear_model, preprocessing, feature_selection, ensemble

from imblearn import pipeline as imbpipeline
from imblearn import over_sampling

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.exceptions import UndefinedMetricWarning, DataConversionWarning
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import make_scorer, confusion_matrix
from sklearn.pipeline import Pipeline, FeatureUnion

from mlxtend.feature_selection import ColumnSelector
from mlxtend.regressor import StackingRegressor, StackingCVRegressor
from mlxtend.preprocessing import DenseTransformer

from gensim.models.word2vec import Word2Vec
from gensim.models.wrappers import FastText
import gensim

import pymystem3

from tabulate import tabulate

from lightgbm.sklearn import LGBMRegressor, LGBMClassifier
from xgboost import XGBRegressor, XGBClassifier

In [3]:
import warnings
warnings.filterwarnings('ignore', category=DataConversionWarning)

In [4]:
import sys
sys.path.append("../src/")
import utils

In [5]:
rmse_error = make_scorer(lambda x, y: sqrt(mse(x, y)))
mae_error = make_scorer(mae)

In [6]:
data = pd.read_csv('~/cloud/data/mvideo/X_train.csv')
data.columns = ['product_id', 'category_level1', 'category_level2', 'brand', 'property', 'user_name', 'rating', 'date',
               'review', 'negative', 'positive']
data['date'] = pd.to_datetime(data.date)

In [7]:
meta = pd.read_pickle('../processed/meta_features.pkl.gz').fillna(0).drop(['product_id', 'rating_cat'], axis=1).astype(float)
meta.shape


Out[7]:
(15587, 114)

In [8]:
polarity = pd.read_pickle('../processed/polarity.pkl.gz').fillna(0); polarity.shape


Out[8]:
(15587, 4)

In [9]:
categories = pd.read_pickle('../processed/categories_dummies.pkl.gz').fillna(0); categories.shape


Out[9]:
(15587, 11896)

In [10]:
data.columns


Out[10]:
Index(['product_id', 'category_level1', 'category_level2', 'brand', 'property',
       'user_name', 'rating', 'date', 'review', 'negative', 'positive'],
      dtype='object')

In [11]:
X = pd.concat((data[['review']], 
               meta.astype(int), 
               polarity.astype(float),
              categories.astype(int)), axis=1); X.shape


Out[11]:
(15587, 12015)

In [12]:
y_pre = preprocessing.MinMaxScaler()

In [13]:
y = y_pre.fit_transform(data[['rating']])

In [14]:
# Tests

In [15]:
scoring = {"RMSE": rmse_error, "MAE": mae_error}

In [16]:
class FlattenTransformer(object):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.flatten()

In [17]:
# X = data.copy()
# X = pd.get_dummies(X, columns=['brand', 'category_level1', 'category_level2'])
# X["date"] = pd.to_datetime(X["date"]).astype(int)
# X.drop(['user_name', 'rating', 'product_id', "negative", "positive", "property"], inplace=True, axis=1)

In [18]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('../external/reviews_w2v.vec', 
                                                            binary=False, unicode_errors='ignore')

In [19]:
def parse_line(x, w2v, stemmer):
    out = [w2v.wv.word_vec(w, use_norm=False) for w in 
                ''.join([s for s in ''.join(stemmer.lemmatize(x.replace('.', '  '))) if s.isalpha() or s == ' ']).split(" ") 
                 if (w in w2v.vocab)
                 and (w not in nltk.corpus.stopwords.words('russian'))
             and (w != '')]    
    if len(out) == 0:
        return [zeros(100)]
    else:
        return out

class Stemmer(object):
    
    def __init__(self, w2v, stemmer):
        self.w2v = w2v
        self.stemmer = stemmer
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return [mean(parse_line(x[0], self.w2v, self.stemmer), axis=0) for x in X]
    
    def fit_transform(self, X, y=None):
        return self.transform(X, y)

In [44]:
tfidf_pipeline = [('selector', ColumnSelector(where(X.columns.isin(['review']))[0])),
                    ('nonalpha', utils.NonAlphaRemover(['review'])),
                    ('wordnorm', utils.WordNormalizer(['review'])),
                    ('tfidf', TfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df=0.01)),
                    ('dense', DenseTransformer())]
tfidf_char_pipeline = [('selector', ColumnSelector(where(X.columns.isin(['review']))[0])),
                       ('flatten', FlattenTransformer()),
                    ('tfidf', TfidfVectorizer(analyzer="char", ngram_range=(1, 4), min_df=0.01)),
                      ('dense', DenseTransformer())]

w2v_pipeline = [('selector', ColumnSelector(where(X.columns.isin(['review']))[0])),
                ('w2v', Stemmer(w2v_model, pymystem3.Mystem()))]

categories_pipeline = [('selector', ColumnSelector(where(X.columns.isin(categories.columns))[0])),
                       ('fs', feature_selection.VarianceThreshold(0.01))]

polarity_pipeline = [('selector', ColumnSelector(where(X.columns.isin(polarity.columns))[0]))]

meta_pipeline = [('selector', ColumnSelector(where(X.columns.isin(meta.columns))[0])),
                 ('scaler', preprocessing.MinMaxScaler()), 
                 ('fs', feature_selection.VarianceThreshold(threshold=0.01))]

other_pipeline = [('selector', ColumnSelector(where(logical_not(X.columns.isin(['review'])))[0])),
                 ('fs', feature_selection.VarianceThreshold(threshold=0.001))]

feature_union = pipeline.make_union(Pipeline(tfidf_pipeline), 
#                                     Pipeline(tfidf_char_pipeline), 
                                    Pipeline(other_pipeline))

# feature_union2 = pipeline.FeatureUnion([
#                     ('tfidf', tfidf_pipeline), 
#                     ('categ', categories_pipeline),
#                     ('polarity', utils.ColumnSelector(polarity.columns))
#                                       ])
# models = []
# for reg in [LGBMRegressor(objective="mse"), XGBRegressor(objective="reg:logistic")]:
#     for pipe in [tfidf_pipeline, tfidf_char_pipeline]:# categories_pipeline, polarity_pipeline, meta_pipeline]:
#         models.append(pipeline.Pipeline(pipe + [('reg', reg)]))
models = []
for reg in [LGBMRegressor(objective="mse"), XGBRegressor(objective="reg:logistic"), svm.LinearSVR(C=0.6),
           linear_model.ElasticNetCV()]:
#     models.append(pipeline.Pipeline(w2v_pipeline + [('reg', reg)]))
#     models.append(pipeline.Pipeline([('union', feature_union), ('reg', reg)]))
    for pipe in [w2v_pipeline, tfidf_pipeline]:#, polarity_pipeline, meta_pipeline]:
        union = FeatureUnion([('main', Pipeline(pipe)), ('meta',Pipeline(meta_pipeline))])
        models.append(Pipeline([('union', union), ('reg', reg)]))
                      
model = StackingRegressor(models, XGBRegressor(objective="reg:logistic"), verbose=1)

In [47]:
res = model_selection.cross_validate(model, X.values, y.flatten(), 
                                            cv=model_selection.KFold(10, shuffle=True, random_state=42), 
                                            n_jobs=1, scoring=scoring, return_train_score=False, verbose=3)


[CV]  ................................................................
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
Fitting regressor8: pipeline (8/8)
[CV]  , RMSE=0.23706786289827864, MAE=0.15933346079854802, total=25.4min
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 25.5min remaining:    0.0s
[CV]  ................................................................
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
Fitting regressor8: pipeline (8/8)
[CV]  , RMSE=0.2522859493273965, MAE=0.1718618172858295, total=18.8min
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 44.3min remaining:    0.0s
[CV]  ................................................................
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
Fitting regressor8: pipeline (8/8)
[CV]  , RMSE=0.2480830985088645, MAE=0.17031552512831916, total=18.8min
[CV]  ................................................................
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
Fitting regressor8: pipeline (8/8)
[CV]  , RMSE=0.2435396978781024, MAE=0.1639499787128912, total=18.8min
[CV]  ................................................................
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
Fitting regressor8: pipeline (8/8)
[CV]  , RMSE=0.23794635914536993, MAE=0.16066632217382912, total=18.8min
[CV]  ................................................................
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
Fitting regressor8: pipeline (8/8)
[CV]  , RMSE=0.24383035869299444, MAE=0.16467581284139846, total=18.7min
[CV]  ................................................................
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
Fitting regressor8: pipeline (8/8)
[CV]  , RMSE=0.24399459389229813, MAE=0.16329174034340477, total=18.9min
[CV]  ................................................................
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
Fitting regressor8: pipeline (8/8)
[CV]  , RMSE=0.22759906563841872, MAE=0.1553571406354953, total=18.8min
[CV]  ................................................................
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
Fitting regressor8: pipeline (8/8)
[CV]  , RMSE=0.23422170609794263, MAE=0.16008129541371235, total=18.8min
[CV]  ................................................................
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
Fitting regressor8: pipeline (8/8)
[CV]  , RMSE=0.24024123596499097, MAE=0.16397174910160406, total=18.8min
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 194.9min finished

In [48]:
prediction = model_selection.cross_val_predict(model, X.values, y.flatten(), 
                                            cv=model_selection.KFold(10, shuffle=True, random_state=42), 
                                            n_jobs=4, verbose=3)
#                                         scoring=scoring, return_train_score=False, verbose=3)


Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
Fitting regressor8: pipeline (8/8)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
Fitting regressor8: pipeline (8/8)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
Fitting regressor8: pipeline (8/8)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
Fitting regressor8: pipeline (8/8)
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
Fitting regressor8: pipeline (8/8)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
Fitting regressor8: pipeline (8/8)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
Fitting regressor8: pipeline (8/8)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
Fitting regressor8: pipeline (8/8)
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
[Parallel(n_jobs=4)]: Done   7 out of  10 | elapsed: 58.9min remaining: 25.2min
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
Fitting regressor8: pipeline (8/8)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
Fitting regressor8: pipeline (8/8)
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed: 80.6min finished

In [51]:
pd.Series(prediction, index=X.index).to_pickle('results.pkl')

In [49]:
y_inv = lambda x: y_pre.inverse_transform(np.atleast_2d(x))

In [84]:
y_true = y_inv(y.flatten()).flatten()
y_pred = y_inv(prediction).flatten()
test_rmse = []
test_mae = []
cv = model_selection.KFold(10, shuffle=True, random_state=42)
for train, test in cv.split(y):
    test_rmse.append(sqrt(mse(y_true[test], y_pred[test])))
    test_mae.append(mae(y_true[test], y_pred[test]))
    
print('RMSE: {:.3f} ± {:.3f}'.format(mean(test_rmse), std(test_rmse)))
print('MAE: {:.3f} ± {:.3f}'.format(mean(test_mae), std(test_mae)))


RMSE: 0.963 ± 0.027
MAE: 0.653 ± 0.019

In [96]:
figure(figsize=(10,10))
scatter(y_pred, y_true+randn(len(y_true))*0.1, alpha=0.1, edgecolor='none')


Out[96]:
<matplotlib.collections.PathCollection at 0x2273755f8>

In [85]:
sqrt(mse(y_inv(y.flatten()), y_inv(prediction))), mae(y_inv(y.flatten()), y_inv(prediction))


Out[85]:
(0.96373528160792443, 0.65342371915160269)

In [86]:
for rating in range(5, 0, -1):
    figure(figsize=(10,4))
    hist(y_inv(prediction).flatten()[y_inv(y.flatten()).flatten()==rating], 
         alpha=0.8, label='Rating: ' + str(rating), bins=20)
    legend(loc='best')



In [87]:
print(tabulate(confusion_matrix(y_inv(y.flatten()).round().flatten(), y_inv(prediction).round().flatten()),
              headers=[1,2,3,4,5], showindex=[1,2,3,4,5], tablefmt='pipe'))


|    |   1 |   2 |   3 |    4 |    5 |
|---:|----:|----:|----:|-----:|-----:|
|  1 | 108 | 599 | 388 |  332 |   47 |
|  2 |  30 | 309 | 262 |  216 |   52 |
|  3 |  24 | 241 | 394 |  496 |  121 |
|  4 |   1 | 116 | 379 | 1162 | 1021 |
|  5 |   2 | 121 | 417 | 2091 | 6658 |

In [28]:
y_inv(y.flatten()).round()


Out[28]:
array([[ 2.,  2.,  4., ...,  5.,  1.,  5.]])

In [ ]: