notebook.community

Edit and run



In [1]:

    
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib



In [26]:

    
import nltk
import seaborn as sns

import pandas as pd

from sklearn import svm, model_selection, pipeline, linear_model, preprocessing, feature_selection, ensemble

from imblearn import pipeline as imbpipeline
from imblearn import over_sampling

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.exceptions import UndefinedMetricWarning, DataConversionWarning
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import make_scorer, confusion_matrix
from sklearn.pipeline import Pipeline, FeatureUnion

from mlxtend.feature_selection import ColumnSelector
from mlxtend.regressor import StackingRegressor, StackingCVRegressor
from mlxtend.preprocessing import DenseTransformer

from gensim.models.word2vec import Word2Vec
from gensim.models.wrappers import FastText
import gensim

import pymystem3

from tabulate import tabulate

from lightgbm.sklearn import LGBMRegressor, LGBMClassifier
from xgboost import XGBRegressor, XGBClassifier



In [3]:

    
import warnings
warnings.filterwarnings('ignore', category=DataConversionWarning)



In [4]:

    
import sys
sys.path.append("../src/")
import utils



In [5]:

    
rmse_error = make_scorer(lambda x, y: sqrt(mse(x, y)))
mae_error = make_scorer(mae)



In [6]:

    
data = pd.read_csv('~/cloud/data/mvideo/X_train.csv')
data.columns = ['product_id', 'category_level1', 'category_level2', 'brand', 'property', 'user_name', 'rating', 'date',
               'review', 'negative', 'positive']
data['date'] = pd.to_datetime(data.date)



In [7]:

    
meta = pd.read_pickle('../processed/meta_features.pkl.gz').fillna(0).drop(['product_id', 'rating_cat'], axis=1).astype(float)
meta.shape









    Out[7]:





(15587, 114)



In [8]:

    
polarity = pd.read_pickle('../processed/polarity.pkl.gz').fillna(0); polarity.shape









    Out[8]:





(15587, 4)



In [9]:

    
categories = pd.read_pickle('../processed/categories_dummies.pkl.gz').fillna(0); categories.shape









    Out[9]:





(15587, 11896)



In [10]:

    
data.columns









    Out[10]:





Index(['product_id', 'category_level1', 'category_level2', 'brand', 'property',
       'user_name', 'rating', 'date', 'review', 'negative', 'positive'],
      dtype='object')



In [11]:

    
X = pd.concat((data[['review']], 
               meta.astype(int), 
               polarity.astype(float),
              categories.astype(int)), axis=1); X.shape









    Out[11]:





(15587, 12015)



In [12]:

    
y_pre = preprocessing.MinMaxScaler()



In [13]:

    
y = y_pre.fit_transform(data[['rating']])



In [14]:

    
# Tests



In [15]:

    
scoring = {"RMSE": rmse_error, "MAE": mae_error}



In [16]:

    
class FlattenTransformer(object):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.flatten()



In [17]:

    
# X = data.copy()
# X = pd.get_dummies(X, columns=['brand', 'category_level1', 'category_level2'])
# X["date"] = pd.to_datetime(X["date"]).astype(int)
# X.drop(['user_name', 'rating', 'product_id', "negative", "positive", "property"], inplace=True, axis=1)



In [18]:

    
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('../external/reviews_w2v.vec', 
                                                            binary=False, unicode_errors='ignore')



In [19]:

    
def parse_line(x, w2v, stemmer):
    out = [w2v.wv.word_vec(w, use_norm=False) for w in 
                ''.join([s for s in ''.join(stemmer.lemmatize(x.replace('.', '  '))) if s.isalpha() or s == ' ']).split(" ") 
                 if (w in w2v.vocab)
                 and (w not in nltk.corpus.stopwords.words('russian'))
             and (w != '')]    
    if len(out) == 0:
        return [zeros(100)]
    else:
        return out

class Stemmer(object):
    
    def __init__(self, w2v, stemmer):
        self.w2v = w2v
        self.stemmer = stemmer
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return [mean(parse_line(x[0], self.w2v, self.stemmer), axis=0) for x in X]
    
    def fit_transform(self, X, y=None):
        return self.transform(X, y)



In [44]:

    
tfidf_pipeline = [('selector', ColumnSelector(where(X.columns.isin(['review']))[0])),
                    ('nonalpha', utils.NonAlphaRemover(['review'])),
                    ('wordnorm', utils.WordNormalizer(['review'])),
                    ('tfidf', TfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df=0.01)),
                    ('dense', DenseTransformer())]
tfidf_char_pipeline = [('selector', ColumnSelector(where(X.columns.isin(['review']))[0])),
                       ('flatten', FlattenTransformer()),
                    ('tfidf', TfidfVectorizer(analyzer="char", ngram_range=(1, 4), min_df=0.01)),
                      ('dense', DenseTransformer())]

w2v_pipeline = [('selector', ColumnSelector(where(X.columns.isin(['review']))[0])),
                ('w2v', Stemmer(w2v_model, pymystem3.Mystem()))]

categories_pipeline = [('selector', ColumnSelector(where(X.columns.isin(categories.columns))[0])),
                       ('fs', feature_selection.VarianceThreshold(0.01))]

polarity_pipeline = [('selector', ColumnSelector(where(X.columns.isin(polarity.columns))[0]))]

meta_pipeline = [('selector', ColumnSelector(where(X.columns.isin(meta.columns))[0])),
                 ('scaler', preprocessing.MinMaxScaler()), 
                 ('fs', feature_selection.VarianceThreshold(threshold=0.01))]

other_pipeline = [('selector', ColumnSelector(where(logical_not(X.columns.isin(['review'])))[0])),
                 ('fs', feature_selection.VarianceThreshold(threshold=0.001))]

feature_union = pipeline.make_union(Pipeline(tfidf_pipeline), 
#                                     Pipeline(tfidf_char_pipeline), 
                                    Pipeline(other_pipeline))

# feature_union2 = pipeline.FeatureUnion([
#                     ('tfidf', tfidf_pipeline), 
#                     ('categ', categories_pipeline),
#                     ('polarity', utils.ColumnSelector(polarity.columns))
#                                       ])
# models = []
# for reg in [LGBMRegressor(objective="mse"), XGBRegressor(objective="reg:logistic")]:
#     for pipe in [tfidf_pipeline, tfidf_char_pipeline]:# categories_pipeline, polarity_pipeline, meta_pipeline]:
#         models.append(pipeline.Pipeline(pipe + [('reg', reg)]))
models = []
for reg in [LGBMRegressor(objective="mse"), XGBRegressor(objective="reg:logistic"), svm.LinearSVR(C=0.6),
           linear_model.ElasticNetCV()]:
#     models.append(pipeline.Pipeline(w2v_pipeline + [('reg', reg)]))
#     models.append(pipeline.Pipeline([('union', feature_union), ('reg', reg)]))
    for pipe in [w2v_pipeline, tfidf_pipeline]:#, polarity_pipeline, meta_pipeline]:
        union = FeatureUnion([('main', Pipeline(pipe)), ('meta',Pipeline(meta_pipeline))])
        models.append(Pipeline([('union', union), ('reg', reg)]))
                      
model = StackingRegressor(models, XGBRegressor(objective="reg:logistic"), verbose=1)



In [47]:

    
res = model_selection.cross_validate(model, X.values, y.flatten(), 
                                            cv=model_selection.KFold(10, shuffle=True, random_state=42), 
                                            n_jobs=1, scoring=scoring, return_train_score=False, verbose=3)









    



[CV]  ................................................................
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)






    



/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)






    



Fitting regressor8: pipeline (8/8)
[CV]  , RMSE=0.23706786289827864, MAE=0.15933346079854802, total=25.4min






    



[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 25.5min remaining:    0.0s






    



[CV]  ................................................................
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)






    



/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)






    



Fitting regressor8: pipeline (8/8)
[CV]  , RMSE=0.2522859493273965, MAE=0.1718618172858295, total=18.8min






    



[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 44.3min remaining:    0.0s






    



[CV]  ................................................................
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)






    



/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)






    



Fitting regressor8: pipeline (8/8)
[CV]  , RMSE=0.2480830985088645, MAE=0.17031552512831916, total=18.8min
[CV]  ................................................................
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)






    



/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)






    



Fitting regressor8: pipeline (8/8)
[CV]  , RMSE=0.2435396978781024, MAE=0.1639499787128912, total=18.8min
[CV]  ................................................................
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)






    



/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)






    



Fitting regressor8: pipeline (8/8)
[CV]  , RMSE=0.23794635914536993, MAE=0.16066632217382912, total=18.8min
[CV]  ................................................................
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)






    



/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)






    



Fitting regressor8: pipeline (8/8)
[CV]  , RMSE=0.24383035869299444, MAE=0.16467581284139846, total=18.7min
[CV]  ................................................................
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)






    



/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)






    



Fitting regressor8: pipeline (8/8)
[CV]  , RMSE=0.24399459389229813, MAE=0.16329174034340477, total=18.9min
[CV]  ................................................................
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)






    



/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)






    



Fitting regressor8: pipeline (8/8)
[CV]  , RMSE=0.22759906563841872, MAE=0.1553571406354953, total=18.8min
[CV]  ................................................................
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)






    



/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)






    



Fitting regressor8: pipeline (8/8)
[CV]  , RMSE=0.23422170609794263, MAE=0.16008129541371235, total=18.8min
[CV]  ................................................................
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)






    



/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)






    



Fitting regressor8: pipeline (8/8)
[CV]  , RMSE=0.24024123596499097, MAE=0.16397174910160406, total=18.8min






    



[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 194.9min finished



In [48]:

    
prediction = model_selection.cross_val_predict(model, X.values, y.flatten(), 
                                            cv=model_selection.KFold(10, shuffle=True, random_state=42), 
                                            n_jobs=4, verbose=3)
#                                         scoring=scoring, return_train_score=False, verbose=3)









    



Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)






    



/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)






    



Fitting regressor8: pipeline (8/8)






    



/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)






    



Fitting regressor8: pipeline (8/8)






    



/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)






    



Fitting regressor8: pipeline (8/8)






    



/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)






    



Fitting regressor8: pipeline (8/8)
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)






    



/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)






    



Fitting regressor8: pipeline (8/8)






    



/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)






    



Fitting regressor8: pipeline (8/8)






    



/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)






    



Fitting regressor8: pipeline (8/8)






    



/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)






    



Fitting regressor8: pipeline (8/8)
Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)






    



[Parallel(n_jobs=4)]: Done   7 out of  10 | elapsed: 58.9min remaining: 25.2min






    



Fitting 8 regressors...
Fitting regressor1: pipeline (1/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor2: pipeline (2/8)
Fitting regressor3: pipeline (3/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor4: pipeline (4/8)
Fitting regressor5: pipeline (5/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)
Fitting regressor6: pipeline (6/8)
Fitting regressor7: pipeline (7/8)






    



/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)






    



Fitting regressor8: pipeline (8/8)






    



/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)






    



Fitting regressor8: pipeline (8/8)






    



[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed: 80.6min finished



In [51]:

    
pd.Series(prediction, index=X.index).to_pickle('results.pkl')



In [49]:

    
y_inv = lambda x: y_pre.inverse_transform(np.atleast_2d(x))



In [84]:

    
y_true = y_inv(y.flatten()).flatten()
y_pred = y_inv(prediction).flatten()
test_rmse = []
test_mae = []
cv = model_selection.KFold(10, shuffle=True, random_state=42)
for train, test in cv.split(y):
    test_rmse.append(sqrt(mse(y_true[test], y_pred[test])))
    test_mae.append(mae(y_true[test], y_pred[test]))
    
print('RMSE: {:.3f} ± {:.3f}'.format(mean(test_rmse), std(test_rmse)))
print('MAE: {:.3f} ± {:.3f}'.format(mean(test_mae), std(test_mae)))









    



RMSE: 0.963 ± 0.027
MAE: 0.653 ± 0.019



In [96]:

    
figure(figsize=(10,10))
scatter(y_pred, y_true+randn(len(y_true))*0.1, alpha=0.1, edgecolor='none')









    Out[96]:





<matplotlib.collections.PathCollection at 0x2273755f8>



In [85]:

    
sqrt(mse(y_inv(y.flatten()), y_inv(prediction))), mae(y_inv(y.flatten()), y_inv(prediction))









    Out[85]:





(0.96373528160792443, 0.65342371915160269)



In [86]:

    
for rating in range(5, 0, -1):
    figure(figsize=(10,4))
    hist(y_inv(prediction).flatten()[y_inv(y.flatten()).flatten()==rating], 
         alpha=0.8, label='Rating: ' + str(rating), bins=20)
    legend(loc='best')



In [87]:

    
print(tabulate(confusion_matrix(y_inv(y.flatten()).round().flatten(), y_inv(prediction).round().flatten()),
              headers=[1,2,3,4,5], showindex=[1,2,3,4,5], tablefmt='pipe'))









    



|    |   1 |   2 |   3 |    4 |    5 |
|---:|----:|----:|----:|-----:|-----:|
|  1 | 108 | 599 | 388 |  332 |   47 |
|  2 |  30 | 309 | 262 |  216 |   52 |
|  3 |  24 | 241 | 394 |  496 |  121 |
|  4 |   1 | 116 | 379 | 1162 | 1021 |
|  5 |   2 | 121 | 417 | 2091 | 6658 |



In [28]:

    
y_inv(y.flatten()).round()









    Out[28]:





array([[ 2.,  2.,  4., ...,  5.,  1.,  5.]])



In [ ]: