In [1]:
%pylab inline
In [26]:
import nltk
import seaborn as sns
import pandas as pd
from sklearn import svm, model_selection, pipeline, linear_model, preprocessing, feature_selection, ensemble
from imblearn import pipeline as imbpipeline
from imblearn import over_sampling
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.exceptions import UndefinedMetricWarning, DataConversionWarning
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import make_scorer, confusion_matrix
from sklearn.pipeline import Pipeline, FeatureUnion
from mlxtend.feature_selection import ColumnSelector
from mlxtend.regressor import StackingRegressor, StackingCVRegressor
from mlxtend.preprocessing import DenseTransformer
from gensim.models.word2vec import Word2Vec
from gensim.models.wrappers import FastText
import gensim
import pymystem3
from tabulate import tabulate
from lightgbm.sklearn import LGBMRegressor, LGBMClassifier
from xgboost import XGBRegressor, XGBClassifier
In [3]:
import warnings
warnings.filterwarnings('ignore', category=DataConversionWarning)
In [4]:
import sys
sys.path.append("../src/")
import utils
In [5]:
rmse_error = make_scorer(lambda x, y: sqrt(mse(x, y)))
mae_error = make_scorer(mae)
In [6]:
data = pd.read_csv('~/cloud/data/mvideo/X_train.csv')
data.columns = ['product_id', 'category_level1', 'category_level2', 'brand', 'property', 'user_name', 'rating', 'date',
'review', 'negative', 'positive']
data['date'] = pd.to_datetime(data.date)
In [7]:
meta = pd.read_pickle('../processed/meta_features.pkl.gz').fillna(0).drop(['product_id', 'rating_cat'], axis=1).astype(float)
meta.shape
Out[7]:
In [8]:
polarity = pd.read_pickle('../processed/polarity.pkl.gz').fillna(0); polarity.shape
Out[8]:
In [9]:
categories = pd.read_pickle('../processed/categories_dummies.pkl.gz').fillna(0); categories.shape
Out[9]:
In [10]:
data.columns
Out[10]:
In [11]:
X = pd.concat((data[['review']],
meta.astype(int),
polarity.astype(float),
categories.astype(int)), axis=1); X.shape
Out[11]:
In [12]:
y_pre = preprocessing.MinMaxScaler()
In [13]:
y = y_pre.fit_transform(data[['rating']])
In [14]:
# Tests
In [15]:
scoring = {"RMSE": rmse_error, "MAE": mae_error}
In [16]:
class FlattenTransformer(object):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return X.flatten()
In [17]:
# X = data.copy()
# X = pd.get_dummies(X, columns=['brand', 'category_level1', 'category_level2'])
# X["date"] = pd.to_datetime(X["date"]).astype(int)
# X.drop(['user_name', 'rating', 'product_id', "negative", "positive", "property"], inplace=True, axis=1)
In [18]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('../external/reviews_w2v.vec',
binary=False, unicode_errors='ignore')
In [19]:
def parse_line(x, w2v, stemmer):
out = [w2v.wv.word_vec(w, use_norm=False) for w in
''.join([s for s in ''.join(stemmer.lemmatize(x.replace('.', ' '))) if s.isalpha() or s == ' ']).split(" ")
if (w in w2v.vocab)
and (w not in nltk.corpus.stopwords.words('russian'))
and (w != '')]
if len(out) == 0:
return [zeros(100)]
else:
return out
class Stemmer(object):
def __init__(self, w2v, stemmer):
self.w2v = w2v
self.stemmer = stemmer
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return [mean(parse_line(x[0], self.w2v, self.stemmer), axis=0) for x in X]
def fit_transform(self, X, y=None):
return self.transform(X, y)
In [44]:
tfidf_pipeline = [('selector', ColumnSelector(where(X.columns.isin(['review']))[0])),
('nonalpha', utils.NonAlphaRemover(['review'])),
('wordnorm', utils.WordNormalizer(['review'])),
('tfidf', TfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df=0.01)),
('dense', DenseTransformer())]
tfidf_char_pipeline = [('selector', ColumnSelector(where(X.columns.isin(['review']))[0])),
('flatten', FlattenTransformer()),
('tfidf', TfidfVectorizer(analyzer="char", ngram_range=(1, 4), min_df=0.01)),
('dense', DenseTransformer())]
w2v_pipeline = [('selector', ColumnSelector(where(X.columns.isin(['review']))[0])),
('w2v', Stemmer(w2v_model, pymystem3.Mystem()))]
categories_pipeline = [('selector', ColumnSelector(where(X.columns.isin(categories.columns))[0])),
('fs', feature_selection.VarianceThreshold(0.01))]
polarity_pipeline = [('selector', ColumnSelector(where(X.columns.isin(polarity.columns))[0]))]
meta_pipeline = [('selector', ColumnSelector(where(X.columns.isin(meta.columns))[0])),
('scaler', preprocessing.MinMaxScaler()),
('fs', feature_selection.VarianceThreshold(threshold=0.01))]
other_pipeline = [('selector', ColumnSelector(where(logical_not(X.columns.isin(['review'])))[0])),
('fs', feature_selection.VarianceThreshold(threshold=0.001))]
feature_union = pipeline.make_union(Pipeline(tfidf_pipeline),
# Pipeline(tfidf_char_pipeline),
Pipeline(other_pipeline))
# feature_union2 = pipeline.FeatureUnion([
# ('tfidf', tfidf_pipeline),
# ('categ', categories_pipeline),
# ('polarity', utils.ColumnSelector(polarity.columns))
# ])
# models = []
# for reg in [LGBMRegressor(objective="mse"), XGBRegressor(objective="reg:logistic")]:
# for pipe in [tfidf_pipeline, tfidf_char_pipeline]:# categories_pipeline, polarity_pipeline, meta_pipeline]:
# models.append(pipeline.Pipeline(pipe + [('reg', reg)]))
models = []
for reg in [LGBMRegressor(objective="mse"), XGBRegressor(objective="reg:logistic"), svm.LinearSVR(C=0.6),
linear_model.ElasticNetCV()]:
# models.append(pipeline.Pipeline(w2v_pipeline + [('reg', reg)]))
# models.append(pipeline.Pipeline([('union', feature_union), ('reg', reg)]))
for pipe in [w2v_pipeline, tfidf_pipeline]:#, polarity_pipeline, meta_pipeline]:
union = FeatureUnion([('main', Pipeline(pipe)), ('meta',Pipeline(meta_pipeline))])
models.append(Pipeline([('union', union), ('reg', reg)]))
model = StackingRegressor(models, XGBRegressor(objective="reg:logistic"), verbose=1)
In [47]:
res = model_selection.cross_validate(model, X.values, y.flatten(),
cv=model_selection.KFold(10, shuffle=True, random_state=42),
n_jobs=1, scoring=scoring, return_train_score=False, verbose=3)
In [48]:
prediction = model_selection.cross_val_predict(model, X.values, y.flatten(),
cv=model_selection.KFold(10, shuffle=True, random_state=42),
n_jobs=4, verbose=3)
# scoring=scoring, return_train_score=False, verbose=3)
In [51]:
pd.Series(prediction, index=X.index).to_pickle('results.pkl')
In [49]:
y_inv = lambda x: y_pre.inverse_transform(np.atleast_2d(x))
In [84]:
y_true = y_inv(y.flatten()).flatten()
y_pred = y_inv(prediction).flatten()
test_rmse = []
test_mae = []
cv = model_selection.KFold(10, shuffle=True, random_state=42)
for train, test in cv.split(y):
test_rmse.append(sqrt(mse(y_true[test], y_pred[test])))
test_mae.append(mae(y_true[test], y_pred[test]))
print('RMSE: {:.3f} ± {:.3f}'.format(mean(test_rmse), std(test_rmse)))
print('MAE: {:.3f} ± {:.3f}'.format(mean(test_mae), std(test_mae)))
In [96]:
figure(figsize=(10,10))
scatter(y_pred, y_true+randn(len(y_true))*0.1, alpha=0.1, edgecolor='none')
Out[96]:
In [85]:
sqrt(mse(y_inv(y.flatten()), y_inv(prediction))), mae(y_inv(y.flatten()), y_inv(prediction))
Out[85]:
In [86]:
for rating in range(5, 0, -1):
figure(figsize=(10,4))
hist(y_inv(prediction).flatten()[y_inv(y.flatten()).flatten()==rating],
alpha=0.8, label='Rating: ' + str(rating), bins=20)
legend(loc='best')
In [87]:
print(tabulate(confusion_matrix(y_inv(y.flatten()).round().flatten(), y_inv(prediction).round().flatten()),
headers=[1,2,3,4,5], showindex=[1,2,3,4,5], tablefmt='pipe'))
In [28]:
y_inv(y.flatten()).round()
Out[28]:
In [ ]: