In [1]:
%pylab inline
In [2]:
import pandas as pd
from sklearn import svm, model_selection, pipeline, linear_model, preprocessing, feature_selection, ensemble
from imblearn import pipeline as imbpipeline
from imblearn import over_sampling
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.exceptions import UndefinedMetricWarning, DataConversionWarning
import nltk
from tabulate import tabulate
from sklearn.pipeline import Pipeline
from mlxtend.feature_selection import ColumnSelector
from mlxtend.classifier import StackingClassifier
from gensim.models.word2vec import Word2Vec
from gensim.models.wrappers import FastText
import gensim
In [3]:
import warnings
warnings.filterwarnings('ignore', category=DataConversionWarning)
In [4]:
import sys
sys.path.append("../src/")
import utils
In [5]:
data = pd.read_csv('~/cloud/data/mvideo/X_train.csv')
data.columns = ['product_id', 'category_level1', 'category_level2', 'brand', 'property', 'user_name', 'rating', 'date',
'review', 'negative', 'positive']
data['date'] = pd.to_datetime(data.date)
In [6]:
meta = pd.read_pickle('../processed/meta_features.pkl.gz').fillna(0).drop(['product_id', 'rating_cat'], axis=1).astype(float)
meta.shape
Out[6]:
In [7]:
polarity = pd.read_pickle('../processed/polarity.pkl.gz').fillna(0); polarity.shape
Out[7]:
In [8]:
categories = pd.read_pickle('../processed/categories_dummies.pkl.gz').fillna(0); categories.shape
Out[8]:
In [9]:
X = pd.concat((data.drop(['product_id', 'rating'], axis=1),
meta.astype(float),
polarity.astype(float),
categories.astype(float)), axis=1); X.shape
Out[9]:
In [10]:
y = data.rating.round().astype(int)
In [11]:
#Models
In [12]:
base_svm = svm.SVC(kernel='linear', C=0.5, class_weight='balanced', probability=True, decision_function_shape='ovr',
random_state=42)
base_gboost = ensemble.GradientBoostingClassifier(n_estimators=100, random_state=42)
In [33]:
pipeline.Pipeline([('selector', ColumnSelector(where(X.columns.isin(categories.columns))[0])),
('fs', feature_selection.VarianceThreshold(threshold=0.01))]).fit_transform(X.values).shape
Out[33]:
In [21]:
meta_gboost = imbpipeline.Pipeline([('selector', ColumnSelector(where(X.columns.isin(meta.columns))[0])),
('scaler', preprocessing.MinMaxScaler()),
('fs', feature_selection.VarianceThreshold(threshold=0.01)),
# ('resample', over_sampling.RandomOverSampler()),
('gboost', base_gboost)])
polarity_gboost = imbpipeline.Pipeline([('selector', ColumnSelector(where(X.columns.isin(polarity.columns))[0])),
# ('resample', over_sampling.RandomOverSampler()),
('gboost', base_gboost)])
cat_gboost = imbpipeline.Pipeline([('selector', ColumnSelector(where(X.columns.isin(categories.columns))[0])),
('fs', feature_selection.VarianceThreshold(threshold=0.1)),
('gboost', base_gboost)])
tfidf_svm = pipeline.Pipeline([('selector', ColumnSelector(where(X.columns.isin(['review']))[0])),
('nonalpha', utils.NonAlphaRemover(['review'])),
('wordnorm', utils.WordNormalizer(['review'])),
('tfidf', TfidfVectorizer(analyzer='word', ngram_range=(1,2), stop_words=None)),
('svm', base_svm)])
tfidf_over_svm = imbpipeline.Pipeline([('selector', ColumnSelector(where(X.columns.isin(['review']))[0])),
('nonalpha', utils.NonAlphaRemover(['review'])),
('wordnorm', utils.WordNormalizer(['review'])),
('tfidf', TfidfVectorizer(analyzer='word', ngram_range=(1,2), stop_words=None)),
('resample', over_sampling.RandomOverSampler()),
('svm', base_svm)])
w2v_model = FastText()
w2v_model = w2v_model.load_fasttext_format("../external/wiki_w2v_256.bin")
# w2v = {w: vec for w, vec in zip(w2v_model.wv.index2word, w2v_model.wv.syn0)}
tfidf_word2vec_wiki_svm = pipeline.Pipeline([('selector', ColumnSelector(where(X.columns.isin(['review']))[0])),
("word2vec", utils.Word2VecTfidfEmbeddingVectorizer(w2v_model)),
('svm', base_svm)])
# w2v_model = gensim.models.KeyedVectors.load_word2vec_format("../external/news_rusvectores2.bin.gz", binary=True)
# w2v = {w: vec for w, vec in zip(w2v_model.wv.index2word, w2v_model.wv.syn0)}
# tfidf_word2vec_news_svm = pipeline.Pipeline([('selector', ColumnSelector(where(X.columns.isin(['review']))[0])),
# ("word2vec", utils.TfidfEmbeddingVectorizer(w2v)),
# ('svm', svm.SVC(kernel='linear', C=0.5, class_weight='balanced', probability=True))])
# w2v_model = gensim.models.KeyedVectors.load_word2vec_format("../external/araneum_1_600_2.bin.gz", binary=True)
# w2v = {w: vec for w, vec in zip(w2v_model.wv.index2word, w2v_model.wv.syn0)}
# tfidf_word2vec_araneum_svm = pipeline.Pipeline([('selector', ColumnSelector(where(X.columns.isin(['review']))[0])),
# ("word2vec", utils.TfidfEmbeddingVectorizer(w2v)),
# ('svm', svm.SVC(kernel='linear', C=0.5, class_weight='balanced', probability=True))])
# cols_non_digit = ['category_level1', 'category_level2', 'brand', 'property', 'user_name', 'rating', 'date',
# 'review', 'negative', 'positive']
In [22]:
step_clfs = [meta_gboost, polarity_gboost, cat_gboost, tfidf_svm, #tfidf_over_svm,
tfidf_word2vec_wiki_svm]#tfidf_word2vec_news_svm, tfidf_word2vec_araneum_svm]
sclf = StackingClassifier(classifiers=step_clfs, verbose=1,
use_probas=True,
average_probas=False,
meta_classifier=linear_model.LogisticRegression(class_weight='balanced'))
In [23]:
scoring = ['f1_micro', 'f1_macro', 'f1_weighted',
'precision_micro', 'precision_macro', 'precision_weighted',
'recall_micro', 'recall_macro', 'recall_weighted',
'neg_mean_absolute_error', 'neg_mean_squared_error',
# 'explained_variance','neg_mean_squared_log_error', 'neg_median_absolute_error', 'r2'
]
In [24]:
result = model_selection.cross_validate(sclf, X.values, y.values,
cv=model_selection.StratifiedKFold(10, shuffle=True, random_state=42), n_jobs=1,
scoring=scoring, return_train_score=False, verbose=3)
In [ ]:
toprint = []
for k, v in result.items():
toprint.append([k, mean(v)])
print(tabulate(toprint, headers=['Metric', 'Average K-Fold']))
In [ ]:
In [20]:
union = pipeline.FeatureUnion([('tfidf', Pipeline([('selector', ColumnSelector(where(X.columns.isin(['review']))[0])),
('nonalpha', utils.NonAlphaRemover(['review'])),
('wordnorm', utils.WordNormalizer(['review'])),
('tfidf', TfidfVectorizer(analyzer='word', ngram_range=(1,2)))])),
('metaf', Pipeline([('selector', ColumnSelector(where(X.columns.isin(meta.columns))[0])),
('scaler', preprocessing.MinMaxScaler()),
('fs', feature_selection.VarianceThreshold(threshold=0.1))])),
('polar', Pipeline([('selector', ColumnSelector(where(X.columns.isin(polarity.columns))[0])),
('fs', feature_selection.VarianceThreshold(threshold=0.1))])),
# ('categ', Pipeline([('selector', ColumnSelector(where(X.columns.isin(categories.columns))[0])),
# ('fs', feature_selection.VarianceThreshold(threshold=0.1))]))
])
In [21]:
model = Pipeline([('union', union), ('clf', svm.LinearSVC(multi_class='ovr', C=0.6, class_weight='balanced',
random_state=42))])
In [ ]:
result = model_selection.cross_validate(model, X.values, y.values,
cv=model_selection.StratifiedKFold(10, shuffle=True, random_state=42), n_jobs=1,
scoring=scoring, return_train_score=False, verbose=3)
In [42]:
model = Pipeline([('union', union), ('clf', base_gboost)])
In [43]:
result = model_selection.cross_validate(model, X.values, y.values,
cv=model_selection.StratifiedKFold(3, shuffle=True, random_state=42), n_jobs=1,
scoring=scoring, return_train_score=False, verbose=3)
In [19]:
model = pipeline.Pipeline([('nonalpha', utils.NonAlphaRemover(['review'])),
('wordnorm', utils.WordNormalizer(['review'])),
('tfidf', TfidfVectorizer(analyzer='word', ngram_range=(1,2))),
# ('svm', svm.LinearSVC(multi_class='ovr', C=0.6, class_weight='balanced'))
('elastic', linear_model.ElasticNet(alpha=0.001, l1_ratio=0.01))
])
result = model_selection.cross_validate(model, data['review'], y.values,
cv=model_selection.StratifiedKFold(10, shuffle=True, random_state=42), n_jobs=-1,
scoring=scoring, return_train_score=False, verbose=3)
In [17]:
y.value_counts()
Out[17]:
In [15]:
toprint = []
for k, v in result.items():
toprint.append([k, mean(v)])
print(tabulate(toprint, headers=['Metric', 'Average K-Fold']))
In [ ]: