%pylab inline

import pandas as pd
from polyglot.text import Text, Word
import polyglot

import tabulate

from sklearn import model_selection, ensemble, metrics, linear_model, neighbors, pipeline, preprocessing, feature_extraction, svm
import sys sys.path.append("../src/") import utils

data = pd.read_csv('~/cloud/data/mvideo/X_train.csv')
data.columns = ['product_id', 'category_level1', 'category_level2', 'brand', 'property', 'user_name', 'rating', 'date',
               'review', 'negative', 'positive']
data['date'] = pd.to_datetime(

data['review'] = utils.WordNormalizer(None).fit_transform(utils.NonAlphaRemover(None).fit_transform(

words = data['review'].apply(lambda x: [Word(w, language='ru') for w in x.split()])

def embed_words(words):
    out = []
    for word in words:
        except KeyError:
    return out

embedding = words.apply(embed_words)

embedding_df = embedding.apply(lambda x: np.sum(x,0)).apply(pd.Series).fillna(0)

X = embedding_df

y =  data.rating.round().astype(int)
# clf = svm.LinearSVC()
# clf = ensemble.GradientBoostingClassifier(n_estimators=100)
clf = ensemble.RandomForestClassifier(n_estimators=100, class_weight='balanced')
# clf = linear_model.LogisticRegression(penalty='l1', class_weight='balanced')
# clf = pipeline.make_pipeline(preprocessing.StandardScaler(),
#                              neighbors.KNeighborsClassifier(n_neighbors=50, weights='distance'))

res = model_selection.cross_validate(clf, X, data['rating'].round(), 
                                     cv=model_selection.StratifiedKFold(10, shuffle=True, random_state=42), n_jobs=-1, 
                                     scoring=['f1_micro', 'f1_macro', 'f1_weighted'], return_train_score=False)

for k, v in res.items():
    print([k, v.mean()])

['fit_time', 12.839696240425109]
['score_time', 0.22497053146362306]
['test_f1_micro', 0.22351027449165692]
['test_f1_macro', 0.17473678968736497]
['test_f1_weighted', 0.2667063295684905]

