In [2]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/IPython/core/magics/pylab.py:160: UserWarning: pylab import has clobbered these variables: ['Text']
`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"

In [36]:
import pandas as pd
from polyglot.text import Text, Word
import polyglot

import tabulate

from sklearn import model_selection, ensemble, metrics, linear_model, neighbors, pipeline, preprocessing, feature_extraction, svm
import sys sys.path.append("../src/") import utils

In [10]:
data = pd.read_csv('~/cloud/data/mvideo/X_train.csv')
data.columns = ['product_id', 'category_level1', 'category_level2', 'brand', 'property', 'user_name', 'rating', 'date',
               'review', 'negative', 'positive']
data['date'] = pd.to_datetime(data.date)

In [24]:
data['review'] = utils.WordNormalizer(None).fit_transform(utils.NonAlphaRemover(None).fit_transform(data.review))

In [25]:
words = data['review'].apply(lambda x: [Word(w, language='ru') for w in x.split()])

In [26]:
def embed_words(words):
    out = []
    for word in words:
        try:
            out.append(w.vector)
        except KeyError:
            out.append(zeros(256))
    return out

In [27]:
embedding = words.apply(embed_words)

In [28]:
embedding_df = embedding.apply(lambda x: np.sum(x,0)).apply(pd.Series).fillna(0)

In [29]:
X = embedding_df

In [40]:
y =  data.rating.round().astype(int)
# clf = svm.LinearSVC()
# clf = ensemble.GradientBoostingClassifier(n_estimators=100)
clf = ensemble.RandomForestClassifier(n_estimators=100, class_weight='balanced')
# clf = linear_model.LogisticRegression(penalty='l1', class_weight='balanced')
# clf = pipeline.make_pipeline(preprocessing.StandardScaler(),
#                              neighbors.KNeighborsClassifier(n_neighbors=50, weights='distance'))

In [41]:
res = model_selection.cross_validate(clf, X, data['rating'].round(), 
                                     cv=model_selection.StratifiedKFold(10, shuffle=True, random_state=42), n_jobs=-1, 
                                     scoring=['f1_micro', 'f1_macro', 'f1_weighted'], return_train_score=False)

In [42]:
for k, v in res.items():
    print([k, v.mean()])


['fit_time', 12.839696240425109]
['score_time', 0.22497053146362306]
['test_f1_micro', 0.22351027449165692]
['test_f1_macro', 0.17473678968736497]
['test_f1_weighted', 0.2667063295684905]

In [ ]: