notebook.community

Edit and run



In [2]:

    
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib






    



/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/IPython/core/magics/pylab.py:160: UserWarning: pylab import has clobbered these variables: ['Text']
`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"



In [36]:

    
import pandas as pd
from polyglot.text import Text, Word
import polyglot

import tabulate

from sklearn import model_selection, ensemble, metrics, linear_model, neighbors, pipeline, preprocessing, feature_extraction, svm

import sys sys.path.append("../src/") import utils



In [10]:

    
data = pd.read_csv('~/cloud/data/mvideo/X_train.csv')
data.columns = ['product_id', 'category_level1', 'category_level2', 'brand', 'property', 'user_name', 'rating', 'date',
               'review', 'negative', 'positive']
data['date'] = pd.to_datetime(data.date)



In [24]:

    
data['review'] = utils.WordNormalizer(None).fit_transform(utils.NonAlphaRemover(None).fit_transform(data.review))



In [25]:

    
words = data['review'].apply(lambda x: [Word(w, language='ru') for w in x.split()])



In [26]:

    
def embed_words(words):
    out = []
    for word in words:
        try:
            out.append(w.vector)
        except KeyError:
            out.append(zeros(256))
    return out



In [27]:

    
embedding = words.apply(embed_words)



In [28]:

    
embedding_df = embedding.apply(lambda x: np.sum(x,0)).apply(pd.Series).fillna(0)



In [29]:

    
X = embedding_df



In [40]:

    
y =  data.rating.round().astype(int)
# clf = svm.LinearSVC()
# clf = ensemble.GradientBoostingClassifier(n_estimators=100)
clf = ensemble.RandomForestClassifier(n_estimators=100, class_weight='balanced')
# clf = linear_model.LogisticRegression(penalty='l1', class_weight='balanced')
# clf = pipeline.make_pipeline(preprocessing.StandardScaler(),
#                              neighbors.KNeighborsClassifier(n_neighbors=50, weights='distance'))



In [41]:

    
res = model_selection.cross_validate(clf, X, data['rating'].round(), 
                                     cv=model_selection.StratifiedKFold(10, shuffle=True, random_state=42), n_jobs=-1, 
                                     scoring=['f1_micro', 'f1_macro', 'f1_weighted'], return_train_score=False)



In [42]:

    
for k, v in res.items():
    print([k, v.mean()])









    



['fit_time', 12.839696240425109]
['score_time', 0.22497053146362306]
['test_f1_micro', 0.22351027449165692]
['test_f1_macro', 0.17473678968736497]
['test_f1_weighted', 0.2667063295684905]



In [ ]: