In [55]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib
/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/IPython/core/magics/pylab.py:160: UserWarning: pylab import has clobbered these variables: ['Text', 'clf', 'select']
`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"

In [6]:
from polyglot.downloader import downloader
downloader.download("LANG:ru")


[polyglot_data] Downloading collection 'LANG:ru'
[polyglot_data]    | 
[polyglot_data]    | Downloading package sgns2.ru to
[polyglot_data]    |     /Users/pavelerofeev/polyglot_data...
[polyglot_data]    |   Package sgns2.ru is already up-to-date!
[polyglot_data]    | Downloading package unipos.ru to
[polyglot_data]    |     /Users/pavelerofeev/polyglot_data...
[polyglot_data]    | Downloading package ner2.ru to
[polyglot_data]    |     /Users/pavelerofeev/polyglot_data...
[polyglot_data]    |   Package ner2.ru is already up-to-date!
[polyglot_data]    | Downloading package counts2.ru to
[polyglot_data]    |     /Users/pavelerofeev/polyglot_data...
[polyglot_data]    |   Package counts2.ru is already up-to-date!
[polyglot_data]    | Downloading package transliteration2.ru to
[polyglot_data]    |     /Users/pavelerofeev/polyglot_data...
[polyglot_data]    |   Package transliteration2.ru is already up-to-
[polyglot_data]    |       date!
[polyglot_data]    | Downloading package embeddings2.ru to
[polyglot_data]    |     /Users/pavelerofeev/polyglot_data...
[polyglot_data]    |   Package embeddings2.ru is already up-to-date!
[polyglot_data]    | Downloading package uniemb.ru to
[polyglot_data]    |     /Users/pavelerofeev/polyglot_data...
[polyglot_data]    | Downloading package sentiment2.ru to
[polyglot_data]    |     /Users/pavelerofeev/polyglot_data...
[polyglot_data]    |   Package sentiment2.ru is already up-to-date!
[polyglot_data]    | Downloading package tsne2.ru to
[polyglot_data]    |     /Users/pavelerofeev/polyglot_data...
[polyglot_data]    |   Package tsne2.ru is already up-to-date!
[polyglot_data]    | Downloading package morph2.ru to
[polyglot_data]    |     /Users/pavelerofeev/polyglot_data...
[polyglot_data]    |   Package morph2.ru is already up-to-date!
[polyglot_data]    | 
[polyglot_data]  Done downloading collection LANG:ru
Out[6]:
True

In [7]:
from polyglot.text import Text
import pandas as pd
import seaborn as sns

import tabulate

from sklearn import model_selection, ensemble, metrics, linear_model, neighbors, pipeline, preprocessing

In [23]:
Text(data.review[0])


Out[23]:
-1.0

In [8]:
data = pd.read_csv('~/cloud/data/mvideo/X_train.csv')
data.columns = ['product_id', 'category_level1', 'category_level2', 'brand', 'property', 'user_name', 'rating', 'date',
               'review', 'negative', 'positive']
data['date'] = pd.to_datetime(data.date)

In [24]:
def get_polarity(text):
    if len(text) > 0:
        text = Text(text, hint_language_code='ru')
        return [word.polarity for word in  text.words]
    else:
        return []

In [25]:
polarity = data.review.apply(get_polarity)

In [57]:
X = pd.merge(polarity.apply(sum).fillna(0).to_frame(), 
             polarity.apply(mean).fillna(0).to_frame(), 
             left_index=True, right_index=True)
X = pd.merge(X, polarity.apply(lambda x: len([v for v in x if v > 0])).fillna(0).to_frame(), 
             left_index=True, right_index=True)
X = pd.merge(X, polarity.apply(lambda x: len([v for v in x if v < 0])).fillna(0).to_frame(), 
             left_index=True, right_index=True)

In [58]:
X.mean()


Out[58]:
review_x    1.482967
review_y    0.034228
review_x    2.461923
review_y    0.978957
dtype: float64

In [59]:
X.columns = ['polarity_sum', 'polarity_mean', 'polarity_count_positive', 'polarity_count_negative']

In [54]:
y =  data.rating.round().astype(int)
select = (y < 2) | (y >= 4)
clf = ensemble.GradientBoostingClassifier(n_estimators=100)
# clf = ensemble.RandomForestClassifier(n_estimators=200, class_weight='balanced')
# clf = linear_model.LogisticRegression(penalty='l2', class_weight='balanced')
# clf = pipeline.make_pipeline(preprocessing.StandardScaler(),
#                              neighbors.KNeighborsClassifier(n_neighbors=50, weights='distance'))
y_pred = model_selection.cross_val_predict(clf, X[select.values], y[select], 
                                           n_jobs=-1, cv=model_selection.StratifiedKFold(10, shuffle=True, random_state=42))
print(metrics.classification_report(y[select], y_pred))
print(metrics.confusion_matrix(y[select], y_pred))


             precision    recall  f1-score   support

          1       0.31      0.02      0.03      1474
          4       0.31      0.01      0.02      2679
          5       0.69      0.99      0.82      9289

avg / total       0.58      0.69      0.57     13442

[[  26   21 1427]
 [  31   23 2625]
 [  27   31 9231]]

In [47]:
print(tabulate.tabulate(metrics.confusion_matrix(y, y_pred), 
                        headers=range(1,6), tablefmt='orgtbl', showindex=range(1,6)).replace('+', '|'))


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-47-78a0f0c0267e> in <module>()
----> 1 print(tabulate.tabulate(metrics.confusion_matrix(y, y_pred), 
      2                         headers=range(1,6), tablefmt='orgtbl', showindex=range(1,6)).replace('+', '|'))

~/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py in confusion_matrix(y_true, y_pred, labels, sample_weight)
    248 
    249     """
--> 250     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
    251     if y_type not in ("binary", "multiclass"):
    252         raise ValueError("%s is not supported" % y_type)

~/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py in _check_targets(y_true, y_pred)
     69     y_pred : array or indicator matrix
     70     """
---> 71     check_consistent_length(y_true, y_pred)
     72     type_true = type_of_target(y_true)
     73     type_pred = type_of_target(y_pred)

~/anaconda/lib/python3.6/site-packages/sklearn/utils/validation.py in check_consistent_length(*arrays)
    171     if len(uniques) > 1:
    172         raise ValueError("Found input variables with inconsistent numbers of"
--> 173                          " samples: %r" % [int(l) for l in lengths])
    174 
    175 

ValueError: Found input variables with inconsistent numbers of samples: [15587, 14718]

In [60]:
X.to_pickle('../processed/polarity.pkl.gz')

In [40]:
pd.Categorical(y).describe()


Out[40]:
counts freqs
categories
1 1474 0.094566
2 869 0.055752
3 1276 0.081863
4 2679 0.171874
5 9289 0.595945

In [ ]: