notebook.community

Edit and run



In [55]:

    
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib






    



/Users/pavelerofeev/anaconda/lib/python3.6/site-packages/IPython/core/magics/pylab.py:160: UserWarning: pylab import has clobbered these variables: ['Text', 'clf', 'select']
`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"



In [6]:

    
from polyglot.downloader import downloader
downloader.download("LANG:ru")









    



[polyglot_data] Downloading collection 'LANG:ru'
[polyglot_data]    | 
[polyglot_data]    | Downloading package sgns2.ru to
[polyglot_data]    |     /Users/pavelerofeev/polyglot_data...
[polyglot_data]    |   Package sgns2.ru is already up-to-date!
[polyglot_data]    | Downloading package unipos.ru to
[polyglot_data]    |     /Users/pavelerofeev/polyglot_data...
[polyglot_data]    | Downloading package ner2.ru to
[polyglot_data]    |     /Users/pavelerofeev/polyglot_data...
[polyglot_data]    |   Package ner2.ru is already up-to-date!
[polyglot_data]    | Downloading package counts2.ru to
[polyglot_data]    |     /Users/pavelerofeev/polyglot_data...
[polyglot_data]    |   Package counts2.ru is already up-to-date!
[polyglot_data]    | Downloading package transliteration2.ru to
[polyglot_data]    |     /Users/pavelerofeev/polyglot_data...
[polyglot_data]    |   Package transliteration2.ru is already up-to-
[polyglot_data]    |       date!
[polyglot_data]    | Downloading package embeddings2.ru to
[polyglot_data]    |     /Users/pavelerofeev/polyglot_data...
[polyglot_data]    |   Package embeddings2.ru is already up-to-date!
[polyglot_data]    | Downloading package uniemb.ru to
[polyglot_data]    |     /Users/pavelerofeev/polyglot_data...
[polyglot_data]    | Downloading package sentiment2.ru to
[polyglot_data]    |     /Users/pavelerofeev/polyglot_data...
[polyglot_data]    |   Package sentiment2.ru is already up-to-date!
[polyglot_data]    | Downloading package tsne2.ru to
[polyglot_data]    |     /Users/pavelerofeev/polyglot_data...
[polyglot_data]    |   Package tsne2.ru is already up-to-date!
[polyglot_data]    | Downloading package morph2.ru to
[polyglot_data]    |     /Users/pavelerofeev/polyglot_data...
[polyglot_data]    |   Package morph2.ru is already up-to-date!
[polyglot_data]    | 
[polyglot_data]  Done downloading collection LANG:ru






    Out[6]:





True



In [7]:

    
from polyglot.text import Text
import pandas as pd
import seaborn as sns

import tabulate

from sklearn import model_selection, ensemble, metrics, linear_model, neighbors, pipeline, preprocessing



In [23]:

    
Text(data.review[0])









    Out[23]:





-1.0



In [8]:

    
data = pd.read_csv('~/cloud/data/mvideo/X_train.csv')
data.columns = ['product_id', 'category_level1', 'category_level2', 'brand', 'property', 'user_name', 'rating', 'date',
               'review', 'negative', 'positive']
data['date'] = pd.to_datetime(data.date)



In [24]:

    
def get_polarity(text):
    if len(text) > 0:
        text = Text(text, hint_language_code='ru')
        return [word.polarity for word in  text.words]
    else:
        return []



In [25]:

    
polarity = data.review.apply(get_polarity)



In [57]:

    
X = pd.merge(polarity.apply(sum).fillna(0).to_frame(), 
             polarity.apply(mean).fillna(0).to_frame(), 
             left_index=True, right_index=True)
X = pd.merge(X, polarity.apply(lambda x: len([v for v in x if v > 0])).fillna(0).to_frame(), 
             left_index=True, right_index=True)
X = pd.merge(X, polarity.apply(lambda x: len([v for v in x if v < 0])).fillna(0).to_frame(), 
             left_index=True, right_index=True)



In [58]:

    
X.mean()









    Out[58]:





review_x    1.482967
review_y    0.034228
review_x    2.461923
review_y    0.978957
dtype: float64



In [59]:

    
X.columns = ['polarity_sum', 'polarity_mean', 'polarity_count_positive', 'polarity_count_negative']



In [54]:

    
y =  data.rating.round().astype(int)
select = (y < 2) | (y >= 4)
clf = ensemble.GradientBoostingClassifier(n_estimators=100)
# clf = ensemble.RandomForestClassifier(n_estimators=200, class_weight='balanced')
# clf = linear_model.LogisticRegression(penalty='l2', class_weight='balanced')
# clf = pipeline.make_pipeline(preprocessing.StandardScaler(),
#                              neighbors.KNeighborsClassifier(n_neighbors=50, weights='distance'))
y_pred = model_selection.cross_val_predict(clf, X[select.values], y[select], 
                                           n_jobs=-1, cv=model_selection.StratifiedKFold(10, shuffle=True, random_state=42))
print(metrics.classification_report(y[select], y_pred))
print(metrics.confusion_matrix(y[select], y_pred))









    



             precision    recall  f1-score   support

          1       0.31      0.02      0.03      1474
          4       0.31      0.01      0.02      2679
          5       0.69      0.99      0.82      9289

avg / total       0.58      0.69      0.57     13442

[[  26   21 1427]
 [  31   23 2625]
 [  27   31 9231]]



In [47]:

    
print(tabulate.tabulate(metrics.confusion_matrix(y, y_pred), 
                        headers=range(1,6), tablefmt='orgtbl', showindex=range(1,6)).replace('+', '|'))









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-47-78a0f0c0267e> in <module>()
----> 1 print(tabulate.tabulate(metrics.confusion_matrix(y, y_pred), 
      2                         headers=range(1,6), tablefmt='orgtbl', showindex=range(1,6)).replace('+', '|'))

~/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py in confusion_matrix(y_true, y_pred, labels, sample_weight)
    248 
    249     """
--> 250     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
    251     if y_type not in ("binary", "multiclass"):
    252         raise ValueError("%s is not supported" % y_type)

~/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py in _check_targets(y_true, y_pred)
     69     y_pred : array or indicator matrix
     70     """
---> 71     check_consistent_length(y_true, y_pred)
     72     type_true = type_of_target(y_true)
     73     type_pred = type_of_target(y_pred)

~/anaconda/lib/python3.6/site-packages/sklearn/utils/validation.py in check_consistent_length(*arrays)
    171     if len(uniques) > 1:
    172         raise ValueError("Found input variables with inconsistent numbers of"
--> 173                          " samples: %r" % [int(l) for l in lengths])
    174 
    175 

ValueError: Found input variables with inconsistent numbers of samples: [15587, 14718]



In [60]:

    
X.to_pickle('../processed/polarity.pkl.gz')



In [40]:

    
pd.Categorical(y).describe()



In [ ]:

	counts	freqs
categories
1	1474	0.094566
2	869	0.055752
3	1276	0.081863
4	2679	0.171874
5	9289	0.595945