In [55]:
%pylab inline
In [6]:
from polyglot.downloader import downloader
downloader.download("LANG:ru")
Out[6]:
In [7]:
from polyglot.text import Text
import pandas as pd
import seaborn as sns
import tabulate
from sklearn import model_selection, ensemble, metrics, linear_model, neighbors, pipeline, preprocessing
In [23]:
Text(data.review[0])
Out[23]:
In [8]:
data = pd.read_csv('~/cloud/data/mvideo/X_train.csv')
data.columns = ['product_id', 'category_level1', 'category_level2', 'brand', 'property', 'user_name', 'rating', 'date',
'review', 'negative', 'positive']
data['date'] = pd.to_datetime(data.date)
In [24]:
def get_polarity(text):
if len(text) > 0:
text = Text(text, hint_language_code='ru')
return [word.polarity for word in text.words]
else:
return []
In [25]:
polarity = data.review.apply(get_polarity)
In [57]:
X = pd.merge(polarity.apply(sum).fillna(0).to_frame(),
polarity.apply(mean).fillna(0).to_frame(),
left_index=True, right_index=True)
X = pd.merge(X, polarity.apply(lambda x: len([v for v in x if v > 0])).fillna(0).to_frame(),
left_index=True, right_index=True)
X = pd.merge(X, polarity.apply(lambda x: len([v for v in x if v < 0])).fillna(0).to_frame(),
left_index=True, right_index=True)
In [58]:
X.mean()
Out[58]:
In [59]:
X.columns = ['polarity_sum', 'polarity_mean', 'polarity_count_positive', 'polarity_count_negative']
In [54]:
y = data.rating.round().astype(int)
select = (y < 2) | (y >= 4)
clf = ensemble.GradientBoostingClassifier(n_estimators=100)
# clf = ensemble.RandomForestClassifier(n_estimators=200, class_weight='balanced')
# clf = linear_model.LogisticRegression(penalty='l2', class_weight='balanced')
# clf = pipeline.make_pipeline(preprocessing.StandardScaler(),
# neighbors.KNeighborsClassifier(n_neighbors=50, weights='distance'))
y_pred = model_selection.cross_val_predict(clf, X[select.values], y[select],
n_jobs=-1, cv=model_selection.StratifiedKFold(10, shuffle=True, random_state=42))
print(metrics.classification_report(y[select], y_pred))
print(metrics.confusion_matrix(y[select], y_pred))
In [47]:
print(tabulate.tabulate(metrics.confusion_matrix(y, y_pred),
headers=range(1,6), tablefmt='orgtbl', showindex=range(1,6)).replace('+', '|'))
In [60]:
X.to_pickle('../processed/polarity.pkl.gz')
In [40]:
pd.Categorical(y).describe()
Out[40]:
In [ ]: