In [2]:
%pylab inline
In [36]:
import pandas as pd
from polyglot.text import Text, Word
import polyglot
import tabulate
from sklearn import model_selection, ensemble, metrics, linear_model, neighbors, pipeline, preprocessing, feature_extraction, svm
In [10]:
data = pd.read_csv('~/cloud/data/mvideo/X_train.csv')
data.columns = ['product_id', 'category_level1', 'category_level2', 'brand', 'property', 'user_name', 'rating', 'date',
'review', 'negative', 'positive']
data['date'] = pd.to_datetime(data.date)
In [24]:
data['review'] = utils.WordNormalizer(None).fit_transform(utils.NonAlphaRemover(None).fit_transform(data.review))
In [25]:
words = data['review'].apply(lambda x: [Word(w, language='ru') for w in x.split()])
In [26]:
def embed_words(words):
out = []
for word in words:
try:
out.append(w.vector)
except KeyError:
out.append(zeros(256))
return out
In [27]:
embedding = words.apply(embed_words)
In [28]:
embedding_df = embedding.apply(lambda x: np.sum(x,0)).apply(pd.Series).fillna(0)
In [29]:
X = embedding_df
In [40]:
y = data.rating.round().astype(int)
# clf = svm.LinearSVC()
# clf = ensemble.GradientBoostingClassifier(n_estimators=100)
clf = ensemble.RandomForestClassifier(n_estimators=100, class_weight='balanced')
# clf = linear_model.LogisticRegression(penalty='l1', class_weight='balanced')
# clf = pipeline.make_pipeline(preprocessing.StandardScaler(),
# neighbors.KNeighborsClassifier(n_neighbors=50, weights='distance'))
In [41]:
res = model_selection.cross_validate(clf, X, data['rating'].round(),
cv=model_selection.StratifiedKFold(10, shuffle=True, random_state=42), n_jobs=-1,
scoring=['f1_micro', 'f1_macro', 'f1_weighted'], return_train_score=False)
In [42]:
for k, v in res.items():
print([k, v.mean()])
In [ ]: