In [1]:
import numpy as np
import pandas as pd
%pylab inline
In [2]:
train_data = pd.read_csv('./data/evo_train.csv', sep=',', header=0)
test_data = pd.read_csv('./data/evo_test.csv', sep=',', header=0)
In [3]:
print train_data.shape
print test_data.shape
In [4]:
train_data.head()
Out[4]:
In [5]:
train_data.GROUP_ID.value_counts()
Out[5]:
Посмотрим самые частые и самые редкие категории.
In [6]:
train_data[train_data.GROUP_ID == 6]['NAME'].head(10)
Out[6]:
!!По-хорошему:
In [7]:
# удаление спецсимволов
import re
def remove_special_character(somestring):
pattern = re.compile(u'[^a-zA-Zа-яА-Я0-9_]+')
tokens = pattern.sub(' ', somestring).strip()
return tokens
train_data['NAME'] = train_data['NAME'].str.decode('utf-8')
train_data['NAME'] = train_data['NAME'].apply(remove_special_character)
test_data['NAME'] = test_data['NAME'].str.decode('utf-8')
test_data['NAME'] = test_data['NAME'].apply(remove_special_character)
In [8]:
# лемматизация
from pymystem3 import Mystem
m = Mystem()
def lemmatize(text):
lemmas = m.lemmatize(text)
return (''.join(lemmas)).strip()
train_data['NAME'] = train_data['NAME'].apply(lemmatize)
test_data['NAME'] = test_data['NAME'].apply(lemmatize)
In [9]:
train_data['NAME'].head(10)
Out[9]:
In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
In [14]:
max_features = range(100, 1000, 100)
for mf in max_features:
cv = CountVectorizer(max_features=mf)
cv_train_data = cv.fit_transform(train_data['NAME'])
rf = RandomForestClassifier(n_estimators=200, random_state=11)
scores = cross_val_score(rf, cv_train_data, train_data['GROUP_ID'], cv=2, scoring='accuracy')
print 'Max features: ', mf
print 'Score: ', scores.mean()
In [11]:
# пробное решение
cv = CountVectorizer(max_features=1200)
cv_train_data = cv.fit_transform(train_data['NAME'])
cv_test_data = cv.transform(test_data['NAME'])
rf = RandomForestClassifier(n_estimators=400, random_state=11)
rf.fit(cv_train_data, train_data['GROUP_ID'])
Out[11]:
In [13]:
predict_rf = rf.predict(cv_test_data)
test_data['GROUP_ID'] = predict_rf
In [14]:
test_data.head()
Out[14]:
In [19]:
test_data[['id', 'GROUP_ID']].to_csv('res_3105.csv', sep=',', header=True, index=False)
In [16]:
from sklearn.naive_bayes import MultinomialNB
In [17]:
mnb = MultinomialNB()
scores = cross_val_score(mnb, cv_train_data, train_data['GROUP_ID'], cv=3, scoring='accuracy')
In [18]:
print scores
In [ ]: