In [1]:
import numpy as np
import pandas as pd
%pylab inline
In [2]:
train_data = pd.read_csv('./data/evo_train.csv', sep=',', header=0)
test_data = pd.read_csv('./data/evo_test.csv', sep=',', header=0)
In [3]:
print train_data.shape
print test_data.shape
In [4]:
train_data.head()
Out[4]:
In [5]:
train_data.GROUP_ID.value_counts()
Out[5]:
Посмотрим самые частые и самые редкие категории.
In [6]:
train_data[train_data.GROUP_ID == 6]['NAME'].head(10)
Out[6]:
!!По-хорошему:
In [7]:
# удаление спецсимволов
import re
def remove_special_character(somestring):
pattern = re.compile(u'[^a-zA-Zа-яА-Я0-9_]+')
tokens = pattern.sub(' ', somestring).strip()
return tokens
train_data['NAME'] = train_data['NAME'].str.decode('utf-8')
train_data['NAME'] = train_data['NAME'].apply(remove_special_character)
test_data['NAME'] = test_data['NAME'].str.decode('utf-8')
test_data['NAME'] = test_data['NAME'].apply(remove_special_character)
In [8]:
# лемматизация
from pymystem3 import Mystem
m = Mystem()
def lemmatize(text):
lemmas = m.lemmatize(text)
return (''.join(lemmas)).strip()
train_data['NAME'] = train_data['NAME'].apply(lemmatize)
test_data['NAME'] = test_data['NAME'].apply(lemmatize)
In [11]:
test_data.head(10)
Out[11]:
In [13]:
train_data.to_csv('./data/lem_train_data.csv', header=True, index=False, encoding='utf-8', sep=';')
test_data.to_csv('./data/lem_test_data.csv', header=True, index=False, encoding='utf-8', sep=';')
In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
In [14]:
max_features = range(4000, 7000, 1000)
for mf in max_features:
cv = TfidfVectorizer(max_features=mf)
cv_train_data = cv.fit_transform(train_data['NAME'])
rf = RandomForestClassifier(n_estimators=200, random_state=11)
scores = cross_val_score(rf, cv_train_data, train_data['GROUP_ID'], cv=2, scoring='accuracy')
print 'Max features: ', mf
print 'Score: ', scores.mean()
In [15]:
# пробное решение
cv = TfidfVectorizer(max_features=8000)
cv_train_data = cv.fit_transform(train_data['NAME'])
cv_test_data = cv.transform(test_data['NAME'])
rf = RandomForestClassifier(n_estimators=400, random_state=11)
rf.fit(cv_train_data, train_data['GROUP_ID'])
Out[15]:
In [16]:
predict_rf = rf.predict(cv_test_data)
test_data['GROUP_ID'] = predict_rf
In [17]:
test_data.head()
Out[17]:
In [18]:
test_data[['id', 'GROUP_ID']].to_csv('res2_rf_3105.csv', sep=',', header=True, index=False)
In [19]:
from sklearn.naive_bayes import MultinomialNB
In [20]:
mnb = MultinomialNB()
scores = cross_val_score(mnb, cv_train_data, train_data['GROUP_ID'], cv=3, scoring='accuracy')
In [21]:
print scores
In [ ]: