In [ ]:
from sklearn import ensemble , cross_validation, learning_curve, metrics
import numpy as np
import pandas as pd
import xgboost as xgb
In [ ]:
%pylab inline
Задача на kaggle: https://www.kaggle.com/c/bioresponse
Данные: https://www.kaggle.com/c/bioresponse/data
По данным характеристикам молекулы требуется определить, будет ли дан биологический ответ (biological response).
Признаки нормализаваны.
Для демонстрации используется обучающая выборка из исходных данных train.csv, файл с данными прилагается.
In [ ]:
bioresponce = pd.read_csv('bioresponse.csv', header=0, sep=',')
In [ ]:
bioresponce.head()
In [ ]:
bioresponce_target = bioresponce.Activity.values
In [ ]:
bioresponce_data = bioresponce.iloc[:, 1:]
In [ ]:
n_trees = [1] + range(10, 55, 5)
In [ ]:
%%time
scoring = []
for n_tree in n_trees:
estimator = ensemble.RandomForestClassifier(n_estimators = n_tree, min_samples_split=5, random_state=1)
score = cross_validation.cross_val_score(estimator, bioresponce_data, bioresponce_target,
scoring = 'accuracy', cv = 3)
scoring.append(score)
scoring = np.asmatrix(scoring)
In [ ]:
scoring
In [ ]:
pylab.plot(n_trees, scoring.mean(axis = 1), marker='.', label='RandomForest')
pylab.grid(True)
pylab.xlabel('n_trees')
pylab.ylabel('score')
pylab.title('Accuracy score')
pylab.legend(loc='lower right')
In [ ]:
%%time
xgb_scoring = []
for n_tree in n_trees:
estimator = xgb.XGBClassifier(learning_rate=0.1, max_depth=5, n_estimators=n_tree, min_child_weight=3)
score = cross_validation.cross_val_score(estimator, bioresponce_data, bioresponce_target,
scoring = 'accuracy', cv = 3)
xgb_scoring.append(score)
xgb_scoring = np.asmatrix(xgb_scoring)
In [ ]:
xgb_scoring
In [ ]:
pylab.plot(n_trees, scoring.mean(axis = 1), marker='.', label='RandomForest')
pylab.plot(n_trees, xgb_scoring.mean(axis = 1), marker='.', label='XGBoost')
pylab.grid(True)
pylab.xlabel('n_trees')
pylab.ylabel('score')
pylab.title('Accuracy score')
pylab.legend(loc='lower right')
python api: http://xgboost.readthedocs.org/en/latest/python/python_api.html
установка: http://xgboost.readthedocs.io/en/latest/build.html