In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
In [32]:
df = pd.read_csv('data/titanic/train.csv',index_col='PassengerId')
In [34]:
df = pd.concat([df,pd.get_dummies(df['Pclass'], prefix='Pclass')],axis=1)
df.drop(['Pclass'],axis=1,inplace=True)
df = pd.concat([df,pd.get_dummies(df['Sex'], prefix='Sex')],axis=1)
df.drop(['Sex'],axis=1,inplace=True)
In [20]:
X_train = train_df[['Pclass_1','Pclass_2','Pclass_3', 'Sex_female','Sex_male','Age','Fare']].values
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
X_train =imp.fit_transform(X_train)
y_train = train_df[['Survived']].values
In [21]:
X_test = test_df[['Pclass_1','Pclass_2','Pclass_3', 'Sex_female','Sex_male','Age','Fare']].values
X_test = imp.transform(X_test)
y_test = test_df[['Survived']].values
In [22]:
clf = ensemble.GradientBoostingClassifier()
clf.fit(X_train, y_train.ravel())
Out[22]:
In [23]:
y_preds = clf.predict_proba(X_test)
# take the second column because the classifier outputs scores for
# the 0 class as well
preds = y_preds[:,1]
# fpr means false-positive-rate
# tpr means true-positive-rate
fpr, tpr, _ = metrics.roc_curve(y_test, preds)
auc_score = metrics.auc(fpr, tpr)
plt.title('ROC Curve')
plt.plot(fpr, tpr, label='AUC = {:.2f}'.format(auc_score))
# it's helpful to add a diagonal to indicate where chance
# scores lie (i.e. just flipping a coin)
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc='lower right')
plt.show()
In [16]:
clf.feature_importances_
Out[16]:
In [17]:
pd.DataFrame({
'feature':['Pclass_1','Pclass_2','Pclass_3', 'Sex_female','Sex_male','Age','Fare'],
'importance':clf.feature_importances_
}).sort_values("importance",ascending=False)
Out[17]: