In [14]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns',1000)

from xgboost import XGBClassifier

In [20]:
df = pd.read_csv('train.csv')

In [21]:
df.head()


Out[21]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

In [4]:
np.random.seed(42)

data=df[['Age','Fare']]
target = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(data.values, target.values, test_size=0.4)

In [5]:
clf = XGBClassifier()
clf.fit(X_train,y_train)


Out[5]:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [6]:
y_preds = clf.predict_proba(X_test)
preds = y_preds[:,1]

In [7]:
# fpr means false-positive-rate
# tpr means true-positive-rate
fpr, tpr, _ = metrics.roc_curve(y_test, preds)

auc_score = metrics.auc(fpr, tpr)

plt.title('ROC Curve')
plt.plot(fpr, tpr, label='AUC = {:.3f}'.format(auc_score))

# it's helpful to add a diagonal to indicate where chance 
# scores lie (i.e. just flipping a coin)
plt.plot([0,1],[0,1],'r--')

plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

plt.legend(loc='lower right')
plt.show()



In [8]:
importance_data = sorted(list(zip(data.columns,clf.feature_importances_)),key=lambda tpl:tpl[1],reverse=True)

xs = range(len(importance_data))
labels = [x for (x,_) in importance_data]

ys = [y for (_,y) in importance_data]

In [10]:
plt.clf()
plt.bar(xs,ys,width=0.5)
plt.xticks(xs,labels)

plt.gca().grid(True)
# select both y axis and x axis
gridlines = plt.gca().get_xgridlines() + plt.gca().get_ygridlines()
# choose line width
line_width = 0.7

plt.ylabel('relative feature importance')

for line in gridlines:
    line.set_linestyle(':')
    line.set_linewidth(line_width)
plt.show()


all variables, including categorical


In [22]:
df = df[['Pclass','Sex','SibSp','Embarked','Age','Fare','Survived']]

df = pd.concat([df,pd.get_dummies(df['Pclass'], prefix='Pclass',dummy_na=True)],axis=1).drop(['Pclass'],axis=1)
df = pd.concat([df,pd.get_dummies(df['Sex'], prefix='Sex',dummy_na=True)],axis=1).drop(['Sex'],axis=1)
df = pd.concat([df,pd.get_dummies(df['SibSp'], prefix='SibSp',dummy_na=True)],axis=1).drop(['SibSp'],axis=1)
df = pd.concat([df,pd.get_dummies(df['Embarked'], prefix='Embarked',dummy_na=True)],axis=1).drop(['Embarked'],axis=1)

In [23]:
df.head()


Out[23]:
Age Fare Survived Pclass_1.0 Pclass_2.0 Pclass_3.0 Pclass_nan Sex_female Sex_male Sex_nan SibSp_0.0 SibSp_1.0 SibSp_2.0 SibSp_3.0 SibSp_4.0 SibSp_5.0 SibSp_8.0 SibSp_nan Embarked_C Embarked_Q Embarked_S Embarked_nan
0 22.0 7.2500 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0
1 38.0 71.2833 1 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0
2 26.0 7.9250 1 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0
3 35.0 53.1000 1 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0
4 35.0 8.0500 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0

In [24]:
np.random.seed(42)

data=df.drop(['Survived'],axis=1)
target = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(data.values, target.values, test_size=0.4)

In [26]:
clf = XGBClassifier()
clf.fit(X_train,y_train)

y_preds = clf.predict_proba(X_test)
preds = y_preds[:,1]

# fpr means false-positive-rate
# tpr means true-positive-rate
fpr, tpr, _ = metrics.roc_curve(y_test, preds)

auc_score = metrics.auc(fpr, tpr)

plt.title('ROC Curve')
plt.plot(fpr, tpr, label='AUC = {:.3f}'.format(auc_score))

# it's helpful to add a diagonal to indicate where chance 
# scores lie (i.e. just flipping a coin)
plt.plot([0,1],[0,1],'r--')

plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

plt.legend(loc='lower right')
plt.show()



In [27]:
importance_data = sorted(list(zip(data.columns,clf.feature_importances_)),key=lambda tpl:tpl[1],reverse=True)

xs = range(len(importance_data))
labels = [x for (x,_) in importance_data]

ys = [y for (_,y) in importance_data]

In [35]:
plt.clf()
plt.bar(xs,ys,width=0.5)
plt.xticks(xs,labels,rotation=45,ha='right')

plt.gca().grid(True)
# select both y axis and x axis
gridlines = plt.gca().get_xgridlines() + plt.gca().get_ygridlines()
# choose line width
line_width = 0.7

plt.ylabel('relative feature importance')

for line in gridlines:
    line.set_linestyle(':')
    line.set_linewidth(line_width)
plt.show()



In [38]:


In [40]:
categorical_columns_names = ['Pclass','Sex','SibSp','Embarked']

for column_name in categorical_columns_names:
    
    all_values_sum = 0
    
    for key,value in list(importances_dict.items():
        if key.startswith(column_name):
            all_values_sum += importances_dict[key]
            del(importances_dict[key])
            
    importances_dict[column_name] = all_values_sum

In [43]:
importance_data = sorted(list(importances_dict.items()),key=lambda tpl:tpl[1],reverse=True)

xs = range(len(importance_data))
labels = [x for (x,_) in importance_data]

ys = [y for (_,y) in importance_data]

In [45]:
plt.clf()
plt.bar(xs,ys,width=0.5)
plt.xticks(xs,labels,rotation=45,ha='center')

plt.gca().grid(True)
# select both y axis and x axis
gridlines = plt.gca().get_xgridlines() + plt.gca().get_ygridlines()
# choose line width
line_width = 0.7

plt.ylabel('relative feature importance')

for line in gridlines:
    line.set_linestyle(':')
    line.set_linewidth(line_width)
plt.show()