notebook.community

Edit and run



In [14]:

    
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns',1000)

from xgboost import XGBClassifier



In [20]:

    
df = pd.read_csv('train.csv')



In [21]:

    
df.head()









    Out[21]:







  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.0
      1
      0
      113803
      53.1000
      C123
      S
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35.0
      0
      0
      373450
      8.0500
      NaN
      S



In [4]:

    
np.random.seed(42)

data=df[['Age','Fare']]
target = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(data.values, target.values, test_size=0.4)



In [5]:

    
clf = XGBClassifier()
clf.fit(X_train,y_train)









    Out[5]:





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)



In [6]:

    
y_preds = clf.predict_proba(X_test)
preds = y_preds[:,1]



In [7]:

    
# fpr means false-positive-rate
# tpr means true-positive-rate
fpr, tpr, _ = metrics.roc_curve(y_test, preds)

auc_score = metrics.auc(fpr, tpr)

plt.title('ROC Curve')
plt.plot(fpr, tpr, label='AUC = {:.3f}'.format(auc_score))

# it's helpful to add a diagonal to indicate where chance 
# scores lie (i.e. just flipping a coin)
plt.plot([0,1],[0,1],'r--')

plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

plt.legend(loc='lower right')
plt.show()



In [8]:

    
importance_data = sorted(list(zip(data.columns,clf.feature_importances_)),key=lambda tpl:tpl[1],reverse=True)

xs = range(len(importance_data))
labels = [x for (x,_) in importance_data]

ys = [y for (_,y) in importance_data]



In [10]:

    
plt.clf()
plt.bar(xs,ys,width=0.5)
plt.xticks(xs,labels)

plt.gca().grid(True)
# select both y axis and x axis
gridlines = plt.gca().get_xgridlines() + plt.gca().get_ygridlines()
# choose line width
line_width = 0.7

plt.ylabel('relative feature importance')

for line in gridlines:
    line.set_linestyle(':')
    line.set_linewidth(line_width)
plt.show()

all variables, including categorical



In [22]:

    
df = df[['Pclass','Sex','SibSp','Embarked','Age','Fare','Survived']]

df = pd.concat([df,pd.get_dummies(df['Pclass'], prefix='Pclass',dummy_na=True)],axis=1).drop(['Pclass'],axis=1)
df = pd.concat([df,pd.get_dummies(df['Sex'], prefix='Sex',dummy_na=True)],axis=1).drop(['Sex'],axis=1)
df = pd.concat([df,pd.get_dummies(df['SibSp'], prefix='SibSp',dummy_na=True)],axis=1).drop(['SibSp'],axis=1)
df = pd.concat([df,pd.get_dummies(df['Embarked'], prefix='Embarked',dummy_na=True)],axis=1).drop(['Embarked'],axis=1)



In [23]:

    
df.head()









    Out[23]:







  
    
      
      Age
      Fare
      Survived
      Pclass_1.0
      Pclass_2.0
      Pclass_3.0
      Pclass_nan
      Sex_female
      Sex_male
      Sex_nan
      SibSp_0.0
      SibSp_1.0
      SibSp_2.0
      SibSp_3.0
      SibSp_4.0
      SibSp_5.0
      SibSp_8.0
      SibSp_nan
      Embarked_C
      Embarked_Q
      Embarked_S
      Embarked_nan
    
  
  
    
      0
      22.0
      7.2500
      0
      0
      0
      1
      0
      0
      1
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
    
    
      1
      38.0
      71.2833
      1
      1
      0
      0
      0
      1
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
    
    
      2
      26.0
      7.9250
      1
      0
      0
      1
      0
      1
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
    
    
      3
      35.0
      53.1000
      1
      1
      0
      0
      0
      1
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
    
    
      4
      35.0
      8.0500
      0
      0
      0
      1
      0
      0
      1
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0



In [24]:

    
np.random.seed(42)

data=df.drop(['Survived'],axis=1)
target = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(data.values, target.values, test_size=0.4)



In [26]:

    
clf = XGBClassifier()
clf.fit(X_train,y_train)

y_preds = clf.predict_proba(X_test)
preds = y_preds[:,1]

# fpr means false-positive-rate
# tpr means true-positive-rate
fpr, tpr, _ = metrics.roc_curve(y_test, preds)

auc_score = metrics.auc(fpr, tpr)

plt.title('ROC Curve')
plt.plot(fpr, tpr, label='AUC = {:.3f}'.format(auc_score))

# it's helpful to add a diagonal to indicate where chance 
# scores lie (i.e. just flipping a coin)
plt.plot([0,1],[0,1],'r--')

plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

plt.legend(loc='lower right')
plt.show()



In [27]:

    
importance_data = sorted(list(zip(data.columns,clf.feature_importances_)),key=lambda tpl:tpl[1],reverse=True)

xs = range(len(importance_data))
labels = [x for (x,_) in importance_data]

ys = [y for (_,y) in importance_data]



In [35]:

    
plt.clf()
plt.bar(xs,ys,width=0.5)
plt.xticks(xs,labels,rotation=45,ha='right')

plt.gca().grid(True)
# select both y axis and x axis
gridlines = plt.gca().get_xgridlines() + plt.gca().get_ygridlines()
# choose line width
line_width = 0.7

plt.ylabel('relative feature importance')

for line in gridlines:
    line.set_linestyle(':')
    line.set_linewidth(line_width)
plt.show()



In [38]:



In [40]:

    
categorical_columns_names = ['Pclass','Sex','SibSp','Embarked']

for column_name in categorical_columns_names:
    
    all_values_sum = 0
    
    for key,value in list(importances_dict.items():
        if key.startswith(column_name):
            all_values_sum += importances_dict[key]
            del(importances_dict[key])
            
    importances_dict[column_name] = all_values_sum



In [43]:

    
importance_data = sorted(list(importances_dict.items()),key=lambda tpl:tpl[1],reverse=True)

xs = range(len(importance_data))
labels = [x for (x,_) in importance_data]

ys = [y for (_,y) in importance_data]



In [45]:

    
plt.clf()
plt.bar(xs,ys,width=0.5)
plt.xticks(xs,labels,rotation=45,ha='center')

plt.gca().grid(True)
# select both y axis and x axis
gridlines = plt.gca().get_xgridlines() + plt.gca().get_ygridlines()
# choose line width
line_width = 0.7

plt.ylabel('relative feature importance')

for line in gridlines:
    line.set_linestyle(':')
    line.set_linewidth(line_width)
plt.show()

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S