In [14]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns',1000)
from xgboost import XGBClassifier
In [20]:
df = pd.read_csv('train.csv')
In [21]:
df.head()
Out[21]:
In [4]:
np.random.seed(42)
data=df[['Age','Fare']]
target = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(data.values, target.values, test_size=0.4)
In [5]:
clf = XGBClassifier()
clf.fit(X_train,y_train)
Out[5]:
In [6]:
y_preds = clf.predict_proba(X_test)
preds = y_preds[:,1]
In [7]:
# fpr means false-positive-rate
# tpr means true-positive-rate
fpr, tpr, _ = metrics.roc_curve(y_test, preds)
auc_score = metrics.auc(fpr, tpr)
plt.title('ROC Curve')
plt.plot(fpr, tpr, label='AUC = {:.3f}'.format(auc_score))
# it's helpful to add a diagonal to indicate where chance
# scores lie (i.e. just flipping a coin)
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc='lower right')
plt.show()
In [8]:
importance_data = sorted(list(zip(data.columns,clf.feature_importances_)),key=lambda tpl:tpl[1],reverse=True)
xs = range(len(importance_data))
labels = [x for (x,_) in importance_data]
ys = [y for (_,y) in importance_data]
In [10]:
plt.clf()
plt.bar(xs,ys,width=0.5)
plt.xticks(xs,labels)
plt.gca().grid(True)
# select both y axis and x axis
gridlines = plt.gca().get_xgridlines() + plt.gca().get_ygridlines()
# choose line width
line_width = 0.7
plt.ylabel('relative feature importance')
for line in gridlines:
line.set_linestyle(':')
line.set_linewidth(line_width)
plt.show()
In [22]:
df = df[['Pclass','Sex','SibSp','Embarked','Age','Fare','Survived']]
df = pd.concat([df,pd.get_dummies(df['Pclass'], prefix='Pclass',dummy_na=True)],axis=1).drop(['Pclass'],axis=1)
df = pd.concat([df,pd.get_dummies(df['Sex'], prefix='Sex',dummy_na=True)],axis=1).drop(['Sex'],axis=1)
df = pd.concat([df,pd.get_dummies(df['SibSp'], prefix='SibSp',dummy_na=True)],axis=1).drop(['SibSp'],axis=1)
df = pd.concat([df,pd.get_dummies(df['Embarked'], prefix='Embarked',dummy_na=True)],axis=1).drop(['Embarked'],axis=1)
In [23]:
df.head()
Out[23]:
In [24]:
np.random.seed(42)
data=df.drop(['Survived'],axis=1)
target = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(data.values, target.values, test_size=0.4)
In [26]:
clf = XGBClassifier()
clf.fit(X_train,y_train)
y_preds = clf.predict_proba(X_test)
preds = y_preds[:,1]
# fpr means false-positive-rate
# tpr means true-positive-rate
fpr, tpr, _ = metrics.roc_curve(y_test, preds)
auc_score = metrics.auc(fpr, tpr)
plt.title('ROC Curve')
plt.plot(fpr, tpr, label='AUC = {:.3f}'.format(auc_score))
# it's helpful to add a diagonal to indicate where chance
# scores lie (i.e. just flipping a coin)
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc='lower right')
plt.show()
In [27]:
importance_data = sorted(list(zip(data.columns,clf.feature_importances_)),key=lambda tpl:tpl[1],reverse=True)
xs = range(len(importance_data))
labels = [x for (x,_) in importance_data]
ys = [y for (_,y) in importance_data]
In [35]:
plt.clf()
plt.bar(xs,ys,width=0.5)
plt.xticks(xs,labels,rotation=45,ha='right')
plt.gca().grid(True)
# select both y axis and x axis
gridlines = plt.gca().get_xgridlines() + plt.gca().get_ygridlines()
# choose line width
line_width = 0.7
plt.ylabel('relative feature importance')
for line in gridlines:
line.set_linestyle(':')
line.set_linewidth(line_width)
plt.show()
In [38]:
In [40]:
categorical_columns_names = ['Pclass','Sex','SibSp','Embarked']
for column_name in categorical_columns_names:
all_values_sum = 0
for key,value in list(importances_dict.items():
if key.startswith(column_name):
all_values_sum += importances_dict[key]
del(importances_dict[key])
importances_dict[column_name] = all_values_sum
In [43]:
importance_data = sorted(list(importances_dict.items()),key=lambda tpl:tpl[1],reverse=True)
xs = range(len(importance_data))
labels = [x for (x,_) in importance_data]
ys = [y for (_,y) in importance_data]
In [45]:
plt.clf()
plt.bar(xs,ys,width=0.5)
plt.xticks(xs,labels,rotation=45,ha='center')
plt.gca().grid(True)
# select both y axis and x axis
gridlines = plt.gca().get_xgridlines() + plt.gca().get_ygridlines()
# choose line width
line_width = 0.7
plt.ylabel('relative feature importance')
for line in gridlines:
line.set_linestyle(':')
line.set_linewidth(line_width)
plt.show()