UpX Academy - Machine Learning Track

Decision Trees & Random Forest

Titanic Dataset


In [0]:
import pandas as pd
from sklearn.preprocessing import Imputer
from sklearn import tree
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
#train_df = pd.read_csv("titanic.csv")
#test_df = pd.read_csv("titanic_test.csv")

In [27]:
from google.colab import files
import io

uploaded = files.upload()
train_df = pd.read_csv(io.StringIO(uploaded['titanic.csv'].decode('utf-8')))


uploaded = files.upload()
test_df = pd.read_csv(io.StringIO(uploaded['titanic_test.csv'].decode('utf-8')))


Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to enable.
Saving titanic.csv to titanic (1).csv
Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to enable.
Saving titanic_test.csv to titanic_test (1).csv

In [28]:
train_df.head()


Out[28]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

In [0]:
train_df["Sex"] = train_df["Sex"].apply(lambda sex: 0 if sex == 'male' else 1)

In [5]:
y = targets = labels = train_df["Survived"].values

columns = ["Fare", "Pclass", "Sex", "Age", "SibSp"]
features = train_df[list(columns)].values
features


Out[5]:
array([[ 7.25  ,  3.    ,  0.    , 22.    ,  1.    ],
       [71.2833,  1.    ,  1.    , 38.    ,  1.    ],
       [ 7.925 ,  3.    ,  1.    , 26.    ,  0.    ],
       ...,
       [23.45  ,  3.    ,  1.    ,     nan,  1.    ],
       [30.    ,  1.    ,  0.    , 26.    ,  0.    ],
       [ 7.75  ,  3.    ,  0.    , 32.    ,  0.    ]])

In [6]:
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
X = imp.fit_transform(features)
X


Out[6]:
array([[ 7.25      ,  3.        ,  0.        , 22.        ,  1.        ],
       [71.2833    ,  1.        ,  1.        , 38.        ,  1.        ],
       [ 7.925     ,  3.        ,  1.        , 26.        ,  0.        ],
       ...,
       [23.45      ,  3.        ,  1.        , 29.69911765,  1.        ],
       [30.        ,  1.        ,  0.        , 26.        ,  0.        ],
       [ 7.75      ,  3.        ,  0.        , 32.        ,  0.        ]])

In [0]:
my_tree_one = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)
my_tree_one = my_tree_one.fit(X, y)

In [8]:
#The feature_importances_ attribute make it simple to interpret the significance of the predictors you include
print(my_tree_one.feature_importances_) 
print(my_tree_one.score(X, y))


[0.12330431 0.18665493 0.5670424  0.09423074 0.02876762]
0.8226711560044894

In [0]:
with open("titanic.dot", 'w') as f:
    f = tree.export_graphviz(my_tree_one, out_file=f, feature_names=columns)

In [0]:
files.download('titanic.dot')

visit http://webgraphviz.com/ to visualize the tree


In [11]:
test_df.head()


Out[11]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S

In [12]:
test_df["Sex"] = test_df["Sex"].apply(lambda sex: 0 if sex == 'male' else 1)
#features_test = train_df[list(columns)].values
features_test = test_df[list(columns)].values
imp_test = Imputer(missing_values='NaN', strategy='mean', axis=0)
X_test = imp_test.fit_transform(features_test)
X_test


Out[12]:
array([[ 7.8292    ,  3.        ,  0.        , 34.5       ,  0.        ],
       [ 7.        ,  3.        ,  1.        , 47.        ,  1.        ],
       [ 9.6875    ,  2.        ,  0.        , 62.        ,  0.        ],
       ...,
       [ 7.25      ,  3.        ,  0.        , 38.5       ,  0.        ],
       [ 8.05      ,  3.        ,  0.        , 30.27259036,  0.        ],
       [22.3583    ,  3.        ,  0.        , 30.27259036,  1.        ]])

In [13]:
pred = my_tree_one.predict(X_test)
pred


Out[13]:
array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0])

In [14]:
#Print Confusion matrix 
pred = my_tree_one.predict(X)
df_confusion = metrics.confusion_matrix(y, pred)
df_confusion


Out[14]:
array([[490,  59],
       [ 99, 243]])

In [15]:
def plot_confusion_matrix(df_confusion, title='Confusion matrix', cmap=plt.cm.gray_r):
    plt.matshow(df_confusion, cmap=cmap) # imshow
    plt.title('Confusion Matrix')
    plt.colorbar()
    plt.ylabel('Actual')
    plt.xlabel('Predicted')

plot_confusion_matrix(df_confusion)


What happen when we change the tree depth?


In [16]:
#Setting "max_depth" to 10 and "min_samples_split" to 5 : my_tree_two
max_depth = 10
min_samples_split = 5
my_tree_two = tree.DecisionTreeClassifier(max_depth = 10, min_samples_split = 5, random_state = 1)
my_tree_two = my_tree_two.fit(X, y)

#Print the score of the new decison tree
print(my_tree_two.score(X, y))


0.9012345679012346

In [0]:
pred = my_tree_two.predict(X)

In [18]:
df_confusion = metrics.confusion_matrix(y, pred)
df_confusion


Out[18]:
array([[533,  16],
       [ 72, 270]])

In [19]:
def plot_confusion_matrix(df_confusion, title='Confusion matrix', cmap=plt.cm.gray_r):
    plt.matshow(df_confusion, cmap=cmap) # imshow
    plt.title('Confusion Matrix')
    plt.colorbar()
    plt.ylabel('Actual')
    plt.xlabel('Predicted')

plot_confusion_matrix(df_confusion)


Try by yourself


In [0]:
# Add new feature and then train the model

train_df["family_size"] = train_df.SibSp + train_df.Parch + 1

Random Forest


In [21]:
from sklearn.ensemble import RandomForestClassifier

# Building and fitting my_forest
forest = RandomForestClassifier(max_depth = 10, min_samples_split=2, n_estimators = 100, random_state = 1)
my_forest = forest.fit(X, y)

# Print the score of the fitted random forest
print(my_forest.score(X, y))


0.9393939393939394

In [0]:
pred = my_forest.predict(X)

In [23]:
df_confusion = metrics.confusion_matrix(y, pred)
df_confusion


Out[23]:
array([[538,  11],
       [ 43, 299]])

In [24]:
fpr, tpr, _ = metrics.roc_curve(y, pred)
roc_auc = metrics.auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',
label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()



In [25]:
# ROC curve on Predicted probabilities
pred_proba = my_forest.predict_proba(X)
fpr, tpr, _ = metrics.roc_curve(y, pred_proba[:,1])
roc_auc = metrics.auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',
label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()