UpX Academy - Machine Learning Track

Decision Trees & Random Forest

Titanic Dataset



In [0]:

    
import pandas as pd
from sklearn.preprocessing import Imputer
from sklearn import tree
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
#train_df = pd.read_csv("titanic.csv")
#test_df = pd.read_csv("titanic_test.csv")



In [27]:

    
from google.colab import files
import io

uploaded = files.upload()
train_df = pd.read_csv(io.StringIO(uploaded['titanic.csv'].decode('utf-8')))


uploaded = files.upload()
test_df = pd.read_csv(io.StringIO(uploaded['titanic_test.csv'].decode('utf-8')))









    





     
     
      Upload widget is only available when the cell has been executed in the
      current browser session. Please rerun this cell to enable.
      
       






    



Saving titanic.csv to titanic (1).csv






    





     
     
      Upload widget is only available when the cell has been executed in the
      current browser session. Please rerun this cell to enable.
      
       






    



Saving titanic_test.csv to titanic_test (1).csv



In [28]:

    
train_df.head()









    Out[28]:







  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.0
      1
      0
      113803
      53.1000
      C123
      S
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35.0
      0
      0
      373450
      8.0500
      NaN
      S



In [0]:

    
train_df["Sex"] = train_df["Sex"].apply(lambda sex: 0 if sex == 'male' else 1)



In [5]:

    
y = targets = labels = train_df["Survived"].values

columns = ["Fare", "Pclass", "Sex", "Age", "SibSp"]
features = train_df[list(columns)].values
features









    Out[5]:





array([[ 7.25  ,  3.    ,  0.    , 22.    ,  1.    ],
       [71.2833,  1.    ,  1.    , 38.    ,  1.    ],
       [ 7.925 ,  3.    ,  1.    , 26.    ,  0.    ],
       ...,
       [23.45  ,  3.    ,  1.    ,     nan,  1.    ],
       [30.    ,  1.    ,  0.    , 26.    ,  0.    ],
       [ 7.75  ,  3.    ,  0.    , 32.    ,  0.    ]])



In [6]:

    
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
X = imp.fit_transform(features)
X









    Out[6]:





array([[ 7.25      ,  3.        ,  0.        , 22.        ,  1.        ],
       [71.2833    ,  1.        ,  1.        , 38.        ,  1.        ],
       [ 7.925     ,  3.        ,  1.        , 26.        ,  0.        ],
       ...,
       [23.45      ,  3.        ,  1.        , 29.69911765,  1.        ],
       [30.        ,  1.        ,  0.        , 26.        ,  0.        ],
       [ 7.75      ,  3.        ,  0.        , 32.        ,  0.        ]])



In [0]:

    
my_tree_one = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)
my_tree_one = my_tree_one.fit(X, y)



In [8]:

    
#The feature_importances_ attribute make it simple to interpret the significance of the predictors you include
print(my_tree_one.feature_importances_) 
print(my_tree_one.score(X, y))









    



[0.12330431 0.18665493 0.5670424  0.09423074 0.02876762]
0.8226711560044894



In [0]:

    
with open("titanic.dot", 'w') as f:
    f = tree.export_graphviz(my_tree_one, out_file=f, feature_names=columns)



In [0]:

    
files.download('titanic.dot')

visit http://webgraphviz.com/ to visualize the tree



In [11]:

    
test_df.head()









    Out[11]:







  
    
      
      PassengerId
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      892
      3
      Kelly, Mr. James
      male
      34.5
      0
      0
      330911
      7.8292
      NaN
      Q
    
    
      1
      893
      3
      Wilkes, Mrs. James (Ellen Needs)
      female
      47.0
      1
      0
      363272
      7.0000
      NaN
      S
    
    
      2
      894
      2
      Myles, Mr. Thomas Francis
      male
      62.0
      0
      0
      240276
      9.6875
      NaN
      Q
    
    
      3
      895
      3
      Wirz, Mr. Albert
      male
      27.0
      0
      0
      315154
      8.6625
      NaN
      S
    
    
      4
      896
      3
      Hirvonen, Mrs. Alexander (Helga E Lindqvist)
      female
      22.0
      1
      1
      3101298
      12.2875
      NaN
      S



In [12]:

    
test_df["Sex"] = test_df["Sex"].apply(lambda sex: 0 if sex == 'male' else 1)
#features_test = train_df[list(columns)].values
features_test = test_df[list(columns)].values
imp_test = Imputer(missing_values='NaN', strategy='mean', axis=0)
X_test = imp_test.fit_transform(features_test)
X_test









    Out[12]:





array([[ 7.8292    ,  3.        ,  0.        , 34.5       ,  0.        ],
       [ 7.        ,  3.        ,  1.        , 47.        ,  1.        ],
       [ 9.6875    ,  2.        ,  0.        , 62.        ,  0.        ],
       ...,
       [ 7.25      ,  3.        ,  0.        , 38.5       ,  0.        ],
       [ 8.05      ,  3.        ,  0.        , 30.27259036,  0.        ],
       [22.3583    ,  3.        ,  0.        , 30.27259036,  1.        ]])



In [13]:

    
pred = my_tree_one.predict(X_test)
pred









    Out[13]:





array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0])



In [14]:

    
#Print Confusion matrix 
pred = my_tree_one.predict(X)
df_confusion = metrics.confusion_matrix(y, pred)
df_confusion









    Out[14]:





array([[490,  59],
       [ 99, 243]])



In [15]:

    
def plot_confusion_matrix(df_confusion, title='Confusion matrix', cmap=plt.cm.gray_r):
    plt.matshow(df_confusion, cmap=cmap) # imshow
    plt.title('Confusion Matrix')
    plt.colorbar()
    plt.ylabel('Actual')
    plt.xlabel('Predicted')

plot_confusion_matrix(df_confusion)

What happen when we change the tree depth?



In [16]:

    
#Setting "max_depth" to 10 and "min_samples_split" to 5 : my_tree_two
max_depth = 10
min_samples_split = 5
my_tree_two = tree.DecisionTreeClassifier(max_depth = 10, min_samples_split = 5, random_state = 1)
my_tree_two = my_tree_two.fit(X, y)

#Print the score of the new decison tree
print(my_tree_two.score(X, y))









    



0.9012345679012346



In [0]:

    
pred = my_tree_two.predict(X)



In [18]:

    
df_confusion = metrics.confusion_matrix(y, pred)
df_confusion









    Out[18]:





array([[533,  16],
       [ 72, 270]])



In [19]:

    
def plot_confusion_matrix(df_confusion, title='Confusion matrix', cmap=plt.cm.gray_r):
    plt.matshow(df_confusion, cmap=cmap) # imshow
    plt.title('Confusion Matrix')
    plt.colorbar()
    plt.ylabel('Actual')
    plt.xlabel('Predicted')

plot_confusion_matrix(df_confusion)

Try by yourself



In [0]:

    
# Add new feature and then train the model

train_df["family_size"] = train_df.SibSp + train_df.Parch + 1

Random Forest



In [21]:

    
from sklearn.ensemble import RandomForestClassifier

# Building and fitting my_forest
forest = RandomForestClassifier(max_depth = 10, min_samples_split=2, n_estimators = 100, random_state = 1)
my_forest = forest.fit(X, y)

# Print the score of the fitted random forest
print(my_forest.score(X, y))









    



0.9393939393939394



In [0]:

    
pred = my_forest.predict(X)



In [23]:

    
df_confusion = metrics.confusion_matrix(y, pred)
df_confusion









    Out[23]:





array([[538,  11],
       [ 43, 299]])



In [24]:

    
fpr, tpr, _ = metrics.roc_curve(y, pred)
roc_auc = metrics.auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',
label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()



In [25]:

    
# ROC curve on Predicted probabilities
pred_proba = my_forest.predict_proba(X)
fpr, tpr, _ = metrics.roc_curve(y, pred_proba[:,1])
roc_auc = metrics.auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',
label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

	PassengerId	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	892	3	Kelly, Mr. James	male	34.5	0	0	330911	7.8292	NaN	Q
1	893	3	Wilkes, Mrs. James (Ellen Needs)	female	47.0	1	0	363272	7.0000	NaN	S
2	894	2	Myles, Mr. Thomas Francis	male	62.0	0	0	240276	9.6875	NaN	Q
3	895	3	Wirz, Mr. Albert	male	27.0	0	0	315154	8.6625	NaN	S
4	896	3	Hirvonen, Mrs. Alexander (Helga E Lindqvist)	female	22.0	1	1	3101298	12.2875	NaN	S