notebook.community

Edit and run



In [116]:

    
# This line configures matplotlib to show figures embedded in the notebook, 
# instead of opening a new window for each figure. 
%matplotlib inline



In [117]:

    
# Use the pandas library to read in the csv file 
import pandas as pd

# This will create a pandas dataframe and assign it to the titanic variable
titanic = pd.read_csv("titanic_data.csv")



In [118]:

    
# Print the first 5 rows of the dataframe
titanic.head(5)









    Out[118]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22
      1
      0
      A/5 21171
      7.2500
      NaN
      S
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38
      1
      0
      PC 17599
      71.2833
      C85
      C
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35
      1
      0
      113803
      53.1000
      C123
      S
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35
      0
      0
      373450
      8.0500
      NaN
      S



In [119]:

    
titanic.shape









    Out[119]:





(891, 12)



In [120]:

    
# Show data types of the class label and features
titanic.dtypes









    Out[120]:





PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object



In [121]:

    
titanic.describe()









    Out[121]:






  
    
      
      PassengerId
      Survived
      Pclass
      Age
      SibSp
      Parch
      Fare
    
  
  
    
      count
      891.000000
      891.000000
      891.000000
      714.000000
      891.000000
      891.000000
      891.000000
    
    
      mean
      446.000000
      0.383838
      2.308642
      29.699118
      0.523008
      0.381594
      32.204208
    
    
      std
      257.353842
      0.486592
      0.836071
      14.526497
      1.102743
      0.806057
      49.693429
    
    
      min
      1.000000
      0.000000
      1.000000
      0.420000
      0.000000
      0.000000
      0.000000
    
    
      25%
      223.500000
      0.000000
      2.000000
      20.125000
      0.000000
      0.000000
      7.910400
    
    
      50%
      446.000000
      0.000000
      3.000000
      28.000000
      0.000000
      0.000000
      14.454200
    
    
      75%
      668.500000
      1.000000
      3.000000
      38.000000
      1.000000
      0.000000
      31.000000
    
    
      max
      891.000000
      1.000000
      3.000000
      80.000000
      8.000000
      6.000000
      512.329200



In [122]:

    
import matplotlib as plt

fig = plt.pyplot.figure()
axis = fig.add_subplot(111)
axis.hist(titanic['Age'], bins=10, range=(titanic['Age'].min(), titanic['Age'].max()))
plt.pyplot.title('Age Distribution')
plt.pyplot.xlabel('Age')
plt.pyplot.ylabel('# Passengers')
plt.pyplot.show()



In [123]:

    
# Get the counts of males and females in the data
titanic.groupby('Sex').count()









    Out[123]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
    
      Sex
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      female
      314
      314
      314
      314
      261
      314
      314
      314
      314
      97
      312
    
    
      male
      577
      577
      577
      577
      453
      577
      577
      577
      577
      107
      577



In [124]:

    
# Get summary stats grouped by Sex field
titanic.groupby('Sex').describe()









    Out[124]:






  
    
      
      
      Age
      Fare
      Parch
      PassengerId
      Pclass
      SibSp
      Survived
    
    
      Sex
      
      
      
      
      
      
      
      
    
  
  
    
      female
      count
      261.000000
      314.000000
      314.000000
      314.000000
      314.000000
      314.000000
      314.000000
    
    
      mean
      27.915709
      44.479818
      0.649682
      431.028662
      2.159236
      0.694268
      0.742038
    
    
      std
      14.110146
      57.997698
      1.022846
      256.846324
      0.857290
      1.156520
      0.438211
    
    
      min
      0.750000
      6.750000
      0.000000
      2.000000
      1.000000
      0.000000
      0.000000
    
    
      25%
      18.000000
      12.071875
      0.000000
      231.750000
      1.000000
      0.000000
      0.000000
    
    
      50%
      27.000000
      23.000000
      0.000000
      414.500000
      2.000000
      0.000000
      1.000000
    
    
      75%
      37.000000
      55.000000
      1.000000
      641.250000
      3.000000
      1.000000
      1.000000
    
    
      max
      63.000000
      512.329200
      6.000000
      889.000000
      3.000000
      8.000000
      1.000000
    
    
      male
      count
      453.000000
      577.000000
      577.000000
      577.000000
      577.000000
      577.000000
      577.000000
    
    
      mean
      30.726645
      25.523893
      0.235702
      454.147314
      2.389948
      0.429809
      0.188908
    
    
      std
      14.678201
      43.138263
      0.612294
      257.486139
      0.813580
      1.061811
      0.391775
    
    
      min
      0.420000
      0.000000
      0.000000
      1.000000
      1.000000
      0.000000
      0.000000
    
    
      25%
      21.000000
      7.895800
      0.000000
      222.000000
      2.000000
      0.000000
      0.000000
    
    
      50%
      29.000000
      10.500000
      0.000000
      464.000000
      3.000000
      0.000000
      0.000000
    
    
      75%
      39.000000
      26.550000
      0.000000
      680.000000
      3.000000
      0.000000
      0.000000
    
    
      max
      80.000000
      512.329200
      5.000000
      891.000000
      3.000000
      8.000000
      1.000000



In [125]:

    
# Retrieve unique values for the Sex element
titanic['Sex'].unique()









    Out[125]:





array(['male', 'female'], dtype=object)



In [126]:

    
crosstab = pd.crosstab(titanic['Sex'], titanic['Survived'].astype(bool))
crosstab.plot(kind='bar', stacked=True, color=['red','green'], grid=False)









    Out[126]:





<matplotlib.axes._subplots.AxesSubplot at 0x117522fd0>



In [127]:

    
# Start here with post 2 code



In [128]:

    
import matplotlib.pyplot as plt

# set display to a a more appealing style
pd.options.display.mpl_style = 'default'

# plot the numeric variables using box plot chart
titanic.boxplot(column=['Age', 'SibSp', 'Parch', 'Fare'])









    Out[128]:





{'boxes': [<matplotlib.lines.Line2D at 0x118025090>,
  <matplotlib.lines.Line2D at 0x11803ab10>,
  <matplotlib.lines.Line2D at 0x118145750>,
  <matplotlib.lines.Line2D at 0x118167710>],
 'caps': [<matplotlib.lines.Line2D at 0x1180311d0>,
  <matplotlib.lines.Line2D at 0x118031810>,
  <matplotlib.lines.Line2D at 0x11812edd0>,
  <matplotlib.lines.Line2D at 0x118138450>,
  <matplotlib.lines.Line2D at 0x118150a10>,
  <matplotlib.lines.Line2D at 0x11815b090>,
  <matplotlib.lines.Line2D at 0x118173650>,
  <matplotlib.lines.Line2D at 0x118173c90>],
 'fliers': [<matplotlib.lines.Line2D at 0x11803a4d0>,
  <matplotlib.lines.Line2D at 0x118145110>,
  <matplotlib.lines.Line2D at 0x11815bd10>,
  <matplotlib.lines.Line2D at 0x11817e950>],
 'means': [],
 'medians': [<matplotlib.lines.Line2D at 0x118031e50>,
  <matplotlib.lines.Line2D at 0x118138a90>,
  <matplotlib.lines.Line2D at 0x11815b6d0>,
  <matplotlib.lines.Line2D at 0x11817e310>],
 'whiskers': [<matplotlib.lines.Line2D at 0x118025490>,
  <matplotlib.lines.Line2D at 0x118025b50>,
  <matplotlib.lines.Line2D at 0x11812e150>,
  <matplotlib.lines.Line2D at 0x11812e790>,
  <matplotlib.lines.Line2D at 0x118145d50>,
  <matplotlib.lines.Line2D at 0x1181503d0>,
  <matplotlib.lines.Line2D at 0x118167990>,
  <matplotlib.lines.Line2D at 0x118167fd0>]}



In [129]:

    
titanic.hist(column=['Age', 'SibSp', 'Parch', 'Fare'], figsize=[10,10])









    Out[129]:





array([[<matplotlib.axes._subplots.AxesSubplot object at 0x118193390>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1182d5210>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x11844de10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1184bd5d0>]], dtype=object)



In [130]:

    
titanic[titanic['Fare'] > 300][['PassengerId', 'Name', 'Sex', 'Fare']]









    Out[130]:






  
    
      
      PassengerId
      Name
      Sex
      Fare
    
  
  
    
      258
      259
      Ward, Miss. Anna
      female
      512.3292
    
    
      679
      680
      Cardeza, Mr. Thomas Drake Martinez
      male
      512.3292
    
    
      737
      738
      Lesurer, Mr. Gustave J
      male
      512.3292



In [131]:

    
titanic.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 90.5+ KB



In [132]:

    
titanic['Age'][0:10]









    Out[132]:





0    22
1    38
2    26
3    35
4    35
5   NaN
6    54
7     2
8    27
9    14
Name: Age, dtype: float64



In [133]:

    
titanic['Age'].mean()









    Out[133]:





29.69911764705882



In [134]:

    
titanic['Age'].median()









    Out[134]:





28.0



In [135]:

    
# Create a new column in the data frame and assign it values from existing column
titanic['AgeBackFill'] = titanic['Age']



In [136]:

    
# Check if the values were copied over
titanic[['Sex', 'Age', 'AgeBackFill']].head(10)









    Out[136]:






  
    
      
      Sex
      Age
      AgeBackFill
    
  
  
    
      0
      male
      22
      22
    
    
      1
      female
      38
      38
    
    
      2
      female
      26
      26
    
    
      3
      female
      35
      35
    
    
      4
      male
      35
      35
    
    
      5
      male
      NaN
      NaN
    
    
      6
      male
      54
      54
    
    
      7
      male
      2
      2
    
    
      8
      female
      27
      27
    
    
      9
      female
      14
      14



In [137]:

    
titanic.loc[titanic['Age'].isnull(), 'AgeBackFill'] = titanic['Age'].median()



In [138]:

    
# Check if the values were backfilled 
titanic[['Sex', 'Age', 'AgeBackFill']].head(10)









    Out[138]:






  
    
      
      Sex
      Age
      AgeBackFill
    
  
  
    
      0
      male
      22
      22
    
    
      1
      female
      38
      38
    
    
      2
      female
      26
      26
    
    
      3
      female
      35
      35
    
    
      4
      male
      35
      35
    
    
      5
      male
      NaN
      28
    
    
      6
      male
      54
      54
    
    
      7
      male
      2
      2
    
    
      8
      female
      27
      27
    
    
      9
      female
      14
      14



In [139]:

    
titanic['Gender'] = titanic['Sex'].map( {'female': 0, 'male': 1} ).astype(int)



In [140]:

    
# Check the transformation 
titanic[['Sex', 'Gender']].head(10)



In [141]:

    
titanic[['PassengerId', 'Embarked']].groupby('Embarked').count()









    Out[141]:






  
    
      
      PassengerId
    
    
      Embarked
      
    
  
  
    
      C
      168
    
    
      Q
      77
    
    
      S
      644



In [142]:

    
# Create a new column
titanic['EmbarkedBackFill'] = titanic['Embarked']

# Backfill missing values
titanic['EmbarkedBackFill'] = titanic['EmbarkedBackFill'].fillna('S')

# Confirm backfill worked
titanic[titanic['Embarked'].isnull()][['Embarked', 'EmbarkedBackFill']]









    Out[142]:






  
    
      
      Embarked
      EmbarkedBackFill
    
  
  
    
      61
      NaN
      S
    
    
      829
      NaN
      S



In [143]:

    
titanic.loc[titanic['EmbarkedBackFill'] == 'S', 'EmbarkedBackFill'] = 0
titanic.loc[titanic['EmbarkedBackFill'] == 'C', 'EmbarkedBackFill'] = 1
titanic.loc[titanic['EmbarkedBackFill'] == 'Q', 'EmbarkedBackFill'] = 2
titanic['EmbarkedBackFill'] = titanic['EmbarkedBackFill'].astype(int)



In [144]:

    
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch']

titanic['Age*Class'] = titanic['AgeBackFill'] * titanic['Pclass']



In [145]:

    
titanic.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 17 columns):
PassengerId         891 non-null int64
Survived            891 non-null int64
Pclass              891 non-null int64
Name                891 non-null object
Sex                 891 non-null object
Age                 714 non-null float64
SibSp               891 non-null int64
Parch               891 non-null int64
Ticket              891 non-null object
Fare                891 non-null float64
Cabin               204 non-null object
Embarked            889 non-null object
AgeBackFill         891 non-null float64
Gender              891 non-null int64
EmbarkedBackFill    891 non-null int64
FamilySize          891 non-null int64
Age*Class           891 non-null float64
dtypes: float64(4), int64(8), object(5)
memory usage: 125.3+ KB



In [146]:

    
# create a new data frame and remove the columns we will not use
titanic_clean = titanic.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Age'], axis=1)



In [147]:

    
titanic_clean.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 11 columns):
PassengerId         891 non-null int64
Survived            891 non-null int64
Pclass              891 non-null int64
SibSp               891 non-null int64
Parch               891 non-null int64
Fare                891 non-null float64
AgeBackFill         891 non-null float64
Gender              891 non-null int64
EmbarkedBackFill    891 non-null int64
FamilySize          891 non-null int64
Age*Class           891 non-null float64
dtypes: float64(3), int64(8)
memory usage: 83.5 KB



In [148]:

    
train_data = titanic_clean.values
train_data









    Out[148]:





array([[   1.,    0.,    3., ...,    0.,    1.,   66.],
       [   2.,    1.,    1., ...,    1.,    1.,   38.],
       [   3.,    1.,    3., ...,    0.,    0.,   78.],
       ..., 
       [ 889.,    0.,    3., ...,    0.,    3.,   84.],
       [ 890.,    1.,    1., ...,    1.,    0.,   26.],
       [ 891.,    0.,    3., ...,    2.,    0.,   96.]])



In [149]:

    
titanic_clean.groupby('Survived').AgeBackFill.hist(alpha=0.4)









    Out[149]:





Survived
0    Axes(0.125,0.125;0.775x0.775)
1    Axes(0.125,0.125;0.775x0.775)
Name: AgeBackFill, dtype: object



In [150]:

    
titanic_clean.groupby('Survived').Fare.hist(alpha=0.4)









    Out[150]:





Survived
0    Axes(0.125,0.125;0.775x0.775)
1    Axes(0.125,0.125;0.775x0.775)
Name: Fare, dtype: object



In [151]:

    
titanic_clean.groupby('Survived').FamilySize.hist(alpha=0.4)









    Out[151]:





Survived
0    Axes(0.125,0.125;0.775x0.775)
1    Axes(0.125,0.125;0.775x0.775)
Name: FamilySize, dtype: object



In [152]:

    
from pandas.tools.plotting import scatter_matrix
scatter_matrix(titanic_clean[['Survived', 'Pclass','AgeBackFill', 'Gender', 'EmbarkedBackFill', 'FamilySize', 'Fare']], alpha=0.2, figsize=(15, 15), diagonal='kde')









    Out[152]:





array([[<matplotlib.axes._subplots.AxesSubplot object at 0x118f66250>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11710dcd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1170e8910>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x116625610>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1160da750>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x115756e10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1156a7390>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x11534f750>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1150b5ad0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1150cae50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11199e9d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10ecd99d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10e2021d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11505b110>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1161c8190>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11926be90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11648af10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x116439c50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1153c7ad0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x116220810>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11545f790>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1191b65d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x106123250>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10ea293d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10eaac110>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x10ec0e850>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11717f690>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11930f3d0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1193932d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119409fd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119477fd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1194fad10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119561c90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1195e2ad0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119584690>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1196d58d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11987f610>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1198e4d50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119a68b90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119acd8d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119b507d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119bd4510>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x119c42510>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119cc7250>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119e2b1d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119ea0fd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119e453d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119f93dd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a917b10>]], dtype=object)



In [153]:

    
# Start here with post 3 code



In [154]:

    
# Import function to split the data
from sklearn.cross_validation import train_test_split

# Select the columns to predict the target
predictors = ["Pclass","Fare", "AgeBackFill", "Gender", "EmbarkedBackFill", "FamilySize"]

# Specify target column
target = "Survived"

# Set features
X = titanic_clean[predictors]

# Set targets
y = titanic_clean[target]

# Split data into 80% training and 20% test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size =0.8, random_state=0)



In [155]:

    
import numpy as np
# Utility method to draw a confusion matrix
def plot_confusion_matrix(cm, target, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(target.unique()))
    plt.xticks(tick_marks, target.unique(), rotation=45)
    plt.yticks(tick_marks, target.unique())
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    for y in range(cm.shape[0]):
        for x in range(cm.shape[1]):
            plt.text(x, y, '%.0f' % cm[y, x],
                     horizontalalignment='center',
                     verticalalignment='center',
                     color='red',
                     fontsize=20
                     )



In [156]:

    
# Import the linear regression class
from sklearn.linear_model import LinearRegression

# Initialize our algorithm class
algo = LinearRegression()

# fit the model 
algo.fit(X_train, y_train)

# predict on our test data
y_pred = algo.predict(X_test)



In [157]:

    
from sklearn.metrics import confusion_matrix

y_pred[y_pred > .5] = 1
y_pred[y_pred <=.5] = 0

cm = confusion_matrix(y_test, y_pred)
plt.figure()
plot_confusion_matrix(cm, titanic_clean[target])



In [158]:

    
# Import the logistic regression class
from sklearn.linear_model import LogisticRegression

# Initialize our algorithm class
algo = LogisticRegression(random_state=0)

# fit the model 
algo.fit(X_train, y_train)

# predict on our test data
y_pred = algo.predict(X_test)



In [159]:

    
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
plt.figure()
plot_confusion_matrix(cm, titanic_clean[target])



In [160]:

    
# Import the Decision tree classifier
from sklearn.tree import DecisionTreeClassifier

# Initialize our algorithm class
algo = DecisionTreeClassifier(random_state=0)

# fit the model 
algo.fit(X_train, y_train)

# predict on our test data
y_pred = algo.predict(X_test)



In [161]:

    
cm = confusion_matrix(y_test, y_pred)
plt.figure()
plot_confusion_matrix(cm, titanic_clean[target])



In [162]:

    
# Import the Random Forest classifier
from sklearn.ensemble import RandomForestClassifier

algo = RandomForestClassifier(n_estimators=100, random_state=0)

# fit the model 
algo.fit(X_train, y_train)

# predict on our test data
y_pred = algo.predict(X_test)



In [163]:

    
cm = confusion_matrix(y_test, y_pred)
plt.figure()
plot_confusion_matrix(cm, titanic_clean[target])



In [ ]:

    
# Start here with post 4 code



In [164]:

    
# Use the pandas library to read in the csv file 
import pandas as pd

# This will create a pandas dataframe and assign it to the titanic variable
validation = pd.read_csv("validation_ob.csv")



In [165]:

    
# transform
validation['AgeBackFill'] = validation['Age']
validation.loc[validation['Age'].isnull(), 'AgeBackFill'] = 28 # use same value used in training data
validation['Gender'] = validation['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
validation['EmbarkedBackFill'] = validation['Embarked']
validation['EmbarkedBackFill'] = validation['EmbarkedBackFill'].fillna('S')
validation.loc[validation['EmbarkedBackFill'] == 'S', 'EmbarkedBackFill'] = 0
validation.loc[validation['EmbarkedBackFill'] == 'C', 'EmbarkedBackFill'] = 1
validation.loc[validation['EmbarkedBackFill'] == 'Q', 'EmbarkedBackFill'] = 2
validation['EmbarkedBackFill'] = validation['EmbarkedBackFill'].astype(int)
validation['FamilySize'] = validation['SibSp'] + validation['Parch']
validation['Age*Class'] = validation['AgeBackFill'] * validation['Pclass']
data = validation.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Age'], axis=1)
# transform end



In [166]:

    
# Select the columns to predict the target
predictors = ["Pclass","Fare", "AgeBackFill", "Gender", "EmbarkedBackFill", "FamilySize"]

# Specify target column
target = "Survived"

# Set features
X = data[predictors]

# Set targets
y = data[target]



In [167]:

    
y_pred = algo.predict(X)

cm = confusion_matrix(y, y_pred)
plt.figure()
plot_confusion_matrix(cm, data[target])

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35	0	373450	8.0500	NaN	S

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

		Age	Fare	Parch	PassengerId	Pclass	SibSp	Survived
Sex
female	count	261.000000	314.000000	314.000000	314.000000	314.000000	314.000000	314.000000
	mean	27.915709	44.479818	0.649682	431.028662	2.159236	0.694268	0.742038
	std	14.110146	57.997698	1.022846	256.846324	0.857290	1.156520	0.438211
	min	0.750000	6.750000	0.000000	2.000000	1.000000	0.000000	0.000000
	25%	18.000000	12.071875	0.000000	231.750000	1.000000	0.000000	0.000000
	50%	27.000000	23.000000	0.000000	414.500000	2.000000	0.000000	1.000000
	75%	37.000000	55.000000	1.000000	641.250000	3.000000	1.000000	1.000000
	max	63.000000	512.329200	6.000000	889.000000	3.000000	8.000000	1.000000
male	count	453.000000	577.000000	577.000000	577.000000	577.000000	577.000000	577.000000
	mean	30.726645	25.523893	0.235702	454.147314	2.389948	0.429809	0.188908
	std	14.678201	43.138263	0.612294	257.486139	0.813580	1.061811	0.391775
	min	0.420000	0.000000	0.000000	1.000000	1.000000	0.000000	0.000000
	25%	21.000000	7.895800	0.000000	222.000000	2.000000	0.000000	0.000000
	50%	29.000000	10.500000	0.000000	464.000000	3.000000	0.000000	0.000000
	75%	39.000000	26.550000	0.000000	680.000000	3.000000	0.000000	0.000000
	max	80.000000	512.329200	5.000000	891.000000	3.000000	8.000000	1.000000

	PassengerId	Name	Sex	Fare
258	259	Ward, Miss. Anna	female	512.3292
679	680	Cardeza, Mr. Thomas Drake Martinez	male	512.3292
737	738	Lesurer, Mr. Gustave J	male	512.3292