Tools We'll Need



In [72]:

    
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.stats import pearsonr

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

What Does Our Data Look Like?



In [73]:

    
titanic_df = pd.read_csv('titanic.csv')



In [74]:

    
# Randomly sort the data, this'll be important later on
titanic_df = titanic_df.sample(frac = 1)



In [75]:

    
titanic_df.reset_index(drop=True, inplace=True)



In [76]:

    
titanic_df









    Out[76]:






  
    
      
      pclass
      survived
      name
      sex
      age
      sibsp
      parch
      ticket
      fare
      cabin
      embarked
      boat
      body
      home.dest
    
  
  
    
      0
      1.0
      0.0
      Gee, Mr. Arthur H
      male
      47.0
      0.0
      0.0
      111320
      38.5000
      E63
      S
      NaN
      275.0
      St Anne's-on-Sea, Lancashire
    
    
      1
      3.0
      0.0
      Goodwin, Mr. Charles Frederick
      male
      40.0
      1.0
      6.0
      CA 2144
      46.9000
      NaN
      S
      NaN
      NaN
      Wiltshire, England Niagara Falls, NY
    
    
      2
      2.0
      0.0
      Gale, Mr. Shadrach
      male
      34.0
      1.0
      0.0
      28664
      21.0000
      NaN
      S
      NaN
      NaN
      Cornwall / Clear Creek, CO
    
    
      3
      3.0
      0.0
      Carlsson, Mr. Carl Robert
      male
      24.0
      0.0
      0.0
      350409
      7.8542
      NaN
      S
      NaN
      NaN
      Goteborg, Sweden Huntley, IL
    
    
      4
      2.0
      0.0
      Fahlstrom, Mr. Arne Jonas
      male
      18.0
      0.0
      0.0
      236171
      13.0000
      NaN
      S
      NaN
      NaN
      Oslo, Norway Bayonne, NJ
    
    
      5
      2.0
      0.0
      Troupiansky, Mr. Moses Aaron
      male
      23.0
      0.0
      0.0
      233639
      13.0000
      NaN
      S
      NaN
      NaN
      NaN
    
    
      6
      2.0
      0.0
      Harris, Mr. Walter
      male
      30.0
      0.0
      0.0
      W/C 14208
      10.5000
      NaN
      S
      NaN
      NaN
      Walthamstow, England
    
    
      7
      3.0
      0.0
      Wittevrongel, Mr. Camille
      male
      36.0
      0.0
      0.0
      345771
      9.5000
      NaN
      S
      NaN
      NaN
      NaN
    
    
      8
      3.0
      0.0
      Johnson, Mr. Alfred
      male
      49.0
      0.0
      0.0
      LINE
      0.0000
      NaN
      S
      NaN
      NaN
      NaN
    
    
      9
      3.0
      0.0
      Peduzzi, Mr. Joseph
      male
      NaN
      0.0
      0.0
      A/5 2817
      8.0500
      NaN
      S
      NaN
      NaN
      NaN
    
    
      10
      3.0
      1.0
      Murphy, Miss. Margaret Jane
      female
      NaN
      1.0
      0.0
      367230
      15.5000
      NaN
      Q
      16
      NaN
      NaN
    
    
      11
      1.0
      1.0
      Beckwith, Mrs. Richard Leonard (Sallie Monypeny)
      female
      47.0
      1.0
      1.0
      11751
      52.5542
      D35
      S
      5
      NaN
      New York, NY
    
    
      12
      1.0
      1.0
      Minahan, Mrs. William Edward (Lillian E Thorpe)
      female
      37.0
      1.0
      0.0
      19928
      90.0000
      C78
      Q
      14
      NaN
      Fond du Lac, WI
    
    
      13
      1.0
      0.0
      Chaffee, Mr. Herbert Fuller
      male
      46.0
      1.0
      0.0
      W.E.P. 5734
      61.1750
      E31
      S
      NaN
      NaN
      Amenia, ND
    
    
      14
      3.0
      1.0
      Moubarek, Master. Gerios
      male
      NaN
      1.0
      1.0
      2661
      15.2458
      NaN
      C
      C
      NaN
      NaN
    
    
      15
      2.0
      0.0
      Funk, Miss. Annie Clemmer
      female
      38.0
      0.0
      0.0
      237671
      13.0000
      NaN
      S
      NaN
      NaN
      Janjgir, India / Pennsylvania
    
    
      16
      3.0
      1.0
      Olsson, Mr. Oscar Wilhelm
      male
      32.0
      0.0
      0.0
      347079
      7.7750
      NaN
      S
      A
      NaN
      NaN
    
    
      17
      2.0
      0.0
      Chapman, Mr. Charles Henry
      male
      52.0
      0.0
      0.0
      248731
      13.5000
      NaN
      S
      NaN
      130.0
      Bronx, NY
    
    
      18
      3.0
      1.0
      Kennedy, Mr. John
      male
      NaN
      0.0
      0.0
      368783
      7.7500
      NaN
      Q
      NaN
      NaN
      NaN
    
    
      19
      1.0
      1.0
      Willard, Miss. Constance
      female
      21.0
      0.0
      0.0
      113795
      26.5500
      NaN
      S
      8 10
      NaN
      Duluth, MN
    
    
      20
      2.0
      0.0
      Rogers, Mr. Reginald Harry
      male
      19.0
      0.0
      0.0
      28004
      10.5000
      NaN
      S
      NaN
      NaN
      NaN
    
    
      21
      3.0
      0.0
      Maenpaa, Mr. Matti Alexanteri
      male
      22.0
      0.0
      0.0
      STON/O 2. 3101275
      7.1250
      NaN
      S
      NaN
      NaN
      NaN
    
    
      22
      3.0
      0.0
      Gustafsson, Mr. Anders Vilhelm
      male
      37.0
      2.0
      0.0
      3101276
      7.9250
      NaN
      S
      NaN
      98.0
      Ruotsinphytaa, Finland New York, NY
    
    
      23
      3.0
      0.0
      Goodwin, Mr. Charles Edward
      male
      14.0
      5.0
      2.0
      CA 2144
      46.9000
      NaN
      S
      NaN
      NaN
      Wiltshire, England Niagara Falls, NY
    
    
      24
      3.0
      0.0
      Calic, Mr. Jovo
      male
      17.0
      0.0
      0.0
      315093
      8.6625
      NaN
      S
      NaN
      NaN
      NaN
    
    
      25
      2.0
      0.0
      Hickman, Mr. Leonard Mark
      male
      24.0
      2.0
      0.0
      S.O.C. 14879
      73.5000
      NaN
      S
      NaN
      NaN
      West Hampstead, London / Neepawa, MB
    
    
      26
      3.0
      0.0
      Caram, Mrs. Joseph (Maria Elias)
      female
      NaN
      1.0
      0.0
      2689
      14.4583
      NaN
      C
      NaN
      NaN
      Ottawa, ON
    
    
      27
      3.0
      0.0
      O'Brien, Mr. Timothy
      male
      NaN
      0.0
      0.0
      330979
      7.8292
      NaN
      Q
      NaN
      NaN
      NaN
    
    
      28
      2.0
      0.0
      Sobey, Mr. Samuel James Hayden
      male
      25.0
      0.0
      0.0
      C.A. 29178
      13.0000
      NaN
      S
      NaN
      NaN
      Cornwall / Houghton, MI
    
    
      29
      3.0
      1.0
      Nilsson, Miss. Berta Olivia
      female
      18.0
      0.0
      0.0
      347066
      7.7750
      NaN
      S
      D
      NaN
      NaN
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      1280
      2.0
      1.0
      Weisz, Mrs. Leopold (Mathilde Francoise Pede)
      female
      29.0
      1.0
      0.0
      228414
      26.0000
      NaN
      S
      10
      NaN
      Bromsgrove, England / Montreal, PQ
    
    
      1281
      3.0
      0.0
      Doyle, Miss. Elizabeth
      female
      24.0
      0.0
      0.0
      368702
      7.7500
      NaN
      Q
      NaN
      NaN
      Ireland New York, NY
    
    
      1282
      1.0
      0.0
      Jones, Mr. Charles Cresson
      male
      46.0
      0.0
      0.0
      694
      26.0000
      NaN
      S
      NaN
      80.0
      Bennington, VT
    
    
      1283
      3.0
      0.0
      Bourke, Mr. John
      male
      40.0
      1.0
      1.0
      364849
      15.5000
      NaN
      Q
      NaN
      NaN
      Ireland Chicago, IL
    
    
      1284
      3.0
      0.0
      Moutal, Mr. Rahamin Haim
      male
      NaN
      0.0
      0.0
      374746
      8.0500
      NaN
      S
      NaN
      NaN
      NaN
    
    
      1285
      3.0
      0.0
      Davison, Mr. Thomas Henry
      male
      NaN
      1.0
      0.0
      386525
      16.1000
      NaN
      S
      NaN
      NaN
      Liverpool, England Bedford, OH
    
    
      1286
      1.0
      0.0
      Parr, Mr. William Henry Marsh
      male
      NaN
      0.0
      0.0
      112052
      0.0000
      NaN
      S
      NaN
      NaN
      Belfast
    
    
      1287
      3.0
      0.0
      Rosblom, Miss. Salli Helena
      female
      2.0
      1.0
      1.0
      370129
      20.2125
      NaN
      S
      NaN
      NaN
      NaN
    
    
      1288
      3.0
      0.0
      Mitkoff, Mr. Mito
      male
      NaN
      0.0
      0.0
      349221
      7.8958
      NaN
      S
      NaN
      NaN
      NaN
    
    
      1289
      3.0
      0.0
      Pavlovic, Mr. Stefo
      male
      32.0
      0.0
      0.0
      349242
      7.8958
      NaN
      S
      NaN
      NaN
      NaN
    
    
      1290
      3.0
      0.0
      Palsson, Miss. Torborg Danira
      female
      8.0
      3.0
      1.0
      349909
      21.0750
      NaN
      S
      NaN
      NaN
      NaN
    
    
      1291
      2.0
      0.0
      Howard, Mrs. Benjamin (Ellen Truelove Arman)
      female
      60.0
      1.0
      0.0
      24065
      26.0000
      NaN
      S
      NaN
      NaN
      Swindon, England
    
    
      1292
      3.0
      0.0
      Nosworthy, Mr. Richard Cater
      male
      21.0
      0.0
      0.0
      A/4. 39886
      7.8000
      NaN
      S
      NaN
      NaN
      NaN
    
    
      1293
      2.0
      0.0
      Gaskell, Mr. Alfred
      male
      16.0
      0.0
      0.0
      239865
      26.0000
      NaN
      S
      NaN
      NaN
      Liverpool / Montreal, PQ
    
    
      1294
      3.0
      0.0
      Peltomaki, Mr. Nikolai Johannes
      male
      25.0
      0.0
      0.0
      STON/O 2. 3101291
      7.9250
      NaN
      S
      NaN
      NaN
      NaN
    
    
      1295
      1.0
      1.0
      Taussig, Mrs. Emil (Tillie Mandelbaum)
      female
      39.0
      1.0
      1.0
      110413
      79.6500
      E67
      S
      8
      NaN
      New York, NY
    
    
      1296
      1.0
      1.0
      Frolicher, Miss. Hedwig Margaritha
      female
      22.0
      0.0
      2.0
      13568
      49.5000
      B39
      C
      5
      NaN
      Zurich, Switzerland
    
    
      1297
      1.0
      1.0
      Spedden, Mr. Frederic Oakley
      male
      45.0
      1.0
      1.0
      16966
      134.5000
      E34
      C
      3
      NaN
      Tuxedo Park, NY
    
    
      1298
      3.0
      0.0
      Jensen, Mr. Hans Peder
      male
      20.0
      0.0
      0.0
      350050
      7.8542
      NaN
      S
      NaN
      NaN
      NaN
    
    
      1299
      3.0
      1.0
      Hansen, Mrs. Claus Peter (Jennie L Howard)
      female
      45.0
      1.0
      0.0
      350026
      14.1083
      NaN
      S
      11
      NaN
      NaN
    
    
      1300
      2.0
      1.0
      Watt, Miss. Bertha J
      female
      12.0
      0.0
      0.0
      C.A. 33595
      15.7500
      NaN
      S
      9
      NaN
      Aberdeen / Portland, OR
    
    
      1301
      1.0
      0.0
      Warren, Mr. Frank Manley
      male
      64.0
      1.0
      0.0
      110813
      75.2500
      D37
      C
      NaN
      NaN
      Portland, OR
    
    
      1302
      2.0
      1.0
      Beesley, Mr. Lawrence
      male
      34.0
      0.0
      0.0
      248698
      13.0000
      D56
      S
      13
      NaN
      London
    
    
      1303
      2.0
      1.0
      Slayter, Miss. Hilda Mary
      female
      30.0
      0.0
      0.0
      234818
      12.3500
      NaN
      Q
      13
      NaN
      Halifax, NS
    
    
      1304
      1.0
      0.0
      White, Mr. Percival Wayland
      male
      54.0
      0.0
      1.0
      35281
      77.2875
      D26
      S
      NaN
      NaN
      Brunswick, ME
    
    
      1305
      3.0
      1.0
      Salkjelsvik, Miss. Anna Kristine
      female
      21.0
      0.0
      0.0
      343120
      7.6500
      NaN
      S
      C
      NaN
      NaN
    
    
      1306
      3.0
      0.0
      Waelens, Mr. Achille
      male
      22.0
      0.0
      0.0
      345767
      9.0000
      NaN
      S
      NaN
      NaN
      Antwerp, Belgium / Stanton, OH
    
    
      1307
      3.0
      0.0
      Jussila, Miss. Katriina
      female
      20.0
      1.0
      0.0
      4136
      9.8250
      NaN
      S
      NaN
      NaN
      NaN
    
    
      1308
      3.0
      0.0
      Svensson, Mr. Johan
      male
      74.0
      0.0
      0.0
      347060
      7.7750
      NaN
      S
      NaN
      NaN
      NaN
    
    
      1309
      3.0
      0.0
      Hansen, Mr. Claus Peter
      male
      41.0
      2.0
      0.0
      350026
      14.1083
      NaN
      S
      NaN
      NaN
      NaN
    
  

1310 rows × 14 columns

Clean the dataset



In [77]:

    
#Select a subset of the columns
titanic_df = titanic_df[['pclass', 'survived', 'sex', 'age', 'fare']]



In [78]:

    
#Fill the NaN with -1
titanic_df.fillna(-1, inplace = True)



In [79]:

    
#A dictionary in python
sex_dict = {'male': 0, 'female': 1, -1: -1}



In [80]:

    
#Our first look at the magic of "apply"
titanic_df.sex = titanic_df.sex.apply(lambda x: sex_dict[x])



In [81]:

    
#How bad is our missing data situation?
for field in titanic_df.columns:
    print field, sum(titanic_df[field] == -1)









    



pclass 1
survived 1
sex 1
age 264
fare 2



In [82]:

    
#Is the fact that age is not reported relevant?
no_age_df = titanic_df[titanic_df.age == -1]
yes_age_df = titanic_df[titanic_df.age != -1]



In [83]:

    
for field in ['fare', 'sex', 'survived', 'pclass']:
    print field
    print 'Missing Age: ', no_age_df[field].mean()
    print 'Present Age: ', yes_age_df[field].mean()









    



fare
Missing Age:  19.7444431818
Present Age:  36.6500515296
sex
Missing Age:  0.291666666667
Present Age:  0.370936902486
survived
Missing Age:  0.272727272727
Present Age:  0.408221797323
pclass
Missing Age:  2.62878787879
Present Age:  2.20745697897



In [84]:

    
#Make a decision about what to do about NaN
titanic_df = titanic_df[(titanic_df.age != -1)&(titanic_df.survived != -1)&(titanic_df.sex != -1)&(titanic_df.fare != -1)&(titanic_df.pclass != -1)]

#More Elegant
#titanic_df = titanic_df[(titanic_df.T != -1).all()]



In [85]:

    
titanic_df









    Out[85]:






  
    
      
      pclass
      survived
      sex
      age
      fare
    
  
  
    
      0
      1.0
      0.0
      0
      47.0
      38.5000
    
    
      1
      3.0
      0.0
      0
      40.0
      46.9000
    
    
      2
      2.0
      0.0
      0
      34.0
      21.0000
    
    
      3
      3.0
      0.0
      0
      24.0
      7.8542
    
    
      4
      2.0
      0.0
      0
      18.0
      13.0000
    
    
      5
      2.0
      0.0
      0
      23.0
      13.0000
    
    
      6
      2.0
      0.0
      0
      30.0
      10.5000
    
    
      7
      3.0
      0.0
      0
      36.0
      9.5000
    
    
      8
      3.0
      0.0
      0
      49.0
      0.0000
    
    
      11
      1.0
      1.0
      1
      47.0
      52.5542
    
    
      12
      1.0
      1.0
      1
      37.0
      90.0000
    
    
      13
      1.0
      0.0
      0
      46.0
      61.1750
    
    
      15
      2.0
      0.0
      1
      38.0
      13.0000
    
    
      16
      3.0
      1.0
      0
      32.0
      7.7750
    
    
      17
      2.0
      0.0
      0
      52.0
      13.5000
    
    
      19
      1.0
      1.0
      1
      21.0
      26.5500
    
    
      20
      2.0
      0.0
      0
      19.0
      10.5000
    
    
      21
      3.0
      0.0
      0
      22.0
      7.1250
    
    
      22
      3.0
      0.0
      0
      37.0
      7.9250
    
    
      23
      3.0
      0.0
      0
      14.0
      46.9000
    
    
      24
      3.0
      0.0
      0
      17.0
      8.6625
    
    
      25
      2.0
      0.0
      0
      24.0
      73.5000
    
    
      28
      2.0
      0.0
      0
      25.0
      13.0000
    
    
      29
      3.0
      1.0
      1
      18.0
      7.7750
    
    
      30
      1.0
      0.0
      0
      33.0
      26.5500
    
    
      31
      1.0
      1.0
      0
      27.0
      76.7292
    
    
      36
      3.0
      0.0
      0
      18.0
      7.8542
    
    
      38
      3.0
      0.0
      1
      18.0
      7.7750
    
    
      39
      3.0
      0.0
      0
      16.0
      9.5000
    
    
      40
      3.0
      0.0
      1
      38.0
      7.7750
    
    
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      1275
      3.0
      0.0
      0
      29.0
      7.9250
    
    
      1277
      1.0
      1.0
      1
      53.0
      51.4792
    
    
      1278
      3.0
      0.0
      0
      21.0
      8.0500
    
    
      1279
      3.0
      0.0
      0
      33.0
      8.6625
    
    
      1280
      2.0
      1.0
      1
      29.0
      26.0000
    
    
      1281
      3.0
      0.0
      1
      24.0
      7.7500
    
    
      1282
      1.0
      0.0
      0
      46.0
      26.0000
    
    
      1283
      3.0
      0.0
      0
      40.0
      15.5000
    
    
      1287
      3.0
      0.0
      1
      2.0
      20.2125
    
    
      1289
      3.0
      0.0
      0
      32.0
      7.8958
    
    
      1290
      3.0
      0.0
      1
      8.0
      21.0750
    
    
      1291
      2.0
      0.0
      1
      60.0
      26.0000
    
    
      1292
      3.0
      0.0
      0
      21.0
      7.8000
    
    
      1293
      2.0
      0.0
      0
      16.0
      26.0000
    
    
      1294
      3.0
      0.0
      0
      25.0
      7.9250
    
    
      1295
      1.0
      1.0
      1
      39.0
      79.6500
    
    
      1296
      1.0
      1.0
      1
      22.0
      49.5000
    
    
      1297
      1.0
      1.0
      0
      45.0
      134.5000
    
    
      1298
      3.0
      0.0
      0
      20.0
      7.8542
    
    
      1299
      3.0
      1.0
      1
      45.0
      14.1083
    
    
      1300
      2.0
      1.0
      1
      12.0
      15.7500
    
    
      1301
      1.0
      0.0
      0
      64.0
      75.2500
    
    
      1302
      2.0
      1.0
      0
      34.0
      13.0000
    
    
      1303
      2.0
      1.0
      1
      30.0
      12.3500
    
    
      1304
      1.0
      0.0
      0
      54.0
      77.2875
    
    
      1305
      3.0
      1.0
      1
      21.0
      7.6500
    
    
      1306
      3.0
      0.0
      0
      22.0
      9.0000
    
    
      1307
      3.0
      0.0
      1
      20.0
      9.8250
    
    
      1308
      3.0
      0.0
      0
      74.0
      7.7750
    
    
      1309
      3.0
      0.0
      0
      41.0
      14.1083
    
  

1045 rows × 5 columns



In [86]:

    
#Again reset the index
titanic_df.reset_index(drop=True, inplace = True)

What's correlated with Survival?



In [87]:

    
#Set up our correlation matrix
correlation_matrix =np.zeros(shape=(5,5))



In [88]:

    
correlation_matrix









    Out[88]:





array([[ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.]])



In [89]:

    
#Populate it
for i, field1 in enumerate(titanic_df.columns):
    for j, field2 in enumerate(titanic_df.columns):
            correlation_matrix[i,j] = pearsonr(titanic_df[field1], titanic_df[field2])[0]



In [90]:

    
titanic_df.columns









    Out[90]:





Index([u'pclass', u'survived', u'sex', u'age', u'fare'], dtype='object')



In [91]:

    
correlation_matrix









    Out[91]:





array([[ 1.        , -0.319979  , -0.14410474, -0.41108588, -0.56525541],
       [-0.319979  ,  1.        ,  0.53771902, -0.05395784,  0.24916365],
       [-0.14410474,  0.53771902,  1.        , -0.06223607,  0.18792965],
       [-0.41108588, -0.05395784, -0.06223607,  1.        ,  0.17873932],
       [-0.56525541,  0.24916365,  0.18792965,  0.17873932,  1.        ]])

Let's make it look cooler



In [92]:

    
#Set figure size
plt.figure(figsize=(10,8))
#Specify we would like a heatmap
plt.imshow(correlation_matrix, interpolation = 'nearest', cmap = 'Greys')
#Specify the x and y labels
plt.xticks(range(5), titanic_df.columns, rotation = 90, fontsize = 16)
plt.yticks(range(5), titanic_df.columns, fontsize = 16)









    Out[92]:





([<matplotlib.axis.YTick at 0xdbae5f8>,
  <matplotlib.axis.YTick at 0xdbae128>,
  <matplotlib.axis.YTick at 0xdbfe588>,
  <matplotlib.axis.YTick at 0xdbfea20>,
  <matplotlib.axis.YTick at 0xdbfef98>],
 <a list of 5 Text yticklabel objects>)

Let's dig into the data further



In [93]:

    
#for each column, draw a historgram of the distribution
for field in titanic_df.columns:
    plt.clf()
    plt.hist(titanic_df[field], color = np.random.rand(3,1))
    plt.title(field)
    plt.show()

Similarity Between 2 Passengers



In [94]:

    
#Further subset the dataframe
titanic_df = titanic_df[['sex', 'age', 'fare', 'survived']]



In [95]:

    
titanic_df









    Out[95]:






  
    
      
      sex
      age
      fare
      survived
    
  
  
    
      0
      0
      47.0
      38.5000
      0.0
    
    
      1
      0
      40.0
      46.9000
      0.0
    
    
      2
      0
      34.0
      21.0000
      0.0
    
    
      3
      0
      24.0
      7.8542
      0.0
    
    
      4
      0
      18.0
      13.0000
      0.0
    
    
      5
      0
      23.0
      13.0000
      0.0
    
    
      6
      0
      30.0
      10.5000
      0.0
    
    
      7
      0
      36.0
      9.5000
      0.0
    
    
      8
      0
      49.0
      0.0000
      0.0
    
    
      9
      1
      47.0
      52.5542
      1.0
    
    
      10
      1
      37.0
      90.0000
      1.0
    
    
      11
      0
      46.0
      61.1750
      0.0
    
    
      12
      1
      38.0
      13.0000
      0.0
    
    
      13
      0
      32.0
      7.7750
      1.0
    
    
      14
      0
      52.0
      13.5000
      0.0
    
    
      15
      1
      21.0
      26.5500
      1.0
    
    
      16
      0
      19.0
      10.5000
      0.0
    
    
      17
      0
      22.0
      7.1250
      0.0
    
    
      18
      0
      37.0
      7.9250
      0.0
    
    
      19
      0
      14.0
      46.9000
      0.0
    
    
      20
      0
      17.0
      8.6625
      0.0
    
    
      21
      0
      24.0
      73.5000
      0.0
    
    
      22
      0
      25.0
      13.0000
      0.0
    
    
      23
      1
      18.0
      7.7750
      1.0
    
    
      24
      0
      33.0
      26.5500
      0.0
    
    
      25
      0
      27.0
      76.7292
      1.0
    
    
      26
      0
      18.0
      7.8542
      0.0
    
    
      27
      1
      18.0
      7.7750
      0.0
    
    
      28
      0
      16.0
      9.5000
      0.0
    
    
      29
      1
      38.0
      7.7750
      0.0
    
    
      ...
      ...
      ...
      ...
      ...
    
    
      1015
      0
      29.0
      7.9250
      0.0
    
    
      1016
      1
      53.0
      51.4792
      1.0
    
    
      1017
      0
      21.0
      8.0500
      0.0
    
    
      1018
      0
      33.0
      8.6625
      0.0
    
    
      1019
      1
      29.0
      26.0000
      1.0
    
    
      1020
      1
      24.0
      7.7500
      0.0
    
    
      1021
      0
      46.0
      26.0000
      0.0
    
    
      1022
      0
      40.0
      15.5000
      0.0
    
    
      1023
      1
      2.0
      20.2125
      0.0
    
    
      1024
      0
      32.0
      7.8958
      0.0
    
    
      1025
      1
      8.0
      21.0750
      0.0
    
    
      1026
      1
      60.0
      26.0000
      0.0
    
    
      1027
      0
      21.0
      7.8000
      0.0
    
    
      1028
      0
      16.0
      26.0000
      0.0
    
    
      1029
      0
      25.0
      7.9250
      0.0
    
    
      1030
      1
      39.0
      79.6500
      1.0
    
    
      1031
      1
      22.0
      49.5000
      1.0
    
    
      1032
      0
      45.0
      134.5000
      1.0
    
    
      1033
      0
      20.0
      7.8542
      0.0
    
    
      1034
      1
      45.0
      14.1083
      1.0
    
    
      1035
      1
      12.0
      15.7500
      1.0
    
    
      1036
      0
      64.0
      75.2500
      0.0
    
    
      1037
      0
      34.0
      13.0000
      1.0
    
    
      1038
      1
      30.0
      12.3500
      1.0
    
    
      1039
      0
      54.0
      77.2875
      0.0
    
    
      1040
      1
      21.0
      7.6500
      1.0
    
    
      1041
      0
      22.0
      9.0000
      0.0
    
    
      1042
      1
      20.0
      9.8250
      0.0
    
    
      1043
      0
      74.0
      7.7750
      0.0
    
    
      1044
      0
      41.0
      14.1083
      0.0
    
  

1045 rows × 4 columns



In [96]:

    
#Normalize age
titanic_df['n_age'] = titanic_df.age.apply(lambda x: (x-titanic_df.age.mean())/titanic_df.age.std())



In [97]:

    
#Take the log of fare
titanic_df['logfare'] = titanic_df.fare.apply(lambda x: np.log(x))



In [98]:

    
#Draw the histogram of logfare
plt.hist(titanic_df[np.isfinite(titanic_df.logfare)].logfare, color = np.random.rand(3,1))









    Out[98]:





(array([   3.,  294.,  199.,  110.,  197.,   83.,   81.,   34.,   32.,    4.]),
 array([ 1.15398392,  1.66248227,  2.17098062,  2.67947896,  3.18797731,
         3.69647565,  4.204974  ,  4.71347235,  5.22197069,  5.73046904,
         6.23896739]),
 <a list of 10 Patch objects>)



In [99]:

    
#a log transformation will cause 0 --> -infinify
titanic_df = titanic_df[np.isfinite(titanic_df.logfare)]



In [100]:

    
#Now normalize the log of fare
titanic_df['n_logfare'] = titanic_df.logfare.apply(lambda x: (x-titanic_df.logfare.mean())/titanic_df.logfare.std())









    



C:\Users\Ritvik\Anaconda2\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app



In [101]:

    
titanic_df









    Out[101]:






  
    
      
      sex
      age
      fare
      survived
      n_age
      logfare
      n_logfare
    
  
  
    
      0
      0
      47.0
      38.5000
      0.0
      1.191739
      3.650658
      0.636174
    
    
      1
      0
      40.0
      46.9000
      0.0
      0.705263
      3.848018
      0.839037
    
    
      2
      0
      34.0
      21.0000
      0.0
      0.288283
      3.044522
      0.013137
    
    
      3
      0
      24.0
      7.8542
      0.0
      -0.406682
      2.061048
      -0.997760
    
    
      4
      0
      18.0
      13.0000
      0.0
      -0.823662
      2.564949
      -0.479808
    
    
      5
      0
      23.0
      13.0000
      0.0
      -0.476179
      2.564949
      -0.479808
    
    
      6
      0
      30.0
      10.5000
      0.0
      0.010297
      2.351375
      -0.699338
    
    
      7
      0
      36.0
      9.5000
      0.0
      0.427276
      2.251292
      -0.802212
    
    
      9
      1
      47.0
      52.5542
      1.0
      1.191739
      3.961845
      0.956038
    
    
      10
      1
      37.0
      90.0000
      1.0
      0.496773
      4.499810
      1.509003
    
    
      11
      0
      46.0
      61.1750
      0.0
      1.122242
      4.113739
      1.112167
    
    
      12
      1
      38.0
      13.0000
      0.0
      0.566270
      2.564949
      -0.479808
    
    
      13
      0
      32.0
      7.7750
      1.0
      0.149290
      2.050913
      -1.008177
    
    
      14
      0
      52.0
      13.5000
      0.0
      1.539221
      2.602690
      -0.441016
    
    
      15
      1
      21.0
      26.5500
      1.0
      -0.615172
      3.279030
      0.254183
    
    
      16
      0
      19.0
      10.5000
      0.0
      -0.754165
      2.351375
      -0.699338
    
    
      17
      0
      22.0
      7.1250
      0.0
      -0.545675
      1.963610
      -1.097915
    
    
      18
      0
      37.0
      7.9250
      0.0
      0.496773
      2.070022
      -0.988536
    
    
      19
      0
      14.0
      46.9000
      0.0
      -1.101648
      3.848018
      0.839037
    
    
      20
      0
      17.0
      8.6625
      0.0
      -0.893158
      2.159003
      -0.897074
    
    
      21
      0
      24.0
      73.5000
      0.0
      -0.406682
      4.297285
      1.300831
    
    
      22
      0
      25.0
      13.0000
      0.0
      -0.337186
      2.564949
      -0.479808
    
    
      23
      1
      18.0
      7.7750
      1.0
      -0.823662
      2.050913
      -1.008177
    
    
      24
      0
      33.0
      26.5500
      0.0
      0.218787
      3.279030
      0.254183
    
    
      25
      0
      27.0
      76.7292
      1.0
      -0.198193
      4.340282
      1.345027
    
    
      26
      0
      18.0
      7.8542
      0.0
      -0.823662
      2.061048
      -0.997760
    
    
      27
      1
      18.0
      7.7750
      0.0
      -0.823662
      2.050913
      -1.008177
    
    
      28
      0
      16.0
      9.5000
      0.0
      -0.962655
      2.251292
      -0.802212
    
    
      29
      1
      38.0
      7.7750
      0.0
      0.566270
      2.050913
      -1.008177
    
    
      30
      0
      3.0
      26.0000
      1.0
      -1.866110
      3.258097
      0.232666
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      1015
      0
      29.0
      7.9250
      0.0
      -0.059200
      2.070022
      -0.988536
    
    
      1016
      1
      53.0
      51.4792
      1.0
      1.608718
      3.941178
      0.934794
    
    
      1017
      0
      21.0
      8.0500
      0.0
      -0.615172
      2.085672
      -0.972450
    
    
      1018
      0
      33.0
      8.6625
      0.0
      0.218787
      2.159003
      -0.897074
    
    
      1019
      1
      29.0
      26.0000
      1.0
      -0.059200
      3.258097
      0.232666
    
    
      1020
      1
      24.0
      7.7500
      0.0
      -0.406682
      2.047693
      -1.011488
    
    
      1021
      0
      46.0
      26.0000
      0.0
      1.122242
      3.258097
      0.232666
    
    
      1022
      0
      40.0
      15.5000
      0.0
      0.705263
      2.740840
      -0.299013
    
    
      1023
      1
      2.0
      20.2125
      0.0
      -1.935607
      3.006301
      -0.026150
    
    
      1024
      0
      32.0
      7.8958
      0.0
      0.149290
      2.066331
      -0.992330
    
    
      1025
      1
      8.0
      21.0750
      0.0
      -1.518627
      3.048088
      0.016801
    
    
      1026
      1
      60.0
      26.0000
      0.0
      2.095194
      3.258097
      0.232666
    
    
      1027
      0
      21.0
      7.8000
      0.0
      -0.615172
      2.054124
      -1.004878
    
    
      1028
      0
      16.0
      26.0000
      0.0
      -0.962655
      3.258097
      0.232666
    
    
      1029
      0
      25.0
      7.9250
      0.0
      -0.337186
      2.070022
      -0.988536
    
    
      1030
      1
      39.0
      79.6500
      1.0
      0.635766
      4.377642
      1.383429
    
    
      1031
      1
      22.0
      49.5000
      1.0
      -0.545675
      3.901973
      0.894496
    
    
      1032
      0
      45.0
      134.5000
      1.0
      1.052745
      4.901564
      1.921960
    
    
      1033
      0
      20.0
      7.8542
      0.0
      -0.684669
      2.061048
      -0.997760
    
    
      1034
      1
      45.0
      14.1083
      1.0
      1.052745
      2.646763
      -0.395713
    
    
      1035
      1
      12.0
      15.7500
      1.0
      -1.240641
      2.756840
      -0.282567
    
    
      1036
      0
      64.0
      75.2500
      0.0
      2.373180
      4.320816
      1.325018
    
    
      1037
      0
      34.0
      13.0000
      1.0
      0.288283
      2.564949
      -0.479808
    
    
      1038
      1
      30.0
      12.3500
      1.0
      0.010297
      2.513656
      -0.532532
    
    
      1039
      0
      54.0
      77.2875
      0.0
      1.678215
      4.347532
      1.352479
    
    
      1040
      1
      21.0
      7.6500
      1.0
      -0.615172
      2.034706
      -1.024837
    
    
      1041
      0
      22.0
      9.0000
      0.0
      -0.545675
      2.197225
      -0.857787
    
    
      1042
      1
      20.0
      9.8250
      0.0
      -0.684669
      2.284930
      -0.767635
    
    
      1043
      0
      74.0
      7.7750
      0.0
      3.068146
      2.050913
      -1.008177
    
    
      1044
      0
      41.0
      14.1083
      0.0
      0.774759
      2.646763
      -0.395713
    
  

1037 rows × 7 columns



In [102]:

    
#Create the dataframe we will use for machine learning
sim_df = titanic_df[['survived', 'sex', 'n_age', 'n_logfare']]



In [103]:

    
#Randomly sample 600 people from the dataset
lim_sim_df = sim_df.sample(600).reset_index()



In [104]:

    
#initialize our similarity matrix
sim_mtx = np.zeros(shape=(len(lim_sim_df), len(lim_sim_df)))



In [105]:

    
#Get a list of who survived and who didn't from our 600
surv_list = lim_sim_df.survived



In [106]:

    
%%time
#populate the similarity matrix
for i in range(len(sim_mtx)):
    if i%100 == 0:
            print i
    v1 = lim_sim_df.iloc[i]
    for j in range(i, len(sim_mtx)):
        norm = np.exp(-np.linalg.norm(v1 - lim_sim_df.iloc[j]))
        sim_mtx[i,j] = norm
        sim_mtx[j,i] = norm









    



0
100
200
300
400
500
Wall time: 45.4 s

Manual Machine Learning (KNN)



In [107]:

    
#our test set will be 15%
pred_size = int(0.15*len(sim_mtx))
print pred_size



In [108]:

    
#make our predictions based on a majority of the relevant neighbors
pred_manual = []
for i in range(pred_size):
    indices_to_use = sorted(range(pred_size, len(sim_mtx)), key=lambda j: sim_mtx[i][j])
    indices_to_use = indices_to_use[-149:]
    sim_list = surv_list[indices_to_use].tolist()
    mode = max(set(sim_list), key=sim_list.count)
    pred_manual.append(mode)



In [109]:

    
sum(((pred_manual - surv_list[:pred_size]) == 0))/float(pred_size)









    Out[109]:





0.65555555555555556

Using SciKitLearn



In [110]:

    
#initialize the KNN
neigh = KNeighborsClassifier(n_neighbors=149)



In [111]:

    
pred_size = int(0.15*len(titanic_df))
auto_surv_list = titanic_df.survived
print pred_size



In [112]:

    
for feat_list in [['sex'], ['age'], ['fare'], ['sex', 'age'], ['sex', 'fare'], ['age', 'fare'], ['sex', 'fare', 'age']]:
    #fit the model with the training data
    neigh.fit(titanic_df[feat_list][pred_size:].as_matrix(), titanic_df['survived'][pred_size:]) 
    pred_auto = neigh.predict(titanic_df[feat_list][:pred_size].as_matrix())
    print feat_list
    print sum(((pred_auto - auto_surv_list[:pred_size]) == 0))/float(pred_size)









    



['sex']
0.761290322581
['age']
0.567741935484
['fare']
0.61935483871
['sex', 'age']
0.58064516129
['sex', 'fare']
0.612903225806
['age', 'fare']
0.6
['sex', 'fare', 'age']
0.6

What is the Best k?



In [113]:

    
#Graph accuracy vs k for our manual KNN
k_list = []
pred_size = int(0.15*len(sim_mtx))

for k in range(1, 200):
    pred_manual = []
    for i in range(pred_size):
        sim_list = surv_list[sorted(range(pred_size, len(sim_mtx)), key=lambda j: sim_mtx[i][j])[-k:]].tolist()
        pred_manual.append(max(set(sim_list), key=sim_list.count))
    acc = sum(((pred_manual - surv_list[:pred_size]) == 0))/float(pred_size)
    k_list.append(acc)
plt.figure(figsize=(10,8))
plt.plot(range(1,200), k_list)









    Out[113]:





[<matplotlib.lines.Line2D at 0xe492588>]



In [114]:

    
#Graph accuracy vs k for SKL KNN
k_list_auto = []
pred_size = int(0.15*len(titanic_df))
feat_list = ['sex', 'age', 'fare']
for k in range(1,800):
    neigh = KNeighborsClassifier(n_neighbors=k)
    neigh.fit(titanic_df[feat_list][pred_size:].as_matrix(), titanic_df['survived'][pred_size:]) 
    pred_auto = neigh.predict(titanic_df[feat_list][:pred_size].as_matrix())
    acc = sum(((pred_auto - auto_surv_list[:pred_size]) == 0))/float(pred_size)
    k_list_auto.append(acc)
plt.figure(figsize=(10,8))
plt.plot(range(1,800), k_list_auto, color = 'r')









    Out[114]:





[<matplotlib.lines.Line2D at 0xe2e7080>]



In [115]:

    
#Side by side
plt.figure(figsize=(10,8))
plt.plot(range(1,800), k_list_auto, color = 'red')
plt.plot(range(1,200), k_list, color = 'blue')
plt.axhline(0.62, color = 'k', linewidth = 1.5)









    Out[115]:





<matplotlib.lines.Line2D at 0xdfe50f0>

There's more than one way to measure success



In [116]:

    
#define precision and recall function
def precision_recall(pred, true):
    pred = np.asarray(pred)
    true = np.asarray(true)
    if (sum(pred+true == 2) + sum(pred-true == 1)) != 0:
        precision = float(sum(pred+true == 2))/(sum(pred+true == 2) + sum(pred-true == 1))
    else:
        precision = 0
    if (sum(pred+true == 2) + sum(pred-true == -1)) != 0:
        recall = float(sum(pred+true == 2))/(sum(pred+true == 2) + sum(pred-true == -1))
    else:
        recall = 0
    return (precision, recall)



In [117]:

    
#Graph precision and recall vs k for SKL KNN
k_list_auto = []
pred_size = int(0.15*len(titanic_df))
feat_list = ['sex', 'age', 'fare']
for k in range(1,550):
    neigh = KNeighborsClassifier(n_neighbors=k)
    neigh.fit(titanic_df[feat_list][pred_size:].as_matrix(), titanic_df['survived'][pred_size:]) 
    pred_auto = neigh.predict(titanic_df[feat_list][:pred_size].as_matrix())
    p_r = precision_recall(pred_auto, auto_surv_list[:pred_size])
    k_list_auto.append(p_r)
plt.figure(figsize=(10,8))
plt.plot(range(1,550), [i[0] for i in k_list_auto], color = 'r')
plt.plot(range(1,550), [i[1] for i in k_list_auto], color = 'g')

plt.axhline(0.32, color = 'red', linewidth=2, alpha = 0.5)









    Out[117]:





<matplotlib.lines.Line2D at 0xdaf43c8>

Let's finish with a bunch of other classifiers



In [118]:

    
#A magical loop
pred_size = int(0.15*len(titanic_df))
feat_list = ['sex', 'age', 'fare']
clfs = {
    'RF': RandomForestClassifier(),
    'LR': LogisticRegression(),
    'GB': GradientBoostingClassifier(),
    'ET': ExtraTreesClassifier(),
    'KNN': KNeighborsClassifier(n_neighbors=300),
    'AB': AdaBoostClassifier()
}

for clf_name in clfs.keys():
    print clf_name
    clf = clfs[clf_name]
    clf.fit(titanic_df[feat_list][pred_size:].as_matrix(), titanic_df['survived'][pred_size:]) 
    pred_auto = clf.predict(titanic_df[feat_list][:pred_size].as_matrix())

    acc = sum(((pred_auto - auto_surv_list[:pred_size]) == 0))/float(pred_size)
    print 'Accuracy: ', acc
    p_r = precision_recall(pred_auto, auto_surv_list[:pred_size])
    print 'Precision: ', p_r[0]
    print 'Recall: ', p_r[1]
    print '----------------------------------------------'









    



KNN
Accuracy:  0.61935483871
Precision:  0.7
Recall:  0.208955223881
----------------------------------------------
AB
Accuracy:  0.735483870968
Precision:  0.685714285714
Recall:  0.716417910448
----------------------------------------------
RF
Accuracy:  0.664516129032
Precision:  0.631578947368
Recall:  0.537313432836
----------------------------------------------
LR
Accuracy:  0.748387096774
Precision:  0.705882352941
Recall:  0.716417910448
----------------------------------------------
GB
Accuracy:  0.787096774194
Precision:  0.783333333333
Recall:  0.701492537313
----------------------------------------------
ET
Accuracy:  0.690322580645
Precision:  0.672727272727
Recall:  0.55223880597
----------------------------------------------

Beware of Machine Learning Pitfalls



In [119]:

    
#WRONG WRONG WRONG!!!!!!
pred_size = int(0.15*len(titanic_df))
feat_list = ['sex', 'age', 'fare']
clfs = {
    'RF': RandomForestClassifier(),
    'LR': LogisticRegression(),
    'GB': GradientBoostingClassifier(),
    'ET': ExtraTreesClassifier(),
    'KNN': KNeighborsClassifier(),
    'AB': AdaBoostClassifier()
}

for clf_name in clfs.keys():
    print clf_name + ' - WRONG!'
    clf = clfs[clf_name]
    clf.fit(titanic_df[feat_list].as_matrix(), titanic_df['survived']) 
    pred_auto = clf.predict(titanic_df[feat_list].as_matrix())

    acc = sum(((pred_auto - auto_surv_list) == 0))/float(len(titanic_df))
    print 'Accuracy: ', acc
    p_r = precision_recall(pred_auto, auto_surv_list)
    print 'Precision: ', p_r[0]
    print 'Recall: ', p_r[1]
    print '----------------------------------------------'









    



KNN - WRONG!
Accuracy:  0.772420443587
Precision:  0.74293059126
Recall:  0.68
----------------------------------------------
AB - WRONG!
Accuracy:  0.790742526519
Precision:  0.738532110092
Recall:  0.757647058824
----------------------------------------------
RF - WRONG!
Accuracy:  0.965284474446
Precision:  0.977886977887
Recall:  0.936470588235
----------------------------------------------
LR - WRONG!
Accuracy:  0.77531340405
Precision:  0.738805970149
Recall:  0.698823529412
----------------------------------------------
GB - WRONG!
Accuracy:  0.854387656702
Precision:  0.862433862434
Recall:  0.767058823529
----------------------------------------------
ET - WRONG!
Accuracy:  0.976856316297
Precision:  0.995061728395
Recall:  0.948235294118
----------------------------------------------

What is the relative importance of our features?

Pitfall



In [120]:

    
clf = LogisticRegression()
clf.fit(titanic_df[['sex', 'age', 'fare']][pred_size:].as_matrix(), titanic_df['survived'][pred_size:]) 
plt.figure(figsize=(10,8))
plt.bar([1,2,3], clf.coef_[0], tick_label = ['sex', 'age', 'fare'])
plt.xticks([1.5,2.5,3.5])









    Out[120]:





([<matplotlib.axis.XTick at 0xe2e7e10>,
  <matplotlib.axis.XTick at 0xdd57d68>,
  <matplotlib.axis.XTick at 0x1076a208>],
 <a list of 3 Text xticklabel objects>)

More Correct ... but still not great



In [121]:

    
clf = LogisticRegression()
clf.fit(titanic_df[['sex', 'n_age', 'n_logfare']][pred_size:].as_matrix(), titanic_df['survived'][pred_size:]) 
plt.figure(figsize=(10,8))
plt.bar([1,2,3], clf.coef_[0], tick_label = ['sex', 'age', 'fare'])
plt.xticks([1.5,2.5,3.5])









    Out[121]:





([<matplotlib.axis.XTick at 0xe1304e0>,
  <matplotlib.axis.XTick at 0xc378208>,
  <matplotlib.axis.XTick at 0x1069ccc0>],
 <a list of 3 Text xticklabel objects>)

Let's use a more robust method



In [122]:

    
clf = RandomForestClassifier()
clf.fit(titanic_df[['sex', 'n_age', 'n_logfare']][pred_size:].as_matrix(), titanic_df['survived'][pred_size:]) 
plt.figure(figsize=(10,8))
plt.bar([1,2,3], clf.feature_importances_, tick_label = ['sex', 'age', 'fare'])
plt.xticks([1.5,2.5,3.5])









    Out[122]:





([<matplotlib.axis.XTick at 0xe150208>,
  <matplotlib.axis.XTick at 0xe150f28>,
  <matplotlib.axis.XTick at 0x108d2cf8>],
 <a list of 3 Text xticklabel objects>)



In [ ]:

	pclass	survived	name	sex	age	sibsp	parch	ticket	fare	cabin	embarked	boat	body	home.dest
0	1.0	0.0	Gee, Mr. Arthur H	male	47.0	0.0	0.0	111320	38.5000	E63	S	NaN	275.0	St Anne's-on-Sea, Lancashire
1	3.0	0.0	Goodwin, Mr. Charles Frederick	male	40.0	1.0	6.0	CA 2144	46.9000	NaN	S	NaN	NaN	Wiltshire, England Niagara Falls, NY
2	2.0	0.0	Gale, Mr. Shadrach	male	34.0	1.0	0.0	28664	21.0000	NaN	S	NaN	NaN	Cornwall / Clear Creek, CO
3	3.0	0.0	Carlsson, Mr. Carl Robert	male	24.0	0.0	0.0	350409	7.8542	NaN	S	NaN	NaN	Goteborg, Sweden Huntley, IL
4	2.0	0.0	Fahlstrom, Mr. Arne Jonas	male	18.0	0.0	0.0	236171	13.0000	NaN	S	NaN	NaN	Oslo, Norway Bayonne, NJ
5	2.0	0.0	Troupiansky, Mr. Moses Aaron	male	23.0	0.0	0.0	233639	13.0000	NaN	S	NaN	NaN	NaN
6	2.0	0.0	Harris, Mr. Walter	male	30.0	0.0	0.0	W/C 14208	10.5000	NaN	S	NaN	NaN	Walthamstow, England
7	3.0	0.0	Wittevrongel, Mr. Camille	male	36.0	0.0	0.0	345771	9.5000	NaN	S	NaN	NaN	NaN
8	3.0	0.0	Johnson, Mr. Alfred	male	49.0	0.0	0.0	LINE	0.0000	NaN	S	NaN	NaN	NaN
9	3.0	0.0	Peduzzi, Mr. Joseph	male	NaN	0.0	0.0	A/5 2817	8.0500	NaN	S	NaN	NaN	NaN
10	3.0	1.0	Murphy, Miss. Margaret Jane	female	NaN	1.0	0.0	367230	15.5000	NaN	Q	16	NaN	NaN
11	1.0	1.0	Beckwith, Mrs. Richard Leonard (Sallie Monypeny)	female	47.0	1.0	1.0	11751	52.5542	D35	S	5	NaN	New York, NY
12	1.0	1.0	Minahan, Mrs. William Edward (Lillian E Thorpe)	female	37.0	1.0	0.0	19928	90.0000	C78	Q	14	NaN	Fond du Lac, WI
13	1.0	0.0	Chaffee, Mr. Herbert Fuller	male	46.0	1.0	0.0	W.E.P. 5734	61.1750	E31	S	NaN	NaN	Amenia, ND
14	3.0	1.0	Moubarek, Master. Gerios	male	NaN	1.0	1.0	2661	15.2458	NaN	C	C	NaN	NaN
15	2.0	0.0	Funk, Miss. Annie Clemmer	female	38.0	0.0	0.0	237671	13.0000	NaN	S	NaN	NaN	Janjgir, India / Pennsylvania
16	3.0	1.0	Olsson, Mr. Oscar Wilhelm	male	32.0	0.0	0.0	347079	7.7750	NaN	S	A	NaN	NaN
17	2.0	0.0	Chapman, Mr. Charles Henry	male	52.0	0.0	0.0	248731	13.5000	NaN	S	NaN	130.0	Bronx, NY
18	3.0	1.0	Kennedy, Mr. John	male	NaN	0.0	0.0	368783	7.7500	NaN	Q	NaN	NaN	NaN
19	1.0	1.0	Willard, Miss. Constance	female	21.0	0.0	0.0	113795	26.5500	NaN	S	8 10	NaN	Duluth, MN
20	2.0	0.0	Rogers, Mr. Reginald Harry	male	19.0	0.0	0.0	28004	10.5000	NaN	S	NaN	NaN	NaN
21	3.0	0.0	Maenpaa, Mr. Matti Alexanteri	male	22.0	0.0	0.0	STON/O 2. 3101275	7.1250	NaN	S	NaN	NaN	NaN
22	3.0	0.0	Gustafsson, Mr. Anders Vilhelm	male	37.0	2.0	0.0	3101276	7.9250	NaN	S	NaN	98.0	Ruotsinphytaa, Finland New York, NY
23	3.0	0.0	Goodwin, Mr. Charles Edward	male	14.0	5.0	2.0	CA 2144	46.9000	NaN	S	NaN	NaN	Wiltshire, England Niagara Falls, NY
24	3.0	0.0	Calic, Mr. Jovo	male	17.0	0.0	0.0	315093	8.6625	NaN	S	NaN	NaN	NaN
25	2.0	0.0	Hickman, Mr. Leonard Mark	male	24.0	2.0	0.0	S.O.C. 14879	73.5000	NaN	S	NaN	NaN	West Hampstead, London / Neepawa, MB
26	3.0	0.0	Caram, Mrs. Joseph (Maria Elias)	female	NaN	1.0	0.0	2689	14.4583	NaN	C	NaN	NaN	Ottawa, ON
27	3.0	0.0	O'Brien, Mr. Timothy	male	NaN	0.0	0.0	330979	7.8292	NaN	Q	NaN	NaN	NaN
28	2.0	0.0	Sobey, Mr. Samuel James Hayden	male	25.0	0.0	0.0	C.A. 29178	13.0000	NaN	S	NaN	NaN	Cornwall / Houghton, MI
29	3.0	1.0	Nilsson, Miss. Berta Olivia	female	18.0	0.0	0.0	347066	7.7750	NaN	S	D	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1280	2.0	1.0	Weisz, Mrs. Leopold (Mathilde Francoise Pede)	female	29.0	1.0	0.0	228414	26.0000	NaN	S	10	NaN	Bromsgrove, England / Montreal, PQ
1281	3.0	0.0	Doyle, Miss. Elizabeth	female	24.0	0.0	0.0	368702	7.7500	NaN	Q	NaN	NaN	Ireland New York, NY
1282	1.0	0.0	Jones, Mr. Charles Cresson	male	46.0	0.0	0.0	694	26.0000	NaN	S	NaN	80.0	Bennington, VT
1283	3.0	0.0	Bourke, Mr. John	male	40.0	1.0	1.0	364849	15.5000	NaN	Q	NaN	NaN	Ireland Chicago, IL
1284	3.0	0.0	Moutal, Mr. Rahamin Haim	male	NaN	0.0	0.0	374746	8.0500	NaN	S	NaN	NaN	NaN
1285	3.0	0.0	Davison, Mr. Thomas Henry	male	NaN	1.0	0.0	386525	16.1000	NaN	S	NaN	NaN	Liverpool, England Bedford, OH
1286	1.0	0.0	Parr, Mr. William Henry Marsh	male	NaN	0.0	0.0	112052	0.0000	NaN	S	NaN	NaN	Belfast
1287	3.0	0.0	Rosblom, Miss. Salli Helena	female	2.0	1.0	1.0	370129	20.2125	NaN	S	NaN	NaN	NaN
1288	3.0	0.0	Mitkoff, Mr. Mito	male	NaN	0.0	0.0	349221	7.8958	NaN	S	NaN	NaN	NaN
1289	3.0	0.0	Pavlovic, Mr. Stefo	male	32.0	0.0	0.0	349242	7.8958	NaN	S	NaN	NaN	NaN
1290	3.0	0.0	Palsson, Miss. Torborg Danira	female	8.0	3.0	1.0	349909	21.0750	NaN	S	NaN	NaN	NaN
1291	2.0	0.0	Howard, Mrs. Benjamin (Ellen Truelove Arman)	female	60.0	1.0	0.0	24065	26.0000	NaN	S	NaN	NaN	Swindon, England
1292	3.0	0.0	Nosworthy, Mr. Richard Cater	male	21.0	0.0	0.0	A/4. 39886	7.8000	NaN	S	NaN	NaN	NaN
1293	2.0	0.0	Gaskell, Mr. Alfred	male	16.0	0.0	0.0	239865	26.0000	NaN	S	NaN	NaN	Liverpool / Montreal, PQ
1294	3.0	0.0	Peltomaki, Mr. Nikolai Johannes	male	25.0	0.0	0.0	STON/O 2. 3101291	7.9250	NaN	S	NaN	NaN	NaN
1295	1.0	1.0	Taussig, Mrs. Emil (Tillie Mandelbaum)	female	39.0	1.0	1.0	110413	79.6500	E67	S	8	NaN	New York, NY
1296	1.0	1.0	Frolicher, Miss. Hedwig Margaritha	female	22.0	0.0	2.0	13568	49.5000	B39	C	5	NaN	Zurich, Switzerland
1297	1.0	1.0	Spedden, Mr. Frederic Oakley	male	45.0	1.0	1.0	16966	134.5000	E34	C	3	NaN	Tuxedo Park, NY
1298	3.0	0.0	Jensen, Mr. Hans Peder	male	20.0	0.0	0.0	350050	7.8542	NaN	S	NaN	NaN	NaN
1299	3.0	1.0	Hansen, Mrs. Claus Peter (Jennie L Howard)	female	45.0	1.0	0.0	350026	14.1083	NaN	S	11	NaN	NaN
1300	2.0	1.0	Watt, Miss. Bertha J	female	12.0	0.0	0.0	C.A. 33595	15.7500	NaN	S	9	NaN	Aberdeen / Portland, OR
1301	1.0	0.0	Warren, Mr. Frank Manley	male	64.0	1.0	0.0	110813	75.2500	D37	C	NaN	NaN	Portland, OR
1302	2.0	1.0	Beesley, Mr. Lawrence	male	34.0	0.0	0.0	248698	13.0000	D56	S	13	NaN	London
1303	2.0	1.0	Slayter, Miss. Hilda Mary	female	30.0	0.0	0.0	234818	12.3500	NaN	Q	13	NaN	Halifax, NS
1304	1.0	0.0	White, Mr. Percival Wayland	male	54.0	0.0	1.0	35281	77.2875	D26	S	NaN	NaN	Brunswick, ME
1305	3.0	1.0	Salkjelsvik, Miss. Anna Kristine	female	21.0	0.0	0.0	343120	7.6500	NaN	S	C	NaN	NaN
1306	3.0	0.0	Waelens, Mr. Achille	male	22.0	0.0	0.0	345767	9.0000	NaN	S	NaN	NaN	Antwerp, Belgium / Stanton, OH
1307	3.0	0.0	Jussila, Miss. Katriina	female	20.0	1.0	0.0	4136	9.8250	NaN	S	NaN	NaN	NaN
1308	3.0	0.0	Svensson, Mr. Johan	male	74.0	0.0	0.0	347060	7.7750	NaN	S	NaN	NaN	NaN
1309	3.0	0.0	Hansen, Mr. Claus Peter	male	41.0	2.0	0.0	350026	14.1083	NaN	S	NaN	NaN	NaN

	sex	age	fare	survived	n_age	logfare	n_logfare
0	0	47.0	38.5000	0.0	1.191739	3.650658	0.636174
1	0	40.0	46.9000	0.0	0.705263	3.848018	0.839037
2	0	34.0	21.0000	0.0	0.288283	3.044522	0.013137
3	0	24.0	7.8542	0.0	-0.406682	2.061048	-0.997760
4	0	18.0	13.0000	0.0	-0.823662	2.564949	-0.479808
5	0	23.0	13.0000	0.0	-0.476179	2.564949	-0.479808
6	0	30.0	10.5000	0.0	0.010297	2.351375	-0.699338
7	0	36.0	9.5000	0.0	0.427276	2.251292	-0.802212
9	1	47.0	52.5542	1.0	1.191739	3.961845	0.956038
10	1	37.0	90.0000	1.0	0.496773	4.499810	1.509003
11	0	46.0	61.1750	0.0	1.122242	4.113739	1.112167
12	1	38.0	13.0000	0.0	0.566270	2.564949	-0.479808
13	0	32.0	7.7750	1.0	0.149290	2.050913	-1.008177
14	0	52.0	13.5000	0.0	1.539221	2.602690	-0.441016
15	1	21.0	26.5500	1.0	-0.615172	3.279030	0.254183
16	0	19.0	10.5000	0.0	-0.754165	2.351375	-0.699338
17	0	22.0	7.1250	0.0	-0.545675	1.963610	-1.097915
18	0	37.0	7.9250	0.0	0.496773	2.070022	-0.988536
19	0	14.0	46.9000	0.0	-1.101648	3.848018	0.839037
20	0	17.0	8.6625	0.0	-0.893158	2.159003	-0.897074
21	0	24.0	73.5000	0.0	-0.406682	4.297285	1.300831
22	0	25.0	13.0000	0.0	-0.337186	2.564949	-0.479808
23	1	18.0	7.7750	1.0	-0.823662	2.050913	-1.008177
24	0	33.0	26.5500	0.0	0.218787	3.279030	0.254183
25	0	27.0	76.7292	1.0	-0.198193	4.340282	1.345027
26	0	18.0	7.8542	0.0	-0.823662	2.061048	-0.997760
27	1	18.0	7.7750	0.0	-0.823662	2.050913	-1.008177
28	0	16.0	9.5000	0.0	-0.962655	2.251292	-0.802212
29	1	38.0	7.7750	0.0	0.566270	2.050913	-1.008177
30	0	3.0	26.0000	1.0	-1.866110	3.258097	0.232666
...	...	...	...	...	...	...	...
1015	0	29.0	7.9250	0.0	-0.059200	2.070022	-0.988536
1016	1	53.0	51.4792	1.0	1.608718	3.941178	0.934794
1017	0	21.0	8.0500	0.0	-0.615172	2.085672	-0.972450
1018	0	33.0	8.6625	0.0	0.218787	2.159003	-0.897074
1019	1	29.0	26.0000	1.0	-0.059200	3.258097	0.232666
1020	1	24.0	7.7500	0.0	-0.406682	2.047693	-1.011488
1021	0	46.0	26.0000	0.0	1.122242	3.258097	0.232666
1022	0	40.0	15.5000	0.0	0.705263	2.740840	-0.299013
1023	1	2.0	20.2125	0.0	-1.935607	3.006301	-0.026150
1024	0	32.0	7.8958	0.0	0.149290	2.066331	-0.992330
1025	1	8.0	21.0750	0.0	-1.518627	3.048088	0.016801
1026	1	60.0	26.0000	0.0	2.095194	3.258097	0.232666
1027	0	21.0	7.8000	0.0	-0.615172	2.054124	-1.004878
1028	0	16.0	26.0000	0.0	-0.962655	3.258097	0.232666
1029	0	25.0	7.9250	0.0	-0.337186	2.070022	-0.988536
1030	1	39.0	79.6500	1.0	0.635766	4.377642	1.383429
1031	1	22.0	49.5000	1.0	-0.545675	3.901973	0.894496
1032	0	45.0	134.5000	1.0	1.052745	4.901564	1.921960
1033	0	20.0	7.8542	0.0	-0.684669	2.061048	-0.997760
1034	1	45.0	14.1083	1.0	1.052745	2.646763	-0.395713
1035	1	12.0	15.7500	1.0	-1.240641	2.756840	-0.282567
1036	0	64.0	75.2500	0.0	2.373180	4.320816	1.325018
1037	0	34.0	13.0000	1.0	0.288283	2.564949	-0.479808
1038	1	30.0	12.3500	1.0	0.010297	2.513656	-0.532532
1039	0	54.0	77.2875	0.0	1.678215	4.347532	1.352479
1040	1	21.0	7.6500	1.0	-0.615172	2.034706	-1.024837
1041	0	22.0	9.0000	0.0	-0.545675	2.197225	-0.857787
1042	1	20.0	9.8250	0.0	-0.684669	2.284930	-0.767635
1043	0	74.0	7.7750	0.0	3.068146	2.050913	-1.008177
1044	0	41.0	14.1083	0.0	0.774759	2.646763	-0.395713