notebook.community

Edit and run



In [3]:

    
# Title: Titanic- Machine Learning Through Disaster 
# Objective: Prediction of Survival on the Titanic 
# Model 1: Using a Simple Model based on Gender only

# Imports

# pandas, numpy
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

# csv, matplotlib, seaborn
import csv
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB



In [4]:

    
# For .read_csv, we always use header=0 when we know row 0 is the header row
df = pd.read_csv('Desktop/titanic/train.csv', header=0)
df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB



In [5]:

    
# Type of object: 
type(df)









    Out[5]:





pandas.core.frame.DataFrame



In [6]:

    
df









    Out[6]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.0
      1
      0
      113803
      53.1000
      C123
      S
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35.0
      0
      0
      373450
      8.0500
      NaN
      S
    
    
      5
      6
      0
      3
      Moran, Mr. James
      male
      NaN
      0
      0
      330877
      8.4583
      NaN
      Q
    
    
      6
      7
      0
      1
      McCarthy, Mr. Timothy J
      male
      54.0
      0
      0
      17463
      51.8625
      E46
      S
    
    
      7
      8
      0
      3
      Palsson, Master. Gosta Leonard
      male
      2.0
      3
      1
      349909
      21.0750
      NaN
      S
    
    
      8
      9
      1
      3
      Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
      female
      27.0
      0
      2
      347742
      11.1333
      NaN
      S
    
    
      9
      10
      1
      2
      Nasser, Mrs. Nicholas (Adele Achem)
      female
      14.0
      1
      0
      237736
      30.0708
      NaN
      C
    
    
      10
      11
      1
      3
      Sandstrom, Miss. Marguerite Rut
      female
      4.0
      1
      1
      PP 9549
      16.7000
      G6
      S
    
    
      11
      12
      1
      1
      Bonnell, Miss. Elizabeth
      female
      58.0
      0
      0
      113783
      26.5500
      C103
      S
    
    
      12
      13
      0
      3
      Saundercock, Mr. William Henry
      male
      20.0
      0
      0
      A/5. 2151
      8.0500
      NaN
      S
    
    
      13
      14
      0
      3
      Andersson, Mr. Anders Johan
      male
      39.0
      1
      5
      347082
      31.2750
      NaN
      S
    
    
      14
      15
      0
      3
      Vestrom, Miss. Hulda Amanda Adolfina
      female
      14.0
      0
      0
      350406
      7.8542
      NaN
      S
    
    
      15
      16
      1
      2
      Hewlett, Mrs. (Mary D Kingcome)
      female
      55.0
      0
      0
      248706
      16.0000
      NaN
      S
    
    
      16
      17
      0
      3
      Rice, Master. Eugene
      male
      2.0
      4
      1
      382652
      29.1250
      NaN
      Q
    
    
      17
      18
      1
      2
      Williams, Mr. Charles Eugene
      male
      NaN
      0
      0
      244373
      13.0000
      NaN
      S
    
    
      18
      19
      0
      3
      Vander Planke, Mrs. Julius (Emelia Maria Vande...
      female
      31.0
      1
      0
      345763
      18.0000
      NaN
      S
    
    
      19
      20
      1
      3
      Masselmani, Mrs. Fatima
      female
      NaN
      0
      0
      2649
      7.2250
      NaN
      C
    
    
      20
      21
      0
      2
      Fynney, Mr. Joseph J
      male
      35.0
      0
      0
      239865
      26.0000
      NaN
      S
    
    
      21
      22
      1
      2
      Beesley, Mr. Lawrence
      male
      34.0
      0
      0
      248698
      13.0000
      D56
      S
    
    
      22
      23
      1
      3
      McGowan, Miss. Anna "Annie"
      female
      15.0
      0
      0
      330923
      8.0292
      NaN
      Q
    
    
      23
      24
      1
      1
      Sloper, Mr. William Thompson
      male
      28.0
      0
      0
      113788
      35.5000
      A6
      S
    
    
      24
      25
      0
      3
      Palsson, Miss. Torborg Danira
      female
      8.0
      3
      1
      349909
      21.0750
      NaN
      S
    
    
      25
      26
      1
      3
      Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...
      female
      38.0
      1
      5
      347077
      31.3875
      NaN
      S
    
    
      26
      27
      0
      3
      Emir, Mr. Farred Chehab
      male
      NaN
      0
      0
      2631
      7.2250
      NaN
      C
    
    
      27
      28
      0
      1
      Fortune, Mr. Charles Alexander
      male
      19.0
      3
      2
      19950
      263.0000
      C23 C25 C27
      S
    
    
      28
      29
      1
      3
      O'Dwyer, Miss. Ellen "Nellie"
      female
      NaN
      0
      0
      330959
      7.8792
      NaN
      Q
    
    
      29
      30
      0
      3
      Todoroff, Mr. Lalio
      male
      NaN
      0
      0
      349216
      7.8958
      NaN
      S
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      861
      862
      0
      2
      Giles, Mr. Frederick Edward
      male
      21.0
      1
      0
      28134
      11.5000
      NaN
      S
    
    
      862
      863
      1
      1
      Swift, Mrs. Frederick Joel (Margaret Welles Ba...
      female
      48.0
      0
      0
      17466
      25.9292
      D17
      S
    
    
      863
      864
      0
      3
      Sage, Miss. Dorothy Edith "Dolly"
      female
      NaN
      8
      2
      CA. 2343
      69.5500
      NaN
      S
    
    
      864
      865
      0
      2
      Gill, Mr. John William
      male
      24.0
      0
      0
      233866
      13.0000
      NaN
      S
    
    
      865
      866
      1
      2
      Bystrom, Mrs. (Karolina)
      female
      42.0
      0
      0
      236852
      13.0000
      NaN
      S
    
    
      866
      867
      1
      2
      Duran y More, Miss. Asuncion
      female
      27.0
      1
      0
      SC/PARIS 2149
      13.8583
      NaN
      C
    
    
      867
      868
      0
      1
      Roebling, Mr. Washington Augustus II
      male
      31.0
      0
      0
      PC 17590
      50.4958
      A24
      S
    
    
      868
      869
      0
      3
      van Melkebeke, Mr. Philemon
      male
      NaN
      0
      0
      345777
      9.5000
      NaN
      S
    
    
      869
      870
      1
      3
      Johnson, Master. Harold Theodor
      male
      4.0
      1
      1
      347742
      11.1333
      NaN
      S
    
    
      870
      871
      0
      3
      Balkic, Mr. Cerin
      male
      26.0
      0
      0
      349248
      7.8958
      NaN
      S
    
    
      871
      872
      1
      1
      Beckwith, Mrs. Richard Leonard (Sallie Monypeny)
      female
      47.0
      1
      1
      11751
      52.5542
      D35
      S
    
    
      872
      873
      0
      1
      Carlsson, Mr. Frans Olof
      male
      33.0
      0
      0
      695
      5.0000
      B51 B53 B55
      S
    
    
      873
      874
      0
      3
      Vander Cruyssen, Mr. Victor
      male
      47.0
      0
      0
      345765
      9.0000
      NaN
      S
    
    
      874
      875
      1
      2
      Abelson, Mrs. Samuel (Hannah Wizosky)
      female
      28.0
      1
      0
      P/PP 3381
      24.0000
      NaN
      C
    
    
      875
      876
      1
      3
      Najib, Miss. Adele Kiamie "Jane"
      female
      15.0
      0
      0
      2667
      7.2250
      NaN
      C
    
    
      876
      877
      0
      3
      Gustafsson, Mr. Alfred Ossian
      male
      20.0
      0
      0
      7534
      9.8458
      NaN
      S
    
    
      877
      878
      0
      3
      Petroff, Mr. Nedelio
      male
      19.0
      0
      0
      349212
      7.8958
      NaN
      S
    
    
      878
      879
      0
      3
      Laleff, Mr. Kristo
      male
      NaN
      0
      0
      349217
      7.8958
      NaN
      S
    
    
      879
      880
      1
      1
      Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)
      female
      56.0
      0
      1
      11767
      83.1583
      C50
      C
    
    
      880
      881
      1
      2
      Shelley, Mrs. William (Imanita Parrish Hall)
      female
      25.0
      0
      1
      230433
      26.0000
      NaN
      S
    
    
      881
      882
      0
      3
      Markun, Mr. Johann
      male
      33.0
      0
      0
      349257
      7.8958
      NaN
      S
    
    
      882
      883
      0
      3
      Dahlberg, Miss. Gerda Ulrika
      female
      22.0
      0
      0
      7552
      10.5167
      NaN
      S
    
    
      883
      884
      0
      2
      Banfield, Mr. Frederick James
      male
      28.0
      0
      0
      C.A./SOTON 34068
      10.5000
      NaN
      S
    
    
      884
      885
      0
      3
      Sutehall, Mr. Henry Jr
      male
      25.0
      0
      0
      SOTON/OQ 392076
      7.0500
      NaN
      S
    
    
      885
      886
      0
      3
      Rice, Mrs. William (Margaret Norton)
      female
      39.0
      0
      5
      382652
      29.1250
      NaN
      Q
    
    
      886
      887
      0
      2
      Montvila, Rev. Juozas
      male
      27.0
      0
      0
      211536
      13.0000
      NaN
      S
    
    
      887
      888
      1
      1
      Graham, Miss. Margaret Edith
      female
      19.0
      0
      0
      112053
      30.0000
      B42
      S
    
    
      888
      889
      0
      3
      Johnston, Miss. Catherine Helen "Carrie"
      female
      NaN
      1
      2
      W./C. 6607
      23.4500
      NaN
      S
    
    
      889
      890
      1
      1
      Behr, Mr. Karl Howell
      male
      26.0
      0
      0
      111369
      30.0000
      C148
      C
    
    
      890
      891
      0
      3
      Dooley, Mr. Patrick
      male
      32.0
      0
      0
      370376
      7.7500
      NaN
      Q
    
  

891 rows × 12 columns



In [7]:

    
# Data Types interpreted by Pandas csv reader
df.dtypes









    Out[7]:





PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object



In [8]:

    
# Filter out columns whose data type is an object
df.dtypes[df.dtypes.map(lambda x: x=='object')]









    Out[8]:





Name        object
Sex         object
Ticket      object
Cabin       object
Embarked    object
dtype: object



In [9]:

    
# Displays the first few data frames (By default, 5 rows)
df.head()









    Out[9]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.0
      1
      0
      113803
      53.1000
      C123
      S
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35.0
      0
      0
      373450
      8.0500
      NaN
      S



In [10]:

    
# Displays the last few data frames (By default, 5 rows)
df.tail()









    Out[10]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      886
      887
      0
      2
      Montvila, Rev. Juozas
      male
      27.0
      0
      0
      211536
      13.00
      NaN
      S
    
    
      887
      888
      1
      1
      Graham, Miss. Margaret Edith
      female
      19.0
      0
      0
      112053
      30.00
      B42
      S
    
    
      888
      889
      0
      3
      Johnston, Miss. Catherine Helen "Carrie"
      female
      NaN
      1
      2
      W./C. 6607
      23.45
      NaN
      S
    
    
      889
      890
      1
      1
      Behr, Mr. Karl Howell
      male
      26.0
      0
      0
      111369
      30.00
      C148
      C
    
    
      890
      891
      0
      3
      Dooley, Mr. Patrick
      male
      32.0
      0
      0
      370376
      7.75
      NaN
      Q



In [11]:

    
# Mathematical Description of Data Frame (in terms of count, mean, etc.)
df.describe()









    Out[11]:






  
    
      
      PassengerId
      Survived
      Pclass
      Age
      SibSp
      Parch
      Fare
    
  
  
    
      count
      891.000000
      891.000000
      891.000000
      714.000000
      891.000000
      891.000000
      891.000000
    
    
      mean
      446.000000
      0.383838
      2.308642
      29.699118
      0.523008
      0.381594
      32.204208
    
    
      std
      257.353842
      0.486592
      0.836071
      14.526497
      1.102743
      0.806057
      49.693429
    
    
      min
      1.000000
      0.000000
      1.000000
      0.420000
      0.000000
      0.000000
      0.000000
    
    
      25%
      223.500000
      0.000000
      2.000000
      20.125000
      0.000000
      0.000000
      7.910400
    
    
      50%
      446.000000
      0.000000
      3.000000
      28.000000
      0.000000
      0.000000
      14.454200
    
    
      75%
      668.500000
      1.000000
      3.000000
      38.000000
      1.000000
      0.000000
      31.000000
    
    
      max
      891.000000
      1.000000
      3.000000
      80.000000
      8.000000
      6.000000
      512.329200



In [12]:

    
# Display the first 10 rows of the Age column
df['Age'][0:10]









    Out[12]:





0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64



In [13]:

    
#Type of Age object
type(df['Age'])









    Out[13]:





pandas.core.series.Series



In [14]:

    
# Mean Value for the Age Column
df['Age'].mean()









    Out[14]:





29.69911764705882



In [15]:

    
# Selection of subsets of the dataframe
df[ ['Sex', 'Pclass', 'Age'] ].head()



In [16]:

    
# Filtering out passengers with Age > 60
df[df['Age'] > 60].head()









    Out[16]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      33
      34
      0
      2
      Wheadon, Mr. Edward H
      male
      66.0
      0
      0
      C.A. 24579
      10.5000
      NaN
      S
    
    
      54
      55
      0
      1
      Ostby, Mr. Engelhart Cornelius
      male
      65.0
      0
      1
      113509
      61.9792
      B30
      C
    
    
      96
      97
      0
      1
      Goldschmidt, Mr. George B
      male
      71.0
      0
      0
      PC 17754
      34.6542
      A5
      C
    
    
      116
      117
      0
      3
      Connors, Mr. Patrick
      male
      70.5
      0
      0
      370369
      7.7500
      NaN
      Q
    
    
      170
      171
      0
      1
      Van der hoef, Mr. Wyckoff
      male
      61.0
      0
      0
      111240
      33.5000
      B19
      S



In [17]:

    
# Combination of above two scripts
df[df['Age'] > 60][['Sex', 'Pclass', 'Age', 'Survived']].head()



In [18]:

    
# Passengers whose age is unavailable
df[df['Age'].isnull()][['Sex', 'Pclass', 'Age']].head()



In [19]:

    
# Use inside loop for condition checking
for i in range(1,4):
    print i, len(df[ (df['Sex'] == 'male') & (df['Pclass'] == i) ])



In [20]:

    
# Visualizing data as a histogram
import pylab as P
df['Age'].hist()
P.show()



In [21]:

    
# Mentioning the Age group size and other necessary constraints
df['Age'].dropna().hist(bins=16, range = (0,80), alpha = 0.5)
P.show()



In [22]:

    
# Adding a new Column named Gender to the Dataframe
# lambda x is an built-in function of python for generating an anonymous function in the moment, at runtime.
df['Gender'] = df['Sex'].map( lambda x: x[0].upper() )
df.head()









    Out[22]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
      Gender
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
      M
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C
      F
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
      F
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.0
      1
      0
      113803
      53.1000
      C123
      S
      F
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35.0
      0
      0
      373450
      8.0500
      NaN
      S
      M



In [23]:

    
# Overwriting the Gender column with binary values
df['Gender'] = df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
df.head()









    Out[23]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
      Gender
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
      1
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C
      0
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
      0
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.0
      1
      0
      113803
      53.1000
      C123
      S
      0
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35.0
      0
      0
      373450
      8.0500
      NaN
      S
      1



In [24]:

    
df['Embarked'].unique()









    Out[24]:





array(['S', 'C', 'Q', nan], dtype=object)



In [25]:

    
df['Embarked'] = df['Embarked'].fillna('T')
df['Port'] = df['Embarked'].map({'S':1,'C':2,'Q':3,'T':0}).astype(int)
df[df['Port'] == 0]









    Out[25]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
      Gender
      Port
    
  
  
    
      61
      62
      1
      1
      Icard, Miss. Amelie
      female
      38.0
      0
      0
      113572
      80.0
      B28
      T
      0
      0
    
    
      829
      830
      1
      1
      Stone, Mrs. George Nelson (Martha Evelyn)
      female
      62.0
      0
      0
      113572
      80.0
      B28
      T
      0
      0



In [26]:

    
df['Embarked'].unique()









    Out[26]:





array(['S', 'C', 'Q', 'T'], dtype=object)



In [27]:

    
# Creating a new table of dimension (gender * class)
median_ages = np.zeros((2,3))
median_ages









    Out[27]:





array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])



In [28]:

    
# Computing the median of ages separately for each specific gender and class 
for i in range(0, 2):
    for j in range(0, 3):
        median_ages[i,j] = df[(df['Gender'] == i) & (df['Pclass'] == j+1)]['Age'].dropna().median()
median_ages









    Out[28]:





array([[ 35. ,  28. ,  21.5],
       [ 40. ,  30. ,  25. ]])



In [29]:

    
# To make changes to the Age Column, we create a new one,modify in accordance with the existing data and delete the former
df['AgeFill'] = df['Age']
df.head()









    Out[29]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
      Gender
      Port
      AgeFill
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
      1
      1
      22.0
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C
      0
      2
      38.0
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
      0
      1
      26.0
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.0
      1
      0
      113803
      53.1000
      C123
      S
      0
      1
      35.0
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35.0
      0
      0
      373450
      8.0500
      NaN
      S
      1
      1
      35.0



In [30]:

    
df[ df['Age'].isnull() ][['Gender','Pclass','Age','AgeFill']].head(10)



In [31]:

    
# Replacing Nans in AgeFill column by the median values of table in accordance with the passenger's class and gender
for i in range(0, 2):
    for j in range(0, 3):
        df.loc[ (df.Age.isnull()) & (df.Gender == i) & (df.Pclass == j+1),'AgeFill'] = median_ages[i,j]
df[ df['Age'].isnull() ][['Gender','Pclass','Age','AgeFill']].head(10)



In [32]:

    
# AgeIsNull column holds binary value based on whether the Age for a particular passenger is available or not
df['AgeIsNull'] = pd.isnull(df.Age).astype(int)
df[['Gender','Pclass','Age','AgeFill','AgeIsNull']].head(10)



In [33]:

    
# Feature Engineering usage
df['FamilySize'] = df['SibSp'] + df['Parch']
df.head()









    Out[33]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
      Gender
      Port
      AgeFill
      AgeIsNull
      FamilySize
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
      1
      1
      22.0
      0
      1
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C
      0
      2
      38.0
      0
      1
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
      0
      1
      26.0
      0
      0
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.0
      1
      0
      113803
      53.1000
      C123
      S
      0
      1
      35.0
      0
      1
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35.0
      0
      0
      373450
      8.0500
      NaN
      S
      1
      1
      35.0
      0
      0



In [34]:

    
# Creating a combination of 2 columns
df['Age*Class'] = df.AgeFill * df.Pclass
df.head()









    Out[34]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
      Gender
      Port
      AgeFill
      AgeIsNull
      FamilySize
      Age*Class
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
      1
      1
      22.0
      0
      1
      66.0
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C
      0
      2
      38.0
      0
      1
      38.0
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
      0
      1
      26.0
      0
      0
      78.0
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.0
      1
      0
      113803
      53.1000
      C123
      S
      0
      1
      35.0
      0
      1
      35.0
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35.0
      0
      0
      373450
      8.0500
      NaN
      S
      1
      1
      35.0
      0
      0
      105.0



In [35]:

    
# Dropping columns not in use
df = df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1) 
df.head()









    Out[35]:






  
    
      
      PassengerId
      Survived
      Pclass
      Age
      SibSp
      Parch
      Fare
      Gender
      Port
      AgeFill
      AgeIsNull
      FamilySize
      Age*Class
    
  
  
    
      0
      1
      0
      3
      22.0
      1
      0
      7.2500
      1
      1
      22.0
      0
      1
      66.0
    
    
      1
      2
      1
      1
      38.0
      1
      0
      71.2833
      0
      2
      38.0
      0
      1
      38.0
    
    
      2
      3
      1
      3
      26.0
      0
      0
      7.9250
      0
      1
      26.0
      0
      0
      78.0
    
    
      3
      4
      1
      1
      35.0
      1
      0
      53.1000
      0
      1
      35.0
      0
      1
      35.0
    
    
      4
      5
      0
      3
      35.0
      0
      0
      8.0500
      1
      1
      35.0
      0
      0
      105.0



In [36]:

    
# Removing Entire Age Column 
df = df.drop(['Age'], axis=1)
#Alternatively for removing columns still having Nans, use :- df = df.dropna()
df = df.dropna()
df.head()









    Out[36]:






  
    
      
      PassengerId
      Survived
      Pclass
      SibSp
      Parch
      Fare
      Gender
      Port
      AgeFill
      AgeIsNull
      FamilySize
      Age*Class
    
  
  
    
      0
      1
      0
      3
      1
      0
      7.2500
      1
      1
      22.0
      0
      1
      66.0
    
    
      1
      2
      1
      1
      1
      0
      71.2833
      0
      2
      38.0
      0
      1
      38.0
    
    
      2
      3
      1
      3
      0
      0
      7.9250
      0
      1
      26.0
      0
      0
      78.0
    
    
      3
      4
      1
      1
      1
      0
      53.1000
      0
      1
      35.0
      0
      1
      35.0
    
    
      4
      5
      0
      3
      0
      0
      8.0500
      1
      1
      35.0
      0
      0
      105.0



In [37]:

    
# Converting the dataframe to a numpy array for usage
train_data = df.values
train_data









    Out[37]:





array([[   1. ,    0. ,    3. , ...,    0. ,    1. ,   66. ],
       [   2. ,    1. ,    1. , ...,    0. ,    1. ,   38. ],
       [   3. ,    1. ,    3. , ...,    0. ,    0. ,   78. ],
       ..., 
       [ 889. ,    0. ,    3. , ...,    1. ,    3. ,   64.5],
       [ 890. ,    1. ,    1. , ...,    0. ,    0. ,   26. ],
       [ 891. ,    0. ,    3. , ...,    0. ,    0. ,   96. ]])



In [38]:

    
#Comparing to the old data,
training_file = open('Desktop/titanic/train.csv','rb')
training_data = csv.reader(training_file)
header = training_data.next()

data=[]
for row in training_data:
    data.append(row)
data = np.array(data)
print data









    



[['1' '0' '3' ..., '7.25' '' 'S']
 ['2' '1' '1' ..., '71.2833' 'C85' 'C']
 ['3' '1' '3' ..., '7.925' '' 'S']
 ..., 
 ['889' '0' '3' ..., '23.45' '' 'S']
 ['890' '1' '1' ..., '30' 'C148' 'C']
 ['891' '0' '3' ..., '7.75' '' 'Q']]



In [39]:

    
#Changing the test data
df_test = pd.read_csv('Desktop/titanic/test.csv')
df_test.info()
df_test.describe()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB






    Out[39]:






  
    
      
      PassengerId
      Pclass
      Age
      SibSp
      Parch
      Fare
    
  
  
    
      count
      418.000000
      418.000000
      332.000000
      418.000000
      418.000000
      417.000000
    
    
      mean
      1100.500000
      2.265550
      30.272590
      0.447368
      0.392344
      35.627188
    
    
      std
      120.810458
      0.841838
      14.181209
      0.896760
      0.981429
      55.907576
    
    
      min
      892.000000
      1.000000
      0.170000
      0.000000
      0.000000
      0.000000
    
    
      25%
      996.250000
      1.000000
      21.000000
      0.000000
      0.000000
      7.895800
    
    
      50%
      1100.500000
      3.000000
      27.000000
      0.000000
      0.000000
      14.454200
    
    
      75%
      1204.750000
      3.000000
      39.000000
      1.000000
      0.000000
      31.500000
    
    
      max
      1309.000000
      3.000000
      76.000000
      8.000000
      9.000000
      512.329200



In [40]:

    
# Adding a Gender Column
df_test['Gender'] = df_test['Sex'].map({'male':1,'female':0}).astype(int)
df_test['Gender'].head()









    Out[40]:





0    1
1    0
2    1
3    1
4    0
Name: Gender, dtype: int64



In [41]:

    
# Filling up the Age using the median age method
median = np.zeros((2,3))
for i in xrange(2):
    for j in xrange(3):
        median[i,j] = df_test[(df_test['Gender'] == i) & (df_test['Pclass'] == j+1)]['Age'].dropna().median() 
print median









    



[[ 41.  24.  22.]
 [ 42.  28.  24.]]



In [42]:

    
df_test['AgeisNull'] = pd.isnull(df_test['Age']).astype(int)
df_test['AgeisNull'].head()









    Out[42]:





0    0
1    0
2    0
3    0
4    0
Name: AgeisNull, dtype: int64



In [43]:

    
#Using the above found median ages to fill in the null ages in the data
for i in xrange(2):
    for j in xrange(3):
        df_test.loc[(df_test['Gender'] == i) & (df_test['Pclass'] == j + 1) & df_test['Age'].isnull(),'Age'] = median[i,j]
pd.isnull(df_test['Age']).head()









    Out[43]:





0    False
1    False
2    False
3    False
4    False
Name: Age, dtype: bool



In [44]:

    
df_test['Embarked'].unique()









    Out[44]:





array(['Q', 'S', 'C'], dtype=object)



In [45]:

    
df_test['Port'] = df_test['Embarked'].map({'S':1,'C':2,'Q':3}).astype(int)



In [46]:

    
df_test.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 14 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
Gender         418 non-null int64
AgeisNull      418 non-null int64
Port           418 non-null int64
dtypes: float64(2), int64(7), object(5)
memory usage: 45.8+ KB



In [47]:

    
# Adding New Features as we added in the training data
df_test['AgeFill'] = df_test['Age']
df_test['FamilySize'] = df_test['Parch'] + df_test['SibSp']
df_test['Age*Class'] = df_test['Age']*df_test['Pclass']
df_test['Age*Class'].hist()
plt.show()
# Adding mean value in the missing fares
df_test.loc[df_test['Fare'].isnull(),'Fare'] = df_test['Fare'].mean()



In [48]:

    
df_test = df_test.drop(['Name','Sex','Ticket','Embarked','Cabin'],axis = 1)
df_test.head()









    Out[48]:






  
    
      
      PassengerId
      Pclass
      Age
      SibSp
      Parch
      Fare
      Gender
      AgeisNull
      Port
      AgeFill
      FamilySize
      Age*Class
    
  
  
    
      0
      892
      3
      34.5
      0
      0
      7.8292
      1
      0
      3
      34.5
      0
      103.5
    
    
      1
      893
      3
      47.0
      1
      0
      7.0000
      0
      0
      1
      47.0
      1
      141.0
    
    
      2
      894
      2
      62.0
      0
      0
      9.6875
      1
      0
      3
      62.0
      0
      124.0
    
    
      3
      895
      3
      27.0
      0
      0
      8.6625
      1
      0
      1
      27.0
      0
      81.0
    
    
      4
      896
      3
      22.0
      1
      1
      12.2875
      0
      0
      1
      22.0
      2
      66.0



In [49]:

    
test_data = df_test.values
print np.shape(test_data)
test_data = test_data[:,1:]
print test_data









    



(418, 12)
[[   3.    34.5    0.  ...,   34.5    0.   103.5]
 [   3.    47.     1.  ...,   47.     1.   141. ]
 [   2.    62.     0.  ...,   62.     0.   124. ]
 ..., 
 [   3.    38.5    0.  ...,   38.5    0.   115.5]
 [   3.    24.     0.  ...,   24.     0.    72. ]
 [   3.    24.     1.  ...,   24.     2.    72. ]]



In [50]:

    
from sklearn.ensemble import RandomForestClassifier



In [51]:

    
forest = RandomForestClassifier(n_estimators = 100)
print np.shape(train_data)



In [52]:

    
forest = forest.fit(train_data[0::,1::],train_data[0::,0])



In [53]:

    
output = forest.predict(test_data)
print output,len(output)









    



[ 108.   62.  445.  445.   62.  221.   62.   62.  368.   62.  108.   62.
   62.   62.   62.   62.  445.  108.   62.   62.   62.   62.  778.  752.
   62.   62.   62.  108.   62.   49.   62.   62.   62.  889.   62.  108.
  778.   62.  445.  644.   62.  166.  108.   62.   62.  108.   62.  108.
   62.   62.   62.   62.   62.   89.  445.   62.  108.  108.  640.   62.
  108.   62.  108.   62.   62.  778.   62.   62.   62.   89.   62.  108.
   62.   62.   62.   62.  445.  331.   62.   62.  349.   62.   62.  108.
  445.  640.   62.   62.  142.  341.  432.  108.   62.  445.   62.  108.
   62.  108.   62.  445.   62.   62.  108.  108.   62.   62.  108.  108.
  445.   62.  445.  555.   62.  368.   62.   62.  108.   62.   62.  432.
  778.  108.   62.  108.  108.   62.  108.  331.  445.  445.  445.   62.
  534.  108.  108.  108.  108.   62.   62.  679.  234.   62.   62.   62.
   62.   86.  644.   62.  166.   62.   62.  108.   62.   62.  262.  108.
   62.   62.   62.  534.  142.  349.   62.  108.  445.  129.   62.   62.
  778.   62.  108.  108.   62.  108.   26.   62.   62.   62.   62.   62.
   62.   62.   62.  108.   62.  445.   62.   62.  864.  445.   62.  221.
  490.   62.   62.  108.   62.   62.   62.  778.  778.   79.   62.   62.
   62.  221.   62.  221.   62.  108.   62.  108.   62.   62.   69.   62.
  555.   62.   62.  445.  432.   62.   62.  108.   62.  534.  108.  142.
   62.  445.   62.  778.   62.  108.   62.  108.   62.  108.   62.   62.
   62.   62.   62.  108.  784.   62.   62.   62.  432.  432.   62.  108.
   62.  445.  445.  108.  108.  445.   62.  108.   62.  108.  534.  173.
   62.  108.  303.  108.  778.  221.   62.  108.   62.  432.  108.   62.
   62.   62.   62.   62.   62.   62.  142.   62.   62.  108.  108.   62.
  108.  445.  693.   20.  108.   62.  445.  108.   62.   49.   62.  108.
  108.  445.   62.  445.  142.  129.   62.   62.   62.   62.  205.  108.
  108.   62.   62.   62.   62.   62.  108.   62.  108.  108.   62.   62.
   62.  108.   62.   62.   62.   62.   62.   62.  108.   62.  108.   62.
   62.   62.   62.  108.   62.  108.   26.  259.   66.   62.   62.  368.
  221.   62.   62.   62.   62.  784.  306.   62.  331.  108.  108.   62.
   62.   62.   62.  445.   62.   26.  640.   62.   62.   62.   62.   62.
  303.  445.   62.   62.   62.   62.   62.  262.  108.  108.  778.  432.
  445.   62.  108.   62.  108.    8.   62.   62.  166.  445.  177.   62.
  108.   62.  108.  108.   62.  452.   62.   62.   62.   62.   62.   62.
  142.   62.  142.   62.   62.  445.   62.  108.  445.  129.] 418



In [54]:

    
# Using Keras Library to Predict data
from keras.layers import Activation, Dense, Dropout
from keras.models import Sequential



In [55]:

    
# Seperating the data and the labels
X = train_data[:500,1:]
X_cv = train_data[500:,1:]
labels= train_data[:,0]
y = np.zeros((500,2))
y_cv = np.zeros((391,2))



In [56]:

    
for i in range(500):
    if(labels[i] == 1):
        y[i][1] = 1
    else:
        y[i][0] = 1
for i in range(391):
    if(labels[500 + i] == 1):
        y_cv[i][1] = 1
    else:
        y_cv[i][0] = 1



In [57]:

    
model = Sequential([
        Dense(32,input_dim=11),
        Activation('sigmoid'),
        Dropout(0.25),
        Dense(32),
        Activation('sigmoid'),
        Dropout(0.25),
        Dense(32),
        Activation('sigmoid'),
        Dropout(0.25),
        Dense(2),
        Activation('sigmoid'),
    ])



In [58]:

    
model.compile(optimizer = 'adadelta',
             loss = 'categorical_crossentropy',
             metrics = ['accuracy'])



In [63]:

    
model.fit(X,y,nb_epoch = 500,batch_size = 32,verbose = 0)









    Out[63]:





<keras.callbacks.History at 0x7f18449c76d0>



In [64]:

    
score = model.evaluate(X_cv,y_cv,batch_size = 32, verbose = 0)
print score









    



[0.00082918573596664816, 1.0]



In [65]:

    
yPred = model.predict_classes(test_data,verbose = 1)









    



418/418 [==============================] - 0s



In [66]:

    
print yPred









    



[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0]



In [67]:

    
file_handle = open('output_keras.csv',"w")
prediction_handle = csv.writer(file_handle)
prediction_handle.writerow(['PassengerId','Survived'])
i = 892
for pred in yPred:
    prediction_handle.writerow([i,int(pred)])
    i += 1
file_handle.close()



In [ ]:

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	0	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	0	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	0	373450	8.0500	NaN	S
5	6	0	3	Moran, Mr. James	male	NaN	0	0	330877	8.4583	NaN	Q
6	7	0	1	McCarthy, Mr. Timothy J	male	54.0	0	0	17463	51.8625	E46	S
7	8	0	3	Palsson, Master. Gosta Leonard	male	2.0	3	1	349909	21.0750	NaN	S
8	9	1	3	Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)	female	27.0	0	2	347742	11.1333	NaN	S
9	10	1	2	Nasser, Mrs. Nicholas (Adele Achem)	female	14.0	1	0	237736	30.0708	NaN	C
10	11	1	3	Sandstrom, Miss. Marguerite Rut	female	4.0	1	1	PP 9549	16.7000	G6	S
11	12	1	1	Bonnell, Miss. Elizabeth	female	58.0	0	0	113783	26.5500	C103	S
12	13	0	3	Saundercock, Mr. William Henry	male	20.0	0	0	A/5. 2151	8.0500	NaN	S
13	14	0	3	Andersson, Mr. Anders Johan	male	39.0	1	5	347082	31.2750	NaN	S
14	15	0	3	Vestrom, Miss. Hulda Amanda Adolfina	female	14.0	0	0	350406	7.8542	NaN	S
15	16	1	2	Hewlett, Mrs. (Mary D Kingcome)	female	55.0	0	0	248706	16.0000	NaN	S
16	17	0	3	Rice, Master. Eugene	male	2.0	4	1	382652	29.1250	NaN	Q
17	18	1	2	Williams, Mr. Charles Eugene	male	NaN	0	0	244373	13.0000	NaN	S
18	19	0	3	Vander Planke, Mrs. Julius (Emelia Maria Vande...	female	31.0	1	0	345763	18.0000	NaN	S
19	20	1	3	Masselmani, Mrs. Fatima	female	NaN	0	0	2649	7.2250	NaN	C
20	21	0	2	Fynney, Mr. Joseph J	male	35.0	0	0	239865	26.0000	NaN	S
21	22	1	2	Beesley, Mr. Lawrence	male	34.0	0	0	248698	13.0000	D56	S
22	23	1	3	McGowan, Miss. Anna "Annie"	female	15.0	0	0	330923	8.0292	NaN	Q
23	24	1	1	Sloper, Mr. William Thompson	male	28.0	0	0	113788	35.5000	A6	S
24	25	0	3	Palsson, Miss. Torborg Danira	female	8.0	3	1	349909	21.0750	NaN	S
25	26	1	3	Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...	female	38.0	1	5	347077	31.3875	NaN	S
26	27	0	3	Emir, Mr. Farred Chehab	male	NaN	0	0	2631	7.2250	NaN	C
27	28	0	1	Fortune, Mr. Charles Alexander	male	19.0	3	2	19950	263.0000	C23 C25 C27	S
28	29	1	3	O'Dwyer, Miss. Ellen "Nellie"	female	NaN	0	0	330959	7.8792	NaN	Q
29	30	0	3	Todoroff, Mr. Lalio	male	NaN	0	0	349216	7.8958	NaN	S
...	...	...	...	...	...	...	...	...	...	...	...	...
861	862	0	2	Giles, Mr. Frederick Edward	male	21.0	1	0	28134	11.5000	NaN	S
862	863	1	1	Swift, Mrs. Frederick Joel (Margaret Welles Ba...	female	48.0	0	0	17466	25.9292	D17	S
863	864	0	3	Sage, Miss. Dorothy Edith "Dolly"	female	NaN	8	2	CA. 2343	69.5500	NaN	S
864	865	0	2	Gill, Mr. John William	male	24.0	0	0	233866	13.0000	NaN	S
865	866	1	2	Bystrom, Mrs. (Karolina)	female	42.0	0	0	236852	13.0000	NaN	S
866	867	1	2	Duran y More, Miss. Asuncion	female	27.0	1	0	SC/PARIS 2149	13.8583	NaN	C
867	868	0	1	Roebling, Mr. Washington Augustus II	male	31.0	0	0	PC 17590	50.4958	A24	S
868	869	0	3	van Melkebeke, Mr. Philemon	male	NaN	0	0	345777	9.5000	NaN	S
869	870	1	3	Johnson, Master. Harold Theodor	male	4.0	1	1	347742	11.1333	NaN	S
870	871	0	3	Balkic, Mr. Cerin	male	26.0	0	0	349248	7.8958	NaN	S
871	872	1	1	Beckwith, Mrs. Richard Leonard (Sallie Monypeny)	female	47.0	1	1	11751	52.5542	D35	S
872	873	0	1	Carlsson, Mr. Frans Olof	male	33.0	0	0	695	5.0000	B51 B53 B55	S
873	874	0	3	Vander Cruyssen, Mr. Victor	male	47.0	0	0	345765	9.0000	NaN	S
874	875	1	2	Abelson, Mrs. Samuel (Hannah Wizosky)	female	28.0	1	0	P/PP 3381	24.0000	NaN	C
875	876	1	3	Najib, Miss. Adele Kiamie "Jane"	female	15.0	0	0	2667	7.2250	NaN	C
876	877	0	3	Gustafsson, Mr. Alfred Ossian	male	20.0	0	0	7534	9.8458	NaN	S
877	878	0	3	Petroff, Mr. Nedelio	male	19.0	0	0	349212	7.8958	NaN	S
878	879	0	3	Laleff, Mr. Kristo	male	NaN	0	0	349217	7.8958	NaN	S
879	880	1	1	Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)	female	56.0	0	1	11767	83.1583	C50	C
880	881	1	2	Shelley, Mrs. William (Imanita Parrish Hall)	female	25.0	0	1	230433	26.0000	NaN	S
881	882	0	3	Markun, Mr. Johann	male	33.0	0	0	349257	7.8958	NaN	S
882	883	0	3	Dahlberg, Miss. Gerda Ulrika	female	22.0	0	0	7552	10.5167	NaN	S
883	884	0	2	Banfield, Mr. Frederick James	male	28.0	0	0	C.A./SOTON 34068	10.5000	NaN	S
884	885	0	3	Sutehall, Mr. Henry Jr	male	25.0	0	0	SOTON/OQ 392076	7.0500	NaN	S
885	886	0	3	Rice, Mrs. William (Margaret Norton)	female	39.0	0	5	382652	29.1250	NaN	Q
886	887	0	2	Montvila, Rev. Juozas	male	27.0	0	0	211536	13.0000	NaN	S
887	888	1	1	Graham, Miss. Margaret Edith	female	19.0	0	0	112053	30.0000	B42	S
888	889	0	3	Johnston, Miss. Catherine Helen "Carrie"	female	NaN	1	2	W./C. 6607	23.4500	NaN	S
889	890	1	1	Behr, Mr. Karl Howell	male	26.0	0	0	111369	30.0000	C148	C
890	891	0	3	Dooley, Mr. Patrick	male	32.0	0	0	370376	7.7500	NaN	Q

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

	PassengerId	Pclass	Name	Sex	Age	Parch	Ticket	Fare	Cabin	Embarked
33	34	2	Wheadon, Mr. Edward H	male	66.0	0	C.A. 24579	10.5000	NaN	S
54	55	1	Ostby, Mr. Engelhart Cornelius	male	65.0	1	113509	61.9792	B30	C
96	97	1	Goldschmidt, Mr. George B	male	71.0	0	PC 17754	34.6542	A5	C
116	117	3	Connors, Mr. Patrick	male	70.5	0	370369	7.7500	NaN	Q
170	171	1	Van der hoef, Mr. Wyckoff	male	61.0	0	111240	33.5000	B19	S

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked	Gender	Port
61	62	1	1	Icard, Miss. Amelie	female	38.0	0	0	113572	80.0	B28	T	0	0
829	830	1	1	Stone, Mrs. George Nelson (Martha Evelyn)	female	62.0	0	0	113572	80.0	B28	T	0	0

	Gender	Pclass	Age	AgeFill
5	1	3	NaN	NaN
17	1	2	NaN	NaN
19	0	3	NaN	NaN
26	1	3	NaN	NaN
28	0	3	NaN	NaN
29	1	3	NaN	NaN
31	0	1	NaN	NaN
32	0	3	NaN	NaN
36	1	3	NaN	NaN
42	1	3	NaN	NaN

	Gender	Pclass	Age	AgeFill
5	1	3	NaN	25.0
17	1	2	NaN	30.0
19	0	3	NaN	21.5
26	1	3	NaN	25.0
28	0	3	NaN	21.5
29	1	3	NaN	25.0
31	0	1	NaN	35.0
32	0	3	NaN	21.5
36	1	3	NaN	25.0
42	1	3	NaN	25.0

	PassengerId	Pclass	Age	SibSp	Parch	Fare
count	418.000000	418.000000	332.000000	418.000000	418.000000	417.000000
mean	1100.500000	2.265550	30.272590	0.447368	0.392344	35.627188
std	120.810458	0.841838	14.181209	0.896760	0.981429	55.907576
min	892.000000	1.000000	0.170000	0.000000	0.000000	0.000000
25%	996.250000	1.000000	21.000000	0.000000	0.000000	7.895800
50%	1100.500000	3.000000	27.000000	0.000000	0.000000	14.454200
75%	1204.750000	3.000000	39.000000	1.000000	0.000000	31.500000
max	1309.000000	3.000000	76.000000	8.000000	9.000000	512.329200

	PassengerId	Pclass	Age	SibSp	Parch	Fare	Gender	Port	AgeFill	FamilySize	Age*Class
0	892	3	34.5	0	0	7.8292	1	3	34.5	0	103.5
1	893	3	47.0	1	0	7.0000	0	1	47.0	1	141.0
2	894	2	62.0	0	0	9.6875	1	3	62.0	0	124.0
3	895	3	27.0	0	0	8.6625	1	1	27.0	0	81.0
4	896	3	22.0	1	1	12.2875	0	1	22.0	2	66.0

	Gender	Pclass	Age	AgeFill
5	1	3	NaN	NaN
17	1	2	NaN	NaN
19	0	3	NaN	NaN
26	1	3	NaN	NaN
28	0	3	NaN	NaN
29	1	3	NaN	NaN
31	0	1	NaN	NaN
32	0	3	NaN	NaN
36	1	3	NaN	NaN
42	1	3	NaN	NaN

	Gender	Pclass	Age	AgeFill
5	1	3	NaN	NaN
17	1	2	NaN	NaN
19	0	3	NaN	NaN
26	1	3	NaN	NaN
28	0	3	NaN	NaN
29	1	3	NaN	NaN
31	0	1	NaN	NaN
32	0	3	NaN	NaN
36	1	3	NaN	NaN
42	1	3	NaN	NaN