notebook.community

Edit and run



In [1]:

    
# import libaries
import numpy as np
import pandas as pd
import matplotlib.pyplot as pyplt
from IPython.display import Image

# Will first not import Seaborn to show how seaborn makes nice plots
# import seaborn as sns



In [2]:

    
# Plot in ipython notebook
%matplotlib inline



In [3]:

    
# Nothing less than the latest :-)
!python --version









    



Python 3.4.1



In [4]:

    
print('Numpy Version: ', np.__version__)
print('Pandas Version: ', pd.__version__)









    



Numpy Version:  1.8.1
Pandas Version:  0.14.0



In [9]:

    
# Set Display Options
pd.options.display.max_rows = 15

Kaggle Titanic Dataset:

Source: http://www.kaggle.com/c/titanic-gettingStarted/data
Let's take a look at the training data and try to make sense of it



In [6]:

    
# Reading Data - Titanic Dataset
# read_csv, read_frame
train_df = pd.read_csv('kaggle_titanic_data/train.csv')



In [8]:

    
train_df









    Out[8]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0  
         1
       0
       3
                                 Braund, Mr. Owen Harris
         male
       22
       1
       0
       A/5 21171
        7.2500
        NaN
       S
    
    
      1  
         2
       1
       1
       Cumings, Mrs. John Bradley (Florence Briggs Th...
       female
       38
       1
       0
        PC 17599
       71.2833
        C85
       C
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      889
       890
       1
       1
                                   Behr, Mr. Karl Howell
         male
       26
       0
       0
          111369
       30.0000
       C148
       C
    
    
      890
       891
       0
       3
                                     Dooley, Mr. Patrick
         male
       32
       0
       0
          370376
        7.7500
        NaN
       Q
    
  

891 rows × 12 columns



In [8]:

    
# What is a Pandas DataFrame
Image(filename='images/dataframe.png')









    Out[8]:



In [10]:

    
train_df.columns









    Out[10]:





Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype='object')



In [10]:

    
# Describe summary of data
train_df.describe()









    Out[10]:






  
    
      
      PassengerId
      Survived
      Pclass
      Age
      SibSp
      Parch
      Fare
    
  
  
    
      count
       891.000000
       891.000000
       891.000000
       714.000000
       891.000000
       891.000000
       891.000000
    
    
      mean
       446.000000
         0.383838
         2.308642
        29.699118
         0.523008
         0.381594
        32.204208
    
    
      std
       257.353842
         0.486592
         0.836071
        14.526497
         1.102743
         0.806057
        49.693429
    
    
      min
         1.000000
         0.000000
         1.000000
         0.420000
         0.000000
         0.000000
         0.000000
    
    
      25%
       223.500000
         0.000000
         2.000000
        20.125000
         0.000000
         0.000000
         7.910400
    
    
      50%
       446.000000
         0.000000
         3.000000
        28.000000
         0.000000
         0.000000
        14.454200
    
    
      75%
       668.500000
         1.000000
         3.000000
        38.000000
         1.000000
         0.000000
        31.000000
    
    
      max
       891.000000
         1.000000
         3.000000
        80.000000
         8.000000
         6.000000
       512.329200



In [11]:

    
# Display Column Types
train_df.dtypes









    Out[11]:





PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object



In [12]:

    
# Change columns to lowercase
train_df.columns = [col.lower() for col in train_df.columns]



In [13]:

    
train_df.columns









    Out[13]:





Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked'], dtype='object')



In [16]:

    
# Rename sex to gender
train_df.rename(columns={'sex': 'gender'}, inplace=True)



In [19]:

    
# select a Column
train_df[['gender', 'pclass']]









    Out[19]:






  
    
      
      gender
      pclass
    
  
  
    
      0  
         male
       3
    
    
      1  
       female
       1
    
    
      2  
       female
       3
    
    
      3  
       female
       1
    
    
      4  
         male
       3
    
    
      5  
         male
       3
    
    
      6  
         male
       1
    
    
      ...
      ...
      ...
    
    
      884
         male
       3
    
    
      885
       female
       3
    
    
      886
         male
       2
    
    
      887
       female
       1
    
    
      888
       female
       3
    
    
      889
         male
       1
    
    
      890
         male
       3
    
  

891 rows × 2 columns



In [29]:

    
# select a row
train_df.ix[0:6][['name':'survived']]









    



  File "<ipython-input-29-4751a92576db>", line 2
    train_df.ix[0:6][['name':'survived']]
                            ^
SyntaxError: invalid syntax



In [30]:

    
# show only that survived
train_df[train_df.survived == 1]









    Out[30]:






  
    
      
      passengerid
      survived
      pclass
      name
      gender
      age
      sibsp
      parch
      ticket
      fare
      cabin
      embarked
    
  
  
    
      1  
         2
       1
       1
       Cumings, Mrs. John Bradley (Florence Briggs Th...
       female
       38
       1
       0
               PC 17599
       71.2833
        C85
       C
    
    
      2  
         3
       1
       3
                                  Heikkinen, Miss. Laina
       female
       26
       0
       0
       STON/O2. 3101282
        7.9250
        NaN
       S
    
    
      3  
         4
       1
       1
            Futrelle, Mrs. Jacques Heath (Lily May Peel)
       female
       35
       1
       0
                 113803
       53.1000
       C123
       S
    
    
      8  
         9
       1
       3
       Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
       female
       27
       0
       2
                 347742
       11.1333
        NaN
       S
    
    
      9  
        10
       1
       2
                     Nasser, Mrs. Nicholas (Adele Achem)
       female
       14
       1
       0
                 237736
       30.0708
        NaN
       C
    
    
      10 
        11
       1
       3
                         Sandstrom, Miss. Marguerite Rut
       female
        4
       1
       1
                PP 9549
       16.7000
         G6
       S
    
    
      11 
        12
       1
       1
                                Bonnell, Miss. Elizabeth
       female
       58
       0
       0
                 113783
       26.5500
       C103
       S
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      871
       872
       1
       1
        Beckwith, Mrs. Richard Leonard (Sallie Monypeny)
       female
       47
       1
       1
                  11751
       52.5542
        D35
       S
    
    
      874
       875
       1
       2
                   Abelson, Mrs. Samuel (Hannah Wizosky)
       female
       28
       1
       0
              P/PP 3381
       24.0000
        NaN
       C
    
    
      875
       876
       1
       3
                        Najib, Miss. Adele Kiamie "Jane"
       female
       15
       0
       0
                   2667
        7.2250
        NaN
       C
    
    
      879
       880
       1
       1
           Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)
       female
       56
       0
       1
                  11767
       83.1583
        C50
       C
    
    
      880
       881
       1
       2
            Shelley, Mrs. William (Imanita Parrish Hall)
       female
       25
       0
       1
                 230433
       26.0000
        NaN
       S
    
    
      887
       888
       1
       1
                            Graham, Miss. Margaret Edith
       female
       19
       0
       0
                 112053
       30.0000
        B42
       S
    
    
      889
       890
       1
       1
                                   Behr, Mr. Karl Howell
         male
       26
       0
       0
                 111369
       30.0000
       C148
       C
    
  

342 rows × 12 columns



In [31]:

    
# unique values
train_df.gender.unique()









    Out[31]:





array(['male', 'female'], dtype=object)



In [32]:

    
# How many Survived?
train_df['survived'].value_counts()









    Out[32]:





0    549
1    342
dtype: int64



In [33]:

    
# Distribution of Age
ax = train_df['age'].hist()  # bins
ax.set_title('Histogram of Fares')
ax.set_xlabel('x Label')









    Out[33]:





<matplotlib.text.Text at 0x10c74ebe0>



In [34]:

    
# Now set Pandas options to make plot pretty
pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier (not necessary if you use seaborn)



In [36]:

    
# Distribution of Age
ax = train_df['age'].hist(bins=30)  # bins
ax.set_title('Histogram of Fares')
ax.set_xlabel('x Label')









    Out[36]:





<matplotlib.text.Text at 0x10c8e16d8>



In [37]:

    
import seaborn as sns



In [38]:

    
# Distribution of Age
ax = train_df['age'].hist(bins=30)  # bins
ax.set_title('Histogram of Fares')
ax.set_xlabel('x Label')









    Out[38]:





<matplotlib.text.Text at 0x10e913198>



In [39]:

    
train_df.age[pd.isnull(train_df.age)]









    Out[39]:





5    NaN
17   NaN
19   NaN
26   NaN
28   NaN
...
849   NaN
859   NaN
863   NaN
868   NaN
878   NaN
888   NaN
Name: age, Length: 177, dtype: float64



In [40]:

    
# Age - fix null values
# Method 1
train_df.age[pd.isnull(train_df.age)] = train_df.age.mean()

# Method 2 - Pandas Convenience Functions
train_df.age.fillna(train_df.age.mean())









    Out[40]:





0    22
1    38
2    26
3    35
4    35
...
885    39.000000
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: age, Length: 891, dtype: float64



In [41]:

    
# Did your gender make a difference in survival
train_df.gender.value_counts()









    Out[41]:





male      577
female    314
dtype: int64



In [42]:

    
train_df.groupby(['pclass', 'gender'])['survived'].agg(['sum', 'count'])



In [43]:

    
Image(filename='images/wesm_book_groupby.png')









    Out[43]:



In [45]:

    
# Show how many survivied by gender and class
class_gender_group = train_df.groupby(['pclass', 'gender'])



In [46]:

    
for k, group in class_gender_group:
    print(k)









    



(1, 'female')
(1, 'male')
(2, 'female')
(2, 'male')
(3, 'female')
(3, 'male')



In [47]:

    
# Plot how many survived, by gender and class
grid_plot = sns.FacetGrid(train_df, row='gender', col='pclass')
grid_plot.map(sns.regplot, 'survived', 'age',color='.3', fit_reg=False, x_jitter=.1)









    Out[47]:





<seaborn.axisgrid.FacetGrid at 0x10e9d8240>



In [48]:

    
ax = sns.boxplot(train_df.age, groupby=train_df.pclass)
ax.set_title('Age Distribution by class')









    Out[48]:





<matplotlib.text.Text at 0x10f421208>



In [49]:

    
# Who paid the highest Fare in Titanic.  Did they survive?
train_df.sort_index(by='fare', ascending=False)









    Out[49]:






  
    
      
      passengerid
      survived
      pclass
      name
      gender
      age
      sibsp
      parch
      ticket
      fare
      cabin
      embarked
    
  
  
    
      258
       259
       1
       1
                         Ward, Miss. Anna
       female
       35
       0
       0
       PC 17755
       512.3292
               NaN
       C
    
    
      737
       738
       1
       1
                   Lesurer, Mr. Gustave J
         male
       35
       0
       0
       PC 17755
       512.3292
              B101
       C
    
    
      679
       680
       1
       1
       Cardeza, Mr. Thomas Drake Martinez
         male
       36
       0
       1
       PC 17755
       512.3292
       B51 B53 B55
       C
    
    
      88 
        89
       1
       1
               Fortune, Miss. Mabel Helen
       female
       23
       3
       2
          19950
       263.0000
       C23 C25 C27
       S
    
    
      27 
        28
       0
       1
           Fortune, Mr. Charles Alexander
         male
       19
       3
       2
          19950
       263.0000
       C23 C25 C27
       S



In [50]:

    
# Highest Paid ticket by Class.
def topn(group, field, n=5):
    return group.sort_index(by=field, ascending=False)[:n]

train_df.groupby('pclass').apply(topn, 'fare', 2)









    Out[50]:






  
    
      
      
      passengerid
      survived
      pclass
      name
      gender
      age
      sibsp
      parch
      ticket
      fare
      cabin
      embarked
    
    
      pclass
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      258
       259
       1
       1
               Ward, Miss. Anna
       female
       35.000000
       0
       0
           PC 17755
       512.3292
        NaN
       C
    
    
      737
       738
       1
       1
         Lesurer, Mr. Gustave J
         male
       35.000000
       0
       0
           PC 17755
       512.3292
       B101
       C
    
    
      2
      665
       666
       0
       2
             Hickman, Mr. Lewis
         male
       32.000000
       2
       0
       S.O.C. 14879
        73.5000
        NaN
       S
    
    
      72 
        73
       0
       2
           Hood, Mr. Ambrose Jr
         male
       21.000000
       0
       0
       S.O.C. 14879
        73.5000
        NaN
       S
    
    
      3
      324
       325
       0
       3
       Sage, Mr. George John Jr
         male
       29.699118
       8
       2
           CA. 2343
        69.5500
        NaN
       S
    
    
      792
       793
       0
       3
        Sage, Miss. Stella Anna
       female
       29.699118
       8
       2
           CA. 2343
        69.5500
        NaN
       S



In [51]:

    
# Youngenst 2 by Class.
def botm(group, field, n=5):
    return group.sort_index(by=field)[:n]

train_df.groupby('pclass').apply(botm, 'age', 2)









    Out[51]:






  
    
      
      
      passengerid
      survived
      pclass
      name
      gender
      age
      sibsp
      parch
      ticket
      fare
      cabin
      embarked
    
    
      pclass
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      305
       306
       1
       1
        Allison, Master. Hudson Trevor
         male
       0.92
       1
       2
       113781
       151.5500
       C22 C26
       S
    
    
      297
       298
       0
       1
          Allison, Miss. Helen Loraine
       female
       2.00
       1
       2
       113781
       151.5500
       C22 C26
       S
    
    
      2
      755
       756
       1
       2
             Hamalainen, Master. Viljo
         male
       0.67
       1
       1
       250649
        14.5000
           NaN
       S
    
    
      78 
        79
       1
       2
         Caldwell, Master. Alden Gates
         male
       0.83
       0
       2
       248738
        29.0000
           NaN
       S
    
    
      3
      803
       804
       1
       3
       Thomas, Master. Assad Alexander
         male
       0.42
       0
       1
         2625
         8.5167
           NaN
       C
    
    
      469
       470
       1
       3
         Baclini, Miss. Helene Barbara
       female
       0.75
       2
       1
         2666
        19.2583
           NaN
       C



In [ ]:

    
# Write back the changes
# train_df.to_csv('kaggle_titanic_data/train_modified.csv')



In [52]:

    
# Custom Translation of Values and creating new Columns
def gender_map(val):
    if val == 'male':
        return 1
    return 0
train_df['gender_val'] = train_df.gender.map(gender_map)



In [53]:

    
train_df









    Out[53]:






  
    
      
      passengerid
      survived
      pclass
      name
      gender
      age
      sibsp
      parch
      ticket
      fare
      cabin
      embarked
      gender_val
    
  
  
    
      0  
         1
       0
       3
                                 Braund, Mr. Owen Harris
         male
       22.000000
       1
       0
              A/5 21171
        7.2500
        NaN
       S
       1
    
    
      1  
         2
       1
       1
       Cumings, Mrs. John Bradley (Florence Briggs Th...
       female
       38.000000
       1
       0
               PC 17599
       71.2833
        C85
       C
       0
    
    
      2  
         3
       1
       3
                                  Heikkinen, Miss. Laina
       female
       26.000000
       0
       0
       STON/O2. 3101282
        7.9250
        NaN
       S
       0
    
    
      3  
         4
       1
       1
            Futrelle, Mrs. Jacques Heath (Lily May Peel)
       female
       35.000000
       1
       0
                 113803
       53.1000
       C123
       S
       0
    
    
      4  
         5
       0
       3
                                Allen, Mr. William Henry
         male
       35.000000
       0
       0
                 373450
        8.0500
        NaN
       S
       1
    
    
      5  
         6
       0
       3
                                        Moran, Mr. James
         male
       29.699118
       0
       0
                 330877
        8.4583
        NaN
       Q
       1
    
    
      6  
         7
       0
       1
                                 McCarthy, Mr. Timothy J
         male
       54.000000
       0
       0
                  17463
       51.8625
        E46
       S
       1
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      884
       885
       0
       3
                                  Sutehall, Mr. Henry Jr
         male
       25.000000
       0
       0
        SOTON/OQ 392076
        7.0500
        NaN
       S
       1
    
    
      885
       886
       0
       3
                    Rice, Mrs. William (Margaret Norton)
       female
       39.000000
       0
       5
                 382652
       29.1250
        NaN
       Q
       0
    
    
      886
       887
       0
       2
                                   Montvila, Rev. Juozas
         male
       27.000000
       0
       0
                 211536
       13.0000
        NaN
       S
       1
    
    
      887
       888
       1
       1
                            Graham, Miss. Margaret Edith
       female
       19.000000
       0
       0
                 112053
       30.0000
        B42
       S
       0
    
    
      888
       889
       0
       3
                Johnston, Miss. Catherine Helen "Carrie"
       female
       29.699118
       1
       2
             W./C. 6607
       23.4500
        NaN
       S
       0
    
    
      889
       890
       1
       1
                                   Behr, Mr. Karl Howell
         male
       26.000000
       0
       0
                 111369
       30.0000
       C148
       C
       1
    
    
      890
       891
       0
       3
                                     Dooley, Mr. Patrick
         male
       32.000000
       0
       0
                 370376
        7.7500
        NaN
       Q
       1
    
  

891 rows × 13 columns



In [54]:

    
train_df.drop('gender_val', axis=1, inplace=True)



In [55]:

    
for row in train_df:
    print(row)









    



passengerid
survived
pclass
name
gender
age
sibsp
parch
ticket
fare
cabin
embarked



In [56]:

    
# Some mistakes I made when working with Pandas
# Don't use Loop to update Dataframe
for k, row in train_df.iterrows():
    if row.gender == 'male':
        train_df.ix[k, 'gender'] = 1
    else:
        train_df.ix[k, 'gender'] = 0



In [ ]:

    
train_df



In [59]:

    
# Filtering Gotchas
train_df[((train_df.survived == 1) & (train_df.pclass == 1))]









    Out[59]:






  
    
      
      passengerid
      survived
      pclass
      name
      gender
      age
      sibsp
      parch
      ticket
      fare
      cabin
      embarked
    
  
  
    
      1  
         2
       1
       1
       Cumings, Mrs. John Bradley (Florence Briggs Th...
       0
       38.000000
       1
       0
       PC 17599
        71.2833
        C85
       C
    
    
      3  
         4
       1
       1
            Futrelle, Mrs. Jacques Heath (Lily May Peel)
       0
       35.000000
       1
       0
         113803
        53.1000
       C123
       S
    
    
      11 
        12
       1
       1
                                Bonnell, Miss. Elizabeth
       0
       58.000000
       0
       0
         113783
        26.5500
       C103
       S
    
    
      23 
        24
       1
       1
                            Sloper, Mr. William Thompson
       1
       28.000000
       0
       0
         113788
        35.5000
         A6
       S
    
    
      31 
        32
       1
       1
          Spencer, Mrs. William Augustus (Marie Eugenie)
       0
       29.699118
       1
       0
       PC 17569
       146.5208
        B78
       C
    
    
      52 
        53
       1
       1
                Harper, Mrs. Henry Sleeper (Myna Haxtun)
       0
       49.000000
       1
       0
       PC 17572
        76.7292
        D33
       C
    
    
      55 
        56
       1
       1
                                       Woolner, Mr. Hugh
       1
       29.699118
       0
       0
          19947
        35.5000
        C52
       S
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      856
       857
       1
       1
              Wick, Mrs. George Dennick (Mary Hitchcock)
       0
       45.000000
       1
       1
          36928
       164.8667
        NaN
       S
    
    
      857
       858
       1
       1
                                  Daly, Mr. Peter Denis 
       1
       51.000000
       0
       0
         113055
        26.5500
        E17
       S
    
    
      862
       863
       1
       1
       Swift, Mrs. Frederick Joel (Margaret Welles Ba...
       0
       48.000000
       0
       0
          17466
        25.9292
        D17
       S
    
    
      871
       872
       1
       1
        Beckwith, Mrs. Richard Leonard (Sallie Monypeny)
       0
       47.000000
       1
       1
          11751
        52.5542
        D35
       S
    
    
      879
       880
       1
       1
           Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)
       0
       56.000000
       0
       1
          11767
        83.1583
        C50
       C
    
    
      887
       888
       1
       1
                            Graham, Miss. Margaret Edith
       0
       19.000000
       0
       0
         112053
        30.0000
        B42
       S
    
    
      889
       890
       1
       1
                                   Behr, Mr. Karl Howell
       1
       26.000000
       0
       0
         111369
        30.0000
       C148
       C
    
  

136 rows × 12 columns



In [ ]:



In [ ]:



In [ ]:

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22	1	0	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38	1	0	PC 17599	71.2833	C85	C
...	...	...	...	...	...	...	...	...	...	...	...	...
889	890	1	1	Behr, Mr. Karl Howell	male	26	0	0	111369	30.0000	C148	C
890	891	0	3	Dooley, Mr. Patrick	male	32	0	0	370376	7.7500	NaN	Q

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

		sum	count
pclass	gender
1	female	91	94
1	male	45	122
2	female	70	76
2	male	17	108
3	female	72	144
3	male	47	347

	passengerid	survived	pclass	name	gender	age	sibsp	parch	ticket	fare	cabin	embarked
258	259	1	1	Ward, Miss. Anna	female	35	0	0	PC 17755	512.3292	NaN	C
737	738	1	1	Lesurer, Mr. Gustave J	male	35	0	0	PC 17755	512.3292	B101	C
679	680	1	1	Cardeza, Mr. Thomas Drake Martinez	male	36	0	1	PC 17755	512.3292	B51 B53 B55	C
88	89	1	1	Fortune, Miss. Mabel Helen	female	23	3	2	19950	263.0000	C23 C25 C27	S
27	28	0	1	Fortune, Mr. Charles Alexander	male	19	3	2	19950	263.0000	C23 C25 C27	S

		passengerid	survived	pclass	name	gender	age	sibsp	parch	ticket	fare	cabin	embarked
pclass
1	305	306	1	1	Allison, Master. Hudson Trevor	male	0.92	1	2	113781	151.5500	C22 C26	S
1	297	298	0	1	Allison, Miss. Helen Loraine	female	2.00	1	2	113781	151.5500	C22 C26	S
2	755	756	1	2	Hamalainen, Master. Viljo	male	0.67	1	1	250649	14.5000	NaN	S
2	78	79	1	2	Caldwell, Master. Alden Gates	male	0.83	0	2	248738	29.0000	NaN	S
3	803	804	1	3	Thomas, Master. Assad Alexander	male	0.42	0	1	2625	8.5167	NaN	C
3	469	470	1	3	Baclini, Miss. Helene Barbara	female	0.75	2	1	2666	19.2583	NaN	C

	passengerid	survived	pclass	name	gender	age	sibsp	parch	ticket	fare	cabin	embarked	gender_val
0	1	0	3	Braund, Mr. Owen Harris	male	22.000000	1	0	A/5 21171	7.2500	NaN	S	1
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.000000	1	0	PC 17599	71.2833	C85	C	0
2	3	1	3	Heikkinen, Miss. Laina	female	26.000000	0	0	STON/O2. 3101282	7.9250	NaN	S	0
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.000000	1	0	113803	53.1000	C123	S	0
4	5	0	3	Allen, Mr. William Henry	male	35.000000	0	0	373450	8.0500	NaN	S	1
5	6	0	3	Moran, Mr. James	male	29.699118	0	0	330877	8.4583	NaN	Q	1
6	7	0	1	McCarthy, Mr. Timothy J	male	54.000000	0	0	17463	51.8625	E46	S	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...
884	885	0	3	Sutehall, Mr. Henry Jr	male	25.000000	0	0	SOTON/OQ 392076	7.0500	NaN	S	1
885	886	0	3	Rice, Mrs. William (Margaret Norton)	female	39.000000	0	5	382652	29.1250	NaN	Q	0
886	887	0	2	Montvila, Rev. Juozas	male	27.000000	0	0	211536	13.0000	NaN	S	1
887	888	1	1	Graham, Miss. Margaret Edith	female	19.000000	0	0	112053	30.0000	B42	S	0
888	889	0	3	Johnston, Miss. Catherine Helen "Carrie"	female	29.699118	1	2	W./C. 6607	23.4500	NaN	S	0
889	890	1	1	Behr, Mr. Karl Howell	male	26.000000	0	0	111369	30.0000	C148	C	1
890	891	0	3	Dooley, Mr. Patrick	male	32.000000	0	0	370376	7.7500	NaN	Q	1