In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv('/Users/williamgray/Library/Mobile Documents/com~apple~CloudDocs/Developer/KaggleTitanic/train.csv')
test_df = pd.read_csv('/Users/williamgray/Library/Mobile Documents/com~apple~CloudDocs/Developer/KaggleTitanic/test.csv')
combine = [train_df, test_df]

In [3]:
train_df.head()


Out[3]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

In [4]:
test_df.head()


Out[4]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S

In [5]:
train_df.mean()


Out[5]:
PassengerId    446.000000
Survived         0.383838
Pclass           2.308642
Age             29.699118
SibSp            0.523008
Parch            0.381594
Fare            32.204208
dtype: float64

In [6]:
test_df.mean()


Out[6]:
PassengerId    1100.500000
Pclass            2.265550
Age              30.272590
SibSp             0.447368
Parch             0.392344
Fare             35.627188
dtype: float64

In [7]:
train_df.describe()


Out[7]:
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200

In [8]:
train_df[train_df['Survived'] == 1]


Out[8]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.00 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.00 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.00 1 0 113803 53.1000 C123 S
8 9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.00 0 2 347742 11.1333 NaN S
9 10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14.00 1 0 237736 30.0708 NaN C
10 11 1 3 Sandstrom, Miss. Marguerite Rut female 4.00 1 1 PP 9549 16.7000 G6 S
11 12 1 1 Bonnell, Miss. Elizabeth female 58.00 0 0 113783 26.5500 C103 S
15 16 1 2 Hewlett, Mrs. (Mary D Kingcome) female 55.00 0 0 248706 16.0000 NaN S
17 18 1 2 Williams, Mr. Charles Eugene male NaN 0 0 244373 13.0000 NaN S
19 20 1 3 Masselmani, Mrs. Fatima female NaN 0 0 2649 7.2250 NaN C
21 22 1 2 Beesley, Mr. Lawrence male 34.00 0 0 248698 13.0000 D56 S
22 23 1 3 McGowan, Miss. Anna "Annie" female 15.00 0 0 330923 8.0292 NaN Q
23 24 1 1 Sloper, Mr. William Thompson male 28.00 0 0 113788 35.5000 A6 S
25 26 1 3 Asplund, Mrs. Carl Oscar (Selma Augusta Emilia... female 38.00 1 5 347077 31.3875 NaN S
28 29 1 3 O'Dwyer, Miss. Ellen "Nellie" female NaN 0 0 330959 7.8792 NaN Q
31 32 1 1 Spencer, Mrs. William Augustus (Marie Eugenie) female NaN 1 0 PC 17569 146.5208 B78 C
32 33 1 3 Glynn, Miss. Mary Agatha female NaN 0 0 335677 7.7500 NaN Q
36 37 1 3 Mamee, Mr. Hanna male NaN 0 0 2677 7.2292 NaN C
39 40 1 3 Nicola-Yarred, Miss. Jamila female 14.00 1 0 2651 11.2417 NaN C
43 44 1 2 Laroche, Miss. Simonne Marie Anne Andree female 3.00 1 2 SC/Paris 2123 41.5792 NaN C
44 45 1 3 Devaney, Miss. Margaret Delia female 19.00 0 0 330958 7.8792 NaN Q
47 48 1 3 O'Driscoll, Miss. Bridget female NaN 0 0 14311 7.7500 NaN Q
52 53 1 1 Harper, Mrs. Henry Sleeper (Myna Haxtun) female 49.00 1 0 PC 17572 76.7292 D33 C
53 54 1 2 Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkin... female 29.00 1 0 2926 26.0000 NaN S
55 56 1 1 Woolner, Mr. Hugh male NaN 0 0 19947 35.5000 C52 S
56 57 1 2 Rugg, Miss. Emily female 21.00 0 0 C.A. 31026 10.5000 NaN S
58 59 1 2 West, Miss. Constance Mirium female 5.00 1 2 C.A. 34651 27.7500 NaN S
61 62 1 1 Icard, Miss. Amelie female 38.00 0 0 113572 80.0000 B28 NaN
65 66 1 3 Moubarek, Master. Gerios male NaN 1 1 2661 15.2458 NaN C
66 67 1 2 Nye, Mrs. (Elizabeth Ramell) female 29.00 0 0 C.A. 29395 10.5000 F33 S
... ... ... ... ... ... ... ... ... ... ... ... ...
809 810 1 1 Chambers, Mrs. Norman Campbell (Bertha Griggs) female 33.00 1 0 113806 53.1000 E8 S
820 821 1 1 Hays, Mrs. Charles Melville (Clara Jennings Gr... female 52.00 1 1 12749 93.5000 B69 S
821 822 1 3 Lulic, Mr. Nikola male 27.00 0 0 315098 8.6625 NaN S
823 824 1 3 Moor, Mrs. (Beila) female 27.00 0 1 392096 12.4750 E121 S
827 828 1 2 Mallet, Master. Andre male 1.00 0 2 S.C./PARIS 2079 37.0042 NaN C
828 829 1 3 McCormack, Mr. Thomas Joseph male NaN 0 0 367228 7.7500 NaN Q
829 830 1 1 Stone, Mrs. George Nelson (Martha Evelyn) female 62.00 0 0 113572 80.0000 B28 NaN
830 831 1 3 Yasbeck, Mrs. Antoni (Selini Alexander) female 15.00 1 0 2659 14.4542 NaN C
831 832 1 2 Richards, Master. George Sibley male 0.83 1 1 29106 18.7500 NaN S
835 836 1 1 Compton, Miss. Sara Rebecca female 39.00 1 1 PC 17756 83.1583 E49 C
838 839 1 3 Chip, Mr. Chang male 32.00 0 0 1601 56.4958 NaN S
839 840 1 1 Marechal, Mr. Pierre male NaN 0 0 11774 29.7000 C47 C
842 843 1 1 Serepeca, Miss. Augusta female 30.00 0 0 113798 31.0000 NaN C
849 850 1 1 Goldenberg, Mrs. Samuel L (Edwiga Grabowska) female NaN 1 0 17453 89.1042 C92 C
853 854 1 1 Lines, Miss. Mary Conover female 16.00 0 1 PC 17592 39.4000 D28 S
855 856 1 3 Aks, Mrs. Sam (Leah Rosen) female 18.00 0 1 392091 9.3500 NaN S
856 857 1 1 Wick, Mrs. George Dennick (Mary Hitchcock) female 45.00 1 1 36928 164.8667 NaN S
857 858 1 1 Daly, Mr. Peter Denis male 51.00 0 0 113055 26.5500 E17 S
858 859 1 3 Baclini, Mrs. Solomon (Latifa Qurban) female 24.00 0 3 2666 19.2583 NaN C
862 863 1 1 Swift, Mrs. Frederick Joel (Margaret Welles Ba... female 48.00 0 0 17466 25.9292 D17 S
865 866 1 2 Bystrom, Mrs. (Karolina) female 42.00 0 0 236852 13.0000 NaN S
866 867 1 2 Duran y More, Miss. Asuncion female 27.00 1 0 SC/PARIS 2149 13.8583 NaN C
869 870 1 3 Johnson, Master. Harold Theodor male 4.00 1 1 347742 11.1333 NaN S
871 872 1 1 Beckwith, Mrs. Richard Leonard (Sallie Monypeny) female 47.00 1 1 11751 52.5542 D35 S
874 875 1 2 Abelson, Mrs. Samuel (Hannah Wizosky) female 28.00 1 0 P/PP 3381 24.0000 NaN C
875 876 1 3 Najib, Miss. Adele Kiamie "Jane" female 15.00 0 0 2667 7.2250 NaN C
879 880 1 1 Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) female 56.00 0 1 11767 83.1583 C50 C
880 881 1 2 Shelley, Mrs. William (Imanita Parrish Hall) female 25.00 0 1 230433 26.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.00 0 0 112053 30.0000 B42 S
889 890 1 1 Behr, Mr. Karl Howell male 26.00 0 0 111369 30.0000 C148 C

342 rows × 12 columns


In [9]:
train_df.describe(include=['O'])


Out[9]:
Name Sex Ticket Cabin Embarked
count 891 891 891 204 889
unique 891 2 681 147 3
top Graham, Mr. George Edward male CA. 2343 C23 C25 C27 S
freq 1 577 7 4 644

In [17]:
train_df[['Pclass', 'Survived']].groupby('Pclass', as_index=False).mean().sort_values(by='Survived', ascending=False)


Out[17]:
Pclass Survived
0 1 0.629630
1 2 0.472826
2 3 0.242363

In [18]:
train_df[['Sex', 'Survived']].groupby('Sex', as_index=False).mean().sort_values(by='Survived', ascending=False)


Out[18]:
Sex Survived
0 female 0.742038
1 male 0.188908

In [19]:
import matplotlib.pyplot as plt

In [20]:
%matplotlib inline

In [21]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [28]:
g = sns.FacetGrid(train_df, col='Survived')
g.map(plt.hist, 'Age', bins=20)


Out[28]:
<seaborn.axisgrid.FacetGrid at 0x118182d10>

In [31]:
grid = sns.FacetGrid(train_df, col='Survived', row='Pclass', size=2.2, aspect=1.6)
grid.map(plt.hist, 'Age', alpha=.5, bins=20)


Out[31]:
<seaborn.axisgrid.FacetGrid at 0x1194f05d0>
grid = sns.FacetGrid(train_df, row='Embarked')

In [38]:
grid = sns.FacetGrid(train_df, row='Embarked', size=3, aspect=3)
grid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette='deep')
grid.add_legend();



In [41]:
grid = sns.FacetGrid(train_df, row='Embarked', col='Survived', size=3, aspect=2)
grid.map(sns.barplot, 'Sex', 'Fare', alpha=.5, ci=None)
grid.add_legend();



In [44]:
print('After ', train_df.shape, test_df.shape, combine[0].shape, combine[1].shape)


('After ', (891, 10), (418, 9), (891, 10), (418, 9))

In [45]:
train_df


Out[45]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Fare Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 7.2500 S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 71.2833 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 7.9250 S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 53.1000 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 8.0500 S
5 6 0 3 Moran, Mr. James male NaN 0 0 8.4583 Q
6 7 0 1 McCarthy, Mr. Timothy J male 54.0 0 0 51.8625 S
7 8 0 3 Palsson, Master. Gosta Leonard male 2.0 3 1 21.0750 S
8 9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 2 11.1333 S
9 10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 0 30.0708 C
10 11 1 3 Sandstrom, Miss. Marguerite Rut female 4.0 1 1 16.7000 S
11 12 1 1 Bonnell, Miss. Elizabeth female 58.0 0 0 26.5500 S
12 13 0 3 Saundercock, Mr. William Henry male 20.0 0 0 8.0500 S
13 14 0 3 Andersson, Mr. Anders Johan male 39.0 1 5 31.2750 S
14 15 0 3 Vestrom, Miss. Hulda Amanda Adolfina female 14.0 0 0 7.8542 S
15 16 1 2 Hewlett, Mrs. (Mary D Kingcome) female 55.0 0 0 16.0000 S
16 17 0 3 Rice, Master. Eugene male 2.0 4 1 29.1250 Q
17 18 1 2 Williams, Mr. Charles Eugene male NaN 0 0 13.0000 S
18 19 0 3 Vander Planke, Mrs. Julius (Emelia Maria Vande... female 31.0 1 0 18.0000 S
19 20 1 3 Masselmani, Mrs. Fatima female NaN 0 0 7.2250 C
20 21 0 2 Fynney, Mr. Joseph J male 35.0 0 0 26.0000 S
21 22 1 2 Beesley, Mr. Lawrence male 34.0 0 0 13.0000 S
22 23 1 3 McGowan, Miss. Anna "Annie" female 15.0 0 0 8.0292 Q
23 24 1 1 Sloper, Mr. William Thompson male 28.0 0 0 35.5000 S
24 25 0 3 Palsson, Miss. Torborg Danira female 8.0 3 1 21.0750 S
25 26 1 3 Asplund, Mrs. Carl Oscar (Selma Augusta Emilia... female 38.0 1 5 31.3875 S
26 27 0 3 Emir, Mr. Farred Chehab male NaN 0 0 7.2250 C
27 28 0 1 Fortune, Mr. Charles Alexander male 19.0 3 2 263.0000 S
28 29 1 3 O'Dwyer, Miss. Ellen "Nellie" female NaN 0 0 7.8792 Q
29 30 0 3 Todoroff, Mr. Lalio male NaN 0 0 7.8958 S
... ... ... ... ... ... ... ... ... ... ...
861 862 0 2 Giles, Mr. Frederick Edward male 21.0 1 0 11.5000 S
862 863 1 1 Swift, Mrs. Frederick Joel (Margaret Welles Ba... female 48.0 0 0 25.9292 S
863 864 0 3 Sage, Miss. Dorothy Edith "Dolly" female NaN 8 2 69.5500 S
864 865 0 2 Gill, Mr. John William male 24.0 0 0 13.0000 S
865 866 1 2 Bystrom, Mrs. (Karolina) female 42.0 0 0 13.0000 S
866 867 1 2 Duran y More, Miss. Asuncion female 27.0 1 0 13.8583 C
867 868 0 1 Roebling, Mr. Washington Augustus II male 31.0 0 0 50.4958 S
868 869 0 3 van Melkebeke, Mr. Philemon male NaN 0 0 9.5000 S
869 870 1 3 Johnson, Master. Harold Theodor male 4.0 1 1 11.1333 S
870 871 0 3 Balkic, Mr. Cerin male 26.0 0 0 7.8958 S
871 872 1 1 Beckwith, Mrs. Richard Leonard (Sallie Monypeny) female 47.0 1 1 52.5542 S
872 873 0 1 Carlsson, Mr. Frans Olof male 33.0 0 0 5.0000 S
873 874 0 3 Vander Cruyssen, Mr. Victor male 47.0 0 0 9.0000 S
874 875 1 2 Abelson, Mrs. Samuel (Hannah Wizosky) female 28.0 1 0 24.0000 C
875 876 1 3 Najib, Miss. Adele Kiamie "Jane" female 15.0 0 0 7.2250 C
876 877 0 3 Gustafsson, Mr. Alfred Ossian male 20.0 0 0 9.8458 S
877 878 0 3 Petroff, Mr. Nedelio male 19.0 0 0 7.8958 S
878 879 0 3 Laleff, Mr. Kristo male NaN 0 0 7.8958 S
879 880 1 1 Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) female 56.0 0 1 83.1583 C
880 881 1 2 Shelley, Mrs. William (Imanita Parrish Hall) female 25.0 0 1 26.0000 S
881 882 0 3 Markun, Mr. Johann male 33.0 0 0 7.8958 S
882 883 0 3 Dahlberg, Miss. Gerda Ulrika female 22.0 0 0 10.5167 S
883 884 0 2 Banfield, Mr. Frederick James male 28.0 0 0 10.5000 S
884 885 0 3 Sutehall, Mr. Henry Jr male 25.0 0 0 7.0500 S
885 886 0 3 Rice, Mrs. William (Margaret Norton) female 39.0 0 5 29.1250 Q
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 13.0000 S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 30.0000 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 23.4500 S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 30.0000 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 7.7500 Q

891 rows × 10 columns


In [71]:
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

pd.crosstab(train_df['Title'], train_df['Sex'])


Out[71]:
Sex female male
Title
Capt 0 1
Col 0 2
Countess 1 0
Don 0 1
Dr 1 6
Jonkheer 0 1
Lady 1 0
Major 0 2
Master 0 40
Miss 182 0
Mlle 2 0
Mme 1 0
Mr 0 517
Mrs 125 0
Ms 1 0
Rev 0 6
Sir 0 1

In [72]:
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',
                                                 'Don', 'Dr', 'Major', 'Rev', 'Sir', 
                                                 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    
train_df[['Title', 'Survived']].groupby('Title', as_index=False).mean().sort_values(by='Survived', ascending=False)


Out[72]:
Title Survived
3 Mrs 0.793651
1 Miss 0.702703
0 Master 0.575000
4 Rare 0.347826
2 Mr 0.156673

In [73]:
train_df['Title']


Out[73]:
0          Mr
1         Mrs
2        Miss
3         Mrs
4          Mr
5          Mr
6          Mr
7      Master
8         Mrs
9         Mrs
10       Miss
11       Miss
12         Mr
13         Mr
14       Miss
15        Mrs
16     Master
17         Mr
18        Mrs
19        Mrs
20         Mr
21         Mr
22       Miss
23         Mr
24       Miss
25        Mrs
26         Mr
27         Mr
28       Miss
29         Mr
        ...  
861        Mr
862       Mrs
863      Miss
864        Mr
865       Mrs
866      Miss
867        Mr
868        Mr
869    Master
870        Mr
871       Mrs
872        Mr
873        Mr
874       Mrs
875      Miss
876        Mr
877        Mr
878        Mr
879       Mrs
880       Mrs
881        Mr
882      Miss
883        Mr
884        Mr
885       Mrs
886      Rare
887      Miss
888      Miss
889        Mr
890        Mr
Name: Title, dtype: object

In [74]:
title_mapping = {'Mr':1, 'Miss':2, 'Mrs':3, 'Master':4, 'Rare':5}

for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)
    
train_df.head()


Out[74]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Fare Embarked Title
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 7.2500 S 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 71.2833 C 3
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 7.9250 S 2
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 53.1000 S 3
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 8.0500 S 1

In [75]:
train_df = train_df.drop(['PassengerId', 'Name'], axis=1)
test_df = test_df.drop('Name', axis=1)
combine = [train_df, test_df]
print(train_df.shape, test_df.shape)


((891, 9), (418, 9))

In [76]:
gender = {'female':1, 'male':0}
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map(gender).astype(int)
    
train_df.head()


Out[76]:
Survived Pclass Sex Age SibSp Parch Fare Embarked Title
0 0 3 0 22.0 1 0 7.2500 S 1
1 1 1 1 38.0 1 0 71.2833 C 3
2 1 3 1 26.0 0 0 7.9250 S 2
3 1 1 1 35.0 1 0 53.1000 S 3
4 0 3 0 35.0 0 0 8.0500 S 1

In [ ]: