TITANIC DATA SET DISCOVERY

Is the story a young poor guy

and a rich girl.

And he died because she said there was no room left on the floating piece of wood (a.ka. selfish).

Fortunately, we have the ship's manifest of nearly every single passenger. But using standard python libraries to peak and poke that dataset is kinda of a miserable experience. Fortunately, we have pandas.

http://pandas.pydata.org/

Pandas provides an abstraction to load in data, manipulate it, and export your changes.



In [1]:

    
from __future__ import division



In [2]:

    
import math



In [3]:

    
from matplotlib import pyplot as plt



In [4]:

    
plt.style.use('ggplot')



In [7]:

    
import pandas as pd



In [8]:

    
import numpy as np



In [9]:

    
%matplotlib inline



In [10]:

    
df = pd.read_csv("data/train.csv")



In [11]:

    
df.head()









    Out[11]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.0
      1
      0
      113803
      53.1000
      C123
      S
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35.0
      0
      0
      373450
      8.0500
      NaN
      S



In [12]:

    
df.tail()









    Out[12]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      886
      887
      0
      2
      Montvila, Rev. Juozas
      male
      27.0
      0
      0
      211536
      13.00
      NaN
      S
    
    
      887
      888
      1
      1
      Graham, Miss. Margaret Edith
      female
      19.0
      0
      0
      112053
      30.00
      B42
      S
    
    
      888
      889
      0
      3
      Johnston, Miss. Catherine Helen "Carrie"
      female
      NaN
      1
      2
      W./C. 6607
      23.45
      NaN
      S
    
    
      889
      890
      1
      1
      Behr, Mr. Karl Howell
      male
      26.0
      0
      0
      111369
      30.00
      C148
      C
    
    
      890
      891
      0
      3
      Dooley, Mr. Patrick
      male
      32.0
      0
      0
      370376
      7.75
      NaN
      Q



In [13]:

    
#Lets see the types that were imported on our behalf
#df is an object and dtypes is an attribute
df.dtypes

#info is a function
df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB



In [14]:

    
df.describe()









    Out[14]:






  
    
      
      PassengerId
      Survived
      Pclass
      Age
      SibSp
      Parch
      Fare
    
  
  
    
      count
      891.000000
      891.000000
      891.000000
      714.000000
      891.000000
      891.000000
      891.000000
    
    
      mean
      446.000000
      0.383838
      2.308642
      29.699118
      0.523008
      0.381594
      32.204208
    
    
      std
      257.353842
      0.486592
      0.836071
      14.526497
      1.102743
      0.806057
      49.693429
    
    
      min
      1.000000
      0.000000
      1.000000
      0.420000
      0.000000
      0.000000
      0.000000
    
    
      25%
      223.500000
      0.000000
      2.000000
      20.125000
      0.000000
      0.000000
      7.910400
    
    
      50%
      446.000000
      0.000000
      3.000000
      28.000000
      0.000000
      0.000000
      14.454200
    
    
      75%
      668.500000
      1.000000
      3.000000
      38.000000
      1.000000
      0.000000
      31.000000
    
    
      max
      891.000000
      1.000000
      3.000000
      80.000000
      8.000000
      6.000000
      512.329200



In [15]:

    
df.shape









    Out[15]:





(891, 12)



In [16]:

    
df.columns









    Out[16]:





Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')



In [17]:

    
df['Name']









    Out[17]:





0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
5                                       Moran, Mr. James
6                                McCarthy, Mr. Timothy J
7                         Palsson, Master. Gosta Leonard
8      Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
9                    Nasser, Mrs. Nicholas (Adele Achem)
10                       Sandstrom, Miss. Marguerite Rut
11                              Bonnell, Miss. Elizabeth
12                        Saundercock, Mr. William Henry
13                           Andersson, Mr. Anders Johan
14                  Vestrom, Miss. Hulda Amanda Adolfina
15                      Hewlett, Mrs. (Mary D Kingcome) 
16                                  Rice, Master. Eugene
17                          Williams, Mr. Charles Eugene
18     Vander Planke, Mrs. Julius (Emelia Maria Vande...
19                               Masselmani, Mrs. Fatima
20                                  Fynney, Mr. Joseph J
21                                 Beesley, Mr. Lawrence
22                           McGowan, Miss. Anna "Annie"
23                          Sloper, Mr. William Thompson
24                         Palsson, Miss. Torborg Danira
25     Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...
26                               Emir, Mr. Farred Chehab
27                        Fortune, Mr. Charles Alexander
28                         O'Dwyer, Miss. Ellen "Nellie"
29                                   Todoroff, Mr. Lalio
                             ...                        
861                          Giles, Mr. Frederick Edward
862    Swift, Mrs. Frederick Joel (Margaret Welles Ba...
863                    Sage, Miss. Dorothy Edith "Dolly"
864                               Gill, Mr. John William
865                             Bystrom, Mrs. (Karolina)
866                         Duran y More, Miss. Asuncion
867                 Roebling, Mr. Washington Augustus II
868                          van Melkebeke, Mr. Philemon
869                      Johnson, Master. Harold Theodor
870                                    Balkic, Mr. Cerin
871     Beckwith, Mrs. Richard Leonard (Sallie Monypeny)
872                             Carlsson, Mr. Frans Olof
873                          Vander Cruyssen, Mr. Victor
874                Abelson, Mrs. Samuel (Hannah Wizosky)
875                     Najib, Miss. Adele Kiamie "Jane"
876                        Gustafsson, Mr. Alfred Ossian
877                                 Petroff, Mr. Nedelio
878                                   Laleff, Mr. Kristo
879        Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)
880         Shelley, Mrs. William (Imanita Parrish Hall)
881                                   Markun, Mr. Johann
882                         Dahlberg, Miss. Gerda Ulrika
883                        Banfield, Mr. Frederick James
884                               Sutehall, Mr. Henry Jr
885                 Rice, Mrs. William (Margaret Norton)
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, dtype: object



In [20]:

    
my_famous_passenger = df[df["Name"] == "Guggenheim, Mr. Benjamin"]



In [23]:

    
print(my_famous_passenger)









    



     PassengerId  Survived  Pclass                      Name   Sex   Age  \
789          790         0       1  Guggenheim, Mr. Benjamin  male  46.0   

     SibSp  Parch    Ticket  Fare    Cabin Embarked  
789      0      0  PC 17593  79.2  B82 B84        C



In [24]:

    
df["Age"].mean()









    Out[24]:





29.69911764705882



In [30]:

    
df["Fare"].describe









    Out[30]:





512.32920000000001



In [33]:

    
my_rich_passenger = df[df["Fare"] == df["Fare"].max()]
print(my_rich_passenger)









    



     PassengerId  Survived  Pclass                                Name  \
258          259         1       1                    Ward, Miss. Anna   
679          680         1       1  Cardeza, Mr. Thomas Drake Martinez   
737          738         1       1              Lesurer, Mr. Gustave J   

        Sex   Age  SibSp  Parch    Ticket      Fare        Cabin Embarked  
258  female  35.0      0      0  PC 17755  512.3292          NaN        C  
679    male  36.0      0      1  PC 17755  512.3292  B51 B53 B55        C  
737    male  35.0      0      0  PC 17755  512.3292         B101        C



In [35]:

    
cols = list(df.columns.values)
print(cols)









    



['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']



In [37]:

    
cols[-2]









    Out[37]:





'Cabin'



In [48]:

    
df_of_women = df[df["Sex"] == "female"]
df_of_men = df[df["Sex"] == "male"]



In [79]:

    
# Creatre three data frames. Capture them by passenger calss "PClass".
class_type = df["Pclass"].unique()
print(class_type)
df_class_3 = df[df["Pclass"] == class_type[1]]
print(df_class_3.shape[0])



In [74]:

    
df["FamilySize"] = df["SibSp"] + df["Parch"]
df["FamilySize"].describe()









    Out[74]:





count    891.000000
mean       0.904602
std        1.613459
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max       10.000000
Name: FamilySize, dtype: float64



In [81]:

    
df["Age"].hist(bins=16, range=(0,80))









    Out[81]:





<matplotlib.axes._subplots.AxesSubplot at 0x1b44f41ba58>



In [82]:

    
df["Age"].dropna().hist(bins=16, range=(0,80))









    Out[82]:





<matplotlib.axes._subplots.AxesSubplot at 0x1b450bb6860>



In [83]:

    
plt.scatter(df["Fare"],df["Survived"])









    Out[83]:





<matplotlib.collections.PathCollection at 0x1b450cb3358>



In [84]:

    
d = {'one':np.random.rand(10),
    'two':np.random.rand(10)}



In [85]:

    
print(d)









    



{'one': array([ 0.964214  ,  0.68799973,  0.12177282,  0.86190168,  0.92925301,
        0.12294288,  0.95088551,  0.5980512 ,  0.44215978,  0.95522712]), 'two': array([ 0.01942783,  0.61809851,  0.56103999,  0.33145496,  0.47450658,
        0.12462589,  0.40896764,  0.55335105,  0.68795514,  0.42165856])}



In [93]:

    
df_scrap = pd.DataFrame(d)
df_scrap.describe()



In [92]:

    
df_scrap.plot(style=['ro','bx'])









    Out[92]:





<matplotlib.axes._subplots.AxesSubplot at 0x1b450c77d68>



In [96]:

    
import statsmodels.api as sm
import pylab as pl



In [112]:

    
new_df = df



In [113]:

    
def gender_to_numeric(gender):
    if gender == "male":
        return 0
    else:
        return 1



In [114]:

    
new_df["Sex"] = new_df["Sex"].apply(gender_to_numeric)
new_df = new_df[["Survived","Age","Sex","Pclass"]]



In [115]:

    
new_df = new_df.dropna()
train_cols = new_df.columns[1:]
print(train_cols)









    



Index(['Age', 'Sex', 'Pclass'], dtype='object')



In [119]:

    
logit = sm.Logit(new_df["Survived"], new_df[train_cols])
result = logit.fit()









    



Optimization terminated successfully.
         Current function value: 0.579432
         Iterations 5



In [120]:

    
print(result.summary())









    



                           Logit Regression Results                           
==============================================================================
Dep. Variable:               Survived   No. Observations:                  714
Model:                          Logit   Df Residuals:                      711
Method:                           MLE   Df Model:                            2
Date:                Sun, 12 Jun 2016   Pseudo R-squ.:                  0.1421
Time:                        10:45:57   Log-Likelihood:                -413.71
converged:                       True   LL-Null:                       -482.26
                                        LLR p-value:                 1.706e-30
==============================================================================
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Age           -0.0420      0.007     -6.246      0.000        -0.055    -0.029
Sex            3.5854      0.407      8.815      0.000         2.788     4.383
Pclass        -1.2439      0.119    -10.447      0.000        -1.477    -1.010
==============================================================================



In [121]:









    Out[121]:





(714, 4)



In [127]:

    
new_df.iloc[[1,7,10],[1,3]]



In [ ]:

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
886	887	0	2	Montvila, Rev. Juozas	male	27.0	0	0	211536	13.00	NaN	S
887	888	1	1	Graham, Miss. Margaret Edith	female	19.0	0	0	112053	30.00	B42	S
888	889	0	3	Johnston, Miss. Catherine Helen "Carrie"	female	NaN	1	2	W./C. 6607	23.45	NaN	S
889	890	1	1	Behr, Mr. Karl Howell	male	26.0	0	0	111369	30.00	C148	C
890	891	0	3	Dooley, Mr. Patrick	male	32.0	0	0	370376	7.75	NaN	Q

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

	one	two
count	10.000000	10.000000
mean	0.663441	0.420109
std	0.334607	0.212748
min	0.121773	0.019428
25%	0.481133	0.350833
50%	0.774951	0.448083
75%	0.945477	0.559118
max	0.964214	0.687955