notebook.community

Edit and run



In [28]:

    
import pandas as pd

import matplotlib as plt
%matplotlib inline



In [2]:

    
df = pd.read_csv('../titanicdeath/static/X_train.csv')



In [3]:

    
df.head()



In [4]:

    
embarked_dist = df['Embarked'].value_counts()



In [5]:

    
embarked_dist









    Out[5]:





0    646
1    168
2     77
Name: Embarked, dtype: int64



In [6]:

    
embarked_dist[0]/sum(embarked_dist)









    Out[6]:





0.72502805836139173



In [7]:

    
embarked_dist[1]/sum(embarked_dist)









    Out[7]:





0.18855218855218855



In [8]:

    
embarked_dist[2]/sum(embarked_dist)









    Out[8]:





0.086419753086419748



In [9]:

    
# this is the code we should use to sample from the exsiting distribution of embarkation ports
df['Embarked'].sample(1).values[0]









    Out[9]:





1



In [10]:

    
# graph the titles and sexes of those titles
cross = pd.crosstab(df['Title'], df['Sex'])
cross



In [11]:

    
df['Title'][df['Sex'] == 0].sample()









    Out[11]:





287    1
Name: Title, dtype: int64



In [12]:

    
cross[0]









    Out[12]:





Title
1    517
2      0
3      0
4     40
5     20
Name: 0, dtype: int64



In [13]:

    
cross[1]









    Out[13]:





Title
1      0
2    185
3    126
4      0
5      3
Name: 1, dtype: int64



In [14]:

    
cross[0].sample(1)









    Out[14]:





Title
1    517
Name: 0, dtype: int64



In [15]:

    
cross[1].sample(1).values[0]









    Out[15]:





185



In [16]:

    
cross_fare = pd.crosstab(df['Pclass'], df['Fare'])
cross_fare



In [17]:

    
cross_fare[0]









    Out[17]:





Pclass
1      6
2      6
3    211
Name: 0, dtype: int64



In [18]:

    
cross_fare[1]









    Out[18]:





Pclass
1      0
2     86
3    131
Name: 1, dtype: int64



In [19]:

    
cross_fare[2]









    Out[19]:





Pclass
1     51
2     70
3    108
Name: 2, dtype: int64



In [20]:

    
cross_fare[3]









    Out[20]:





Pclass
1    159
2     22
3     41
Name: 3, dtype: int64



In [21]:

    
df['Fare'][df['Pclass'] == 1].sample(1)









    Out[21]:





194    2
Name: Fare, dtype: int64



In [23]:

    
# get a distribution of parents/grandparents
# load in original data

orig_train = pd.read_csv('../titanicdeath/static/train.csv')
orig_test = pd.read_csv('../titanicdeath/static/test.csv')



In [25]:

    
orig_train.head()









    Out[25]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.0
      1
      0
      113803
      53.1000
      C123
      S
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35.0
      0
      0
      373450
      8.0500
      NaN
      S



In [31]:

    
# number of parents and children
orig_train['Parch'].describe()









    Out[31]:





count    891.000000
mean       0.381594
std        0.806057
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        6.000000
Name: Parch, dtype: float64



In [ ]:

    
# so the mean number of spouses is ~ 0.4 or half a person
# so what we want to do in our code is sample from this dist.

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S