In [28]:
import pandas as pd

import matplotlib as plt
%matplotlib inline

In [2]:
df = pd.read_csv('../titanicdeath/static/X_train.csv')

In [3]:
df.head()


Out[3]:
Pclass Sex Age Fare Embarked Title IsAlone Age*Class
0 3 0 1 0 0 1 0 3
1 1 1 2 3 1 3 0 2
2 3 1 1 1 0 2 1 3
3 1 1 2 3 0 3 0 2
4 3 0 2 1 0 1 1 6

In [4]:
embarked_dist = df['Embarked'].value_counts()

In [5]:
embarked_dist


Out[5]:
0    646
1    168
2     77
Name: Embarked, dtype: int64

In [6]:
embarked_dist[0]/sum(embarked_dist)


Out[6]:
0.72502805836139173

In [7]:
embarked_dist[1]/sum(embarked_dist)


Out[7]:
0.18855218855218855

In [8]:
embarked_dist[2]/sum(embarked_dist)


Out[8]:
0.086419753086419748

In [9]:
# this is the code we should use to sample from the exsiting distribution of embarkation ports
df['Embarked'].sample(1).values[0]


Out[9]:
1

In [10]:
# graph the titles and sexes of those titles
cross = pd.crosstab(df['Title'], df['Sex'])
cross


Out[10]:
Sex 0 1
Title
1 517 0
2 0 185
3 0 126
4 40 0
5 20 3

In [11]:
df['Title'][df['Sex'] == 0].sample()


Out[11]:
287    1
Name: Title, dtype: int64

In [12]:
cross[0]


Out[12]:
Title
1    517
2      0
3      0
4     40
5     20
Name: 0, dtype: int64

In [13]:
cross[1]


Out[13]:
Title
1      0
2    185
3    126
4      0
5      3
Name: 1, dtype: int64

In [14]:
cross[0].sample(1)


Out[14]:
Title
1    517
Name: 0, dtype: int64

In [15]:
cross[1].sample(1).values[0]


Out[15]:
185

In [16]:
cross_fare = pd.crosstab(df['Pclass'], df['Fare'])
cross_fare


Out[16]:
Fare 0 1 2 3
Pclass
1 6 0 51 159
2 6 86 70 22
3 211 131 108 41

In [17]:
cross_fare[0]


Out[17]:
Pclass
1      6
2      6
3    211
Name: 0, dtype: int64

In [18]:
cross_fare[1]


Out[18]:
Pclass
1      0
2     86
3    131
Name: 1, dtype: int64

In [19]:
cross_fare[2]


Out[19]:
Pclass
1     51
2     70
3    108
Name: 2, dtype: int64

In [20]:
cross_fare[3]


Out[20]:
Pclass
1    159
2     22
3     41
Name: 3, dtype: int64

In [21]:
df['Fare'][df['Pclass'] == 1].sample(1)


Out[21]:
194    2
Name: Fare, dtype: int64

In [23]:
# get a distribution of parents/grandparents
# load in original data

orig_train = pd.read_csv('../titanicdeath/static/train.csv')
orig_test = pd.read_csv('../titanicdeath/static/test.csv')

In [25]:
orig_train.head()


Out[25]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

In [31]:
# number of parents and children
orig_train['Parch'].describe()


Out[31]:
count    891.000000
mean       0.381594
std        0.806057
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        6.000000
Name: Parch, dtype: float64

In [ ]:
# so the mean number of spouses is ~ 0.4 or half a person
# so what we want to do in our code is sample from this dist.