In [28]:
import pandas as pd
import matplotlib as plt
%matplotlib inline
In [2]:
df = pd.read_csv('../titanicdeath/static/X_train.csv')
In [3]:
df.head()
Out[3]:
In [4]:
embarked_dist = df['Embarked'].value_counts()
In [5]:
embarked_dist
Out[5]:
In [6]:
embarked_dist[0]/sum(embarked_dist)
Out[6]:
In [7]:
embarked_dist[1]/sum(embarked_dist)
Out[7]:
In [8]:
embarked_dist[2]/sum(embarked_dist)
Out[8]:
In [9]:
# this is the code we should use to sample from the exsiting distribution of embarkation ports
df['Embarked'].sample(1).values[0]
Out[9]:
In [10]:
# graph the titles and sexes of those titles
cross = pd.crosstab(df['Title'], df['Sex'])
cross
Out[10]:
In [11]:
df['Title'][df['Sex'] == 0].sample()
Out[11]:
In [12]:
cross[0]
Out[12]:
In [13]:
cross[1]
Out[13]:
In [14]:
cross[0].sample(1)
Out[14]:
In [15]:
cross[1].sample(1).values[0]
Out[15]:
In [16]:
cross_fare = pd.crosstab(df['Pclass'], df['Fare'])
cross_fare
Out[16]:
In [17]:
cross_fare[0]
Out[17]:
In [18]:
cross_fare[1]
Out[18]:
In [19]:
cross_fare[2]
Out[19]:
In [20]:
cross_fare[3]
Out[20]:
In [21]:
df['Fare'][df['Pclass'] == 1].sample(1)
Out[21]:
In [23]:
# get a distribution of parents/grandparents
# load in original data
orig_train = pd.read_csv('../titanicdeath/static/train.csv')
orig_test = pd.read_csv('../titanicdeath/static/test.csv')
In [25]:
orig_train.head()
Out[25]:
In [31]:
# number of parents and children
orig_train['Parch'].describe()
Out[31]:
In [ ]:
# so the mean number of spouses is ~ 0.4 or half a person
# so what we want to do in our code is sample from this dist.