In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier as dt
from sklearn.cross_validation import ShuffleSplit, train_test_split
from sklearn.metrics import make_scorer,accuracy_score
from sklearn.grid_search import GridSearchCV

In [2]:
t_data = pd.read_csv('titanic_data.csv')
l_data = t_data.Survived
t_data = t_data.drop(['Name','PassengerId','Ticket','Cabin','Embarked'],axis=1)
def sexter(data): return 1 if data == 'female' else 0
t_data.Sex = t_data['Sex'].apply(sexter)
t_data.head()
t_data.fillna(0,inplace=True)

In [3]:
t_data.head()


Out[3]:
Survived Pclass Sex Age SibSp Parch Fare
0 0 3 0 22 1 0 7.2500
1 1 1 1 38 1 0 71.2833
2 1 3 1 26 0 0 7.9250
3 1 1 1 35 1 0 53.1000
4 0 3 0 35 0 0 8.0500

In [30]:
for group,data in t_data.groupby(['Pclass','Sex']):
    dead = data.Survived.value_counts()[0]
    dead_p = float(dead)/t_data.Survived.value_counts()[0]
    alive = data.Survived.value_counts()[1]
    alive_p = float(alive)/t_data.Survived.value_counts()[1]
    print ('Pclass %i, Sex, %i dead = %i, dead_p = %0.2f'% (group[0],group[1],dead,dead_p))
    print ('Pclass %i, Sex, %i alive = %i, alive_p = %0.2f' % (group[0],group[1],alive,alive_p))


Pclass 1, Sex, 0 dead = 77, dead_p = 0.14
Pclass 1, Sex, 0 alive = 45, alive_p = 0.13
Pclass 1, Sex, 1 dead = 3, dead_p = 0.01
Pclass 1, Sex, 1 alive = 91, alive_p = 0.27
Pclass 2, Sex, 0 dead = 91, dead_p = 0.17
Pclass 2, Sex, 0 alive = 17, alive_p = 0.05
Pclass 2, Sex, 1 dead = 6, dead_p = 0.01
Pclass 2, Sex, 1 alive = 70, alive_p = 0.20
Pclass 3, Sex, 0 dead = 300, dead_p = 0.55
Pclass 3, Sex, 0 alive = 47, alive_p = 0.14
Pclass 3, Sex, 1 dead = 72, dead_p = 0.13
Pclass 3, Sex, 1 alive = 72, alive_p = 0.21

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: