In [1]:
import pandas as pd
from pandas import Series, DataFrame

In [2]:
titanic_df = pd.read_csv('train.csv')

In [4]:
titanic_df.head()


Out[4]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

In [5]:
titanic_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB

In [6]:
import numpy as np

In [7]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [9]:
sns.factorplot('Sex',data=titanic_df,
kind='count')


Out[9]:
<seaborn.axisgrid.FacetGrid at 0x11a3da550>

In [11]:
sns.factorplot('Sex',data=titanic_df,hue='Pclass',kind='count')


Out[11]:
<seaborn.axisgrid.FacetGrid at 0x11a4dcb38>

In [13]:
sns.factorplot('Pclass',data=titanic_df,hue='Sex',kind='count')


Out[13]:
<seaborn.axisgrid.FacetGrid at 0x11a4ca208>

In [15]:
def male_female_child(passenger):
    age,sex = passenger
    
    if age < 16:
        return 'child'
    else:
        return sex

In [16]:
titanic_df['person'] = titanic_df[['Age', 'Sex']].apply(male_female_child,axis=1)

In [17]:
titanic_df[0:10]


Out[17]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked person
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S male
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C female
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S female
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S female
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S male
5 6 0 3 Moran, Mr. James male NaN 0 0 330877 8.4583 NaN Q male
6 7 0 1 McCarthy, Mr. Timothy J male 54.0 0 0 17463 51.8625 E46 S male
7 8 0 3 Palsson, Master. Gosta Leonard male 2.0 3 1 349909 21.0750 NaN S child
8 9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 2 347742 11.1333 NaN S female
9 10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 0 237736 30.0708 NaN C child

In [18]:
sns.factorplot('Pclass',data=titanic_df,hue='person',kind='count')


Out[18]:
<seaborn.axisgrid.FacetGrid at 0x11a89cd30>

In [19]:
titanic_df['Age'].hist(bins=70)


Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x11abfa9b0>

In [20]:
titanic_df['Age'].mean()


Out[20]:
29.69911764705882

In [25]:
titanic_df['person'].value_counts()


Out[25]:
male      537
female    271
child      83
Name: person, dtype: int64

In [26]:
titanic_df['Sex'].value_counts()


Out[26]:
male      577
female    314
Name: Sex, dtype: int64

In [27]:
fig = sns.FacetGrid(titanic_df,hue='Sex',aspect=4)
fig.map(sns.kdeplot,'Age',shade=True)

oldest = titanic_df['Age'].max()

fig.set(xlim=(0,oldest))

fig.add_legend()


//anaconda/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[27]:
<seaborn.axisgrid.FacetGrid at 0x11adb8d68>

In [28]:
fig = sns.FacetGrid(titanic_df,hue='person',aspect=4)
fig.map(sns.kdeplot,'Age',shade=True)

oldest = titanic_df['Age'].max()

fig.set(xlim=(0,oldest))

fig.add_legend()


//anaconda/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[28]:
<seaborn.axisgrid.FacetGrid at 0x11aeaacc0>

In [29]:
fig = sns.FacetGrid(titanic_df,hue='Pclass',aspect=4)
fig.map(sns.kdeplot,'Age',shade=True)

oldest = titanic_df['Age'].max()

fig.set(xlim=(0,oldest))

fig.add_legend()


//anaconda/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[29]:
<seaborn.axisgrid.FacetGrid at 0x11b15a5c0>

In [30]:
titanic_df.head()


Out[30]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked person
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S male
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C female
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S female
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S female
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S male

In [31]:
deck = titanic_df['Cabin'].dropna()

In [33]:
deck.head()


Out[33]:
1      C85
3     C123
6      E46
10      G6
11    C103
Name: Cabin, dtype: object

In [35]:
levels = []

for level in deck:
    levels.append(level[0])

cabin_df = DataFrame(levels)
cabin_df.columns = ['Cabin']
sns.factorplot('Cabin',data=cabin_df,palette='winter_d',kind='count')


Out[35]:
<seaborn.axisgrid.FacetGrid at 0x11b564438>

In [38]:
cabin_df = cabin_df[cabin_df.Cabin !='T']
sns.factorplot('Cabin',data=cabin_df,palette='summer',kind='count')


Out[38]:
<seaborn.axisgrid.FacetGrid at 0x11b5d4240>

In [39]:
titanic_df.head()


Out[39]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked person
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S male
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C female
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S female
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S female
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S male

In [45]:
sns.factorplot('Embarked',data=titanic_df,hue='Pclass',kind='count',x_order=['C','Q','S'])


//anaconda/lib/python3.5/site-packages/seaborn/categorical.py:3367: UserWarning: The `x_order` parameter has been renamed `order`
  UserWarning)
Out[45]:
<seaborn.axisgrid.FacetGrid at 0x11ac21f28>

In [46]:
titanic_df.head()


Out[46]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked person
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S male
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C female
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S female
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S female
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S male

In [47]:
titanic_df['Alone'] = titanic_df.SibSp + titanic_df.Parch

In [48]:
titanic_df['Alone']


Out[48]:
0       1
1       1
2       0
3       1
4       0
5       0
6       0
7       4
8       2
9       1
10      2
11      0
12      0
13      6
14      0
15      0
16      5
17      0
18      1
19      0
20      0
21      0
22      0
23      0
24      4
25      6
26      0
27      5
28      0
29      0
       ..
861     1
862     0
863    10
864     0
865     0
866     1
867     0
868     0
869     2
870     0
871     2
872     0
873     0
874     1
875     0
876     0
877     0
878     0
879     1
880     1
881     0
882     0
883     0
884     0
885     5
886     0
887     0
888     3
889     0
890     0
Name: Alone, dtype: int64

In [50]:
titanic_df['Alone'].loc[titanic_df['Alone'] >0] = 'With family' 

titanic_df['Alone'].loc[titanic_df['Alone'] == 0] = 'Alone'


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-50-3a24223ab913> in <module>()
----> 1 titanic_df['Alone'].loc[titanic_df['Alone'] >0] = 'With family'
      2 
      3 titanic_df['Alone'].loc[titanic_df['Alone'] == 0] = 'Alone'

//anaconda/lib/python3.5/site-packages/pandas/core/ops.py in wrapper(self, other, axis)
    761                 other = np.asarray(other)
    762 
--> 763             res = na_op(values, other)
    764             if isscalar(res):
    765                 raise TypeError('Could not compare %s type with Series' %

//anaconda/lib/python3.5/site-packages/pandas/core/ops.py in na_op(x, y)
    681                     result = lib.vec_compare(x, y, op)
    682             else:
--> 683                 result = lib.scalar_compare(x, y, op)
    684         else:
    685 

pandas/lib.pyx in pandas.lib.scalar_compare (pandas/lib.c:14261)()

TypeError: unorderable types: str() > int()

In [52]:
titanic_df['Survivor'] = titanic_df.Survived.map({0:'no',1:'yes'})

sns.factorplot('Survivor', data=titanic_df,kind='count')


Out[52]:
<seaborn.axisgrid.FacetGrid at 0x11b92b9b0>

In [54]:
sns.factorplot('Pclass','Survived',hue='person',data=titanic_df)


Out[54]:
<seaborn.axisgrid.FacetGrid at 0x11bc86240>

In [56]:
sns.lmplot('Age','Survived',data=titanic_df)


Out[56]:
<seaborn.axisgrid.FacetGrid at 0x11bc0dd30>

In [58]:
sns.lmplot('Age','Survived',hue='Pclass'
,data=titanic_df)


Out[58]:
<seaborn.axisgrid.FacetGrid at 0x11c07cc18>

In [ ]: