In [1]:
# import libaries
import numpy as np
import pandas as pd
import matplotlib.pyplot as pyplt
from IPython.display import Image

# Will first not import Seaborn to show how seaborn makes nice plots
# import seaborn as sns

In [2]:
# Plot in ipython notebook
%matplotlib inline

In [3]:
# Nothing less than the latest :-)
!python --version


Python 3.4.1

In [4]:
print('Numpy Version: ', np.__version__)
print('Pandas Version: ', pd.__version__)


Numpy Version:  1.8.1
Pandas Version:  0.14.0

In [9]:
# Set Display Options
pd.options.display.max_rows = 15

Kaggle Titanic Dataset:


In [6]:
# Reading Data - Titanic Dataset
# read_csv, read_frame
train_df = pd.read_csv('kaggle_titanic_data/train.csv')

In [8]:
train_df


Out[8]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38 1 0 PC 17599 71.2833 C85 C
... ... ... ... ... ... ... ... ... ... ... ... ...
889 890 1 1 Behr, Mr. Karl Howell male 26 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32 0 0 370376 7.7500 NaN Q

891 rows × 12 columns


In [8]:
# What is a Pandas DataFrame
Image(filename='images/dataframe.png')


Out[8]:

In [10]:
train_df.columns


Out[10]:
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype='object')

In [10]:
# Describe summary of data
train_df.describe()


Out[10]:
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200

In [11]:
# Display Column Types
train_df.dtypes


Out[11]:
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [12]:
# Change columns to lowercase
train_df.columns = [col.lower() for col in train_df.columns]

In [13]:
train_df.columns


Out[13]:
Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked'], dtype='object')

In [16]:
# Rename sex to gender
train_df.rename(columns={'sex': 'gender'}, inplace=True)

In [19]:
# select a Column
train_df[['gender', 'pclass']]


Out[19]:
gender pclass
0 male 3
1 female 1
2 female 3
3 female 1
4 male 3
5 male 3
6 male 1
... ... ...
884 male 3
885 female 3
886 male 2
887 female 1
888 female 3
889 male 1
890 male 3

891 rows × 2 columns


In [29]:
# select a row
train_df.ix[0:6][['name':'survived']]


  File "<ipython-input-29-4751a92576db>", line 2
    train_df.ix[0:6][['name':'survived']]
                            ^
SyntaxError: invalid syntax

In [30]:
# show only that survived
train_df[train_df.survived == 1]


Out[30]:
passengerid survived pclass name gender age sibsp parch ticket fare cabin embarked
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0 113803 53.1000 C123 S
8 9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27 0 2 347742 11.1333 NaN S
9 10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14 1 0 237736 30.0708 NaN C
10 11 1 3 Sandstrom, Miss. Marguerite Rut female 4 1 1 PP 9549 16.7000 G6 S
11 12 1 1 Bonnell, Miss. Elizabeth female 58 0 0 113783 26.5500 C103 S
... ... ... ... ... ... ... ... ... ... ... ... ...
871 872 1 1 Beckwith, Mrs. Richard Leonard (Sallie Monypeny) female 47 1 1 11751 52.5542 D35 S
874 875 1 2 Abelson, Mrs. Samuel (Hannah Wizosky) female 28 1 0 P/PP 3381 24.0000 NaN C
875 876 1 3 Najib, Miss. Adele Kiamie "Jane" female 15 0 0 2667 7.2250 NaN C
879 880 1 1 Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) female 56 0 1 11767 83.1583 C50 C
880 881 1 2 Shelley, Mrs. William (Imanita Parrish Hall) female 25 0 1 230433 26.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19 0 0 112053 30.0000 B42 S
889 890 1 1 Behr, Mr. Karl Howell male 26 0 0 111369 30.0000 C148 C

342 rows × 12 columns


In [31]:
# unique values
train_df.gender.unique()


Out[31]:
array(['male', 'female'], dtype=object)

In [32]:
# How many Survived?
train_df['survived'].value_counts()


Out[32]:
0    549
1    342
dtype: int64

In [33]:
# Distribution of Age
ax = train_df['age'].hist()  # bins
ax.set_title('Histogram of Fares')
ax.set_xlabel('x Label')


Out[33]:
<matplotlib.text.Text at 0x10c74ebe0>

In [34]:
# Now set Pandas options to make plot pretty
pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier (not necessary if you use seaborn)

In [36]:
# Distribution of Age
ax = train_df['age'].hist(bins=30)  # bins
ax.set_title('Histogram of Fares')
ax.set_xlabel('x Label')


Out[36]:
<matplotlib.text.Text at 0x10c8e16d8>

In [37]:
import seaborn as sns

In [38]:
# Distribution of Age
ax = train_df['age'].hist(bins=30)  # bins
ax.set_title('Histogram of Fares')
ax.set_xlabel('x Label')


Out[38]:
<matplotlib.text.Text at 0x10e913198>

In [39]:
train_df.age[pd.isnull(train_df.age)]


Out[39]:
5    NaN
17   NaN
19   NaN
26   NaN
28   NaN
...
849   NaN
859   NaN
863   NaN
868   NaN
878   NaN
888   NaN
Name: age, Length: 177, dtype: float64

In [40]:
# Age - fix null values
# Method 1
train_df.age[pd.isnull(train_df.age)] = train_df.age.mean()

# Method 2 - Pandas Convenience Functions
train_df.age.fillna(train_df.age.mean())


Out[40]:
0    22
1    38
2    26
3    35
4    35
...
885    39.000000
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: age, Length: 891, dtype: float64

In [41]:
# Did your gender make a difference in survival
train_df.gender.value_counts()


Out[41]:
male      577
female    314
dtype: int64

In [42]:
train_df.groupby(['pclass', 'gender'])['survived'].agg(['sum', 'count'])


Out[42]:
sum count
pclass gender
1 female 91 94
male 45 122
2 female 70 76
male 17 108
3 female 72 144
male 47 347

In [43]:
Image(filename='images/wesm_book_groupby.png')


Out[43]:

In [45]:
# Show how many survivied by gender and class
class_gender_group = train_df.groupby(['pclass', 'gender'])

In [46]:
for k, group in class_gender_group:
    print(k)


(1, 'female')
(1, 'male')
(2, 'female')
(2, 'male')
(3, 'female')
(3, 'male')

In [47]:
# Plot how many survived, by gender and class
grid_plot = sns.FacetGrid(train_df, row='gender', col='pclass')
grid_plot.map(sns.regplot, 'survived', 'age',color='.3', fit_reg=False, x_jitter=.1)


Out[47]:
<seaborn.axisgrid.FacetGrid at 0x10e9d8240>

In [48]:
ax = sns.boxplot(train_df.age, groupby=train_df.pclass)
ax.set_title('Age Distribution by class')


Out[48]:
<matplotlib.text.Text at 0x10f421208>

In [49]:
# Who paid the highest Fare in Titanic.  Did they survive?
train_df.sort_index(by='fare', ascending=False)


Out[49]:
passengerid survived pclass name gender age sibsp parch ticket fare cabin embarked
258 259 1 1 Ward, Miss. Anna female 35 0 0 PC 17755 512.3292 NaN C
737 738 1 1 Lesurer, Mr. Gustave J male 35 0 0 PC 17755 512.3292 B101 C
679 680 1 1 Cardeza, Mr. Thomas Drake Martinez male 36 0 1 PC 17755 512.3292 B51 B53 B55 C
88 89 1 1 Fortune, Miss. Mabel Helen female 23 3 2 19950 263.0000 C23 C25 C27 S
27 28 0 1 Fortune, Mr. Charles Alexander male 19 3 2 19950 263.0000 C23 C25 C27 S

In [50]:
# Highest Paid ticket by Class.
def topn(group, field, n=5):
    return group.sort_index(by=field, ascending=False)[:n]

train_df.groupby('pclass').apply(topn, 'fare', 2)


Out[50]:
passengerid survived pclass name gender age sibsp parch ticket fare cabin embarked
pclass
1 258 259 1 1 Ward, Miss. Anna female 35.000000 0 0 PC 17755 512.3292 NaN C
737 738 1 1 Lesurer, Mr. Gustave J male 35.000000 0 0 PC 17755 512.3292 B101 C
2 665 666 0 2 Hickman, Mr. Lewis male 32.000000 2 0 S.O.C. 14879 73.5000 NaN S
72 73 0 2 Hood, Mr. Ambrose Jr male 21.000000 0 0 S.O.C. 14879 73.5000 NaN S
3 324 325 0 3 Sage, Mr. George John Jr male 29.699118 8 2 CA. 2343 69.5500 NaN S
792 793 0 3 Sage, Miss. Stella Anna female 29.699118 8 2 CA. 2343 69.5500 NaN S

In [51]:
# Youngenst 2 by Class.
def botm(group, field, n=5):
    return group.sort_index(by=field)[:n]

train_df.groupby('pclass').apply(botm, 'age', 2)


Out[51]:
passengerid survived pclass name gender age sibsp parch ticket fare cabin embarked
pclass
1 305 306 1 1 Allison, Master. Hudson Trevor male 0.92 1 2 113781 151.5500 C22 C26 S
297 298 0 1 Allison, Miss. Helen Loraine female 2.00 1 2 113781 151.5500 C22 C26 S
2 755 756 1 2 Hamalainen, Master. Viljo male 0.67 1 1 250649 14.5000 NaN S
78 79 1 2 Caldwell, Master. Alden Gates male 0.83 0 2 248738 29.0000 NaN S
3 803 804 1 3 Thomas, Master. Assad Alexander male 0.42 0 1 2625 8.5167 NaN C
469 470 1 3 Baclini, Miss. Helene Barbara female 0.75 2 1 2666 19.2583 NaN C

In [ ]:
# Write back the changes
# train_df.to_csv('kaggle_titanic_data/train_modified.csv')

In [52]:
# Custom Translation of Values and creating new Columns
def gender_map(val):
    if val == 'male':
        return 1
    return 0
train_df['gender_val'] = train_df.gender.map(gender_map)

In [53]:
train_df


Out[53]:
passengerid survived pclass name gender age sibsp parch ticket fare cabin embarked gender_val
0 1 0 3 Braund, Mr. Owen Harris male 22.000000 1 0 A/5 21171 7.2500 NaN S 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.000000 1 0 PC 17599 71.2833 C85 C 0
2 3 1 3 Heikkinen, Miss. Laina female 26.000000 0 0 STON/O2. 3101282 7.9250 NaN S 0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.000000 1 0 113803 53.1000 C123 S 0
4 5 0 3 Allen, Mr. William Henry male 35.000000 0 0 373450 8.0500 NaN S 1
5 6 0 3 Moran, Mr. James male 29.699118 0 0 330877 8.4583 NaN Q 1
6 7 0 1 McCarthy, Mr. Timothy J male 54.000000 0 0 17463 51.8625 E46 S 1
... ... ... ... ... ... ... ... ... ... ... ... ... ...
884 885 0 3 Sutehall, Mr. Henry Jr male 25.000000 0 0 SOTON/OQ 392076 7.0500 NaN S 1
885 886 0 3 Rice, Mrs. William (Margaret Norton) female 39.000000 0 5 382652 29.1250 NaN Q 0
886 887 0 2 Montvila, Rev. Juozas male 27.000000 0 0 211536 13.0000 NaN S 1
887 888 1 1 Graham, Miss. Margaret Edith female 19.000000 0 0 112053 30.0000 B42 S 0
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 29.699118 1 2 W./C. 6607 23.4500 NaN S 0
889 890 1 1 Behr, Mr. Karl Howell male 26.000000 0 0 111369 30.0000 C148 C 1
890 891 0 3 Dooley, Mr. Patrick male 32.000000 0 0 370376 7.7500 NaN Q 1

891 rows × 13 columns


In [54]:
train_df.drop('gender_val', axis=1, inplace=True)

In [55]:
for row in train_df:
    print(row)


passengerid
survived
pclass
name
gender
age
sibsp
parch
ticket
fare
cabin
embarked

In [56]:
# Some mistakes I made when working with Pandas
# Don't use Loop to update Dataframe
for k, row in train_df.iterrows():
    if row.gender == 'male':
        train_df.ix[k, 'gender'] = 1
    else:
        train_df.ix[k, 'gender'] = 0

In [ ]:
train_df

In [59]:
# Filtering Gotchas
train_df[((train_df.survived == 1) & (train_df.pclass == 1))]


Out[59]:
passengerid survived pclass name gender age sibsp parch ticket fare cabin embarked
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 38.000000 1 0 PC 17599 71.2833 C85 C
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 35.000000 1 0 113803 53.1000 C123 S
11 12 1 1 Bonnell, Miss. Elizabeth 0 58.000000 0 0 113783 26.5500 C103 S
23 24 1 1 Sloper, Mr. William Thompson 1 28.000000 0 0 113788 35.5000 A6 S
31 32 1 1 Spencer, Mrs. William Augustus (Marie Eugenie) 0 29.699118 1 0 PC 17569 146.5208 B78 C
52 53 1 1 Harper, Mrs. Henry Sleeper (Myna Haxtun) 0 49.000000 1 0 PC 17572 76.7292 D33 C
55 56 1 1 Woolner, Mr. Hugh 1 29.699118 0 0 19947 35.5000 C52 S
... ... ... ... ... ... ... ... ... ... ... ... ...
856 857 1 1 Wick, Mrs. George Dennick (Mary Hitchcock) 0 45.000000 1 1 36928 164.8667 NaN S
857 858 1 1 Daly, Mr. Peter Denis 1 51.000000 0 0 113055 26.5500 E17 S
862 863 1 1 Swift, Mrs. Frederick Joel (Margaret Welles Ba... 0 48.000000 0 0 17466 25.9292 D17 S
871 872 1 1 Beckwith, Mrs. Richard Leonard (Sallie Monypeny) 0 47.000000 1 1 11751 52.5542 D35 S
879 880 1 1 Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) 0 56.000000 0 1 11767 83.1583 C50 C
887 888 1 1 Graham, Miss. Margaret Edith 0 19.000000 0 0 112053 30.0000 B42 S
889 890 1 1 Behr, Mr. Karl Howell 1 26.000000 0 0 111369 30.0000 C148 C

136 rows × 12 columns


In [ ]:


In [ ]:


In [ ]: