In [1]:
# import libaries
import numpy as np
import pandas as pd
import matplotlib.pyplot as pyplt
from IPython.display import Image
# Will first not import Seaborn to show how seaborn makes nice plots
# import seaborn as sns
In [2]:
# Plot in ipython notebook
%matplotlib inline
In [3]:
# Nothing less than the latest :-)
!python --version
In [4]:
print('Numpy Version: ', np.__version__)
print('Pandas Version: ', pd.__version__)
In [9]:
# Set Display Options
pd.options.display.max_rows = 15
In [6]:
# Reading Data - Titanic Dataset
# read_csv, read_frame
train_df = pd.read_csv('kaggle_titanic_data/train.csv')
In [8]:
train_df
Out[8]:
In [8]:
# What is a Pandas DataFrame
Image(filename='images/dataframe.png')
Out[8]:
In [10]:
train_df.columns
Out[10]:
In [10]:
# Describe summary of data
train_df.describe()
Out[10]:
In [11]:
# Display Column Types
train_df.dtypes
Out[11]:
In [12]:
# Change columns to lowercase
train_df.columns = [col.lower() for col in train_df.columns]
In [13]:
train_df.columns
Out[13]:
In [16]:
# Rename sex to gender
train_df.rename(columns={'sex': 'gender'}, inplace=True)
In [19]:
# select a Column
train_df[['gender', 'pclass']]
Out[19]:
In [29]:
# select a row
train_df.ix[0:6][['name':'survived']]
In [30]:
# show only that survived
train_df[train_df.survived == 1]
Out[30]:
In [31]:
# unique values
train_df.gender.unique()
Out[31]:
In [32]:
# How many Survived?
train_df['survived'].value_counts()
Out[32]:
In [33]:
# Distribution of Age
ax = train_df['age'].hist() # bins
ax.set_title('Histogram of Fares')
ax.set_xlabel('x Label')
Out[33]:
In [34]:
# Now set Pandas options to make plot pretty
pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier (not necessary if you use seaborn)
In [36]:
# Distribution of Age
ax = train_df['age'].hist(bins=30) # bins
ax.set_title('Histogram of Fares')
ax.set_xlabel('x Label')
Out[36]:
In [37]:
import seaborn as sns
In [38]:
# Distribution of Age
ax = train_df['age'].hist(bins=30) # bins
ax.set_title('Histogram of Fares')
ax.set_xlabel('x Label')
Out[38]:
In [39]:
train_df.age[pd.isnull(train_df.age)]
Out[39]:
In [40]:
# Age - fix null values
# Method 1
train_df.age[pd.isnull(train_df.age)] = train_df.age.mean()
# Method 2 - Pandas Convenience Functions
train_df.age.fillna(train_df.age.mean())
Out[40]:
In [41]:
# Did your gender make a difference in survival
train_df.gender.value_counts()
Out[41]:
In [42]:
train_df.groupby(['pclass', 'gender'])['survived'].agg(['sum', 'count'])
Out[42]:
In [43]:
Image(filename='images/wesm_book_groupby.png')
Out[43]:
In [45]:
# Show how many survivied by gender and class
class_gender_group = train_df.groupby(['pclass', 'gender'])
In [46]:
for k, group in class_gender_group:
print(k)
In [47]:
# Plot how many survived, by gender and class
grid_plot = sns.FacetGrid(train_df, row='gender', col='pclass')
grid_plot.map(sns.regplot, 'survived', 'age',color='.3', fit_reg=False, x_jitter=.1)
Out[47]:
In [48]:
ax = sns.boxplot(train_df.age, groupby=train_df.pclass)
ax.set_title('Age Distribution by class')
Out[48]:
In [49]:
# Who paid the highest Fare in Titanic. Did they survive?
train_df.sort_index(by='fare', ascending=False)
Out[49]:
In [50]:
# Highest Paid ticket by Class.
def topn(group, field, n=5):
return group.sort_index(by=field, ascending=False)[:n]
train_df.groupby('pclass').apply(topn, 'fare', 2)
Out[50]:
In [51]:
# Youngenst 2 by Class.
def botm(group, field, n=5):
return group.sort_index(by=field)[:n]
train_df.groupby('pclass').apply(botm, 'age', 2)
Out[51]:
In [ ]:
# Write back the changes
# train_df.to_csv('kaggle_titanic_data/train_modified.csv')
In [52]:
# Custom Translation of Values and creating new Columns
def gender_map(val):
if val == 'male':
return 1
return 0
train_df['gender_val'] = train_df.gender.map(gender_map)
In [53]:
train_df
Out[53]:
In [54]:
train_df.drop('gender_val', axis=1, inplace=True)
In [55]:
for row in train_df:
print(row)
In [56]:
# Some mistakes I made when working with Pandas
# Don't use Loop to update Dataframe
for k, row in train_df.iterrows():
if row.gender == 'male':
train_df.ix[k, 'gender'] = 1
else:
train_df.ix[k, 'gender'] = 0
In [ ]:
train_df
In [59]:
# Filtering Gotchas
train_df[((train_df.survived == 1) & (train_df.pclass == 1))]
Out[59]:
In [ ]:
In [ ]:
In [ ]: