In [6]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('precision',4)
pd.set_option('display.width',120)
titanic_df = pd.read_csv("./data/titanic_train.csv")
del titanic_df['Name']
#del titanic_df['PassengerId']
del titanic_df['Ticket']
del titanic_df['Cabin']
#titanic_df['Embarked_int'] = pd.Series([titanic_df['Embarked']=='S')
numsex = {"male":1 ,"female" :2}
titanic_df['Sex'] = titanic_df['Sex'].replace(numsex)
titanic_df['Sex'] = titanic_df['Sex'].convert_objects(convert_numeric=True)
numembark = {"S":1 ,"C" :2, "Q":3}
titanic_df['Embarked'] = titanic_df['Embarked'].replace(numembark)
titanic_df['Embarked'] = titanic_df['Embarked'].convert_objects(convert_numeric=True)
titanic_df['Embarked'].fillna(1)
titanic_df['Embarked'].fillna(titanic_df['Age'].mean())
titanic_df.head()
Out[6]:
In [7]:
titanic_df.describe()
Out[7]:
In [8]:
boxplots = titanic_df.boxplot(return_type='axes')
In [9]:
boxplots = titanic_df.boxplot(column='Age',by='Survived',return_type='axes')
In [10]:
densityplot = titanic_df.plot() #kind='density'
In [11]:
colors_palette = {0:"blue",1: "green"}
groups = list(titanic_df.Survived)
colors = [colors_palette[c] for c in groups]
simple_scatterplot = titanic_df.plot(kind='scatter',x=2,y=3,c=colors)
In [12]:
hexbin = titanic_df.plot(kind='hexbin',x=0,y=3,gridsize=10)
In [13]:
from pandas.tools.plotting import scatter_matrix
colors_palette = {0:"blue",1: "green"}
colors = [colors_palette[c] for c in groups]
matrix_of_scatterplots = scatter_matrix(titanic_df,alpha=0.2,figsize=(14,14),color=colors,diagonal='kde')
In [14]:
from pandas.tools.plotting import parallel_coordinates
pl1 = parallel_coordinates(titanic_df,'Survived')
In [15]:
missing_perc = titanic_df.apply(lambda x: 100*(1-x.count().sum()/(1.0*len(x))))
sorted_missing_perc = missing_perc.order(ascending=False)
sorted_missing_perc
Out[15]:
In [16]:
import random
titanic_df = pd.read_csv("./data/titanic_train.csv")
bar_width=0.1
categories_map={'Pclass':{'First':1,'Second':2,'Third':3},
'Sex':{'Female':'female','Male':'male'},
"Survived":{'Perished':0,'Survived':1},
'Embarked':{'Cherbourg':'C','Queenstown':'Q','Southampton':'S'},
'SibSp':{str(x): x for x in [0,1,2,3,4,5,8]},
'Parch':{str(x): x for x in range(7)}}
colors = ['red','green','blue','yellow','magenta','orange']
subplots=[111,211,311,411,511,611,711,811]
cIdx=0
fig,ax=plt.subplots(len(subplots),figsize=(10,12))
keyorder=['Survived','Sex','Pclass','Embarked','SibSp','Parch']
for category_key,category_items in sorted(categories_map.items(),key=lambda i:keyorder.index(i[0])):
#for 2.7 use categories_map.iteritems()
num_bars=len(category_items)
index=np.arange(num_bars)
idx=0
for cat_name,cat_val in sorted(category_items.items()):
ax[cIdx].bar(idx,len(titanic_df[titanic_df[category_key]==cat_val]),label=cat_name,color=np.random.rand(3,1))
idx+=1
ax[cIdx].set_title('%s Breakdown'%category_key)
xlabels=sorted(category_items.keys())
ax[cIdx].set_xticks(index+bar_width)
ax[cIdx].set_xticklabels(xlabels)
ax[cIdx].set_ylabel('Count')
cIdx +=1
for hcat in ['Age','Fare']:
ax[cIdx].hist(titanic_df[hcat].dropna(),color=np.random.rand(3,1))
ax[cIdx].set_title('%s Breakdown' %hcat)
ax[cIdx].set_ylabel('Frequency')
cIdx+=1
fig.subplots_adjust(hspace=0.8)
plt.show()
In [ ]: