In [6]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Modelling Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier

# Modelling Helpers
from sklearn.preprocessing import Imputer , Normalizer , scale
from sklearn.cross_validation import train_test_split , StratifiedKFold
from sklearn.feature_selection import RFECV

# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6

In [7]:
def plot_histograms( df , variables , n_rows , n_cols ):
    fig = plt.figure( figsize = ( 16 , 12 ) )
    for i, var_name in enumerate( variables ):
        ax=fig.add_subplot( n_rows , n_cols , i+1 )
        df[ var_name ].hist( bins=10 , ax=ax )
        ax.set_title( 'Skew: ' + str( round( float( df[ var_name ].skew() ) , ) ) ) # + ' ' + var_name ) #var_name+" Distribution")
        ax.set_xticklabels( [] , visible=False )
        ax.set_yticklabels( [] , visible=False )
    fig.tight_layout()  # Improves appearance a bit.
    plt.show()

def plot_distribution( df , var , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , hue=target , aspect=4 , row = row , col = col )
    facet.map( sns.kdeplot , var , shade= True )
    facet.set( xlim=( 0 , df[ var ].max() ) )
    facet.add_legend()

def plot_categories( df , cat , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , row = row , col = col )
    facet.map( sns.barplot , cat , target )
    facet.add_legend()

def plot_correlation_map( df ):
    corr = titanic.corr()
    _ , ax = plt.subplots( figsize =( 12 , 10 ) )
    cmap = sns.diverging_palette( 220 , 10 , as_cmap = True )
    _ = sns.heatmap(
        corr, 
        cmap = cmap,
        square=True, 
        cbar_kws={ 'shrink' : .9 }, 
        ax=ax, 
        annot = True, 
        annot_kws = { 'fontsize' : 12 }
    )

def describe_more( df ):
    var = [] ; l = [] ; t = []
    for x in df:
        var.append( x )
        l.append( len( pd.value_counts( df[ x ] ) ) )
        t.append( df[ x ].dtypes )
    levels = pd.DataFrame( { 'Variable' : var , 'Levels' : l , 'Datatype' : t } )
    levels.sort_values( by = 'Levels' , inplace = True )
    return levels

def plot_variable_importance( X , y ):
    tree = DecisionTreeClassifier( random_state = 99 )
    tree.fit( X , y )
    plot_model_var_imp( tree , X , y )
    
def plot_model_var_imp( model , X , y ):
    imp = pd.DataFrame( 
        model.feature_importances_  , 
        columns = [ 'Importance' ] , 
        index = X.columns 
    )
    imp = imp.sort_values( [ 'Importance' ] , ascending = True )
    imp[ : 10 ].plot( kind = 'barh' )
    print (model.score( X , y ))

In [9]:
# get titanic & test csv files as a DataFrame
train = pd.read_csv("train.csv")
test    = pd.read_csv("test.csv")

full = train.append( test , ignore_index = True )
titanic = full[ :891 ]

del train , test

print ('Datasets:' , 'full:' , full.shape , 'titanic:' , titanic.shape)


Datasets: full: (1309, 12) titanic: (891, 12)

In [10]:
titanic.head()


Out[10]:
Age Cabin Embarked Fare Name Parch PassengerId Pclass Sex SibSp Survived Ticket
0 22.0 NaN S 7.2500 Braund, Mr. Owen Harris 0 1 3 male 1 0.0 A/5 21171
1 38.0 C85 C 71.2833 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 2 1 female 1 1.0 PC 17599
2 26.0 NaN S 7.9250 Heikkinen, Miss. Laina 0 3 3 female 0 1.0 STON/O2. 3101282
3 35.0 C123 S 53.1000 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 4 1 female 1 1.0 113803
4 35.0 NaN S 8.0500 Allen, Mr. William Henry 0 5 3 male 0 0.0 373450

In [11]:
titanic.describe()


Out[11]:
Age Fare Parch PassengerId Pclass SibSp Survived
count 714.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000
mean 29.699118 32.204208 0.381594 446.000000 2.308642 0.523008 0.383838
std 14.526497 49.693429 0.806057 257.353842 0.836071 1.102743 0.486592
min 0.420000 0.000000 0.000000 1.000000 1.000000 0.000000 0.000000
25% 20.125000 7.910400 0.000000 223.500000 2.000000 0.000000 0.000000
50% 28.000000 14.454200 0.000000 446.000000 3.000000 0.000000 0.000000
75% 38.000000 31.000000 0.000000 668.500000 3.000000 1.000000 1.000000
max 80.000000 512.329200 6.000000 891.000000 3.000000 8.000000 1.000000

In [12]:
plot_correlation_map( titanic )



In [ ]: