notebook.community

Edit and run



In [6]:

    
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Modelling Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier

# Modelling Helpers
from sklearn.preprocessing import Imputer , Normalizer , scale
from sklearn.cross_validation import train_test_split , StratifiedKFold
from sklearn.feature_selection import RFECV

# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6



In [7]:

    
def plot_histograms( df , variables , n_rows , n_cols ):
    fig = plt.figure( figsize = ( 16 , 12 ) )
    for i, var_name in enumerate( variables ):
        ax=fig.add_subplot( n_rows , n_cols , i+1 )
        df[ var_name ].hist( bins=10 , ax=ax )
        ax.set_title( 'Skew: ' + str( round( float( df[ var_name ].skew() ) , ) ) ) # + ' ' + var_name ) #var_name+" Distribution")
        ax.set_xticklabels( [] , visible=False )
        ax.set_yticklabels( [] , visible=False )
    fig.tight_layout()  # Improves appearance a bit.
    plt.show()

def plot_distribution( df , var , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , hue=target , aspect=4 , row = row , col = col )
    facet.map( sns.kdeplot , var , shade= True )
    facet.set( xlim=( 0 , df[ var ].max() ) )
    facet.add_legend()

def plot_categories( df , cat , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , row = row , col = col )
    facet.map( sns.barplot , cat , target )
    facet.add_legend()

def plot_correlation_map( df ):
    corr = titanic.corr()
    _ , ax = plt.subplots( figsize =( 12 , 10 ) )
    cmap = sns.diverging_palette( 220 , 10 , as_cmap = True )
    _ = sns.heatmap(
        corr, 
        cmap = cmap,
        square=True, 
        cbar_kws={ 'shrink' : .9 }, 
        ax=ax, 
        annot = True, 
        annot_kws = { 'fontsize' : 12 }
    )

def describe_more( df ):
    var = [] ; l = [] ; t = []
    for x in df:
        var.append( x )
        l.append( len( pd.value_counts( df[ x ] ) ) )
        t.append( df[ x ].dtypes )
    levels = pd.DataFrame( { 'Variable' : var , 'Levels' : l , 'Datatype' : t } )
    levels.sort_values( by = 'Levels' , inplace = True )
    return levels

def plot_variable_importance( X , y ):
    tree = DecisionTreeClassifier( random_state = 99 )
    tree.fit( X , y )
    plot_model_var_imp( tree , X , y )
    
def plot_model_var_imp( model , X , y ):
    imp = pd.DataFrame( 
        model.feature_importances_  , 
        columns = [ 'Importance' ] , 
        index = X.columns 
    )
    imp = imp.sort_values( [ 'Importance' ] , ascending = True )
    imp[ : 10 ].plot( kind = 'barh' )
    print (model.score( X , y ))



In [9]:

    
# get titanic & test csv files as a DataFrame
train = pd.read_csv("train.csv")
test    = pd.read_csv("test.csv")

full = train.append( test , ignore_index = True )
titanic = full[ :891 ]

del train , test

print ('Datasets:' , 'full:' , full.shape , 'titanic:' , titanic.shape)









    



Datasets: full: (1309, 12) titanic: (891, 12)



In [10]:

    
titanic.head()









    Out[10]:







  
    
      
      Age
      Cabin
      Embarked
      Fare
      Name
      Parch
      PassengerId
      Pclass
      Sex
      SibSp
      Survived
      Ticket
    
  
  
    
      0
      22.0
      NaN
      S
      7.2500
      Braund, Mr. Owen Harris
      0
      1
      3
      male
      1
      0.0
      A/5 21171
    
    
      1
      38.0
      C85
      C
      71.2833
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      0
      2
      1
      female
      1
      1.0
      PC 17599
    
    
      2
      26.0
      NaN
      S
      7.9250
      Heikkinen, Miss. Laina
      0
      3
      3
      female
      0
      1.0
      STON/O2. 3101282
    
    
      3
      35.0
      C123
      S
      53.1000
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      0
      4
      1
      female
      1
      1.0
      113803
    
    
      4
      35.0
      NaN
      S
      8.0500
      Allen, Mr. William Henry
      0
      5
      3
      male
      0
      0.0
      373450



In [11]:

    
titanic.describe()









    Out[11]:







  
    
      
      Age
      Fare
      Parch
      PassengerId
      Pclass
      SibSp
      Survived
    
  
  
    
      count
      714.000000
      891.000000
      891.000000
      891.000000
      891.000000
      891.000000
      891.000000
    
    
      mean
      29.699118
      32.204208
      0.381594
      446.000000
      2.308642
      0.523008
      0.383838
    
    
      std
      14.526497
      49.693429
      0.806057
      257.353842
      0.836071
      1.102743
      0.486592
    
    
      min
      0.420000
      0.000000
      0.000000
      1.000000
      1.000000
      0.000000
      0.000000
    
    
      25%
      20.125000
      7.910400
      0.000000
      223.500000
      2.000000
      0.000000
      0.000000
    
    
      50%
      28.000000
      14.454200
      0.000000
      446.000000
      3.000000
      0.000000
      0.000000
    
    
      75%
      38.000000
      31.000000
      0.000000
      668.500000
      3.000000
      1.000000
      1.000000
    
    
      max
      80.000000
      512.329200
      6.000000
      891.000000
      3.000000
      8.000000
      1.000000



In [12]:

    
plot_correlation_map( titanic )



In [ ]:

	Age	Cabin	Embarked	Fare	Name	PassengerId	Pclass	Sex	SibSp	Survived	Ticket
0	22.0	NaN	S	7.2500	Braund, Mr. Owen Harris	1	3	male	1	0.0	A/5 21171
1	38.0	C85	C	71.2833	Cumings, Mrs. John Bradley (Florence Briggs Th...	2	1	female	1	1.0	PC 17599
2	26.0	NaN	S	7.9250	Heikkinen, Miss. Laina	3	3	female	0	1.0	STON/O2. 3101282
3	35.0	C123	S	53.1000	Futrelle, Mrs. Jacques Heath (Lily May Peel)	4	1	female	1	1.0	113803
4	35.0	NaN	S	8.0500	Allen, Mr. William Henry	5	3	male	0	0.0	373450

	Age	Fare	Parch	PassengerId	Pclass	SibSp	Survived
count	714.000000	891.000000	891.000000	891.000000	891.000000	891.000000	891.000000
mean	29.699118	32.204208	0.381594	446.000000	2.308642	0.523008	0.383838
std	14.526497	49.693429	0.806057	257.353842	0.836071	1.102743	0.486592
min	0.420000	0.000000	0.000000	1.000000	1.000000	0.000000	0.000000
25%	20.125000	7.910400	0.000000	223.500000	2.000000	0.000000	0.000000
50%	28.000000	14.454200	0.000000	446.000000	3.000000	0.000000	0.000000
75%	38.000000	31.000000	0.000000	668.500000	3.000000	1.000000	1.000000
max	80.000000	512.329200	6.000000	891.000000	3.000000	8.000000	1.000000