In [6]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')
# Handle table-like data and matrices
import numpy as np
import pandas as pd
# Modelling Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
# Modelling Helpers
from sklearn.preprocessing import Imputer , Normalizer , scale
from sklearn.cross_validation import train_test_split , StratifiedKFold
from sklearn.feature_selection import RFECV
# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6
In [7]:
def plot_histograms( df , variables , n_rows , n_cols ):
fig = plt.figure( figsize = ( 16 , 12 ) )
for i, var_name in enumerate( variables ):
ax=fig.add_subplot( n_rows , n_cols , i+1 )
df[ var_name ].hist( bins=10 , ax=ax )
ax.set_title( 'Skew: ' + str( round( float( df[ var_name ].skew() ) , ) ) ) # + ' ' + var_name ) #var_name+" Distribution")
ax.set_xticklabels( [] , visible=False )
ax.set_yticklabels( [] , visible=False )
fig.tight_layout() # Improves appearance a bit.
plt.show()
def plot_distribution( df , var , target , **kwargs ):
row = kwargs.get( 'row' , None )
col = kwargs.get( 'col' , None )
facet = sns.FacetGrid( df , hue=target , aspect=4 , row = row , col = col )
facet.map( sns.kdeplot , var , shade= True )
facet.set( xlim=( 0 , df[ var ].max() ) )
facet.add_legend()
def plot_categories( df , cat , target , **kwargs ):
row = kwargs.get( 'row' , None )
col = kwargs.get( 'col' , None )
facet = sns.FacetGrid( df , row = row , col = col )
facet.map( sns.barplot , cat , target )
facet.add_legend()
def plot_correlation_map( df ):
corr = titanic.corr()
_ , ax = plt.subplots( figsize =( 12 , 10 ) )
cmap = sns.diverging_palette( 220 , 10 , as_cmap = True )
_ = sns.heatmap(
corr,
cmap = cmap,
square=True,
cbar_kws={ 'shrink' : .9 },
ax=ax,
annot = True,
annot_kws = { 'fontsize' : 12 }
)
def describe_more( df ):
var = [] ; l = [] ; t = []
for x in df:
var.append( x )
l.append( len( pd.value_counts( df[ x ] ) ) )
t.append( df[ x ].dtypes )
levels = pd.DataFrame( { 'Variable' : var , 'Levels' : l , 'Datatype' : t } )
levels.sort_values( by = 'Levels' , inplace = True )
return levels
def plot_variable_importance( X , y ):
tree = DecisionTreeClassifier( random_state = 99 )
tree.fit( X , y )
plot_model_var_imp( tree , X , y )
def plot_model_var_imp( model , X , y ):
imp = pd.DataFrame(
model.feature_importances_ ,
columns = [ 'Importance' ] ,
index = X.columns
)
imp = imp.sort_values( [ 'Importance' ] , ascending = True )
imp[ : 10 ].plot( kind = 'barh' )
print (model.score( X , y ))
In [9]:
# get titanic & test csv files as a DataFrame
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
full = train.append( test , ignore_index = True )
titanic = full[ :891 ]
del train , test
print ('Datasets:' , 'full:' , full.shape , 'titanic:' , titanic.shape)
In [10]:
titanic.head()
Out[10]:
In [11]:
titanic.describe()
Out[11]:
In [12]:
plot_correlation_map( titanic )
In [ ]: