In [1]:
##################################################################################
############################### LOADING DATA #####################################
##################################################################################

In [2]:
import pandas as pd
import numpy as np

In [3]:
datadir = "~/data/kaggle/titanic/"
testfile = datadir + "test.csv"
trainfile = datadir + "train.csv"

test  = pd.read_csv(testfile)
train = pd.read_csv(trainfile)

In [4]:
train.head()


Out[4]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

In [5]:
train.sample(5)


Out[5]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
766 767 0 1 Brewe, Dr. Arthur Jackson male NaN 0 0 112379 39.6000 NaN C
235 236 0 3 Harknett, Miss. Alice Phoebe female NaN 0 0 W./C. 6609 7.5500 NaN S
773 774 0 3 Elias, Mr. Dibo male NaN 0 0 2674 7.2250 NaN C
199 200 0 2 Yrois, Miss. Henriette ("Mrs Harbeck") female 24.0 0 0 248747 13.0000 NaN S
388 389 0 3 Sadlier, Mr. Matthew male NaN 0 0 367655 7.7292 NaN Q

In [6]:
test.head()


Out[6]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S

In [7]:
##################################################################################
############################### DATA ANALYSIS ####################################
##################################################################################

In [8]:
# RULE OF THUMB.. avoid using validation or test data to make a decision.. always use your training data
# using the other set pollutes your model with info it shouldnt have and adventually renders
# the testing data sets useless

In [9]:
# Step 0 

# undestand the columns
# https://www.kaggle.com/c/titanic/data

#survival   Survival                  Categorical - 0 = No, 1 = Yes
#pclass     Ticket class              Categorical - 1 = 1st, 2 = 2nd, 3 = 3rd
#sex        Sex                       Categorical - male, female
#embarked   Port of Embarkation       Categorical - C = Cherbourg, Q = Queenstown, S = Southampton

#Age        Age in years              Numerical (is poluted)    
#                                     - If less than 1 Age is fractional  
#                                     - If the age is estimated, is it in the form of xx.5
#sibsp      siblings/spouses onboard  Numerical
#parch      parents/children onboard  Numerical - (guardians such as nannies are not counted) 
#fare       Passenger fare            Numerical

#name       Passenger name            Raw text
#ticket     Ticket number             Raw text
#cabin      Cabin number              Raw text

In [10]:
train.dtypes


Out[10]:
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [11]:
# Determine if there is missing data 
pd.isnull(train).sum() > 0


Out[11]:
PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool

In [12]:
pd.isnull(test).sum() > 0


Out[12]:
PassengerId    False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare            True
Cabin           True
Embarked       False
dtype: bool

In [13]:
# NOTE -- notice that the Fare data in the test set is sometimes missing

In [14]:
# Step 1 - Understand the fundmentals of the columns

# Computing mean, variance 
# Computing percentials and quartiles of data

In [15]:
train.describe()


Out[15]:
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200

In [16]:
train.describe(percentiles=np.arange(10)/10.0)


Out[16]:
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
0% 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
10% 90.000000 0.000000 1.000000 14.000000 0.000000 0.000000 7.550000
20% 179.000000 0.000000 1.000000 19.000000 0.000000 0.000000 7.854200
30% 268.000000 0.000000 2.000000 22.000000 0.000000 0.000000 8.050000
40% 357.000000 0.000000 2.000000 25.000000 0.000000 0.000000 10.500000
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
60% 535.000000 0.000000 3.000000 31.800000 0.000000 0.000000 21.679200
70% 624.000000 1.000000 3.000000 36.000000 1.000000 0.000000 27.000000
80% 713.000000 1.000000 3.000000 41.000000 1.000000 1.000000 39.687500
90% 802.000000 1.000000 3.000000 50.000000 1.000000 2.000000 77.958300
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200

In [17]:
train["Sex"].value_counts()


Out[17]:
male      577
female    314
Name: Sex, dtype: int64

In [18]:
(train["Sex"] != "female").sum()


Out[18]:
577

In [19]:
#  Step 2 - Understand the corelations between columns 
#  -- especailly the output column..

# Pivot tables
# Rendering Histograms
# Plotting boxplots

In [20]:
train.pivot_table(values=["Survived"], index=["Sex"], aggfunc=np.mean)


Out[20]:
Survived
Sex
female 0.742038
male 0.188908

In [21]:
# NOTE the give-me here.. 
# If we just choose all females as survive(.74*.5) and all males dead(1-.18)*.5) will get 78% correct!
# any model less than this is just complete garbage

In [22]:
train.pivot_table(values=["Survived"], index=["Pclass"], aggfunc=np.mean)


Out[22]:
Survived
Pclass
1 0.629630
2 0.472826
3 0.242363

In [23]:
# NOTE the give-me here.. 
# richer people mostly lived.

In [24]:
train.pivot_table(values=["Survived"], index=["Pclass","Sex"], aggfunc=np.mean)


Out[24]:
Survived
Pclass Sex
1 female 0.968085
male 0.368852
2 female 0.921053
male 0.157407
3 female 0.500000
male 0.135447

In [25]:
train.pivot_table(values=["Survived"], index=["SibSp"], aggfunc=np.mean)


Out[25]:
Survived
SibSp
0 0.345395
1 0.535885
2 0.464286
3 0.250000
4 0.166667
5 0.000000
8 0.000000

In [26]:
train.pivot_table(values=["Survived"], index=["Parch"], aggfunc=np.mean)


Out[26]:
Survived
Parch
0 0.343658
1 0.550847
2 0.500000
3 0.600000
4 0.000000
5 0.200000
6 0.000000

In [27]:
train.pivot_table(values=["Survived"],index=["Pclass","Sex"], aggfunc=np.sum)
#train.pivot_table(values=["Pclass"], index=["Sex"], aggfunc=np.sum)


Out[27]:
Survived
Pclass Sex
1 female 91
male 45
2 female 70
male 17
3 female 72
male 47

In [28]:
train.groupby(["Pclass","Sex"])["PassengerId"].count()


Out[28]:
Pclass  Sex   
1       female     94
        male      122
2       female     76
        male      108
3       female    144
        male      347
Name: PassengerId, dtype: int64

In [29]:
import matplotlib.pyplot as plt
%matplotlib inline

In [30]:
# TODO Log scale
train.boxplot(column="Fare",by="Survived")


Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6b340b7f50>

In [31]:
train.hist(column="Fare",by="Survived",bins=30)


Out[31]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f6b340b7590>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x7f6b2e4aae90>], dtype=object)

In [32]:
train.boxplot(column="Age",by="Survived")


Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6b2e3662d0>

In [33]:
train.hist(column="Age",by="Survived",bins=30)


Out[33]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f6b2e2260d0>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x7f6b2e0e39d0>], dtype=object)

In [34]:
##################################################################################
############################### DATA CLEAN UP ####################################
##################################################################################

In [35]:
# Step 0 
# save ourselves the time and merge it all togther

alldata = pd.concat([train,test], axis=0)
alldata.head()


Out[35]:
Age Cabin Embarked Fare Name Parch PassengerId Pclass Sex SibSp Survived Ticket
0 22.0 NaN S 7.2500 Braund, Mr. Owen Harris 0 1 3 male 1 0.0 A/5 21171
1 38.0 C85 C 71.2833 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 2 1 female 1 1.0 PC 17599
2 26.0 NaN S 7.9250 Heikkinen, Miss. Laina 0 3 3 female 0 1.0 STON/O2. 3101282
3 35.0 C123 S 53.1000 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 4 1 female 1 1.0 113803
4 35.0 NaN S 8.0500 Allen, Mr. William Henry 0 5 3 male 0 0.0 373450

In [36]:
# Step 1 
#delete useless stuff

alldata = alldata.drop(["Name", "Ticket", "Cabin","PassengerId"], 1)
alldata.head()


Out[36]:
Age Embarked Fare Parch Pclass Sex SibSp Survived
0 22.0 S 7.2500 0 3 male 1 0.0
1 38.0 C 71.2833 0 1 female 1 1.0
2 26.0 S 7.9250 0 3 female 0 1.0
3 35.0 S 53.1000 0 1 female 1 1.0
4 35.0 S 8.0500 0 3 male 0 0.0

In [37]:
# Step 2 
# Expand the catogrical data into boolen indications of presence or not
# This remove the need for the model to learn the "meaning" of the value and uncomplicates the situation

# dummy cols.. convert the "class" values into attibutes that are true/false
dummy_cols=["Embarked","Sex","Pclass"]
for column in dummy_cols:
    dummies = pd.get_dummies(alldata[column])
    alldata[dummies.columns] = dummies
alldata = alldata.drop(dummy_cols, 1)
#delete MALE its just the inverse of female.. 
alldata = alldata.drop(["male"], 1)
alldata.head()


Out[37]:
Age Fare Parch SibSp Survived C Q S female 1 2 3
0 22.0 7.2500 0 1 0.0 0 0 1 0 0 0 1
1 38.0 71.2833 0 1 1.0 1 0 0 1 1 0 0
2 26.0 7.9250 0 0 1.0 0 0 1 1 0 0 1
3 35.0 53.1000 0 1 1.0 0 0 1 1 1 0 0
4 35.0 8.0500 0 0 0.0 0 0 1 0 0 0 1

In [38]:
# Step 3 
# Handling missing data
# several options exist, we will go the simple replacement with the mean

In [39]:
#check for NaN(bad) data
pd.isnull(alldata).sum() > 0


Out[39]:
Age          True
Fare         True
Parch       False
SibSp       False
Survived     True
C           False
Q           False
S           False
female      False
1           False
2           False
3           False
dtype: bool

In [40]:
pd.isnull(alldata)[1000:1010]


Out[40]:
Age Fare Parch SibSp Survived C Q S female 1 2 3
109 False False False False True False False False False False False False
110 False False False False True False False False False False False False
111 True False False False True False False False False False False False
112 False False False False True False False False False False False False
113 False False False False True False False False False False False False
114 False False False False True False False False False False False False
115 False False False False True False False False False False False False
116 True False False False True False False False False False False False
117 False False False False True False False False False False False False
118 False False False False True False False False False False False False

In [41]:
# check the statisics of the data -- take care not to overly skew it 
alldata.describe()


Out[41]:
Age Fare Parch SibSp Survived C Q S female 1 2 3
count 1046.000000 1308.000000 1309.000000 1309.000000 891.000000 1309.000000 1309.000000 1309.000000 1309.000000 1309.000000 1309.000000 1309.000000
mean 29.881138 33.295479 0.385027 0.498854 0.383838 0.206264 0.093965 0.698243 0.355997 0.246753 0.211612 0.541635
std 14.413493 51.758668 0.865560 1.041658 0.486592 0.404777 0.291891 0.459196 0.478997 0.431287 0.408607 0.498454
min 0.170000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 21.000000 7.895800 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 28.000000 14.454200 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000
75% 39.000000 31.275000 0.000000 1.000000 1.000000 0.000000 0.000000 1.000000 1.000000 0.000000 0.000000 1.000000
max 80.000000 512.329200 9.000000 8.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

In [42]:
# fill in Nan data with mean values
#clean up the Nan(bad) data
nan_cols = ["Age","Fare"]
for column in nan_cols:
    coldata = alldata[column]
    coldata = coldata.fillna(coldata.mean())
    alldata[column] = coldata
alldata.head()


Out[42]:
Age Fare Parch SibSp Survived C Q S female 1 2 3
0 22.0 7.2500 0 1 0.0 0 0 1 0 0 0 1
1 38.0 71.2833 0 1 1.0 1 0 0 1 1 0 0
2 26.0 7.9250 0 0 1.0 0 0 1 1 0 0 1
3 35.0 53.1000 0 1 1.0 0 0 1 1 1 0 0
4 35.0 8.0500 0 0 0.0 0 0 1 0 0 0 1

In [43]:
#confirm clean up 
pd.isnull(alldata).sum() > 0


Out[43]:
Age         False
Fare        False
Parch       False
SibSp       False
Survived     True
C           False
Q           False
S           False
female      False
1           False
2           False
3           False
dtype: bool

In [44]:
#double check that the status didnt move too much
alldata.describe()


Out[44]:
Age Fare Parch SibSp Survived C Q S female 1 2 3
count 1309.000000 1309.000000 1309.000000 1309.000000 891.000000 1309.000000 1309.000000 1309.000000 1309.000000 1309.000000 1309.000000 1309.000000
mean 29.881138 33.295479 0.385027 0.498854 0.383838 0.206264 0.093965 0.698243 0.355997 0.246753 0.211612 0.541635
std 12.883193 51.738879 0.865560 1.041658 0.486592 0.404777 0.291891 0.459196 0.478997 0.431287 0.408607 0.498454
min 0.170000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 22.000000 7.895800 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 29.881138 14.454200 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000
75% 35.000000 31.275000 0.000000 1.000000 1.000000 0.000000 0.000000 1.000000 1.000000 0.000000 0.000000 1.000000
max 80.000000 512.329200 9.000000 8.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

In [45]:
# slice the data apart again
out_cols = ["Survived"] 

xtrain = alldata[0:len(train)]
ytrain = xtrain[out_cols] 
xtrain = xtrain.drop(out_cols, 1)

xtest  = alldata[len(train):]
ytest  = xtest[out_cols] 
xtest  = xtest.drop(out_cols, 1)

In [46]:
xtrain.head()


Out[46]:
Age Fare Parch SibSp C Q S female 1 2 3
0 22.0 7.2500 0 1 0 0 1 0 0 0 1
1 38.0 71.2833 0 1 1 0 0 1 1 0 0
2 26.0 7.9250 0 0 0 0 1 1 0 0 1
3 35.0 53.1000 0 1 0 0 1 1 1 0 0
4 35.0 8.0500 0 0 0 0 1 0 0 0 1

In [47]:
xtest.head()


Out[47]:
Age Fare Parch SibSp C Q S female 1 2 3
0 34.5 7.8292 0 0 0 1 0 0 0 0 1
1 47.0 7.0000 0 1 0 0 1 1 0 0 1
2 62.0 9.6875 0 0 0 1 0 0 0 1 0
3 27.0 8.6625 0 0 0 0 1 0 0 0 1
4 22.0 12.2875 1 1 0 0 1 1 0 0 1

In [48]:
ytrain.head()


Out[48]:
Survived
0 0.0
1 1.0
2 1.0
3 1.0
4 0.0

In [49]:
#for now use training and test sets as the same..

x_train, y_train = xtrain, ytrain
x_test,  y_test  = xtrain, ytrain

In [50]:
print x_train.shape, y_train.shape, x_test.shape, y_test.shape


(891, 11) (891, 1) (891, 11) (891, 1)

In [51]:
##################################################################################
################################# MODELLING ######################################
##################################################################################

In [52]:
def plotboundary(inputs, outputs, x1, x2, predict):
    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    x_min, x_max = inputs[x1].min(), inputs[x1].max()
    y_min, y_max = inputs[x2].min(), inputs[x2].max()
    x_step = (x_max - x_min)/30.0
    y_step = (y_max - y_min)/30.0

    #basis_tag = [ "", "min", "mean", "max"]
    #basis = [inputs.min(), inputs.mean(), inputs.max()]
    basis_idx = [3,4,5,6,7]
    basis = inputs.describe() 
        
    xx, yy = np.meshgrid(np.arange(x_min-x_step, x_max+x_step, x_step), 
                         np.arange(y_min-y_step, y_max+y_step, y_step))
    
    plt.rcParams['figure.figsize'] = (16, 4)
    #plt.figure(figsize=(20,9))
    #plt.subplots_adjust(hspace=.7)
    f, ax = plt.subplots(1, 6)
    fig = 0
    
    # Plot also the training points
    ax[fig].scatter(inputs[x1], inputs[x2], c=outputs, edgecolors='k', cmap=plt.cm.Paired)
    
    ax[fig].set_xlim(xx.min(), xx.max())
    ax[fig].set_ylim(yy.min(), yy.max())
    ax[fig].set_xticks(())
    ax[fig].set_yticks(())
        
    ax[fig].set_xlabel(x1)
    ax[fig].set_ylabel(x2)
    fig += 1
    
    for idx in basis_idx:
        base = basis.iloc[idx]
        tag  = basis.index[idx]
        mockin = pd.concat([base] * xx.ravel().shape[0], axis=1).transpose()

        mockin[x1] = xx.ravel()
        mockin[x2] = yy.ravel()
        
        Z = predict(mockin)
        Z = Z.reshape(xx.shape)
        ax[fig].pcolormesh(xx, yy, Z, cmap='RdBu')

        ax[fig].set_xlim(xx.min(), xx.max())
        ax[fig].set_ylim(yy.min(), yy.max())
        ax[fig].set_xticks(())
        ax[fig].set_yticks(())

        ax[fig].set_xlabel(tag)

        fig += 1


    plt.show()

In [53]:
##################################################################################
################# MODEL1 XGBOOST - BOOSTED RANDOM FOREST #########################
##################################################################################

In [54]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


/usr/local/lib/python2.7/dist-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [55]:
model_xgb = xgb.XGBClassifier()
model_xgb.fit(x_train, y_train)


/usr/local/lib/python2.7/dist-packages/sklearn/preprocessing/label.py:112: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
/usr/local/lib/python2.7/dist-packages/sklearn/preprocessing/label.py:147: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
Out[55]:
XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [56]:
# make predictions for test data
y_pred = model_xgb.predict(x_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))


Accuracy: 87.54%

In [67]:
model_xgb.predict(x_train.iloc[1:4])


Out[67]:
array([ 1.,  1.,  1.])

In [66]:
print np.arange(10)
print np.arange(10).reshape((2,5))


[0 1 2 3 4 5 6 7 8 9]
[[0 1 2 3 4]
 [5 6 7 8 9]]

In [89]:
#xgb.plot_tree(model_xgb)

In [87]:
xgb.plot_importance(model_xgb)


Out[87]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fdff2f0c050>

In [98]:
plotboundary(x_train, y_train["Survived"], "female", "Fare", 
             lambda x: model_xgb.predict(x))



In [109]:
##################################################################################
################### MODEL2 Scipy - Logiistic regression ##########################
##################################################################################

In [111]:
from sklearn import linear_model

logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(x_train, y_train)


/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.py:526: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
Out[111]:
LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [113]:
# make predictions for test data
y_pred = logreg.predict(x_test)
predictions = [round(value) for value in y_pred]

# evaluate predictions
accuracy = accuracy_score(y_valid, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))


Accuracy: 80.47%

In [312]:
plotboundary(x_train, y_train["Survived"], "Fare", "Age", 
             lambda x: logreg.predict(x))



In [266]:
##################################################################################
################### MODEL2 RAW - Deep Logiistic regression #######################
##################################################################################

In [275]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.callbacks import EarlyStopping
#from keras.optimizers import SGD, Nadam
import datetime

In [280]:
y_train2 = y_train.copy()
y_train2["Died"] = 1 - y_train["Survived"]
y_test2 = y_test.copy()
y_test2["Died"] = 1 - y_test["Survived"]

print x_train.shape, y_train2.shape, x_test.shape, y_test2.shape


(891, 11) (891, 2) (891, 11) (891, 2)

In [286]:
model_deep = Sequential()
model_deep.add(Dense(128, input_shape=(11,)))
model_deep.add(Activation('relu'))
print model_deep.output_shape
model_deep.add(Dense(64))
model_deep.add(Activation('relu'))
print model_deep.output_shape
model_deep.add(Dense(2))
model_deep.add(Activation('softmax'))
print model_deep.output_shape


(None, 128)
(None, 64)
(None, 2)

In [291]:
model_deep.compile(loss='categorical_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

stopper = EarlyStopping(monitor='val_loss', patience=10, verbose=0)

callback_list = [stopper]

print  datetime.datetime.now()
stats = model_deep.fit(x_train.values, y_train2.values, 
                       batch_size=128, nb_epoch=60,
                       verbose=1, callbacks=callback_list, 
                       validation_data=(x_test.values, y_test2.values))
print  datetime.datetime.now()


2017-03-10 21:58:21.931460
Train on 891 samples, validate on 891 samples
Epoch 1/60
891/891 [==============================] - 0s - loss: 0.4690 - acc: 0.7800 - val_loss: 0.5037 - val_acc: 0.7924
Epoch 2/60
891/891 [==============================] - 0s - loss: 0.4558 - acc: 0.8036 - val_loss: 0.4055 - val_acc: 0.8260
Epoch 3/60
891/891 [==============================] - 0s - loss: 0.4288 - acc: 0.8114 - val_loss: 0.4099 - val_acc: 0.8193
Epoch 4/60
891/891 [==============================] - 0s - loss: 0.4013 - acc: 0.8193 - val_loss: 0.3970 - val_acc: 0.8260
Epoch 5/60
891/891 [==============================] - 0s - loss: 0.3950 - acc: 0.8283 - val_loss: 0.3923 - val_acc: 0.8238
Epoch 6/60
891/891 [==============================] - 0s - loss: 0.3971 - acc: 0.8227 - val_loss: 0.3895 - val_acc: 0.8283
Epoch 7/60
891/891 [==============================] - 0s - loss: 0.3915 - acc: 0.8249 - val_loss: 0.3888 - val_acc: 0.8238
Epoch 8/60
891/891 [==============================] - 0s - loss: 0.3967 - acc: 0.8204 - val_loss: 0.3907 - val_acc: 0.8193
Epoch 9/60
891/891 [==============================] - 0s - loss: 0.3912 - acc: 0.8204 - val_loss: 0.3965 - val_acc: 0.8215
Epoch 10/60
891/891 [==============================] - 0s - loss: 0.3932 - acc: 0.8238 - val_loss: 0.3881 - val_acc: 0.8249
Epoch 11/60
891/891 [==============================] - 0s - loss: 0.4033 - acc: 0.8159 - val_loss: 0.3877 - val_acc: 0.8260
Epoch 12/60
891/891 [==============================] - 0s - loss: 0.4182 - acc: 0.8070 - val_loss: 0.4058 - val_acc: 0.8238
Epoch 13/60
891/891 [==============================] - 0s - loss: 0.4185 - acc: 0.8081 - val_loss: 0.4202 - val_acc: 0.8350
Epoch 14/60
891/891 [==============================] - 0s - loss: 0.4114 - acc: 0.8171 - val_loss: 0.3960 - val_acc: 0.8339
Epoch 15/60
891/891 [==============================] - 0s - loss: 0.3935 - acc: 0.8227 - val_loss: 0.3888 - val_acc: 0.8283
Epoch 16/60
891/891 [==============================] - 0s - loss: 0.3887 - acc: 0.8227 - val_loss: 0.3861 - val_acc: 0.8294
Epoch 17/60
891/891 [==============================] - 0s - loss: 0.3929 - acc: 0.8238 - val_loss: 0.3841 - val_acc: 0.8260
Epoch 18/60
891/891 [==============================] - 0s - loss: 0.4008 - acc: 0.8193 - val_loss: 0.4040 - val_acc: 0.8339
Epoch 19/60
891/891 [==============================] - 0s - loss: 0.4040 - acc: 0.8204 - val_loss: 0.3929 - val_acc: 0.8339
Epoch 20/60
891/891 [==============================] - 0s - loss: 0.4004 - acc: 0.8238 - val_loss: 0.3899 - val_acc: 0.8204
Epoch 21/60
891/891 [==============================] - 0s - loss: 0.4100 - acc: 0.8171 - val_loss: 0.4516 - val_acc: 0.7969
Epoch 22/60
891/891 [==============================] - 0s - loss: 0.4112 - acc: 0.8316 - val_loss: 0.3980 - val_acc: 0.8182
Epoch 23/60
891/891 [==============================] - 0s - loss: 0.3936 - acc: 0.8215 - val_loss: 0.4088 - val_acc: 0.8137
Epoch 24/60
891/891 [==============================] - 0s - loss: 0.4322 - acc: 0.8137 - val_loss: 0.4425 - val_acc: 0.7991
Epoch 25/60
891/891 [==============================] - 0s - loss: 0.4230 - acc: 0.8126 - val_loss: 0.4225 - val_acc: 0.8013
Epoch 26/60
891/891 [==============================] - 0s - loss: 0.4075 - acc: 0.8215 - val_loss: 0.4723 - val_acc: 0.7991
Epoch 27/60
891/891 [==============================] - 0s - loss: 0.4550 - acc: 0.8126 - val_loss: 0.4000 - val_acc: 0.8227
Epoch 28/60
891/891 [==============================] - 0s - loss: 0.4286 - acc: 0.8260 - val_loss: 0.3890 - val_acc: 0.8328
2017-03-10 21:58:23.365642

In [295]:
score = model_deep.evaluate(x_test.values, y_test2.values, verbose=0)
print('Test accuracy:', score[1]*100)


('Test accuracy:', 83.277216630618838)

In [315]:
plotboundary(x_train, y_train["Survived"], "Fare", "Age", 
             lambda x: model_deep.predict(x.values)[:,0])



In [ ]: