In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from logistic import Logistic_Regression, hypothesis

In [2]:
train_df = pd.read_csv('data/train.csv')

In [3]:
train_df.head()


Out[3]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

In [4]:
sns.heatmap(train_df.isnull(),yticklabels=False,cmap='viridis',cbar=False)


Out[4]:
<matplotlib.axes._subplots.AxesSubplot at 0x10fce7b70>
Clean null values

In [5]:
pclass_1_mean_age = train_df.groupby(['Pclass']).mean().iloc[0]['Age']
pclass_2_mean_age = train_df.groupby(['Pclass']).mean().iloc[1]['Age']
pclass_3_mean_age = train_df.groupby(['Pclass']).mean().iloc[2]['Age']

In [6]:
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        if Pclass == 1:
            return pclass_1_mean_age
        elif Pclass == 2:
            return pclass_2_mean_age
        else:
            return pclass_3_mean_age
    else:
        return Age

In [7]:
train_df['Age'] = train_df[['Age','Pclass']].apply(impute_age,axis=1)

In [8]:
sns.heatmap(train_df.isnull(),yticklabels=False,cmap='viridis',cbar=False)


Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x10fe420f0>

In [9]:
train_df.drop('Cabin',axis=1,inplace=True)
train_df.dropna(inplace=True)

In [10]:
sns.heatmap(train_df.isnull(),yticklabels=False,cmap='viridis',cbar=False)


Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x10fee02b0>
Encoded non-numeric features & drop non-relevant columns

In [11]:
sex = pd.get_dummies(train_df['Sex'],drop_first=True)
embark = pd.get_dummies(train_df['Embarked'],drop_first=True)
train_df = pd.concat([train_df,sex,embark],axis=1)
train_df.drop(['Sex','Embarked','Name','Ticket','PassengerId'],axis=1,inplace=True)
train_df.head()


Out[11]:
Survived Pclass Age SibSp Parch Fare male Q S
0 0 3 22.0 1 0 7.2500 1 0 1
1 1 1 38.0 1 0 71.2833 0 0 0
2 1 3 26.0 0 0 7.9250 0 0 1
3 1 1 35.0 1 0 53.1000 0 0 1
4 0 3 35.0 0 0 8.0500 1 0 1
Set-up Model

In [12]:
y = train_df['Survived'].copy().as_matrix()

In [13]:
X = train_df.drop('Survived',axis=1).as_matrix()

In [14]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4)

In [15]:
initial_theta = [0,0,0,0,0,0,0,0]
alpha = 0.1
iterations = 10000

In [16]:
final_theta = Logistic_Regression(X_train,y_train,alpha,initial_theta,iterations)

In [17]:
final_theta


Out[17]:
[3.3551008916371257,
 0.083981247695455563,
 -8.0577085272903552,
 -2.1257755774497347,
 0.48251739700268836,
 -32.51989230713253,
 -0.97823056431704292,
 3.1311103533613962]
Score Model

In [18]:
def score(theta):
    model_score = 0
    length = len(X_test)
    for i in range(length):
        y_prediction = round(hypothesis(theta,X_test[i]))
        y_real = y_test[i]
        if y_prediction == y_real:
            model_score += 1
    score = model_score / length
    return float(score)

In [19]:
print("The model was correct " + str(round((score(final_theta) * 100),3)) + "% of the time.")


The model was correct 75.843% of the time.