notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from logistic import Logistic_Regression, hypothesis



In [2]:

    
train_df = pd.read_csv('data/train.csv')



In [3]:

    
train_df.head()









    Out[3]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.0
      1
      0
      113803
      53.1000
      C123
      S
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35.0
      0
      0
      373450
      8.0500
      NaN
      S



In [4]:

    
sns.heatmap(train_df.isnull(),yticklabels=False,cmap='viridis',cbar=False)









    Out[4]:





<matplotlib.axes._subplots.AxesSubplot at 0x10fce7b70>

Clean null values



In [5]:

    
pclass_1_mean_age = train_df.groupby(['Pclass']).mean().iloc[0]['Age']
pclass_2_mean_age = train_df.groupby(['Pclass']).mean().iloc[1]['Age']
pclass_3_mean_age = train_df.groupby(['Pclass']).mean().iloc[2]['Age']



In [6]:

    
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        if Pclass == 1:
            return pclass_1_mean_age
        elif Pclass == 2:
            return pclass_2_mean_age
        else:
            return pclass_3_mean_age
    else:
        return Age



In [7]:

    
train_df['Age'] = train_df[['Age','Pclass']].apply(impute_age,axis=1)



In [8]:

    
sns.heatmap(train_df.isnull(),yticklabels=False,cmap='viridis',cbar=False)









    Out[8]:





<matplotlib.axes._subplots.AxesSubplot at 0x10fe420f0>



In [9]:

    
train_df.drop('Cabin',axis=1,inplace=True)
train_df.dropna(inplace=True)



In [10]:

    
sns.heatmap(train_df.isnull(),yticklabels=False,cmap='viridis',cbar=False)









    Out[10]:





<matplotlib.axes._subplots.AxesSubplot at 0x10fee02b0>

Encoded non-numeric features & drop non-relevant columns



In [11]:

    
sex = pd.get_dummies(train_df['Sex'],drop_first=True)
embark = pd.get_dummies(train_df['Embarked'],drop_first=True)
train_df = pd.concat([train_df,sex,embark],axis=1)
train_df.drop(['Sex','Embarked','Name','Ticket','PassengerId'],axis=1,inplace=True)
train_df.head()

Set-up Model



In [12]:

    
y = train_df['Survived'].copy().as_matrix()



In [13]:

    
X = train_df.drop('Survived',axis=1).as_matrix()



In [14]:

    
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4)



In [15]:

    
initial_theta = [0,0,0,0,0,0,0,0]
alpha = 0.1
iterations = 10000



In [16]:

    
final_theta = Logistic_Regression(X_train,y_train,alpha,initial_theta,iterations)



In [17]:

    
final_theta









    Out[17]:





[3.3551008916371257,
 0.083981247695455563,
 -8.0577085272903552,
 -2.1257755774497347,
 0.48251739700268836,
 -32.51989230713253,
 -0.97823056431704292,
 3.1311103533613962]

Score Model



In [18]:

    
def score(theta):
    model_score = 0
    length = len(X_test)
    for i in range(length):
        y_prediction = round(hypothesis(theta,X_test[i]))
        y_real = y_test[i]
        if y_prediction == y_real:
            model_score += 1
    score = model_score / length
    return float(score)



In [19]:

    
print("The model was correct " + str(round((score(final_theta) * 100),3)) + "% of the time.")









    



The model was correct 75.843% of the time.

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S