In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from logistic import Logistic_Regression, hypothesis
In [2]:
train_df = pd.read_csv('data/train.csv')
In [3]:
train_df.head()
Out[3]:
In [4]:
sns.heatmap(train_df.isnull(),yticklabels=False,cmap='viridis',cbar=False)
Out[4]:
In [5]:
pclass_1_mean_age = train_df.groupby(['Pclass']).mean().iloc[0]['Age']
pclass_2_mean_age = train_df.groupby(['Pclass']).mean().iloc[1]['Age']
pclass_3_mean_age = train_df.groupby(['Pclass']).mean().iloc[2]['Age']
In [6]:
def impute_age(cols):
Age = cols[0]
Pclass = cols[1]
if pd.isnull(Age):
if Pclass == 1:
return pclass_1_mean_age
elif Pclass == 2:
return pclass_2_mean_age
else:
return pclass_3_mean_age
else:
return Age
In [7]:
train_df['Age'] = train_df[['Age','Pclass']].apply(impute_age,axis=1)
In [8]:
sns.heatmap(train_df.isnull(),yticklabels=False,cmap='viridis',cbar=False)
Out[8]:
In [9]:
train_df.drop('Cabin',axis=1,inplace=True)
train_df.dropna(inplace=True)
In [10]:
sns.heatmap(train_df.isnull(),yticklabels=False,cmap='viridis',cbar=False)
Out[10]:
In [11]:
sex = pd.get_dummies(train_df['Sex'],drop_first=True)
embark = pd.get_dummies(train_df['Embarked'],drop_first=True)
train_df = pd.concat([train_df,sex,embark],axis=1)
train_df.drop(['Sex','Embarked','Name','Ticket','PassengerId'],axis=1,inplace=True)
train_df.head()
Out[11]:
In [12]:
y = train_df['Survived'].copy().as_matrix()
In [13]:
X = train_df.drop('Survived',axis=1).as_matrix()
In [14]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4)
In [15]:
initial_theta = [0,0,0,0,0,0,0,0]
alpha = 0.1
iterations = 10000
In [16]:
final_theta = Logistic_Regression(X_train,y_train,alpha,initial_theta,iterations)
In [17]:
final_theta
Out[17]:
In [18]:
def score(theta):
model_score = 0
length = len(X_test)
for i in range(length):
y_prediction = round(hypothesis(theta,X_test[i]))
y_real = y_test[i]
if y_prediction == y_real:
model_score += 1
score = model_score / length
return float(score)
In [19]:
print("The model was correct " + str(round((score(final_theta) * 100),3)) + "% of the time.")