In [49]:
# Title: Titanic- Machine Learning Through Disaster
# Objective: Prediction of Survival on the Titanic
# Model 1: Using a Simple Model based on Gender only
# Imports
# pandas, numpy
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
# csv, matplotlib, seaborn
import csv
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
In [55]:
# Opening the train.csv files in a dataframe (using Pandas)
train_data = pd.read_csv('Desktop/titanic/train.csv', header=0)
train_data.info()
df.head()
Out[55]:
In [53]:
# Converting dataframe to a Matrix format for usage
data= train_data.as_matrix()
print data
In [5]:
# Opening up train.csv file in a python object
training_file = open('Desktop/titanic/train.csv','rb')
training_data = csv.reader(training_file)
header = training_data.next()
# Adding training_data to a list
data=[]
for row in training_data:
data.append(row)
data = np.array(data)
print data
In [18]:
# Determining the proportions of surviving passengers using available data in train.csv
number_passengers = np.size(data[0::,1].astype(np.float))
number_survived = np.sum(data[0::,1].astype(np.float))
proportion_survivors = number_survived / number_passengers
print "Number of Passengers onboard : %s " % number_passengers
print "Survived Passengers : %s " % number_survived
print "Proportion of Passengers Survived : %s " % proportion_survivors
In [19]:
# Classification of available data based on Gender
women_only_stats = data[0::,4] == "female"
men_only_stats = data[0::,4] == "male"
# Figures of total men and women survivors separately
women_onboard = data[women_only_stats,1].astype(np.float)
men_onboard = data[men_only_stats,1].astype(np.float)
print "No. of Women Survived : %s / %s" % (np.sum(women_onboard), np.size(women_onboard))
print "No. of Men Survived : %s / %s" % (np.sum(men_onboard), np.size(men_onboard))
In [21]:
# Individual Proportions of men and women survived
proportion_women_survived = np.sum(women_onboard) / np.size(women_onboard)
proportion_men_survived = np.sum(men_onboard) / np.size(men_onboard)
print 'Proportion of women who survived : %s' % proportion_women_survived
print 'Proportion of men who survived : %s' % proportion_men_survived
In [22]:
# Opening up test.csv file in a python object
test_file = open('Desktop/titanic/test.csv', 'rb')
test_data = csv.reader(test_file) #Object
header = test_data.next()
In [25]:
# Writing Predicted data to a new file object
prediction_file = open("Desktop/titanic/genderbasedmodel.csv", "wb")
prediction_data = csv.writer(prediction_file) #Object
In [26]:
# Adding 2 columns to the file - PassengerId and Survived , as per required output format
prediction_data.writerow(["PassengerId","Survived"])
for row in test_data:
if row[3] == "female" :
prediction_data.writerow([row[0],'1'])
else:
prediction_data.writerow([row[0],'0'])
# Closing the csv files
test_file.close()
prediction_file.close()
In [35]:
# Model 2: Using a more refined Model based on Gender, Fare Price and Passenger Class
# Defining fae bracket size and an upper bound to Fare
fare_bracket_size = 10
fare_ceiling = 40
# Equating all fares greater than or equal to this bound to max limit in last price bracket
data[data[0::,9].astype(np.float) >= fare_ceiling , 9] = fare_ceiling - 1.0
# Determining number of price brackets and classes based on given data
price_brackets = fare_ceiling / fare_bracket_size
classes = len(np.unique(data[0::,2]))
# Prepare a 3-D survival table having gender, number_of_classes, number_of_price_brackets as its dimensions
survival_table = np.zeros((2,classes,price_brackets))
print "Initializing Survival table of Dimensions (2 * %s * %s)" % (classes,price_brackets)
print survival_table
In [37]:
# Adding Men/Women only Stats to the survival table based on the discussed parameters
for i in xrange(number_of_classes): #loop through each class
for j in xrange(number_of_price_brackets): #loop through each price range
women_only_stats = data[ (data[0::,4] == "female") & (data[0::,2].astype(np.float) == i+1)&
(data[0:,9].astype(np.float)>= j*fare_bracket_size)& \
(data[0:,9].astype(np.float)< (j+1)*fare_bracket_size) , 1]
men_only_stats = data[ (data[0::,4] == "male") & (data[0::,2].astype(np.float) == i+1)&
(data[0:,9].astype(np.float)>= j*fare_bracket_size)& \
(data[0:,9].astype(np.float)< (j+1)*fare_bracket_size) , 1]
survival_table[0,i,j] = np.mean(women_only_stats.astype(np.float))
survival_table[1,i,j] = np.mean(men_only_stats.astype(np.float))
print survival_table
In [38]:
# Taking Care of all the Nan's in the survival table
survival_table[ survival_table != survival_table ] = 0
print survival_table
In [39]:
# Selecting a threshold (=0.5) to make the values in the table either 0 or 1 (binary)
survival_table[ survival_table < 0.5 ] = 0
survival_table[ survival_table >= 0.5 ] = 1
print survival_table
In [40]:
# Opening up the test file and initializing the header
test_file = open('Desktop/titanic/test.csv', 'rb')
test_data = csv.reader(test_file)
header = test_data.next()
# Writing Predicted data to a newly created file object
predictions_file = open("Desktop/titanic/genderclassmodel.csv", "wb")
pred_data = csv.writer(predictions_file)
pred_data.writerow(["PassengerId", "Survived"])
In [47]:
# Populating the newly created file with values on the basis of the survival table
for row in test_data:
set
for j in xrange(number_of_price_brackets):
try:
row[8]=float(row[8])
except:
bin_fare= 3-float(row[1])
break
if(row[8]>fare_ceiling):
bin_fare=number_of_price_brackets -1
break
if row[8] >= j * fare_bracket_size and row[8] < (j+1) * fare_bracket_size:
bin_fare = j
break
if row[3] == 'female':
pred_data.writerow([row[0], "%d" % int(survival_table[0, int(row[1])-1, bin_fare])])
else:
pred_data.writerow([row[0], "%d" % int(survival_table[1, int(row[1])-1, bin_fare])])
In [48]:
# Closing the csv files.
test_file.close()
predictions_file.close()