In [49]:
# Title: Titanic- Machine Learning Through Disaster 
# Objective: Prediction of Survival on the Titanic 
# Model 1: Using a Simple Model based on Gender only

# Imports

# pandas, numpy
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

# csv, matplotlib, seaborn
import csv
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [55]:
# Opening the train.csv files in a dataframe (using Pandas)
train_data = pd.read_csv('Desktop/titanic/train.csv', header=0)
train_data.info()
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
Out[55]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

In [53]:
# Converting dataframe to a Matrix format for usage
data= train_data.as_matrix()
print data


[[1 0 3 ..., 7.25 nan 'S']
 [2 1 1 ..., 71.2833 'C85' 'C']
 [3 1 3 ..., 7.925 nan 'S']
 ..., 
 [889 0 3 ..., 23.45 nan 'S']
 [890 1 1 ..., 30.0 'C148' 'C']
 [891 0 3 ..., 7.75 nan 'Q']]

In [5]:
# Opening up train.csv file in a python object
training_file = open('Desktop/titanic/train.csv','rb')
training_data = csv.reader(training_file)
header = training_data.next()

# Adding training_data to a list
data=[]
for row in training_data:
    data.append(row)
data = np.array(data)
print data


[['1' '0' '3' ..., '7.25' '' 'S']
 ['2' '1' '1' ..., '71.2833' 'C85' 'C']
 ['3' '1' '3' ..., '7.925' '' 'S']
 ..., 
 ['889' '0' '3' ..., '23.45' '' 'S']
 ['890' '1' '1' ..., '30' 'C148' 'C']
 ['891' '0' '3' ..., '7.75' '' 'Q']]

In [18]:
# Determining the proportions of surviving passengers using available data in train.csv
number_passengers = np.size(data[0::,1].astype(np.float))
number_survived = np.sum(data[0::,1].astype(np.float))
proportion_survivors = number_survived / number_passengers
print "Number of Passengers onboard : %s " % number_passengers
print "Survived Passengers : %s " % number_survived
print "Proportion of Passengers Survived : %s " % proportion_survivors


Number of Passengers onboard : 891 
Survived Passengers : 342.0 
Proportion of Passengers Survived : 0.383838383838 

In [19]:
# Classification of available data based on Gender
women_only_stats = data[0::,4] == "female"
men_only_stats = data[0::,4] == "male"

# Figures of total men and women survivors separately
women_onboard = data[women_only_stats,1].astype(np.float)
men_onboard = data[men_only_stats,1].astype(np.float)
print "No. of Women Survived : %s / %s" % (np.sum(women_onboard), np.size(women_onboard))
print "No. of Men Survived : %s / %s" % (np.sum(men_onboard), np.size(men_onboard))


No. of Women Survived : 233.0 / 314
No. of Men Survived : 109.0 / 577

In [21]:
# Individual Proportions of men and women survived
proportion_women_survived = np.sum(women_onboard) / np.size(women_onboard)  
proportion_men_survived = np.sum(men_onboard) / np.size(men_onboard) 
print 'Proportion of women who survived : %s' % proportion_women_survived
print 'Proportion of men who survived : %s' % proportion_men_survived


Proportion of women who survived : 0.742038216561
Proportion of men who survived : 0.188908145581

In [22]:
# Opening up test.csv file in a python object
test_file = open('Desktop/titanic/test.csv', 'rb')
test_data = csv.reader(test_file)   #Object
header = test_data.next()

In [25]:
# Writing Predicted data to a new file object
prediction_file = open("Desktop/titanic/genderbasedmodel.csv", "wb")
prediction_data = csv.writer(prediction_file) #Object

In [26]:
# Adding 2 columns to the file - PassengerId and Survived , as per required output format
prediction_data.writerow(["PassengerId","Survived"])
for row in test_data:
    if row[3] == "female" :
        prediction_data.writerow([row[0],'1'])
    else:
        prediction_data.writerow([row[0],'0'])

# Closing the csv files
test_file.close()
prediction_file.close()

In [35]:
# Model 2: Using a more refined Model based on Gender, Fare Price and Passenger Class

# Defining fae bracket size and an upper bound to Fare
fare_bracket_size = 10
fare_ceiling = 40

# Equating all fares greater than or equal to this bound to max limit in last price bracket
data[data[0::,9].astype(np.float) >= fare_ceiling , 9] = fare_ceiling - 1.0

# Determining number of price brackets and classes based on given data
price_brackets = fare_ceiling / fare_bracket_size
classes = len(np.unique(data[0::,2]))

# Prepare a 3-D survival table having gender, number_of_classes, number_of_price_brackets as its dimensions
survival_table = np.zeros((2,classes,price_brackets))
print "Initializing Survival table of Dimensions (2 * %s * %s)"  % (classes,price_brackets)
print survival_table


Initializing Survival table of Dimensions (2 * 3 * 4)
[[[ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]]

 [[ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]]]

In [37]:
# Adding Men/Women only Stats to the survival table based on the discussed parameters
for i in xrange(number_of_classes):         #loop through each class
  for j in xrange(number_of_price_brackets):   #loop through each price range
    
    women_only_stats = data[ (data[0::,4] == "female") & (data[0::,2].astype(np.float) == i+1)&
                            (data[0:,9].astype(np.float)>= j*fare_bracket_size)& \
                            (data[0:,9].astype(np.float)< (j+1)*fare_bracket_size) , 1]
    men_only_stats = data[ (data[0::,4] == "male") & (data[0::,2].astype(np.float) == i+1)&
                            (data[0:,9].astype(np.float)>= j*fare_bracket_size)& \
                            (data[0:,9].astype(np.float)< (j+1)*fare_bracket_size) , 1]
    
    survival_table[0,i,j] = np.mean(women_only_stats.astype(np.float)) 
    survival_table[1,i,j] = np.mean(men_only_stats.astype(np.float))
    
print survival_table


[[[        nan         nan  0.83333333  0.97727273]
  [        nan  0.91428571  0.9         1.        ]
  [ 0.59375     0.58139535  0.33333333  0.125     ]]

 [[ 0.                 nan  0.4         0.38372093]
  [ 0.          0.15873016  0.16        0.21428571]
  [ 0.11153846  0.23684211  0.125       0.24      ]]]

In [38]:
# Taking Care of all the Nan's in the survival table
survival_table[ survival_table != survival_table ] = 0
print survival_table


[[[ 0.          0.          0.83333333  0.97727273]
  [ 0.          0.91428571  0.9         1.        ]
  [ 0.59375     0.58139535  0.33333333  0.125     ]]

 [[ 0.          0.          0.4         0.38372093]
  [ 0.          0.15873016  0.16        0.21428571]
  [ 0.11153846  0.23684211  0.125       0.24      ]]]

In [39]:
# Selecting a threshold (=0.5) to make the values in the table either 0 or 1 (binary)
survival_table[ survival_table < 0.5 ] = 0
survival_table[ survival_table >= 0.5 ] = 1 
print survival_table


[[[ 0.  0.  1.  1.]
  [ 0.  1.  1.  1.]
  [ 1.  1.  0.  0.]]

 [[ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]]]

In [40]:
# Opening up the test file and initializing the header
test_file = open('Desktop/titanic/test.csv', 'rb')
test_data = csv.reader(test_file)
header = test_data.next()

# Writing Predicted data to a newly created file object
predictions_file = open("Desktop/titanic/genderclassmodel.csv", "wb")
pred_data = csv.writer(predictions_file)
pred_data.writerow(["PassengerId", "Survived"])

In [47]:
# Populating the newly created file with values on the basis of the survival table
for row in test_data:
    set
    for j in xrange(number_of_price_brackets):
        try:
            row[8]=float(row[8])
        except:
            bin_fare= 3-float(row[1])
            break
        if(row[8]>fare_ceiling):
            bin_fare=number_of_price_brackets -1
            break
        if row[8] >= j * fare_bracket_size and row[8] < (j+1) * fare_bracket_size:  
            bin_fare = j
            break
            
        if row[3] == 'female':  
            pred_data.writerow([row[0], "%d" % int(survival_table[0, int(row[1])-1, bin_fare])])
        else:                                       
            pred_data.writerow([row[0], "%d" % int(survival_table[1, int(row[1])-1, bin_fare])])

In [48]:
# Closing the csv files.
test_file.close() 
predictions_file.close()