notebook.community

Edit and run



In [49]:

    
# Title: Titanic- Machine Learning Through Disaster 
# Objective: Prediction of Survival on the Titanic 
# Model 1: Using a Simple Model based on Gender only

# Imports

# pandas, numpy
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

# csv, matplotlib, seaborn
import csv
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB



In [55]:

    
# Opening the train.csv files in a dataframe (using Pandas)
train_data = pd.read_csv('Desktop/titanic/train.csv', header=0)
train_data.info()
df.head()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB






    Out[55]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.0
      1
      0
      113803
      53.1000
      C123
      S
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35.0
      0
      0
      373450
      8.0500
      NaN
      S



In [53]:

    
# Converting dataframe to a Matrix format for usage
data= train_data.as_matrix()
print data









    



[[1 0 3 ..., 7.25 nan 'S']
 [2 1 1 ..., 71.2833 'C85' 'C']
 [3 1 3 ..., 7.925 nan 'S']
 ..., 
 [889 0 3 ..., 23.45 nan 'S']
 [890 1 1 ..., 30.0 'C148' 'C']
 [891 0 3 ..., 7.75 nan 'Q']]



In [5]:

    
# Opening up train.csv file in a python object
training_file = open('Desktop/titanic/train.csv','rb')
training_data = csv.reader(training_file)
header = training_data.next()

# Adding training_data to a list
data=[]
for row in training_data:
    data.append(row)
data = np.array(data)
print data









    



[['1' '0' '3' ..., '7.25' '' 'S']
 ['2' '1' '1' ..., '71.2833' 'C85' 'C']
 ['3' '1' '3' ..., '7.925' '' 'S']
 ..., 
 ['889' '0' '3' ..., '23.45' '' 'S']
 ['890' '1' '1' ..., '30' 'C148' 'C']
 ['891' '0' '3' ..., '7.75' '' 'Q']]



In [18]:

    
# Determining the proportions of surviving passengers using available data in train.csv
number_passengers = np.size(data[0::,1].astype(np.float))
number_survived = np.sum(data[0::,1].astype(np.float))
proportion_survivors = number_survived / number_passengers
print "Number of Passengers onboard : %s " % number_passengers
print "Survived Passengers : %s " % number_survived
print "Proportion of Passengers Survived : %s " % proportion_survivors









    



Number of Passengers onboard : 891 
Survived Passengers : 342.0 
Proportion of Passengers Survived : 0.383838383838



In [19]:

    
# Classification of available data based on Gender
women_only_stats = data[0::,4] == "female"
men_only_stats = data[0::,4] == "male"

# Figures of total men and women survivors separately
women_onboard = data[women_only_stats,1].astype(np.float)
men_onboard = data[men_only_stats,1].astype(np.float)
print "No. of Women Survived : %s / %s" % (np.sum(women_onboard), np.size(women_onboard))
print "No. of Men Survived : %s / %s" % (np.sum(men_onboard), np.size(men_onboard))









    



No. of Women Survived : 233.0 / 314
No. of Men Survived : 109.0 / 577



In [21]:

    
# Individual Proportions of men and women survived
proportion_women_survived = np.sum(women_onboard) / np.size(women_onboard)  
proportion_men_survived = np.sum(men_onboard) / np.size(men_onboard) 
print 'Proportion of women who survived : %s' % proportion_women_survived
print 'Proportion of men who survived : %s' % proportion_men_survived









    



Proportion of women who survived : 0.742038216561
Proportion of men who survived : 0.188908145581



In [22]:

    
# Opening up test.csv file in a python object
test_file = open('Desktop/titanic/test.csv', 'rb')
test_data = csv.reader(test_file)   #Object
header = test_data.next()



In [25]:

    
# Writing Predicted data to a new file object
prediction_file = open("Desktop/titanic/genderbasedmodel.csv", "wb")
prediction_data = csv.writer(prediction_file) #Object



In [26]:

    
# Adding 2 columns to the file - PassengerId and Survived , as per required output format
prediction_data.writerow(["PassengerId","Survived"])
for row in test_data:
    if row[3] == "female" :
        prediction_data.writerow([row[0],'1'])
    else:
        prediction_data.writerow([row[0],'0'])

# Closing the csv files
test_file.close()
prediction_file.close()



In [35]:

    
# Model 2: Using a more refined Model based on Gender, Fare Price and Passenger Class

# Defining fae bracket size and an upper bound to Fare
fare_bracket_size = 10
fare_ceiling = 40

# Equating all fares greater than or equal to this bound to max limit in last price bracket
data[data[0::,9].astype(np.float) >= fare_ceiling , 9] = fare_ceiling - 1.0

# Determining number of price brackets and classes based on given data
price_brackets = fare_ceiling / fare_bracket_size
classes = len(np.unique(data[0::,2]))

# Prepare a 3-D survival table having gender, number_of_classes, number_of_price_brackets as its dimensions
survival_table = np.zeros((2,classes,price_brackets))
print "Initializing Survival table of Dimensions (2 * %s * %s)"  % (classes,price_brackets)
print survival_table









    



Initializing Survival table of Dimensions (2 * 3 * 4)
[[[ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]]

 [[ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]]]



In [37]:

    
# Adding Men/Women only Stats to the survival table based on the discussed parameters
for i in xrange(number_of_classes):         #loop through each class
  for j in xrange(number_of_price_brackets):   #loop through each price range
    
    women_only_stats = data[ (data[0::,4] == "female") & (data[0::,2].astype(np.float) == i+1)&
                            (data[0:,9].astype(np.float)>= j*fare_bracket_size)& \
                            (data[0:,9].astype(np.float)< (j+1)*fare_bracket_size) , 1]
    men_only_stats = data[ (data[0::,4] == "male") & (data[0::,2].astype(np.float) == i+1)&
                            (data[0:,9].astype(np.float)>= j*fare_bracket_size)& \
                            (data[0:,9].astype(np.float)< (j+1)*fare_bracket_size) , 1]
    
    survival_table[0,i,j] = np.mean(women_only_stats.astype(np.float)) 
    survival_table[1,i,j] = np.mean(men_only_stats.astype(np.float))
    
print survival_table









    



[[[        nan         nan  0.83333333  0.97727273]
  [        nan  0.91428571  0.9         1.        ]
  [ 0.59375     0.58139535  0.33333333  0.125     ]]

 [[ 0.                 nan  0.4         0.38372093]
  [ 0.          0.15873016  0.16        0.21428571]
  [ 0.11153846  0.23684211  0.125       0.24      ]]]



In [38]:

    
# Taking Care of all the Nan's in the survival table
survival_table[ survival_table != survival_table ] = 0
print survival_table









    



[[[ 0.          0.          0.83333333  0.97727273]
  [ 0.          0.91428571  0.9         1.        ]
  [ 0.59375     0.58139535  0.33333333  0.125     ]]

 [[ 0.          0.          0.4         0.38372093]
  [ 0.          0.15873016  0.16        0.21428571]
  [ 0.11153846  0.23684211  0.125       0.24      ]]]



In [39]:

    
# Selecting a threshold (=0.5) to make the values in the table either 0 or 1 (binary)
survival_table[ survival_table < 0.5 ] = 0
survival_table[ survival_table >= 0.5 ] = 1 
print survival_table









    



[[[ 0.  0.  1.  1.]
  [ 0.  1.  1.  1.]
  [ 1.  1.  0.  0.]]

 [[ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]
  [ 0.  0.  0.  0.]]]



In [40]:

    
# Opening up the test file and initializing the header
test_file = open('Desktop/titanic/test.csv', 'rb')
test_data = csv.reader(test_file)
header = test_data.next()

# Writing Predicted data to a newly created file object
predictions_file = open("Desktop/titanic/genderclassmodel.csv", "wb")
pred_data = csv.writer(predictions_file)
pred_data.writerow(["PassengerId", "Survived"])



In [47]:

    
# Populating the newly created file with values on the basis of the survival table
for row in test_data:
    set
    for j in xrange(number_of_price_brackets):
        try:
            row[8]=float(row[8])
        except:
            bin_fare= 3-float(row[1])
            break
        if(row[8]>fare_ceiling):
            bin_fare=number_of_price_brackets -1
            break
        if row[8] >= j * fare_bracket_size and row[8] < (j+1) * fare_bracket_size:  
            bin_fare = j
            break
            
        if row[3] == 'female':  
            pred_data.writerow([row[0], "%d" % int(survival_table[0, int(row[1])-1, bin_fare])])
        else:                                       
            pred_data.writerow([row[0], "%d" % int(survival_table[1, int(row[1])-1, bin_fare])])



In [48]:

    
# Closing the csv files.
test_file.close() 
predictions_file.close()

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S