In [3]:
# Title: Titanic- Machine Learning Through Disaster
# Objective: Prediction of Survival on the Titanic
# Model 1: Using a Simple Model based on Gender only
# Imports
# pandas, numpy
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
# csv, matplotlib, seaborn
import csv
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
In [4]:
# For .read_csv, we always use header=0 when we know row 0 is the header row
df = pd.read_csv('Desktop/titanic/train.csv', header=0)
df.info()
In [5]:
# Type of object:
type(df)
Out[5]:
In [6]:
df
Out[6]:
In [7]:
# Data Types interpreted by Pandas csv reader
df.dtypes
Out[7]:
In [8]:
# Filter out columns whose data type is an object
df.dtypes[df.dtypes.map(lambda x: x=='object')]
Out[8]:
In [9]:
# Displays the first few data frames (By default, 5 rows)
df.head()
Out[9]:
In [10]:
# Displays the last few data frames (By default, 5 rows)
df.tail()
Out[10]:
In [11]:
# Mathematical Description of Data Frame (in terms of count, mean, etc.)
df.describe()
Out[11]:
In [12]:
# Display the first 10 rows of the Age column
df['Age'][0:10]
Out[12]:
In [13]:
#Type of Age object
type(df['Age'])
Out[13]:
In [14]:
# Mean Value for the Age Column
df['Age'].mean()
Out[14]:
In [15]:
# Selection of subsets of the dataframe
df[ ['Sex', 'Pclass', 'Age'] ].head()
Out[15]:
In [16]:
# Filtering out passengers with Age > 60
df[df['Age'] > 60].head()
Out[16]:
In [17]:
# Combination of above two scripts
df[df['Age'] > 60][['Sex', 'Pclass', 'Age', 'Survived']].head()
Out[17]:
In [18]:
# Passengers whose age is unavailable
df[df['Age'].isnull()][['Sex', 'Pclass', 'Age']].head()
Out[18]:
In [19]:
# Use inside loop for condition checking
for i in range(1,4):
print i, len(df[ (df['Sex'] == 'male') & (df['Pclass'] == i) ])
In [20]:
# Visualizing data as a histogram
import pylab as P
df['Age'].hist()
P.show()
In [21]:
# Mentioning the Age group size and other necessary constraints
df['Age'].dropna().hist(bins=16, range = (0,80), alpha = 0.5)
P.show()
In [22]:
# Adding a new Column named Gender to the Dataframe
# lambda x is an built-in function of python for generating an anonymous function in the moment, at runtime.
df['Gender'] = df['Sex'].map( lambda x: x[0].upper() )
df.head()
Out[22]:
In [23]:
# Overwriting the Gender column with binary values
df['Gender'] = df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
df.head()
Out[23]:
In [24]:
df['Embarked'].unique()
Out[24]:
In [25]:
df['Embarked'] = df['Embarked'].fillna('T')
df['Port'] = df['Embarked'].map({'S':1,'C':2,'Q':3,'T':0}).astype(int)
df[df['Port'] == 0]
Out[25]:
In [26]:
df['Embarked'].unique()
Out[26]:
In [27]:
# Creating a new table of dimension (gender * class)
median_ages = np.zeros((2,3))
median_ages
Out[27]:
In [28]:
# Computing the median of ages separately for each specific gender and class
for i in range(0, 2):
for j in range(0, 3):
median_ages[i,j] = df[(df['Gender'] == i) & (df['Pclass'] == j+1)]['Age'].dropna().median()
median_ages
Out[28]:
In [29]:
# To make changes to the Age Column, we create a new one,modify in accordance with the existing data and delete the former
df['AgeFill'] = df['Age']
df.head()
Out[29]:
In [30]:
df[ df['Age'].isnull() ][['Gender','Pclass','Age','AgeFill']].head(10)
Out[30]:
In [31]:
# Replacing Nans in AgeFill column by the median values of table in accordance with the passenger's class and gender
for i in range(0, 2):
for j in range(0, 3):
df.loc[ (df.Age.isnull()) & (df.Gender == i) & (df.Pclass == j+1),'AgeFill'] = median_ages[i,j]
df[ df['Age'].isnull() ][['Gender','Pclass','Age','AgeFill']].head(10)
Out[31]:
In [32]:
# AgeIsNull column holds binary value based on whether the Age for a particular passenger is available or not
df['AgeIsNull'] = pd.isnull(df.Age).astype(int)
df[['Gender','Pclass','Age','AgeFill','AgeIsNull']].head(10)
Out[32]:
In [33]:
# Feature Engineering usage
df['FamilySize'] = df['SibSp'] + df['Parch']
df.head()
Out[33]:
In [34]:
# Creating a combination of 2 columns
df['Age*Class'] = df.AgeFill * df.Pclass
df.head()
Out[34]:
In [35]:
# Dropping columns not in use
df = df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1)
df.head()
Out[35]:
In [36]:
# Removing Entire Age Column
df = df.drop(['Age'], axis=1)
#Alternatively for removing columns still having Nans, use :- df = df.dropna()
df = df.dropna()
df.head()
Out[36]:
In [37]:
# Converting the dataframe to a numpy array for usage
train_data = df.values
train_data
Out[37]:
In [38]:
#Comparing to the old data,
training_file = open('Desktop/titanic/train.csv','rb')
training_data = csv.reader(training_file)
header = training_data.next()
data=[]
for row in training_data:
data.append(row)
data = np.array(data)
print data
In [39]:
#Changing the test data
df_test = pd.read_csv('Desktop/titanic/test.csv')
df_test.info()
df_test.describe()
Out[39]:
In [40]:
# Adding a Gender Column
df_test['Gender'] = df_test['Sex'].map({'male':1,'female':0}).astype(int)
df_test['Gender'].head()
Out[40]:
In [41]:
# Filling up the Age using the median age method
median = np.zeros((2,3))
for i in xrange(2):
for j in xrange(3):
median[i,j] = df_test[(df_test['Gender'] == i) & (df_test['Pclass'] == j+1)]['Age'].dropna().median()
print median
In [42]:
df_test['AgeisNull'] = pd.isnull(df_test['Age']).astype(int)
df_test['AgeisNull'].head()
Out[42]:
In [43]:
#Using the above found median ages to fill in the null ages in the data
for i in xrange(2):
for j in xrange(3):
df_test.loc[(df_test['Gender'] == i) & (df_test['Pclass'] == j + 1) & df_test['Age'].isnull(),'Age'] = median[i,j]
pd.isnull(df_test['Age']).head()
Out[43]:
In [44]:
df_test['Embarked'].unique()
Out[44]:
In [45]:
df_test['Port'] = df_test['Embarked'].map({'S':1,'C':2,'Q':3}).astype(int)
In [46]:
df_test.info()
In [47]:
# Adding New Features as we added in the training data
df_test['AgeFill'] = df_test['Age']
df_test['FamilySize'] = df_test['Parch'] + df_test['SibSp']
df_test['Age*Class'] = df_test['Age']*df_test['Pclass']
df_test['Age*Class'].hist()
plt.show()
# Adding mean value in the missing fares
df_test.loc[df_test['Fare'].isnull(),'Fare'] = df_test['Fare'].mean()
In [48]:
df_test = df_test.drop(['Name','Sex','Ticket','Embarked','Cabin'],axis = 1)
df_test.head()
Out[48]:
In [49]:
test_data = df_test.values
print np.shape(test_data)
test_data = test_data[:,1:]
print test_data
In [50]:
from sklearn.ensemble import RandomForestClassifier
In [51]:
forest = RandomForestClassifier(n_estimators = 100)
print np.shape(train_data)
In [52]:
forest = forest.fit(train_data[0::,1::],train_data[0::,0])
In [53]:
output = forest.predict(test_data)
print output,len(output)
In [54]:
# Using Keras Library to Predict data
from keras.layers import Activation, Dense, Dropout
from keras.models import Sequential
In [55]:
# Seperating the data and the labels
X = train_data[:500,1:]
X_cv = train_data[500:,1:]
labels= train_data[:,0]
y = np.zeros((500,2))
y_cv = np.zeros((391,2))
In [56]:
for i in range(500):
if(labels[i] == 1):
y[i][1] = 1
else:
y[i][0] = 1
for i in range(391):
if(labels[500 + i] == 1):
y_cv[i][1] = 1
else:
y_cv[i][0] = 1
In [57]:
model = Sequential([
Dense(32,input_dim=11),
Activation('sigmoid'),
Dropout(0.25),
Dense(32),
Activation('sigmoid'),
Dropout(0.25),
Dense(32),
Activation('sigmoid'),
Dropout(0.25),
Dense(2),
Activation('sigmoid'),
])
In [58]:
model.compile(optimizer = 'adadelta',
loss = 'categorical_crossentropy',
metrics = ['accuracy'])
In [63]:
model.fit(X,y,nb_epoch = 500,batch_size = 32,verbose = 0)
Out[63]:
In [64]:
score = model.evaluate(X_cv,y_cv,batch_size = 32, verbose = 0)
print score
In [65]:
yPred = model.predict_classes(test_data,verbose = 1)
In [66]:
print yPred
In [67]:
file_handle = open('output_keras.csv',"w")
prediction_handle = csv.writer(file_handle)
prediction_handle.writerow(['PassengerId','Survived'])
i = 892
for pred in yPred:
prediction_handle.writerow([i,int(pred)])
i += 1
file_handle.close()
In [ ]: