Predict the survival of Titanic passengers using a K-Means algorithm.
In [1]:
import pandas
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.cluster import KMeans
from pprint import pprint
TITANIC_TRAIN = 'train.csv'
TITANIC_TEST = 'test.csv'
# t_df refers to titanic_dataframe
t_df = pandas.read_csv(TITANIC_TRAIN, header=0)
In [2]:
t_df.drop(['Name', 'Ticket', 'Cabin', 'Embarked', 'Sex'], axis=1, inplace=True)
t_df.info()
t_df.head(1)
Out[2]:
In [3]:
t_df.Age.fillna(np.mean(t_df.Age), inplace=True)
t_df.info()
F1 score to be used to evaluate algoritm results.
In [4]:
def precision(tp, fp):
"""Determtine The Precision of Algorithm"""
return tp / (tp + fp)
def recall(tp, fn):
"""Determine The Recall of Algorithm"""
return tp / (tp + fn)
def f1_score(tp, fn, fp):
"""Return the F1 score of a algorithm"""
pre = precision(tp, fp)
rec = recall(tp, fn)
return (2 * ((pre * rec) / (pre + rec)))
In [5]:
train, test = train_test_split(t_df, test_size = 0.2)
y = np.array(train['Survived'])
x = np.array(train[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']])
train_fares = []
for i in range(len(x)):
train_fares.append(x[i][-1])
In [6]:
k = 2
kmeans = KMeans(n_clusters=k)
results = kmeans.fit_predict(x)
The K-Means split up the passengers into two groups 0 and 1 but it's not clear which of these represents Surivived and Non-Survived. The assumption is made that whichever group has the higher mean fare is the survival group. Depending on which group is the survival group the True Positives/False Positives calculations are slighty different.
In [7]:
tp = 0
fp = 0
fn = 0
one_fare = []
zero_fare = []
for i in range(len(results)):
if results[i] == 1:
one_fare.append(train_fares[i])
elif results[i] == 0:
zero_fare.append(train_fares[i])
one_mean_fare = np.mean(one_fare)
print("Mean Fare of Group One: {}".format(one_mean_fare))
zero_mean_fare = np.mean(zero_fare)
print("Mean Fare of Group Zero: {}".format(zero_mean_fare))
if one_mean_fare > zero_mean_fare:
for i in range(len(results)):
diff = y[i] - results[i]
if diff == 1:
fp += 1
elif diff == 0:
tp += 1
else:
fn += 1
else:
for i in range(len(results)):
diff = y[i] - results[i]
if diff == 1:
fn += 1
elif diff == 0:
tp += 1
else:
fp += 1
print("True Positives: " + str(tp))
print("False Positives: " + str(fp))
print("False Negative: " + str(fn))
In [8]:
f1 = f1_score(tp, fn, fp)
print("F1 Score: " + str(f1))
K-Means algoritm predicts ~78% correct results.
In [19]:
test_df = pandas.read_csv(TITANIC_TEST, header=0)
test_df.drop(['Name', 'Ticket', 'Cabin', 'Embarked', 'Sex'], axis=1, inplace=True)
test_df.Age.fillna(np.mean(test_df.Age), inplace=True)
test_df.Fare.fillna(np.mean(test_df.Fare), inplace=True)
x = np.array(test_df[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']])
kmeans = KMeans(n_clusters=k)
results = kmeans.fit_predict(x)
s1 = pandas.Series(np.array(test_df.PassengerId), name='PassengerId')
s2 = pandas.Series(results, name='Survived')
kaggle_result = pandas.concat([s1,s2], axis=1)
kaggle_result.to_csv('titanic_day2.csv', index=False)
In [ ]: