In [1]:
##################################################################################
############################### LOADING DATA #####################################
##################################################################################
In [2]:
import pandas as pd
import numpy as np
In [3]:
datadir = "~/data/kaggle/titanic/"
testfile = datadir + "test.csv"
trainfile = datadir + "train.csv"
test = pd.read_csv(testfile)
train = pd.read_csv(trainfile)
In [4]:
train.head()
Out[4]:
In [5]:
train.sample(5)
Out[5]:
In [6]:
test.head()
Out[6]:
In [7]:
##################################################################################
############################### DATA ANALYSIS ####################################
##################################################################################
In [8]:
# RULE OF THUMB.. avoid using validation or test data to make a decision.. always use your training data
# using the other set pollutes your model with info it shouldnt have and adventually renders
# the testing data sets useless
In [9]:
# Step 0
# undestand the columns
# https://www.kaggle.com/c/titanic/data
#survival Survival Categorical - 0 = No, 1 = Yes
#pclass Ticket class Categorical - 1 = 1st, 2 = 2nd, 3 = 3rd
#sex Sex Categorical - male, female
#embarked Port of Embarkation Categorical - C = Cherbourg, Q = Queenstown, S = Southampton
#Age Age in years Numerical (is poluted)
# - If less than 1 Age is fractional
# - If the age is estimated, is it in the form of xx.5
#sibsp siblings/spouses onboard Numerical
#parch parents/children onboard Numerical - (guardians such as nannies are not counted)
#fare Passenger fare Numerical
#name Passenger name Raw text
#ticket Ticket number Raw text
#cabin Cabin number Raw text
In [10]:
train.dtypes
Out[10]:
In [11]:
# Determine if there is missing data
pd.isnull(train).sum() > 0
Out[11]:
In [12]:
pd.isnull(test).sum() > 0
Out[12]:
In [13]:
# NOTE -- notice that the Fare data in the test set is sometimes missing
In [14]:
# Step 1 - Understand the fundmentals of the columns
# Computing mean, variance
# Computing percentials and quartiles of data
In [15]:
train.describe()
Out[15]:
In [16]:
train.describe(percentiles=np.arange(10)/10.0)
Out[16]:
In [17]:
train["Sex"].value_counts()
Out[17]:
In [18]:
(train["Sex"] != "female").sum()
Out[18]:
In [19]:
# Step 2 - Understand the corelations between columns
# -- especailly the output column..
# Pivot tables
# Rendering Histograms
# Plotting boxplots
In [20]:
train.pivot_table(values=["Survived"], index=["Sex"], aggfunc=np.mean)
Out[20]:
In [21]:
# NOTE the give-me here..
# If we just choose all females as survive(.74*.5) and all males dead(1-.18)*.5) will get 78% correct!
# any model less than this is just complete garbage
In [22]:
train.pivot_table(values=["Survived"], index=["Pclass"], aggfunc=np.mean)
Out[22]:
In [23]:
# NOTE the give-me here..
# richer people mostly lived.
In [24]:
train.pivot_table(values=["Survived"], index=["Pclass","Sex"], aggfunc=np.mean)
Out[24]:
In [25]:
train.pivot_table(values=["Survived"], index=["SibSp"], aggfunc=np.mean)
Out[25]:
In [26]:
train.pivot_table(values=["Survived"], index=["Parch"], aggfunc=np.mean)
Out[26]:
In [27]:
train.pivot_table(values=["Survived"],index=["Pclass","Sex"], aggfunc=np.sum)
#train.pivot_table(values=["Pclass"], index=["Sex"], aggfunc=np.sum)
Out[27]:
In [28]:
train.groupby(["Pclass","Sex"])["PassengerId"].count()
Out[28]:
In [29]:
import matplotlib.pyplot as plt
%matplotlib inline
In [30]:
# TODO Log scale
train.boxplot(column="Fare",by="Survived")
Out[30]:
In [31]:
train.hist(column="Fare",by="Survived",bins=30)
Out[31]:
In [32]:
train.boxplot(column="Age",by="Survived")
Out[32]:
In [33]:
train.hist(column="Age",by="Survived",bins=30)
Out[33]:
In [34]:
##################################################################################
############################### DATA CLEAN UP ####################################
##################################################################################
In [35]:
# Step 0
# save ourselves the time and merge it all togther
alldata = pd.concat([train,test], axis=0)
alldata.head()
Out[35]:
In [36]:
# Step 1
#delete useless stuff
alldata = alldata.drop(["Name", "Ticket", "Cabin","PassengerId"], 1)
alldata.head()
Out[36]:
In [37]:
# Step 2
# Expand the catogrical data into boolen indications of presence or not
# This remove the need for the model to learn the "meaning" of the value and uncomplicates the situation
# dummy cols.. convert the "class" values into attibutes that are true/false
dummy_cols=["Embarked","Sex","Pclass"]
for column in dummy_cols:
dummies = pd.get_dummies(alldata[column])
alldata[dummies.columns] = dummies
alldata = alldata.drop(dummy_cols, 1)
#delete MALE its just the inverse of female..
alldata = alldata.drop(["male"], 1)
alldata.head()
Out[37]:
In [38]:
# Step 3
# Handling missing data
# several options exist, we will go the simple replacement with the mean
In [39]:
#check for NaN(bad) data
pd.isnull(alldata).sum() > 0
Out[39]:
In [40]:
pd.isnull(alldata)[1000:1010]
Out[40]:
In [41]:
# check the statisics of the data -- take care not to overly skew it
alldata.describe()
Out[41]:
In [42]:
# fill in Nan data with mean values
#clean up the Nan(bad) data
nan_cols = ["Age","Fare"]
for column in nan_cols:
coldata = alldata[column]
coldata = coldata.fillna(coldata.mean())
alldata[column] = coldata
alldata.head()
Out[42]:
In [43]:
#confirm clean up
pd.isnull(alldata).sum() > 0
Out[43]:
In [44]:
#double check that the status didnt move too much
alldata.describe()
Out[44]:
In [45]:
# slice the data apart again
out_cols = ["Survived"]
xtrain = alldata[0:len(train)]
ytrain = xtrain[out_cols]
xtrain = xtrain.drop(out_cols, 1)
xtest = alldata[len(train):]
ytest = xtest[out_cols]
xtest = xtest.drop(out_cols, 1)
In [46]:
xtrain.head()
Out[46]:
In [47]:
xtest.head()
Out[47]:
In [48]:
ytrain.head()
Out[48]:
In [49]:
#for now use training and test sets as the same..
x_train, y_train = xtrain, ytrain
x_test, y_test = xtrain, ytrain
In [50]:
print x_train.shape, y_train.shape, x_test.shape, y_test.shape
In [51]:
##################################################################################
################################# MODELLING ######################################
##################################################################################
In [52]:
def plotboundary(inputs, outputs, x1, x2, predict):
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = inputs[x1].min(), inputs[x1].max()
y_min, y_max = inputs[x2].min(), inputs[x2].max()
x_step = (x_max - x_min)/30.0
y_step = (y_max - y_min)/30.0
#basis_tag = [ "", "min", "mean", "max"]
#basis = [inputs.min(), inputs.mean(), inputs.max()]
basis_idx = [3,4,5,6,7]
basis = inputs.describe()
xx, yy = np.meshgrid(np.arange(x_min-x_step, x_max+x_step, x_step),
np.arange(y_min-y_step, y_max+y_step, y_step))
plt.rcParams['figure.figsize'] = (16, 4)
#plt.figure(figsize=(20,9))
#plt.subplots_adjust(hspace=.7)
f, ax = plt.subplots(1, 6)
fig = 0
# Plot also the training points
ax[fig].scatter(inputs[x1], inputs[x2], c=outputs, edgecolors='k', cmap=plt.cm.Paired)
ax[fig].set_xlim(xx.min(), xx.max())
ax[fig].set_ylim(yy.min(), yy.max())
ax[fig].set_xticks(())
ax[fig].set_yticks(())
ax[fig].set_xlabel(x1)
ax[fig].set_ylabel(x2)
fig += 1
for idx in basis_idx:
base = basis.iloc[idx]
tag = basis.index[idx]
mockin = pd.concat([base] * xx.ravel().shape[0], axis=1).transpose()
mockin[x1] = xx.ravel()
mockin[x2] = yy.ravel()
Z = predict(mockin)
Z = Z.reshape(xx.shape)
ax[fig].pcolormesh(xx, yy, Z, cmap='RdBu')
ax[fig].set_xlim(xx.min(), xx.max())
ax[fig].set_ylim(yy.min(), yy.max())
ax[fig].set_xticks(())
ax[fig].set_yticks(())
ax[fig].set_xlabel(tag)
fig += 1
plt.show()
In [53]:
##################################################################################
################# MODEL1 XGBOOST - BOOSTED RANDOM FOREST #########################
##################################################################################
In [54]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
In [55]:
model_xgb = xgb.XGBClassifier()
model_xgb.fit(x_train, y_train)
Out[55]:
In [56]:
# make predictions for test data
y_pred = model_xgb.predict(x_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
In [67]:
model_xgb.predict(x_train.iloc[1:4])
Out[67]:
In [66]:
print np.arange(10)
print np.arange(10).reshape((2,5))
In [89]:
#xgb.plot_tree(model_xgb)
In [87]:
xgb.plot_importance(model_xgb)
Out[87]:
In [98]:
plotboundary(x_train, y_train["Survived"], "female", "Fare",
lambda x: model_xgb.predict(x))
In [109]:
##################################################################################
################### MODEL2 Scipy - Logiistic regression ##########################
##################################################################################
In [111]:
from sklearn import linear_model
logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(x_train, y_train)
Out[111]:
In [113]:
# make predictions for test data
y_pred = logreg.predict(x_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_valid, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
In [312]:
plotboundary(x_train, y_train["Survived"], "Fare", "Age",
lambda x: logreg.predict(x))
In [266]:
##################################################################################
################### MODEL2 RAW - Deep Logiistic regression #######################
##################################################################################
In [275]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.callbacks import EarlyStopping
#from keras.optimizers import SGD, Nadam
import datetime
In [280]:
y_train2 = y_train.copy()
y_train2["Died"] = 1 - y_train["Survived"]
y_test2 = y_test.copy()
y_test2["Died"] = 1 - y_test["Survived"]
print x_train.shape, y_train2.shape, x_test.shape, y_test2.shape
In [286]:
model_deep = Sequential()
model_deep.add(Dense(128, input_shape=(11,)))
model_deep.add(Activation('relu'))
print model_deep.output_shape
model_deep.add(Dense(64))
model_deep.add(Activation('relu'))
print model_deep.output_shape
model_deep.add(Dense(2))
model_deep.add(Activation('softmax'))
print model_deep.output_shape
In [291]:
model_deep.compile(loss='categorical_crossentropy',
optimizer="adam",
metrics=['accuracy'])
stopper = EarlyStopping(monitor='val_loss', patience=10, verbose=0)
callback_list = [stopper]
print datetime.datetime.now()
stats = model_deep.fit(x_train.values, y_train2.values,
batch_size=128, nb_epoch=60,
verbose=1, callbacks=callback_list,
validation_data=(x_test.values, y_test2.values))
print datetime.datetime.now()
In [295]:
score = model_deep.evaluate(x_test.values, y_test2.values, verbose=0)
print('Test accuracy:', score[1]*100)
In [315]:
plotboundary(x_train, y_train["Survived"], "Fare", "Age",
lambda x: model_deep.predict(x.values)[:,0])
In [ ]: