In [30]:
#import all the needed package
import numpy as np
import scipy as sp
import re
import pandas as pd
import sklearn
from sklearn.cross_validation import train_test_split,cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.svm import SVC
Let's load and examine the titanic data with pandas first.
In [61]:
data = pd.read_csv('data/train.csv')
print data.head()
In [62]:
# our target is the survived column
y= data['Survived']
In [63]:
print data.shape
So we have 891 training examples with 10 information columns given. Of course it is not straight forward to use all of them at this point. In this example, we will just explore two simple SVM models that only use two features. Our choice of models are:
Recall how we generated features from catagories last session. We use the same method to generate an additional feature called Sex_male.
In [34]:
#add in Sex_male features
data['Sex_male']=data.Sex.map({'female':0,'male':1})
data.head()
Out[34]:
In [35]:
#get the features we indented to use
feature_cols=['Pclass','Sex_male']
X=data[feature_cols]
X.head()
Out[35]:
In [36]:
#use the default SVM rbf model
model=SVC()
scores=cross_val_score(model,X,y,cv=10,scoring='accuracy')
print scores, np.mean(scores),np.std(scores)
Here is how we examine how the selection works.
In [37]:
xmin,xmax=X['Pclass'].min()-0.5,X['Pclass'].max()+0.5
ymin,ymax=X['Sex_male'].min()-0.5,X['Sex_male'].max()+0.5
print xmin,xmax,ymin,ymax
xx, yy = np.meshgrid(np.linspace(xmin, xmax, 200), np.linspace(ymin, ymax, 200))
In [38]:
model.fit(X,y)
Z = model.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
In [39]:
fig=plt.figure(figsize=(20,10))
ax=fig.add_subplot(111)
ax.pcolormesh(xx, yy, -Z, cmap=plt.cm.RdBu,alpha=0.5)
ax.scatter(X['Pclass']+np.random.randn(len(X['Pclass']))*0.1, X['Sex_male']+np.random.randn(len(X['Pclass']))*0.05, c=y,s=40, cmap=plt.cm.RdBu_r)
ax.set_xlabel("Pclass")
ax.set_ylabel("Sex_male")
ax.set_xlim([0.5,3.5])
ax.set_ylim([-0.5,1.5])
plt.show()
In [40]:
#use the isnull function to check if there is any missing value in the Age column.
pd.isnull(data['Age']).any()
Out[40]:
How many missing values are there?
In [41]:
print len(data['Age'][pd.isnull(data['Age'])])
SVM does not allow features with missing values, what do we do?
One idea would be to fill in them with a number we think is reasonable.
Let's try to use the average age first.
In [42]:
data['Age'][pd.isnull(data['Age'])]=data['Age'].mean()
In [43]:
#generate our new feature
feature_cols=['Age','Sex_male']
X=data[feature_cols]
X.head()
Out[43]:
In [44]:
#use the default SVM rbf model
scores=cross_val_score(model,X,y,cv=10,scoring='accuracy')
print scores, np.mean(scores),np.std(scores)
In [45]:
X['Age']=(X['Age']-X['Age'].median())/X['Age'].std()
#X = StandardScaler().fit_transform(X)
In [46]:
scores=cross_val_score(model,X,y,cv=10,scoring='accuracy')
print scores, np.mean(scores),np.std(scores)
Let's examine the selection function of the model.
In [47]:
xmin,xmax=X['Age'].min()-0.5,X['Age'].max()+0.5
ymin,ymax=X['Sex_male'].min()-0.5,X['Sex_male'].max()+0.5
print xmin,xmax,ymin,ymax
xx, yy = np.meshgrid(np.linspace(xmin, xmax, 200), np.linspace(ymin, ymax, 200))
In [48]:
model.fit(X,y)
Z = model.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
In [49]:
fig=plt.figure(figsize=(20,10))
ax=fig.add_subplot(111)
ax.pcolormesh(xx, yy, -Z, cmap=plt.cm.RdBu,alpha=0.5)
ax.scatter(X['Age'], X['Sex_male']+np.random.randn(len(X['Age']))*0.05, c=y,s=40, cmap=plt.cm.RdBu_r)
ax.set_xlabel("Normalized Age")
ax.set_ylabel("Sex_male")
ax.set_ylim([-0.5,1.5])
ax.set_xlim([-3,4.5])
plt.show()
First we want to read in the test data set and add in the gender features as what we did with the training data set.
In [50]:
test_data = pd.read_csv('data/test.csv')
#print test_data.head()
#add in Sex_male features
test_data['Sex_male']=test_data.Sex.map({'female':0,'male':1})
We notice again that some of the age value is missing in the test data, and want to fill in the same way as what we did with the training data.
In [51]:
#use the isnull function to check if there is any missing value in the Age column.
pd.isnull(test_data['Age']).any()
Out[51]:
In [52]:
print len(test_data['Age'][pd.isnull(test_data['Age'])])
In [53]:
test_data['Age'][pd.isnull(test_data['Age'])]=data['Age'].mean()
Note here we give the missing values the mean age of the training data.
What's the pros and cons of doing this?
We want to get the features from the test data, and scale our age feature the same way as what we did in the training data.
In [54]:
#generate our new feature
X_test=test_data[feature_cols]
X_test['Age']=(X_test['Age']-data['Age'].median())/data['Age'].std()
We use the model above to predict the survive of our test data.
The model is fitted with the entire training data.
In [55]:
y_pred=model.predict(X_test)
X_test.head()
Out[55]:
In [56]:
samplesubmit = pd.read_csv("data/titanic_submit_example.csv")
#samplesubmit.head()
In [57]:
samplesubmit["Survived"]=y_pred
#samplesubmit.to_csv
samplesubmit.to_csv("data/titanic_submit_gender_age.csv",index=False)
In [58]:
samplesubmit.head()
Out[58]:
In [ ]: