Use the Titanic dataset to practice
[http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt]
headers: Ordinal, Class, Survived (0=no, 1=yes), Name,Age, Port of Embarkation, Home/Destination, Room, Ticket, Boat
In [8]:
import csv
import numpy as np
with open("titanic.txt","rb") as f:
titanic_reader = csv.reader(f, delimiter=',', quotechar='"')
# header
row = titanic_reader.next()
features_names = np.array(row)
# Load dataset, and the target classes
titanic_X, titanic_y = [], []
for row in titanic_reader:
titanic_X.append(row)
titanic_y.append(row[2]) # the target value column Survived
titanic_X = np.array(titanic_X)
titanic_y = np.array(titanic_y)
In [4]:
print features_names
In [9]:
print titanic_X[1], titanic_y[1]
In [10]:
# select 1, 4, and 10 columns as features
titanic_X = titanic_X[:, [1,4,10]]
features_names = features_names[[1,4,10]]
In [11]:
print features_names
In [12]:
# we have missing value for age
print titanic_X[12]
In [13]:
# fill the missing value with average value
ages = titanic_X[:,1] # the all value from age column
mean_age = np.mean(titanic_X[ ages != 'NA', 1].astype(np.float))
titanic_X[ages == 'NA',1] = mean_age
print titanic_X[12]
In [14]:
# encode the label values to numeric values
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
label_encoder = enc.fit(titanic_X[:,2])
print "Categorical classes:", label_encoder.classes_
In [15]:
integer_classes = label_encoder.transform(label_encoder.classes_)
print integer_classes
In [16]:
t = label_encoder.transform(titanic_X[:,2])
titanic_X[:, 2] = t
In [17]:
print titanic_X[12]
In [18]:
# convert categorical feature to multiple columns
from sklearn.preprocessing import OneHotEncoder
enc = LabelEncoder()
label_encoder = enc.fit(titanic_X[:,0])
print "Categorical classes:", label_encoder.classes_
In [19]:
numeric_features = label_encoder.transform(label_encoder.classes_)
In [21]:
print numeric_features.shape
In [22]:
numeric_features = label_encoder.transform(label_encoder.classes_).reshape(3,1)
print numeric_features
In [23]:
enc = OneHotEncoder()
one_hot_encoder = enc.fit(numeric_features)
In [24]:
print one_hot_encoder
In [26]:
num_of_rows = titanic_X.shape[0]
t = label_encoder.transform(titanic_X[:,0]).reshape(num_of_rows,1)
In [29]:
# Second, create a sparse matrix with three columns, each one indicating if the instance belongs to the class
new_features = one_hot_encoder.transform(t)
titanic_X = np.concatenate([titanic_X, new_features.toarray()], axis = 1)
titanic_X = np.delete(titanic_X, [0], 1)
features_names = ['age', 'sex', 'first_class', 'second_class', 'third_class']
titanic_X = titanic_X.astype(float)
titanic_y = titanic_y.astype(float)
In [30]:
print features_names
print titanic_X[12], titanic_y[12]
In [31]:
# as usual, we split the set to train and test
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(titanic_X, titanic_y, test_size=0.25, random_state=33)
In [32]:
print len(X_train), len(X_test)
In [33]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5)
clf = clf.fit(X_train, y_train)
In [34]:
clf
Out[34]:
In [47]:
# to create the tree diagram, we need to install some Python libs
# pydoc, http://www.graphviz.org/
# sudo apt-get install graphviz
# pip uninstall pydot
# pip install pydot2
from IPython.core.display import Image
import pydot, StringIO
dot_data = StringIO.StringIO()
tree.export_graphviz(clf, out_file=dot_data, feature_names=['age','sex','1st_class','2nd_class','3d_class'])
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_png('decision_tree_titanic.png')
Image(filename='decision_tree_titanic.png')
Out[47]:
In [48]:
# let implement the desicion tree and evaluate the performance
from sklearn import metrics
def measure_performance(X, y, clf, show_accuracy=True, show_classification_report=True, show_confusion_matrix=True):
y_pred = clf.predict(X)
if show_accuracy:
print "Accuracy:{0:.3f}".format(
metrics.accuracy_score(y, y_pred)), "\n"
if show_classification_report:
print "Classification Report:"
print metrics.classification_report(y, y_pred), "\n"
if show_confusion_matrix:
print "Confusion matrix:"
print metrics.confusion_matrix(y, y_pred), "\n"
# run report
measure_performance(X_train, y_train, clf, True, True, True)
In [49]:
# cross validation
# leave-one-out cross-validation
from sklearn.cross_validation import cross_val_score, LeaveOneOut
from scipy.stats import sem
def loo_cv(X_train, y_train, clf):
loo = LeaveOneOut(X_train[:].shape[0])
scores = np.zeros(X_train[:].shape[0])
for train_index, test_index in loo:
X_train_cv, X_test_cv = X_train[train_index], X_train[test_index]
y_train_cv, y_test_cv = y_train[train_index], y_train[test_index]
clf = clf.fit(X_train_cv, y_train_cv)
y_pred = clf.predict(X_test_cv)
scores[test_index] = metrics.accuracy_score(y_test_cv.astype(int), y_pred.astype(int))
print "Mean score: {0:.3f} (+/- {1:.3f})".format(np.mean(scores), sem(scores))
loo_cv(X_train, y_train, clf)
In [55]:
Out[55]:
In [ ]: