In [1]:
import numpy as np
import pandas as pd
# RMS Titanic data visualization code
from titanic_visualizations import survival_stats
from IPython.display import display
%matplotlib inline
# Load the dataset
in_file = 'titanic_data.csv'
full_data = pd.read_csv(in_file)
# Print the first few entries of the RMS Titanic data
display(full_data.head())
In [2]:
# Store the 'Survived' feature in a new variable and remove it from the dataset
outcomes = full_data['Survived']
data = full_data.drop('Survived', axis = 1)
# Show the new dataset with 'Survived' removed
display(data.head())
In [3]:
data = full_data.drop(['Survived','Name','Ticket','Cabin','Embarked','PassengerId'], axis = 1)
data['Sex'] = data['Sex'].apply(lambda x: 1. if x == 'female' else 0.)
display(data.head())
In [26]:
data = data.fillna( data.mean() )
data["Age"] = np.floor(data["Age"]/10)*10
In [27]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(min_samples_split=20,min_samples_leaf=6)
In [28]:
clf = clf.fit(data, outcomes)
predictions = clf.predict(data)
from sklearn.metrics import accuracy_score
print "Accuracy Score:", accuracy_score(outcomes, predictions)
In [29]:
from sklearn.externals.six import StringIO
with open("data.dot", 'w') as f:
f = tree.export_graphviz(clf, out_file=f)
In [30]:
from sklearn.externals.six import StringIO
import pydot
dot_data = StringIO()
tree.export_graphviz(clf, out_file=dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph[0].write_pdf("data.pdf")
Out[30]:
In [37]:
In [ ]: