In [1]:
import numpy as np
import pandas as pd

# RMS Titanic data visualization code 
from titanic_visualizations import survival_stats
from IPython.display import display
%matplotlib inline

# Load the dataset
in_file = 'titanic_data.csv'
full_data = pd.read_csv(in_file)

# Print the first few entries of the RMS Titanic data
display(full_data.head())


/home/josh/anaconda2/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

In [2]:
# Store the 'Survived' feature in a new variable and remove it from the dataset
outcomes = full_data['Survived']
data = full_data.drop('Survived', axis = 1)

# Show the new dataset with 'Survived' removed
display(data.head())


PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

In [3]:
data = full_data.drop(['Survived','Name','Ticket','Cabin','Embarked','PassengerId'], axis = 1)
data['Sex'] = data['Sex'].apply(lambda x: 1. if x == 'female' else 0.)
display(data.head())


Pclass Sex Age SibSp Parch Fare
0 3 0.0 22.0 1 0 7.2500
1 1 1.0 38.0 1 0 71.2833
2 3 1.0 26.0 0 0 7.9250
3 1 1.0 35.0 1 0 53.1000
4 3 0.0 35.0 0 0 8.0500

In [26]:
data = data.fillna( data.mean() )
data["Age"] = np.floor(data["Age"]/10)*10

In [27]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(min_samples_split=20,min_samples_leaf=6)

In [28]:
clf = clf.fit(data, outcomes)
predictions = clf.predict(data)
from sklearn.metrics import accuracy_score
print "Accuracy Score:", accuracy_score(outcomes, predictions)


Accuracy Score: 0.863075196409

In [29]:
from sklearn.externals.six import StringIO
with open("data.dot", 'w') as f:
    f = tree.export_graphviz(clf, out_file=f)

In [30]:
from sklearn.externals.six import StringIO  
import pydot 
dot_data = StringIO() 
tree.export_graphviz(clf, out_file=dot_data) 
graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
graph[0].write_pdf("data.pdf")


Out[30]:
True

In [37]:



---------------------------------------------------------------------------
Exception                                 Traceback (most recent call last)
<ipython-input-37-6e025b99bcf3> in <module>()
----> 1 graph[0].write_pdf("data.pdf")

C:\ProgrameFiles\Anaconda\lib\site-packages\pydot.pyc in <lambda>(path, f, prog)
   1683                 'write_'+frmt,
   1684                 lambda path, f=frmt, prog=self.prog:
-> 1685                     self.write(path, format=f, prog=prog))
   1686 
   1687             f = self.__dict__['write_'+frmt]

C:\ProgrameFiles\Anaconda\lib\site-packages\pydot.pyc in write(self, path, prog, format)
   1766                 s = unicode(s)
   1767         else:
-> 1768             s = self.create(prog, format)
   1769             mode = 'wb'
   1770         with io.open(path, mode=mode) as f:

C:\ProgrameFiles\Anaconda\lib\site-packages\pydot.pyc in create(self, prog, format)
   1874                 raise Exception(
   1875                     '"{prog}" not found in path.'.format(
-> 1876                         prog=prog))
   1877             else:
   1878                 raise

Exception: "dot.exe" not found in path.

In [ ]: