notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas as pd

# RMS Titanic data visualization code 
from titanic_visualizations import survival_stats
from IPython.display import display
%matplotlib inline

# Load the dataset
in_file = 'titanic_data.csv'
full_data = pd.read_csv(in_file)

# Print the first few entries of the RMS Titanic data
display(full_data.head())









    



/home/josh/anaconda2/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')






    






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.0
      1
      0
      113803
      53.1000
      C123
      S
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35.0
      0
      0
      373450
      8.0500
      NaN
      S



In [2]:

    
# Store the 'Survived' feature in a new variable and remove it from the dataset
outcomes = full_data['Survived']
data = full_data.drop('Survived', axis = 1)

# Show the new dataset with 'Survived' removed
display(data.head())









    






  
    
      
      PassengerId
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      1
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
    
    
      1
      2
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C
    
    
      2
      3
      3
      Heikkinen, Miss. Laina
      female
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
    
    
      3
      4
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.0
      1
      0
      113803
      53.1000
      C123
      S
    
    
      4
      5
      3
      Allen, Mr. William Henry
      male
      35.0
      0
      0
      373450
      8.0500
      NaN
      S



In [3]:

    
data = full_data.drop(['Survived','Name','Ticket','Cabin','Embarked','PassengerId'], axis = 1)
data['Sex'] = data['Sex'].apply(lambda x: 1. if x == 'female' else 0.)
display(data.head())



In [26]:

    
data = data.fillna( data.mean() )
data["Age"] = np.floor(data["Age"]/10)*10



In [27]:

    
from sklearn import tree
clf = tree.DecisionTreeClassifier(min_samples_split=20,min_samples_leaf=6)



In [28]:

    
clf = clf.fit(data, outcomes)
predictions = clf.predict(data)
from sklearn.metrics import accuracy_score
print "Accuracy Score:", accuracy_score(outcomes, predictions)









    



Accuracy Score: 0.863075196409



In [29]:

    
from sklearn.externals.six import StringIO
with open("data.dot", 'w') as f:
    f = tree.export_graphviz(clf, out_file=f)



In [30]:

    
from sklearn.externals.six import StringIO  
import pydot 
dot_data = StringIO() 
tree.export_graphviz(clf, out_file=dot_data) 
graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
graph[0].write_pdf("data.pdf")









    Out[30]:





True



In [37]:









    



---------------------------------------------------------------------------
Exception                                 Traceback (most recent call last)
<ipython-input-37-6e025b99bcf3> in <module>()
----> 1 graph[0].write_pdf("data.pdf")

C:\ProgrameFiles\Anaconda\lib\site-packages\pydot.pyc in <lambda>(path, f, prog)
   1683                 'write_'+frmt,
   1684                 lambda path, f=frmt, prog=self.prog:
-> 1685                     self.write(path, format=f, prog=prog))
   1686 
   1687             f = self.__dict__['write_'+frmt]

C:\ProgrameFiles\Anaconda\lib\site-packages\pydot.pyc in write(self, path, prog, format)
   1766                 s = unicode(s)
   1767         else:
-> 1768             s = self.create(prog, format)
   1769             mode = 'wb'
   1770         with io.open(path, mode=mode) as f:

C:\ProgrameFiles\Anaconda\lib\site-packages\pydot.pyc in create(self, prog, format)
   1874                 raise Exception(
   1875                     '"{prog}" not found in path.'.format(
-> 1876                         prog=prog))
   1877             else:
   1878                 raise

Exception: "dot.exe" not found in path.



In [ ]:

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S