notebook.community

Edit and run



In [1]:

    
import pandas as pd
%matplotlib inline
from sklearn import tree
import matplotlib.pyplot as plt
import numpy as np



In [2]:

    
sample_list = [0,250,36,'M',10,150,34,'F',2,90,10,'M',6,78,8,'F',4,20,1,'F',1,170,70,'M',8,160,41,'F',10,180,38,'M',6,200,45,'M']
df = pd.DataFrame(np.asarray(sample_list).reshape(9,4),
                    index=['Homer','Marge','Bart','Lisa','Maggie','Abe','Selma','Otto','Krusty'],columns = ['hair_length','weight','age','class'])



In [3]:

    
df









    Out[3]:






  
    
      
      hair_length
      weight
      age
      class
    
  
  
    
      Homer
      0
      250
      36
      M
    
    
      Marge
      10
      150
      34
      F
    
    
      Bart
      2
      90
      10
      M
    
    
      Lisa
      6
      78
      8
      F
    
    
      Maggie
      4
      20
      1
      F
    
    
      Abe
      1
      170
      70
      M
    
    
      Selma
      8
      160
      41
      F
    
    
      Otto
      10
      180
      38
      M
    
    
      Krusty
      6
      200
      45
      M



In [4]:

    
x = df[['hair_length','weight','age']].as_matrix() # the attributes
x









    Out[4]:





array([['0', '250', '36'],
       ['10', '150', '34'],
       ['2', '90', '10'],
       ['6', '78', '8'],
       ['4', '20', '1'],
       ['1', '170', '70'],
       ['8', '160', '41'],
       ['10', '180', '38'],
       ['6', '200', '45']], dtype=object)



In [5]:

    
y = df['class'].as_matrix() # the attributes
y









    Out[5]:





array(['M', 'F', 'M', 'F', 'F', 'M', 'F', 'M', 'M'], dtype=object)



In [7]:

    
dt = tree.DecisionTreeClassifier()



In [8]:

    
dt = dt.fit(x,y)



In [13]:









    Out[13]:





DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

Lets try cross validation before testing Comic.



In [15]:

    
from sklearn.cross_validation import cross_val_score



In [16]:

    
# http://scikit-learn.org/stable/modules/cross_validation.html#computing-cross-validated-metrics
scores = cross_val_score(dt,x,y,cv=5) #We're passing in our values and getting an array of values back









    



c:\users\dongjin\envs\03stat\lib\site-packages\sklearn\cross_validation.py:516: Warning: The least populated class in y has only 4 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=5.
  % (min_labels, self.n_folds)), Warning)



In [17]:

    
np.mean(scores) #here we get our average result









    Out[17]:





0.80000000000000004

Comic



In [23]:

    
x_test = np.asarray([0,250,36])



In [24]:

    
predicted= dt.predict(x_test)









    



c:\users\dongjin\envs\03stat\lib\site-packages\sklearn\utils\validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)



In [25]:

    
predicted









    Out[25]:





array(['M'], dtype=object)



In [ ]:

	hair_length	weight	age	class
Homer	0	250	36	M
Marge	10	150	34	F
Bart	2	90	10	M
Lisa	6	78	8	F
Maggie	4	20	1	F
Abe	1	170	70	M
Selma	8	160	41	F
Otto	10	180	38	M
Krusty	6	200	45	M