In [1]:
import pandas as pd
%matplotlib inline
from sklearn import tree
import matplotlib.pyplot as plt
import numpy as np

In [2]:
sample_list = [0,250,36,'M',10,150,34,'F',2,90,10,'M',6,78,8,'F',4,20,1,'F',1,170,70,'M',8,160,41,'F',10,180,38,'M',6,200,45,'M']
df = pd.DataFrame(np.asarray(sample_list).reshape(9,4),
                    index=['Homer','Marge','Bart','Lisa','Maggie','Abe','Selma','Otto','Krusty'],columns = ['hair_length','weight','age','class'])

In [3]:
df


Out[3]:
hair_length weight age class
Homer 0 250 36 M
Marge 10 150 34 F
Bart 2 90 10 M
Lisa 6 78 8 F
Maggie 4 20 1 F
Abe 1 170 70 M
Selma 8 160 41 F
Otto 10 180 38 M
Krusty 6 200 45 M

In [4]:
x = df[['hair_length','weight','age']].as_matrix() # the attributes
x


Out[4]:
array([['0', '250', '36'],
       ['10', '150', '34'],
       ['2', '90', '10'],
       ['6', '78', '8'],
       ['4', '20', '1'],
       ['1', '170', '70'],
       ['8', '160', '41'],
       ['10', '180', '38'],
       ['6', '200', '45']], dtype=object)

In [5]:
y = df['class'].as_matrix() # the attributes
y


Out[5]:
array(['M', 'F', 'M', 'F', 'F', 'M', 'F', 'M', 'M'], dtype=object)

In [7]:
dt = tree.DecisionTreeClassifier()

In [8]:
dt = dt.fit(x,y)

In [13]:



Out[13]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

Lets try cross validation before testing Comic.


In [15]:
from sklearn.cross_validation import cross_val_score

In [16]:
# http://scikit-learn.org/stable/modules/cross_validation.html#computing-cross-validated-metrics
scores = cross_val_score(dt,x,y,cv=5) #We're passing in our values and getting an array of values back


c:\users\dongjin\envs\03stat\lib\site-packages\sklearn\cross_validation.py:516: Warning: The least populated class in y has only 4 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=5.
  % (min_labels, self.n_folds)), Warning)

In [17]:
np.mean(scores) #here we get our average result


Out[17]:
0.80000000000000004

Comic


In [23]:
x_test = np.asarray([0,250,36])

In [24]:
predicted= dt.predict(x_test)


c:\users\dongjin\envs\03stat\lib\site-packages\sklearn\utils\validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)

In [25]:
predicted


Out[25]:
array(['M'], dtype=object)

In [ ]: