In [1]:
import h5py, numpy as np
from sklearn import tree

In [2]:
f = h5py.File('/scr/murg2/MachineLearning/BrocaData_new.mat','r') 
labels = np.array(f.get('labels'))
features = np.array(f.get('features'))

In [3]:
# sum correlation values to only use nodes that are highly connected as features
summedFeatures = np.sum(features, axis=0)
idx = np.argsort(summedFeatures, axis=0)
sortedFeatures = np.squeeze(np.array(np.where(np.isfinite(summedFeatures[idx]) == True)))

In [4]:
# Take top X correlation values and assess classifier with 20% testing set
for numFeatures in xrange(100,6000,100):
    x = []
    x = features[:,idx[sortedFeatures[-numFeatures:]]]
    testSet = np.shape(features)[0] * .20 # use 20% as testing set
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(x[testSet:], labels[testSet:])
    wrong = np.sum(np.abs(np.squeeze(clf.predict(x[0:testSet-1,:]) - np.transpose(labels[0:testSet-1]))))
    print ("Features = %s, %s percent correct" % (str(numFeatures), str((testSet-wrong) / testSet)))


Features = 100, 0.886964251393 percent correct
Features = 200, 0.899216798001 percent correct
Features = 300, 0.887925235441 percent correct
Features = 400, 0.907144916394 percent correct
Features = 500, 0.918076109937 percent correct
Features = 600, 0.915673649817 percent correct
Features = 700, 0.912190082645 percent correct
Features = 800, 0.901138766096 percent correct
Features = 900, 0.920598693062 percent correct
Features = 1000, 0.915073034788 percent correct
Features = 1100, 0.926244474342 percent correct
Features = 1200, 0.916034018835 percent correct
Features = 1300, 0.923361522199 percent correct
Features = 1400, 0.922040169133 percent correct
Features = 1500, 0.927565827407 percent correct
Features = 1600, 0.932490870652 percent correct
Features = 1700, 0.92600422833 percent correct
Features = 1800, 0.922400538151 percent correct
Features = 1900, 0.9354939458 percent correct
Features = 2000, 0.903060734192 percent correct
Features = 2100, 0.894652123775 percent correct
Features = 2200, 0.900538151067 percent correct
Features = 2300, 0.88083797809 percent correct
Features = 2400, 0.912190082645 percent correct
Features = 2500, 0.914592542764 percent correct
Features = 2600, 0.930328656544 percent correct
Features = 2700, 0.926364597348 percent correct
Features = 2800, 0.929367672497 percent correct
Features = 2900, 0.921319431097 percent correct
Features = 3000, 0.918316355948 percent correct
Features = 3100, 0.926845089372 percent correct
Features = 3200, 0.922280415145 percent correct
Features = 3300, 0.927685950413 percent correct
Features = 3400, 0.92984816452 percent correct
Features = 3500, 0.906304055353 percent correct
Features = 3600, 0.935253699789 percent correct
Features = 3700, 0.942220834134 percent correct
Features = 3800, 0.925643859312 percent correct
Features = 3900, 0.928406688449 percent correct
Features = 4000, 0.932851239669 percent correct
Features = 4100, 0.928286565443 percent correct
Features = 4200, 0.932731116663 percent correct
Features = 4300, 0.935734191812 percent correct
Features = 4400, 0.938136651932 percent correct
Features = 4500, 0.932370747646 percent correct
Features = 4600, 0.937415913896 percent correct
Features = 4700, 0.933932346723 percent correct
Features = 4800, 0.937776282914 percent correct
Features = 4900, 0.942100711128 percent correct
Features = 5000, 0.94234095714 percent correct
Features = 5100, 0.928166442437 percent correct
Features = 5200, 0.942220834134 percent correct
Features = 5300, 0.936214683836 percent correct
Features = 5400, 0.929728041515 percent correct
Features = 5500, 0.929728041515 percent correct
Features = 5600, 0.930088410532 percent correct
Features = 5700, 0.922520661157 percent correct
Features = 5800, 0.927565827407 percent correct
Features = 5900, 0.950869690563 percent correct

In [5]:
from sklearn.metrics import confusion_matrix
print (confusion_matrix(x[testSet:], labels[testSet:]))


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-5-95bbd40b50cb> in <module>()
      1 from sklearn.metrics import confusion_matrix
----> 2 print (confusion_matrix(x[testSet:], labels[testSet:]))

/usr/lib/python2.7/dist-packages/sklearn/metrics/metrics.pyc in confusion_matrix(y_true, y_pred, labels)
    740 
    741     """
--> 742     y_type, y_true, y_pred = _check_clf_targets(y_true, y_pred)
    743     if y_type not in ("binary", "multiclass"):
    744         raise ValueError("%s is not supported" % y_type)

/usr/lib/python2.7/dist-packages/sklearn/metrics/metrics.pyc in _check_clf_targets(y_true, y_pred)
    113     if len(y_type) > 1:
    114         raise ValueError("Can't handle mix of {0} and {1}"
--> 115                          "".format(type_true, type_pred))
    116 
    117     # We can't have more than one value on y_type => The set is no more needed

ValueError: Can't handle mix of continuous-multioutput and binary

In [7]:
# Graph decision tree:
# If pydot fails, install:
!pip install --user pydot 
from sklearn.externals.six import StringIO  
import pydot 
dot_data = StringIO() 
tree.export_graphviz(clf, out_file=dot_data) 
graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
graph.write_pdf("broca.pdf")


Downloading/unpacking pydot
  Downloading pydot-1.0.28.tar.gz
  Running setup.py egg_info for package pydot
    
Requirement already satisfied (use --upgrade to upgrade): pyparsing in /usr/lib/python2.7/dist-packages (from pydot)
Requirement already satisfied (use --upgrade to upgrade): distribute in /usr/lib/python2.7/dist-packages (from pydot)
Installing collected packages: pydot
  Running setup.py install for pydot
    
Successfully installed pydot
Cleaning up...
Out[7]:
True

In [ ]: