In [1]:
import pickle
import sys
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit
data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r") )
In [2]:
features_list = ["poi", "salary"]
data = featureFormat(data_dict, features_list)
labels, features = targetFeatureSplit(data)
In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
In [4]:
features_train, features_test, labels_train, labels_test = train_test_split(features,labels,test_size=0.3,random_state=42)
In [5]:
clf = DecisionTreeClassifier()
clf.fit(features_train,labels_train)
pred = clf.predict(features_test)
In [6]:
# ref http://stackoverflow.com/questions/10741346
import numpy as np
unique, counts = np.unique(labels_test, return_counts=True)
print "true labels"
print np.asarray((unique, counts)).T
print "predicted labels"
unique, counts = np.unique(pred, return_counts=True)
print np.asarray((unique, counts)).T
Which turn out to match up very poorly. No true positives. Just guessing 0 for everyone would in fact be more accurate.
In [7]:
print "number of true positives:",sum((labels_test==1) & (pred ==1))
In [8]:
from sklearn.metrics import precision_score, recall_score
These are not even slightly good news:
In [9]:
print "precision:",precision_score(labels_test,pred)
print "recall:",recall_score(labels_test,pred)
In [10]:
predictions = np.array([0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1])
true_labels = np.array([0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0])
print "number of true positives:",sum((true_labels==1) & (predictions==1))
print "number of false positives:",sum((true_labels==0) & (predictions==1))
print "number of true negatives:",sum((true_labels==0) & (predictions==0))
print "number of false negatives:",sum((true_labels==1) & (predictions==0))
In [11]:
print "precision:", 6/(6+3.)
print "recall:", 6/(6+2.)