In [1]:
import sys
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess
from sklearn.tree import DecisionTreeClassifier
In [2]:
features_train, features_test, labels_train, labels_test = preprocess()
In [3]:
clf = DecisionTreeClassifier(min_samples_split=40)
clf.fit(features_train,labels_train)
Out[3]:
In [4]:
clf.score(features_test,labels_test)
Out[4]:
In [5]:
#### Number of features:
In [6]:
features_train.shape
Out[6]:
This one selects a smaller subset of features (1% rather than 10%)
In [7]:
from email_preprocess2 import preprocess as preprocess2
In [8]:
features_train2, features_test2, labels_train2, labels_test2 = preprocess2()
In [9]:
features_train2.shape
Out[9]:
In [9]:
clf2 = DecisionTreeClassifier(min_samples_split=40)
clf2.fit(features_train2,labels_train2)
clf2.score(features_test2,labels_test2)
Out[9]:
Not all that much worse.