Classification trees and regression trees.
The entropy is a great way for creating a desition tree, as you can use the entropy to calculate how well the data is splitted for differnt features. Therefore the lower the entropy is the better the predition is.
Overfitting is when the desition three is fittet to well to the training dataset. And when you test with other data, the predictions are wrong.
Instead of creating a single 'perfect' tree, you randomly create a lot of 'weak' trees. That you then combine to give create an ensemble learning technique. Were you look at the result of all the weak trees, and then let them 'vote' on how the data should be classified.
In [1]:
from sklearn import tree, preprocessing
from sklearn.feature_extraction import DictVectorizer
import numpy as np
from collections import OrderedDict
import pandas as pd
import itertools
from __future__ import division
from sklearn.externals.six import StringIO
import os
import pydot
from IPython.display import Image
In [2]:
data_path = 'SFPD_Incidents_-_from_1_January_2003.csv'
data = pd.read_csv(data_path)
In [3]:
def encode_target(df, target_column):
df_mod = df.copy()
targets = df_mod[target_column].unique()
map_to_int = {name: n for n, name in enumerate(targets)}
df_mod[target_column+"_encoded"] = df_mod[target_column].replace(map_to_int)
return (df_mod, targets)
data, districts = encode_target(data, 'PdDistrict')
data, categories = encode_target(data, 'Category')
data, days = encode_target(data, 'DayOfWeek')
# sneak peek of data
data.head()
Out[3]:
In [4]:
training_data = data.head(int(data.Category.count() * 0.9))
test_data = data.tail(int(data.Category.count() * 0.1))
In [5]:
def train_tree( prediction, features, dataset):
clf = tree.DecisionTreeClassifier()
print "TRAINING WITH %d SAMPLES" % len(dataset)
X = np.array(dataset[features])
Y = np.array(list(itertools.chain(*dataset[[prediction]].values)))
return clf.fit(X, Y)
def test_tree(clf, test_data, features):
return clf.predict(test_data[features])
def convert_encoded_district_to_str(preditions):
return map(lambda p: districts[p], preditions)
def test_prediction(clf, test_data, features):
corrects = 0
predictions = test_tree(clf, test_data[features], features)
for i in range(0, len(predictions)):
if predictions[i] == test_data.iloc[i].PdDistrict_encoded:
corrects += 1
print "FOUND %d CORRECT PREDICTIONS" % corrects
return corrects / len(predictions)
In [6]:
# The featues we create our model from
features = ['Category_encoded']
# We train, we are predicting the district
clf = train_tree('PdDistrict_encoded', features, training_data)
# test prediction accuracy
print "Prediction accuracy %f" % test_prediction(clf, test_data, features)
In [7]:
for dis in districts[:1]:
clf = train_tree('PdDistrict_encoded', features, training_data[training_data.PdDistrict == dis])
print "Prediction accuracy %f, trained for %s\n" % (test_prediction(clf, test_data, features), dis)
In [8]:
# We can see that the prediction can only guess SOUTHN
len(test_data[test_data.PdDistrict == 'SOUTHERN'])
Out[8]:
In [12]:
# The featues we create our model from
features = ['Category_encoded','DayOfWeek_encoded']
# We train, we are predicting the district
clf = train_tree('PdDistrict_encoded', features, training_data)
# test prediction accuracy
print "Prediction accuracy %f" % test_prediction(clf, test_data, features)
In [16]:
with open("tree.dot", 'w') as f:
f = tree.export_graphviz(clf, out_file=f)
In [18]:
In [ ]:
for dis in districts:
clf2 = train_tree('PdDistrict_encoded', features, training_data[training_data.PdDistrict == dis])
print "Prediction accuracy %f, trained for %s\n" % (test_prediction(clf2, test_data, features), dis)
In [ ]: