In [1]:
import numpy as np
import pandas as pd
import itertools
from __future__ import division
from sklearn.tree import tree, DecisionTreeClassifier, export_graphviz
from sklearn import cluster
%matplotlib inline
pd.set_option("display.max_columns", 500)
pd.set_option("max_rows", 1000)
In [2]:
filePath = 'datasets/NYPD_Motor_Vehicle_Collisions_weather4.csv'
colls = pd.read_csv(filePath)
# colls = colls[(colls['VEHICLE TYPE CODE 1'] == 'TAXI') | \
# (colls['VEHICLE TYPE CODE 2'] == 'TAXI') | \
# (colls['VEHICLE TYPE CODE 3'] == 'TAXI') | \
# (colls['VEHICLE TYPE CODE 4'] == 'TAXI') | \
# (colls['VEHICLE TYPE CODE 5'] == 'TAXI')]
# colls = colls[colls['CONTRIBUTING FACTOR VEHICLE 1'] != 'Unspecified']
print len(colls.index)
colls = colls[pd.notnull(colls['LOCATION'])]
colls['HOUR'] = colls.TIME.str.split(':').str.get(0)
colls['MONTH'] = colls.DATE.str.split('/').str.get(0)
colls = colls[pd.notnull(colls['HOUR'])]
colls.head()
Out[2]:
In [3]:
from sklearn.ensemble import RandomForestClassifier
def encode_column(df, target_column):
# print "copying dataset"
df_mod = df.copy()
# print "finding uniques"
targets = pd.Series(df_mod[target_column].unique())
# print "mapping to ints"
map_to_int = {name: n for n, name in enumerate(targets)}
df_mod[target_column+"_encoded"] = df_mod[target_column].replace(map_to_int)
return (df_mod, targets)
def train_tree(prediction, features, dataset):
clf = RandomForestClassifier(n_estimators=50, n_jobs=4)
print "TRAINING WITH %d SAMPLES" % len(dataset)
X = np.array(dataset[features])
Y = np.array(list(itertools.chain(*dataset[[prediction]].values)))
return clf.fit(X, Y)
# Test forest
def test_tree(clf, test_data, features):
return clf.predict(test_data[features])
# Tests test_data on the forest classifier clf with features and target_label.
# encoded_map is a lookup table of the encoded values (numeric) to actual string values
def test_prediction(target_label, clf, test_data, features, encoded_map):
corrects = 0
predictions = test_tree(clf, test_data[features], features)
for i in range(0, len(predictions)):
if predictions[i] == test_data.iloc[i][target_label]:
corrects += 1
print "FOUND %d CORRECT PREDICTIONS" % corrects
# Return model accuracy [%]
return corrects / len(predictions)
def convert_encoded_district_to_str(preditions):
return map(lambda p: districts[p], preditions)
def kmeans(k, dataset, colums):
md = cluster.KMeans(n_clusters=k).fit(dataset[colums])
return md.predict(dataset[colums]),md.cluster_centers_
In [47]:
target_label = 'ZIP CODE'
data = colls[pd.notnull(colls[target_label])]
# Encoding target/label column
# mdata, target = encode_column(data, 'ZIP CODE')
mdata, target = encode_column(data, target_label)
# Encode the feature columns
# mdata, _ = encode_column(mdata, 'BOROUGH')
# mdata, _ = encode_column(mdata, 'CONTRIBUTING FACTOR VEHICLE 1')
# mdata, _ = encode_column(mdata, 'VEHICLE TYPE CODE 1')
mdata, _ = encode_column(mdata, 'Conditions')
# KNN
s, cen = kmeans(200, mdata, ['LONGITUDE', 'LATITUDE'])
mdata['KMEANS'] = s
# Splitting date and time into month and hour
# mdata['HOUR'] = data.TIME.str.split(':').str.get(0)
# mdata['MONTH'] = data.DATE.str.split('/').str.get(1)
In [48]:
# Features for prediction
features = ['KMEANS','Conditions_encoded','HOUR']
# Split data set into training and test data
train_data = mdata.head(int(mdata.BOROUGH.count() * 0.80))
test_data = mdata.tail(int(mdata.BOROUGH.count() * 0.20))
target_label_encoded = target_label+'_encoded'
# Train the decision tree
clf = train_tree(target_label_encoded, features, train_data)
# Test the decision tree
print "Prediction accuracy %f" % test_prediction(target_label_encoded, clf, test_data, features, target)
In [49]:
mdata[['LATITUDE', 'LONGITUDE', 'KMEANS']].to_csv('datasets/clusters/c200.csv', sep=',')
mdata[['LATITUDE', 'LONGITUDE', 'KMEANS']].head(20000).to_csv('datasets/clusters/c200_small_20000.csv', sep=',')
2cluster = TRAINING WITH 467746 SAMPLES FOUND 3053 CORRECT PREDICTIONS Prediction accuracy 0.026108
5cluster = TRAINING WITH 467746 SAMPLES FOUND 6664 CORRECT PREDICTIONS Prediction accuracy 0.056988
10cluster = TRAINING WITH 467746 SAMPLES FOUND 12126 CORRECT PREDICTIONS Prediction accuracy 0.103698
20cluster = TRAINING WITH 467746 SAMPLES FOUND 21044 CORRECT PREDICTIONS Prediction accuracy 0.179962
30cluster = TRAINING WITH 467746 SAMPLES FOUND 28535 CORRECT PREDICTIONS Prediction accuracy 0.244022
50cluster = TRAINING WITH 467746 SAMPLES FOUND 41761 CORRECT PREDICTIONS Prediction accuracy 0.357127
100cluster = TRAINING WITH 467746 SAMPLES FOUND 60735 CORRECT PREDICTIONS Prediction accuracy 0.519387
200cluster = TRAINING WITH 467746 SAMPLES FOUND 73581 CORRECT PREDICTIONS Prediction accuracy 0.629242
In [50]:
import geoplotlib as gpl
def get_spaced_colors(n):
max_value = 16581375 #255**3
interval = int(max_value / n)
colors = [hex(I)[2:].zfill(6) for I in range(0, max_value, interval)]
return [[int(i[:2], 16), int(i[2:4], 16), int(i[4:], 16), 255] for i in colors]
colormap = get_spaced_colors(200)
def coords(k):
lat = mdata[mdata['KMEANS'] == k].LATITUDE.values
lon = mdata[mdata['KMEANS'] == k].LONGITUDE.values
return lat,lon
for i in range(0, 200):
lat, lon = coords(i)
latlong = {'lon': lon, 'lat': lat}
gpl.dot(latlong, color=colormap[i])
gpl.inline()
In [ ]:
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(clf, mdata[features].values, mdata[target_label_encoded].values, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
In [ ]:
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
In [ ]:
from sklearn import tree
from sklearn.externals.six import StringIO
import pydot
from IPython.display import Image
dot_data = StringIO()
tree.export_graphviz(clf, out_file=dot_data, feature_names=features,
filled=True, rounded=True,
special_characters=True)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
# Image(graph.create_png())
In [ ]:
with open('datasets/tree.pdf', 'w') as f:
f.write(graph.create_pdf())
In [ ]:
nan = colls[pd.isnull(colls.TemperatureC)]
print len(nan)
print len(colls)
In [ ]:
colls[colls['DATE'] == "03/08/2015"].head(1000).sort_values(by='TIME')
In [ ]: