In [7]:
import numpy as np
import pandas as pd
import itertools
from __future__ import division
from sklearn.tree import tree, DecisionTreeClassifier, export_graphviz
from sklearn import cluster
import geoplotlib as gpl
%matplotlib inline
pd.set_option("display.max_columns", 500)
pd.set_option("max_rows", 1000)
In [61]:
filePath = 'datasets/NYPD_Motor_Vehicle_Collisions_weather4.csv'
collisions = pd.read_csv(filePath)
collisions['YEAR'] = collisions.DATE.str.split('/').str.get(2)
collisions['MONTH'] = collisions.DATE.str.split('/').str.get(0)
collisions['HOUR'] = collisions.TIME.str.split(':').str.get(0)
In [79]:
from sklearn.ensemble import RandomForestClassifier
def encode_column(df, target_column):
# print "copying dataset"
df_mod = df.copy()
# print "finding uniques"
targets = pd.Series(df_mod[target_column].unique())
# print "mapping to ints"
map_to_int = {name: n for n, name in enumerate(targets)}
df_mod[target_column+"_encoded"] = df_mod[target_column].replace(map_to_int)
return (df_mod, targets)
def train_tree(prediction, features, dataset):
clf = RandomForestClassifier(n_estimators=50, n_jobs=4)
print "TRAINING WITH %d SAMPLES" % len(dataset)
X = np.array(dataset[features])
Y = np.array(list(itertools.chain(*dataset[[prediction]].values)))
return clf.fit(X, Y)
# Test forest
def test_tree(clf, test_data, features):
return clf.predict(test_data[features])
# Tests test_data on the forest classifier clf with features and target_label.
# encoded_map is a lookup table of the encoded values (numeric) to actual string values
def test_prediction(target_label, clf, test_data, features, encoded_map):
corrects = 0
predictions = test_tree(clf, test_data[features], features)
for i in range(0, len(predictions)):
if predictions[i] == test_data.iloc[i][target_label]:
corrects += 1
print "FOUND %d CORRECT PREDICTIONS" % corrects
# Return model accuracy [%]
return corrects / len(predictions)
def convert_encoded_district_to_str(preditions):
return map(lambda p: districts[p], preditions)
def kmean(k, dataset, colums):
md = cluster.KMeans(n_clusters=k).fit(dataset[colums])
return md.predict(dataset[colums]),md.cluster_centers_
def get_spaced_colors(n):
max_value = 16581375 #255**3
interval = int(max_value / n)
colors = [hex(I)[2:].zfill(6) for I in range(0, max_value, interval)]
return [[int(i[:2], 16), int(i[2:4], 16), int(i[4:], 16), 255] for i in colors]
def coords(k):
lat = data[data['KMEANS'] == k].LATITUDE.values
lon = data[data['KMEANS'] == k].LONGITUDE.values
return lat,lon
In [146]:
# KMeans
mask = ((pd.notnull(collisions.LOCATION)) &\
(collisions.YEAR == str(2015)) &\
(pd.notnull(collisions.TemperatureC)) &\
(collisions['CONTRIBUTING FACTOR VEHICLE 1']) &\
(collisions['Conditions'].str.contains('Snow')))
data = collisions.loc[mask]
print "Data size: %s" % len(data.index)
data, _ = encode_column(data, 'BOROUGH')
data, _ = encode_column(data, 'Conditions')
data, _ = encode_column(data, 'CONTRIBUTING FACTOR VEHICLE 1')
data, _ = encode_column(data, 'VEHICLE TYPE CODE 1')
data.TemperatureC = data.TemperatureC.astype('float64')
k = 30
colormap = get_spaced_colors(k)
# kmeans, centoid = kmean(k, data.loc[mask], ['Conditions_encoded'])
# data['KMEANS_CON'] = kmeans
kmeans, centoid = kmean(k, data.loc[mask], ['TemperatureC'])
data['KMEANS_TEMP'] = kmeans
kmeans, centoid = kmean(k, data.loc[mask], ['LATITUDE', 'LONGITUDE'])
data['KMEANS_LOC'] = kmeans
# for i in range(0, k):
# lat, lon = coords(i)
# locs = {'lon': lon, 'lat': lat}
# gpl.dot(locs, color=colormap[i])
# gpl.inline()
In [147]:
data.head()
Out[147]:
In [151]:
target_label = 'HOUR'
target_label_encoded = target_label#+'_encoded'
data = data[pd.notnull(data[target_label])]
data, target = encode_column(data, target_label)
# Features for prediction
features = ['Conditions_encoded']
# Split data set into training and test data
train_data = data.head(int(data.BOROUGH.count() * 0.80))
test_data = data.tail(int(data.BOROUGH.count() * 0.20))
# Train the decision tree
clf = train_tree(target_label_encoded, features, train_data)
# Test the decision tree
print "Prediction accuracy %f" % test_prediction(target_label_encoded, clf, test_data, features, target)
In [105]:
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(clf, data[features].values, data[target_label_encoded].values, cv=5)
print("Accuracy: %0.6f (+/- %0.6f)" % (scores.mean(), scores.std() * 2))
In [ ]: