In [7]:
import numpy as np
import pandas as pd
import itertools
from __future__ import division
from sklearn.tree import tree, DecisionTreeClassifier, export_graphviz
from sklearn import cluster
import geoplotlib as gpl

%matplotlib inline
pd.set_option("display.max_columns", 500)
pd.set_option("max_rows", 1000)

In [61]:
filePath = 'datasets/NYPD_Motor_Vehicle_Collisions_weather4.csv'
collisions = pd.read_csv(filePath)
collisions['YEAR'] = collisions.DATE.str.split('/').str.get(2)
collisions['MONTH'] = collisions.DATE.str.split('/').str.get(0)
collisions['HOUR'] = collisions.TIME.str.split(':').str.get(0)

In [79]:
from sklearn.ensemble import RandomForestClassifier
def encode_column(df, target_column):
#     print "copying dataset"
    df_mod = df.copy()
#     print "finding uniques"
    targets = pd.Series(df_mod[target_column].unique())
#     print "mapping to ints"
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod[target_column+"_encoded"] = df_mod[target_column].replace(map_to_int)
    return (df_mod, targets)

def train_tree(prediction, features, dataset):
    clf = RandomForestClassifier(n_estimators=50, n_jobs=4)
    print "TRAINING WITH %d SAMPLES" % len(dataset) 
    X = np.array(dataset[features])
    Y = np.array(list(itertools.chain(*dataset[[prediction]].values)))
    return clf.fit(X, Y)

# Test forest
def test_tree(clf, test_data, features):
    return clf.predict(test_data[features])

# Tests test_data on the forest classifier clf with features and target_label.
# encoded_map is a lookup table of the encoded values (numeric) to actual string values
def test_prediction(target_label, clf, test_data, features, encoded_map):
    corrects = 0
    predictions = test_tree(clf, test_data[features], features)
    for i in range(0, len(predictions)):
        if predictions[i] == test_data.iloc[i][target_label]:
            corrects += 1
    print "FOUND %d CORRECT PREDICTIONS" % corrects
    # Return model accuracy [%]
    return corrects / len(predictions)

def convert_encoded_district_to_str(preditions):
    return map(lambda p: districts[p], preditions)

def kmean(k, dataset, colums):
    md = cluster.KMeans(n_clusters=k).fit(dataset[colums])
    return md.predict(dataset[colums]),md.cluster_centers_

def get_spaced_colors(n):
    max_value = 16581375 #255**3
    interval = int(max_value / n)
    colors = [hex(I)[2:].zfill(6) for I in range(0, max_value, interval)]
    
    return [[int(i[:2], 16), int(i[2:4], 16), int(i[4:], 16), 255] for i in colors]

def coords(k):
    lat = data[data['KMEANS'] == k].LATITUDE.values
    lon = data[data['KMEANS'] == k].LONGITUDE.values
    
    return lat,lon

In [146]:
# KMeans
mask = ((pd.notnull(collisions.LOCATION)) &\
        (collisions.YEAR == str(2015)) &\
        (pd.notnull(collisions.TemperatureC)) &\
        (collisions['CONTRIBUTING FACTOR VEHICLE 1']) &\
        (collisions['Conditions'].str.contains('Snow')))
data = collisions.loc[mask]
print "Data size: %s" % len(data.index)

data, _ = encode_column(data, 'BOROUGH')
data, _ = encode_column(data, 'Conditions')
data, _ = encode_column(data, 'CONTRIBUTING FACTOR VEHICLE 1')
data, _ = encode_column(data, 'VEHICLE TYPE CODE 1')

data.TemperatureC = data.TemperatureC.astype('float64')

k = 30
colormap = get_spaced_colors(k)

# kmeans, centoid = kmean(k, data.loc[mask], ['Conditions_encoded'])
# data['KMEANS_CON'] = kmeans

kmeans, centoid = kmean(k, data.loc[mask], ['TemperatureC'])
data['KMEANS_TEMP'] = kmeans

kmeans, centoid = kmean(k, data.loc[mask], ['LATITUDE', 'LONGITUDE'])
data['KMEANS_LOC'] = kmeans

# for i in range(0, k):
#     lat, lon = coords(i)

#     locs = {'lon': lon, 'lat': lat}
#     gpl.dot(locs, color=colormap[i])

# gpl.inline()


Data size: 4414

In [147]:
data.head()


Out[147]:
Unnamed: 0 DATE TIME BOROUGH ZIP CODE LATITUDE LONGITUDE LOCATION ON STREET NAME CROSS STREET NAME OFF STREET NAME NUMBER OF PERSONS INJURED NUMBER OF PERSONS KILLED NUMBER OF PEDESTRIANS INJURED NUMBER OF PEDESTRIANS KILLED NUMBER OF CYCLIST INJURED NUMBER OF CYCLIST KILLED NUMBER OF MOTORIST INJURED NUMBER OF MOTORIST KILLED CONTRIBUTING FACTOR VEHICLE 1 CONTRIBUTING FACTOR VEHICLE 2 CONTRIBUTING FACTOR VEHICLE 3 CONTRIBUTING FACTOR VEHICLE 4 CONTRIBUTING FACTOR VEHICLE 5 UNIQUE KEY VEHICLE TYPE CODE 1 VEHICLE TYPE CODE 2 VEHICLE TYPE CODE 3 VEHICLE TYPE CODE 4 VEHICLE TYPE CODE 5 Conditions Precipitationmm TemperatureC VisibilityKm YEAR MONTH HOUR BOROUGH_encoded Conditions_encoded CONTRIBUTING FACTOR VEHICLE 1_encoded VEHICLE TYPE CODE 1_encoded KMEANS_TEMP KMEANS_LOC
42403 42403 12/29/2015 0:04 BRONX 10458 40.861944 -73.893727 (40.8619443, -73.8937266) EAST FORDHAM ROAD ELM PLACE NaN 0 0 0 0 0 0 0 0 Unspecified Unspecified NaN NaN NaN 3361861 PASSENGER VEHICLE PASSENGER VEHICLE NaN NaN NaN Light Snow 0.01 1 9.7 2015 12 0 0 0 0 0 7 17
42404 42404 12/29/2015 0:07 BROOKLYN 11217 40.683456 -73.975616 (40.6834565, -73.9756156) NaN NaN 625 ATLANTIC AVENUE 0 0 0 0 0 0 0 0 Unspecified Unspecified NaN NaN NaN 3362576 PASSENGER VEHICLE SPORT UTILITY / STATION WAGON NaN NaN NaN Light Snow 0.01 1 9.7 2015 12 0 1 0 0 0 7 21
42405 42405 12/29/2015 0:15 MANHATTAN 10004 40.706237 -74.014337 (40.706237, -74.014337) MORRIS STREET GREENWICH STREET NaN 0 0 0 0 0 0 0 0 Other Vehicular Other Vehicular NaN NaN NaN 3361663 SPORT UTILITY / STATION WAGON UNKNOWN NaN NaN NaN Light Snow 0.01 1 9.7 2015 12 0 2 0 1 1 7 25
42406 42406 12/29/2015 0:17 MANHATTAN 10019 40.762859 -73.989401 (40.762859, -73.9894015) 9 AVENUE WEST 49 STREET NaN 1 0 1 0 0 0 0 0 Failure to Yield Right-of-Way NaN NaN NaN NaN 3361733 SPORT UTILITY / STATION WAGON NaN NaN NaN NaN Light Snow 0.01 1 9.7 2015 12 0 2 0 2 1 7 10
42407 42407 12/29/2015 0:30 MANHATTAN 10014 40.738757 -74.008105 (40.7387575, -74.0081048) HORATIO STREET WASHINGTON STREET NaN 1 0 1 0 0 0 0 0 Driver Inattention/Distraction NaN NaN NaN NaN 3361675 TAXI NaN NaN NaN NaN Light Snow 0.01 1 9.7 2015 12 0 2 0 3 2 7 25

In [151]:
target_label = 'HOUR'
target_label_encoded = target_label#+'_encoded'
data = data[pd.notnull(data[target_label])]
data, target = encode_column(data, target_label)

# Features for prediction
features = ['Conditions_encoded']

# Split data set into training and test data
train_data = data.head(int(data.BOROUGH.count() * 0.80))
test_data = data.tail(int(data.BOROUGH.count() * 0.20))

# Train the decision tree
clf = train_tree(target_label_encoded, features, train_data)

# Test the decision tree
print "Prediction accuracy %f" % test_prediction(target_label_encoded, clf, test_data, features, target)


TRAINING WITH 3254 SAMPLES
FOUND 31 CORRECT PREDICTIONS
Prediction accuracy 0.038130

In [105]:
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(clf, data[features].values, data[target_label_encoded].values, cv=5)
print("Accuracy: %0.6f (+/- %0.6f)" % (scores.mean(), scores.std() * 2))


Accuracy: 0.241691 (+/- 0.012609)

In [ ]: