In [1]:
import numpy as np
import pandas as pd
import itertools
from __future__ import division
from sklearn.tree import tree, DecisionTreeClassifier, export_graphviz
from sklearn import cluster

%matplotlib inline
pd.set_option("display.max_columns", 500)
pd.set_option("max_rows", 1000)

In [2]:
filePath = 'datasets/NYPD_Motor_Vehicle_Collisions_weather4.csv'
colls = pd.read_csv(filePath)
# colls = colls[(colls['VEHICLE TYPE CODE 1'] == 'TAXI') | \
#              (colls['VEHICLE TYPE CODE 2'] == 'TAXI') | \
#              (colls['VEHICLE TYPE CODE 3'] == 'TAXI') | \
#              (colls['VEHICLE TYPE CODE 4'] == 'TAXI') | \
#              (colls['VEHICLE TYPE CODE 5'] == 'TAXI')]

# colls = colls[colls['CONTRIBUTING FACTOR VEHICLE 1'] != 'Unspecified']

print len(colls.index)

colls = colls[pd.notnull(colls['LOCATION'])]

colls['HOUR'] = colls.TIME.str.split(':').str.get(0)
colls['MONTH'] = colls.DATE.str.split('/').str.get(0)

colls = colls[pd.notnull(colls['HOUR'])]

colls.head()


769054
/Users/masve/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2902: DtypeWarning: Columns (30,31) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
Out[2]:
Unnamed: 0 DATE TIME BOROUGH ZIP CODE LATITUDE LONGITUDE LOCATION ON STREET NAME CROSS STREET NAME OFF STREET NAME NUMBER OF PERSONS INJURED NUMBER OF PERSONS KILLED NUMBER OF PEDESTRIANS INJURED NUMBER OF PEDESTRIANS KILLED NUMBER OF CYCLIST INJURED NUMBER OF CYCLIST KILLED NUMBER OF MOTORIST INJURED NUMBER OF MOTORIST KILLED CONTRIBUTING FACTOR VEHICLE 1 CONTRIBUTING FACTOR VEHICLE 2 CONTRIBUTING FACTOR VEHICLE 3 CONTRIBUTING FACTOR VEHICLE 4 CONTRIBUTING FACTOR VEHICLE 5 UNIQUE KEY VEHICLE TYPE CODE 1 VEHICLE TYPE CODE 2 VEHICLE TYPE CODE 3 VEHICLE TYPE CODE 4 VEHICLE TYPE CODE 5 Conditions Precipitationmm TemperatureC VisibilityKm HOUR MONTH
0 0 03/14/2016 3:27 QUEENS 11372 40.747734 -73.882999 (40.7477341, -73.8829986) ROOSEVELT AVENUE 83 STREET NaN 1 0 1 0 0 0 0 0 Unspecified NaN NaN NaN NaN 3405169 OTHER NaN NaN NaN NaN NaN NaN NaN NaN 3 03
3 3 03/14/2016 0:45 MANHATTAN 10035 40.808279 -73.938793 (40.8082795, -73.9387929) EAST 129 STREET MADISON AVENUE NaN 0 0 0 0 0 0 0 0 Unspecified Unspecified NaN NaN NaN 3405059 PASSENGER VEHICLE PASSENGER VEHICLE NaN NaN NaN NaN NaN NaN NaN 0 03
4 4 03/13/2016 23:00 BROOKLYN 11206 40.706653 -73.950406 (40.7066527, -73.9504063) UNION AVENUE MONTROSE AVENUE NaN 0 0 0 0 0 0 0 0 Driver Inattention/Distraction Unspecified NaN NaN NaN 3405121 PASSENGER VEHICLE PASSENGER VEHICLE NaN NaN NaN NaN NaN NaN NaN 23 03
6 6 03/13/2016 9:48 BROOKLYN 11212 40.661997 -73.919593 (40.661997, -73.9195931) KINGS HIGHWAY EAST 98 STREET NaN 0 0 0 0 0 0 0 0 Passenger Distraction Unspecified NaN NaN NaN 3404744 PASSENGER VEHICLE UNKNOWN NaN NaN NaN NaN NaN NaN NaN 9 03
9 9 03/13/2016 9:46 QUEENS 11106 40.756580 -73.929752 (40.75658, -73.9297516) 36 AVENUE 31 STREET NaN 0 0 0 0 0 0 0 0 Failure to Yield Right-of-Way Unspecified NaN NaN NaN 3404995 LIVERY VEHICLE PASSENGER VEHICLE NaN NaN NaN NaN NaN NaN NaN 9 03

In [3]:
from sklearn.ensemble import RandomForestClassifier
def encode_column(df, target_column):
#     print "copying dataset"
    df_mod = df.copy()
#     print "finding uniques"
    targets = pd.Series(df_mod[target_column].unique())
#     print "mapping to ints"
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod[target_column+"_encoded"] = df_mod[target_column].replace(map_to_int)
    return (df_mod, targets)

def train_tree(prediction, features, dataset):
    clf = RandomForestClassifier(n_estimators=50, n_jobs=4)
    print "TRAINING WITH %d SAMPLES" % len(dataset) 
    X = np.array(dataset[features])
    Y = np.array(list(itertools.chain(*dataset[[prediction]].values)))
    return clf.fit(X, Y)

# Test forest
def test_tree(clf, test_data, features):
    return clf.predict(test_data[features])

# Tests test_data on the forest classifier clf with features and target_label.
# encoded_map is a lookup table of the encoded values (numeric) to actual string values
def test_prediction(target_label, clf, test_data, features, encoded_map):
    corrects = 0
    predictions = test_tree(clf, test_data[features], features)
    for i in range(0, len(predictions)):
        if predictions[i] == test_data.iloc[i][target_label]:
            corrects += 1
    print "FOUND %d CORRECT PREDICTIONS" % corrects
    # Return model accuracy [%]
    return corrects / len(predictions)

def convert_encoded_district_to_str(preditions):
    return map(lambda p: districts[p], preditions)

def kmeans(k, dataset, colums):
    md = cluster.KMeans(n_clusters=k).fit(dataset[colums])
    return md.predict(dataset[colums]),md.cluster_centers_

In [47]:
target_label = 'ZIP CODE'

data = colls[pd.notnull(colls[target_label])]

# Encoding target/label column
# mdata, target = encode_column(data, 'ZIP CODE')
mdata, target = encode_column(data, target_label)

# Encode the feature columns
# mdata, _ = encode_column(mdata, 'BOROUGH')
# mdata, _ = encode_column(mdata, 'CONTRIBUTING FACTOR VEHICLE 1')
# mdata, _ = encode_column(mdata, 'VEHICLE TYPE CODE 1')
mdata, _ = encode_column(mdata, 'Conditions')

# KNN
s, cen = kmeans(200, mdata, ['LONGITUDE', 'LATITUDE'])
mdata['KMEANS'] = s


# Splitting date and time into month and hour
# mdata['HOUR'] = data.TIME.str.split(':').str.get(0)
# mdata['MONTH'] = data.DATE.str.split('/').str.get(1)

In [48]:
# Features for prediction
features = ['KMEANS','Conditions_encoded','HOUR']

# Split data set into training and test data
train_data = mdata.head(int(mdata.BOROUGH.count() * 0.80))
test_data = mdata.tail(int(mdata.BOROUGH.count() * 0.20))

target_label_encoded = target_label+'_encoded'

# Train the decision tree
clf = train_tree(target_label_encoded, features, train_data)

# Test the decision tree
print "Prediction accuracy %f" % test_prediction(target_label_encoded, clf, test_data, features, target)


TRAINING WITH 467746 SAMPLES
FOUND 73581 CORRECT PREDICTIONS
Prediction accuracy 0.629242

In [49]:
mdata[['LATITUDE', 'LONGITUDE', 'KMEANS']].to_csv('datasets/clusters/c200.csv', sep=',')
mdata[['LATITUDE', 'LONGITUDE', 'KMEANS']].head(20000).to_csv('datasets/clusters/c200_small_20000.csv', sep=',')

2cluster = TRAINING WITH 467746 SAMPLES FOUND 3053 CORRECT PREDICTIONS Prediction accuracy 0.026108

5cluster = TRAINING WITH 467746 SAMPLES FOUND 6664 CORRECT PREDICTIONS Prediction accuracy 0.056988

10cluster = TRAINING WITH 467746 SAMPLES FOUND 12126 CORRECT PREDICTIONS Prediction accuracy 0.103698

20cluster = TRAINING WITH 467746 SAMPLES FOUND 21044 CORRECT PREDICTIONS Prediction accuracy 0.179962

30cluster = TRAINING WITH 467746 SAMPLES FOUND 28535 CORRECT PREDICTIONS Prediction accuracy 0.244022

50cluster = TRAINING WITH 467746 SAMPLES FOUND 41761 CORRECT PREDICTIONS Prediction accuracy 0.357127

100cluster = TRAINING WITH 467746 SAMPLES FOUND 60735 CORRECT PREDICTIONS Prediction accuracy 0.519387

200cluster = TRAINING WITH 467746 SAMPLES FOUND 73581 CORRECT PREDICTIONS Prediction accuracy 0.629242


In [50]:
import geoplotlib as gpl

def get_spaced_colors(n):
    max_value = 16581375 #255**3
    interval = int(max_value / n)
    colors = [hex(I)[2:].zfill(6) for I in range(0, max_value, interval)]
    
    return [[int(i[:2], 16), int(i[2:4], 16), int(i[4:], 16), 255] for i in colors]

colormap = get_spaced_colors(200)

def coords(k):
    lat = mdata[mdata['KMEANS'] == k].LATITUDE.values
    lon = mdata[mdata['KMEANS'] == k].LONGITUDE.values
    
    return lat,lon
    
for i in range(0, 200):
    lat, lon = coords(i)

    latlong = {'lon': lon, 'lat': lat}
    gpl.dot(latlong, color=colormap[i])

gpl.inline()