notebook.community

Edit and run



In [7]:

    
import numpy as np
import pandas as pd
import itertools
from __future__ import division
from sklearn.tree import tree, DecisionTreeClassifier, export_graphviz
from sklearn import cluster
import geoplotlib as gpl

%matplotlib inline
pd.set_option("display.max_columns", 500)
pd.set_option("max_rows", 1000)



In [61]:

    
filePath = 'datasets/NYPD_Motor_Vehicle_Collisions_weather4.csv'
collisions = pd.read_csv(filePath)
collisions['YEAR'] = collisions.DATE.str.split('/').str.get(2)
collisions['MONTH'] = collisions.DATE.str.split('/').str.get(0)
collisions['HOUR'] = collisions.TIME.str.split(':').str.get(0)



In [79]:

    
from sklearn.ensemble import RandomForestClassifier
def encode_column(df, target_column):
#     print "copying dataset"
    df_mod = df.copy()
#     print "finding uniques"
    targets = pd.Series(df_mod[target_column].unique())
#     print "mapping to ints"
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod[target_column+"_encoded"] = df_mod[target_column].replace(map_to_int)
    return (df_mod, targets)

def train_tree(prediction, features, dataset):
    clf = RandomForestClassifier(n_estimators=50, n_jobs=4)
    print "TRAINING WITH %d SAMPLES" % len(dataset) 
    X = np.array(dataset[features])
    Y = np.array(list(itertools.chain(*dataset[[prediction]].values)))
    return clf.fit(X, Y)

# Test forest
def test_tree(clf, test_data, features):
    return clf.predict(test_data[features])

# Tests test_data on the forest classifier clf with features and target_label.
# encoded_map is a lookup table of the encoded values (numeric) to actual string values
def test_prediction(target_label, clf, test_data, features, encoded_map):
    corrects = 0
    predictions = test_tree(clf, test_data[features], features)
    for i in range(0, len(predictions)):
        if predictions[i] == test_data.iloc[i][target_label]:
            corrects += 1
    print "FOUND %d CORRECT PREDICTIONS" % corrects
    # Return model accuracy [%]
    return corrects / len(predictions)

def convert_encoded_district_to_str(preditions):
    return map(lambda p: districts[p], preditions)

def kmean(k, dataset, colums):
    md = cluster.KMeans(n_clusters=k).fit(dataset[colums])
    return md.predict(dataset[colums]),md.cluster_centers_

def get_spaced_colors(n):
    max_value = 16581375 #255**3
    interval = int(max_value / n)
    colors = [hex(I)[2:].zfill(6) for I in range(0, max_value, interval)]
    
    return [[int(i[:2], 16), int(i[2:4], 16), int(i[4:], 16), 255] for i in colors]

def coords(k):
    lat = data[data['KMEANS'] == k].LATITUDE.values
    lon = data[data['KMEANS'] == k].LONGITUDE.values
    
    return lat,lon



In [146]:

    
# KMeans
mask = ((pd.notnull(collisions.LOCATION)) &\
        (collisions.YEAR == str(2015)) &\
        (pd.notnull(collisions.TemperatureC)) &\
        (collisions['CONTRIBUTING FACTOR VEHICLE 1']) &\
        (collisions['Conditions'].str.contains('Snow')))
data = collisions.loc[mask]
print "Data size: %s" % len(data.index)

data, _ = encode_column(data, 'BOROUGH')
data, _ = encode_column(data, 'Conditions')
data, _ = encode_column(data, 'CONTRIBUTING FACTOR VEHICLE 1')
data, _ = encode_column(data, 'VEHICLE TYPE CODE 1')

data.TemperatureC = data.TemperatureC.astype('float64')

k = 30
colormap = get_spaced_colors(k)

# kmeans, centoid = kmean(k, data.loc[mask], ['Conditions_encoded'])
# data['KMEANS_CON'] = kmeans

kmeans, centoid = kmean(k, data.loc[mask], ['TemperatureC'])
data['KMEANS_TEMP'] = kmeans

kmeans, centoid = kmean(k, data.loc[mask], ['LATITUDE', 'LONGITUDE'])
data['KMEANS_LOC'] = kmeans

# for i in range(0, k):
#     lat, lon = coords(i)

#     locs = {'lon': lon, 'lat': lat}
#     gpl.dot(locs, color=colormap[i])

# gpl.inline()









    



Data size: 4414



In [147]:

    
data.head()









    Out[147]:






  
    
      
      Unnamed: 0
      DATE
      TIME
      BOROUGH
      ZIP CODE
      LATITUDE
      LONGITUDE
      LOCATION
      ON STREET NAME
      CROSS STREET NAME
      OFF STREET NAME
      NUMBER OF PERSONS INJURED
      NUMBER OF PERSONS KILLED
      NUMBER OF PEDESTRIANS INJURED
      NUMBER OF PEDESTRIANS KILLED
      NUMBER OF CYCLIST INJURED
      NUMBER OF CYCLIST KILLED
      NUMBER OF MOTORIST INJURED
      NUMBER OF MOTORIST KILLED
      CONTRIBUTING FACTOR VEHICLE 1
      CONTRIBUTING FACTOR VEHICLE 2
      CONTRIBUTING FACTOR VEHICLE 3
      CONTRIBUTING FACTOR VEHICLE 4
      CONTRIBUTING FACTOR VEHICLE 5
      UNIQUE KEY
      VEHICLE TYPE CODE 1
      VEHICLE TYPE CODE 2
      VEHICLE TYPE CODE 3
      VEHICLE TYPE CODE 4
      VEHICLE TYPE CODE 5
      Conditions
      Precipitationmm
      TemperatureC
      VisibilityKm
      YEAR
      MONTH
      HOUR
      BOROUGH_encoded
      Conditions_encoded
      CONTRIBUTING FACTOR VEHICLE 1_encoded
      VEHICLE TYPE CODE 1_encoded
      KMEANS_TEMP
      KMEANS_LOC
    
  
  
    
      42403
      42403
      12/29/2015
      0:04
      BRONX
      10458
      40.861944
      -73.893727
      (40.8619443, -73.8937266)
      EAST FORDHAM ROAD
      ELM PLACE
      NaN
      0
      0
      0
      0
      0
      0
      0
      0
      Unspecified
      Unspecified
      NaN
      NaN
      NaN
      3361861
      PASSENGER VEHICLE
      PASSENGER VEHICLE
      NaN
      NaN
      NaN
      Light Snow
      0.01
      1
      9.7
      2015
      12
      0
      0
      0
      0
      0
      7
      17
    
    
      42404
      42404
      12/29/2015
      0:07
      BROOKLYN
      11217
      40.683456
      -73.975616
      (40.6834565, -73.9756156)
      NaN
      NaN
      625 ATLANTIC AVENUE
      0
      0
      0
      0
      0
      0
      0
      0
      Unspecified
      Unspecified
      NaN
      NaN
      NaN
      3362576
      PASSENGER VEHICLE
      SPORT UTILITY / STATION WAGON
      NaN
      NaN
      NaN
      Light Snow
      0.01
      1
      9.7
      2015
      12
      0
      1
      0
      0
      0
      7
      21
    
    
      42405
      42405
      12/29/2015
      0:15
      MANHATTAN
      10004
      40.706237
      -74.014337
      (40.706237, -74.014337)
      MORRIS STREET
      GREENWICH STREET
      NaN
      0
      0
      0
      0
      0
      0
      0
      0
      Other Vehicular
      Other Vehicular
      NaN
      NaN
      NaN
      3361663
      SPORT UTILITY / STATION WAGON
      UNKNOWN
      NaN
      NaN
      NaN
      Light Snow
      0.01
      1
      9.7
      2015
      12
      0
      2
      0
      1
      1
      7
      25
    
    
      42406
      42406
      12/29/2015
      0:17
      MANHATTAN
      10019
      40.762859
      -73.989401
      (40.762859, -73.9894015)
      9 AVENUE
      WEST 49 STREET
      NaN
      1
      0
      1
      0
      0
      0
      0
      0
      Failure to Yield Right-of-Way
      NaN
      NaN
      NaN
      NaN
      3361733
      SPORT UTILITY / STATION WAGON
      NaN
      NaN
      NaN
      NaN
      Light Snow
      0.01
      1
      9.7
      2015
      12
      0
      2
      0
      2
      1
      7
      10
    
    
      42407
      42407
      12/29/2015
      0:30
      MANHATTAN
      10014
      40.738757
      -74.008105
      (40.7387575, -74.0081048)
      HORATIO STREET
      WASHINGTON STREET
      NaN
      1
      0
      1
      0
      0
      0
      0
      0
      Driver Inattention/Distraction
      NaN
      NaN
      NaN
      NaN
      3361675
      TAXI
      NaN
      NaN
      NaN
      NaN
      Light Snow
      0.01
      1
      9.7
      2015
      12
      0
      2
      0
      3
      2
      7
      25



In [151]:

    
target_label = 'HOUR'
target_label_encoded = target_label#+'_encoded'
data = data[pd.notnull(data[target_label])]
data, target = encode_column(data, target_label)

# Features for prediction
features = ['Conditions_encoded']

# Split data set into training and test data
train_data = data.head(int(data.BOROUGH.count() * 0.80))
test_data = data.tail(int(data.BOROUGH.count() * 0.20))

# Train the decision tree
clf = train_tree(target_label_encoded, features, train_data)

# Test the decision tree
print "Prediction accuracy %f" % test_prediction(target_label_encoded, clf, test_data, features, target)









    



TRAINING WITH 3254 SAMPLES
FOUND 31 CORRECT PREDICTIONS
Prediction accuracy 0.038130



In [105]:

    
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(clf, data[features].values, data[target_label_encoded].values, cv=5)
print("Accuracy: %0.6f (+/- %0.6f)" % (scores.mean(), scores.std() * 2))









    



Accuracy: 0.241691 (+/- 0.012609)



In [ ]:

	Unnamed: 0	DATE	TIME	BOROUGH	ZIP CODE	LATITUDE	LONGITUDE	LOCATION	ON STREET NAME	CROSS STREET NAME	OFF STREET NAME	NUMBER OF PERSONS INJURED	NUMBER OF PEDESTRIANS INJURED	CONTRIBUTING FACTOR VEHICLE 1	CONTRIBUTING FACTOR VEHICLE 2	CONTRIBUTING FACTOR VEHICLE 3	CONTRIBUTING FACTOR VEHICLE 4	CONTRIBUTING FACTOR VEHICLE 5	UNIQUE KEY	VEHICLE TYPE CODE 1	VEHICLE TYPE CODE 2	VEHICLE TYPE CODE 3	VEHICLE TYPE CODE 4	VEHICLE TYPE CODE 5	Conditions	Precipitationmm	TemperatureC	VisibilityKm	YEAR	MONTH	BOROUGH_encoded	CONTRIBUTING FACTOR VEHICLE 1_encoded	VEHICLE TYPE CODE 1_encoded	KMEANS_TEMP	KMEANS_LOC
42403	42403	12/29/2015	0:04	BRONX	10458	40.861944	-73.893727	(40.8619443, -73.8937266)	EAST FORDHAM ROAD	ELM PLACE	NaN	0	0	Unspecified	Unspecified	NaN	NaN	NaN	3361861	PASSENGER VEHICLE	PASSENGER VEHICLE	NaN	NaN	NaN	Light Snow	0.01	1	9.7	2015	12	0	0	0	7	17
42404	42404	12/29/2015	0:07	BROOKLYN	11217	40.683456	-73.975616	(40.6834565, -73.9756156)	NaN	NaN	625 ATLANTIC AVENUE	0	0	Unspecified	Unspecified	NaN	NaN	NaN	3362576	PASSENGER VEHICLE	SPORT UTILITY / STATION WAGON	NaN	NaN	NaN	Light Snow	0.01	1	9.7	2015	12	1	0	0	7	21
42405	42405	12/29/2015	0:15	MANHATTAN	10004	40.706237	-74.014337	(40.706237, -74.014337)	MORRIS STREET	GREENWICH STREET	NaN	0	0	Other Vehicular	Other Vehicular	NaN	NaN	NaN	3361663	SPORT UTILITY / STATION WAGON	UNKNOWN	NaN	NaN	NaN	Light Snow	0.01	1	9.7	2015	12	2	1	1	7	25
42406	42406	12/29/2015	0:17	MANHATTAN	10019	40.762859	-73.989401	(40.762859, -73.9894015)	9 AVENUE	WEST 49 STREET	NaN	1	1	Failure to Yield Right-of-Way	NaN	NaN	NaN	NaN	3361733	SPORT UTILITY / STATION WAGON	NaN	NaN	NaN	NaN	Light Snow	0.01	1	9.7	2015	12	2	2	1	7	10
42407	42407	12/29/2015	0:30	MANHATTAN	10014	40.738757	-74.008105	(40.7387575, -74.0081048)	HORATIO STREET	WASHINGTON STREET	NaN	1	1	Driver Inattention/Distraction	NaN	NaN	NaN	NaN	3361675	TAXI	NaN	NaN	NaN	NaN	Light Snow	0.01	1	9.7	2015	12	2	3	2	7	25