notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas as pd
import itertools
from __future__ import division
from sklearn.tree import tree, DecisionTreeClassifier, export_graphviz
from sklearn import cluster

%matplotlib inline
pd.set_option("display.max_columns", 500)
pd.set_option("max_rows", 1000)



In [2]:

    
filePath = 'datasets/NYPD_Motor_Vehicle_Collisions_weather4.csv'
colls = pd.read_csv(filePath)
# colls = colls[(colls['VEHICLE TYPE CODE 1'] == 'TAXI') | \
#              (colls['VEHICLE TYPE CODE 2'] == 'TAXI') | \
#              (colls['VEHICLE TYPE CODE 3'] == 'TAXI') | \
#              (colls['VEHICLE TYPE CODE 4'] == 'TAXI') | \
#              (colls['VEHICLE TYPE CODE 5'] == 'TAXI')]

# colls = colls[colls['CONTRIBUTING FACTOR VEHICLE 1'] != 'Unspecified']

print len(colls.index)

colls = colls[pd.notnull(colls['LOCATION'])]

colls['HOUR'] = colls.TIME.str.split(':').str.get(0)
colls['MONTH'] = colls.DATE.str.split('/').str.get(0)

colls = colls[pd.notnull(colls['HOUR'])]

colls.head()









    



769054






    



/Users/masve/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2902: DtypeWarning: Columns (30,31) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)






    Out[2]:






  
    
      
      Unnamed: 0
      DATE
      TIME
      BOROUGH
      ZIP CODE
      LATITUDE
      LONGITUDE
      LOCATION
      ON STREET NAME
      CROSS STREET NAME
      OFF STREET NAME
      NUMBER OF PERSONS INJURED
      NUMBER OF PERSONS KILLED
      NUMBER OF PEDESTRIANS INJURED
      NUMBER OF PEDESTRIANS KILLED
      NUMBER OF CYCLIST INJURED
      NUMBER OF CYCLIST KILLED
      NUMBER OF MOTORIST INJURED
      NUMBER OF MOTORIST KILLED
      CONTRIBUTING FACTOR VEHICLE 1
      CONTRIBUTING FACTOR VEHICLE 2
      CONTRIBUTING FACTOR VEHICLE 3
      CONTRIBUTING FACTOR VEHICLE 4
      CONTRIBUTING FACTOR VEHICLE 5
      UNIQUE KEY
      VEHICLE TYPE CODE 1
      VEHICLE TYPE CODE 2
      VEHICLE TYPE CODE 3
      VEHICLE TYPE CODE 4
      VEHICLE TYPE CODE 5
      Conditions
      Precipitationmm
      TemperatureC
      VisibilityKm
      HOUR
      MONTH
    
  
  
    
      0
      0
      03/14/2016
      3:27
      QUEENS
      11372
      40.747734
      -73.882999
      (40.7477341, -73.8829986)
      ROOSEVELT AVENUE
      83 STREET
      NaN
      1
      0
      1
      0
      0
      0
      0
      0
      Unspecified
      NaN
      NaN
      NaN
      NaN
      3405169
      OTHER
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      3
      03
    
    
      3
      3
      03/14/2016
      0:45
      MANHATTAN
      10035
      40.808279
      -73.938793
      (40.8082795, -73.9387929)
      EAST 129 STREET
      MADISON AVENUE
      NaN
      0
      0
      0
      0
      0
      0
      0
      0
      Unspecified
      Unspecified
      NaN
      NaN
      NaN
      3405059
      PASSENGER VEHICLE
      PASSENGER VEHICLE
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      0
      03
    
    
      4
      4
      03/13/2016
      23:00
      BROOKLYN
      11206
      40.706653
      -73.950406
      (40.7066527, -73.9504063)
      UNION AVENUE
      MONTROSE AVENUE
      NaN
      0
      0
      0
      0
      0
      0
      0
      0
      Driver Inattention/Distraction
      Unspecified
      NaN
      NaN
      NaN
      3405121
      PASSENGER VEHICLE
      PASSENGER VEHICLE
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      23
      03
    
    
      6
      6
      03/13/2016
      9:48
      BROOKLYN
      11212
      40.661997
      -73.919593
      (40.661997, -73.9195931)
      KINGS HIGHWAY
      EAST 98 STREET
      NaN
      0
      0
      0
      0
      0
      0
      0
      0
      Passenger Distraction
      Unspecified
      NaN
      NaN
      NaN
      3404744
      PASSENGER VEHICLE
      UNKNOWN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      9
      03
    
    
      9
      9
      03/13/2016
      9:46
      QUEENS
      11106
      40.756580
      -73.929752
      (40.75658, -73.9297516)
      36 AVENUE
      31 STREET
      NaN
      0
      0
      0
      0
      0
      0
      0
      0
      Failure to Yield Right-of-Way
      Unspecified
      NaN
      NaN
      NaN
      3404995
      LIVERY VEHICLE
      PASSENGER VEHICLE
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      9
      03



In [3]:

    
from sklearn.ensemble import RandomForestClassifier
def encode_column(df, target_column):
#     print "copying dataset"
    df_mod = df.copy()
#     print "finding uniques"
    targets = pd.Series(df_mod[target_column].unique())
#     print "mapping to ints"
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod[target_column+"_encoded"] = df_mod[target_column].replace(map_to_int)
    return (df_mod, targets)

def train_tree(prediction, features, dataset):
    clf = RandomForestClassifier(n_estimators=50, n_jobs=4)
    print "TRAINING WITH %d SAMPLES" % len(dataset) 
    X = np.array(dataset[features])
    Y = np.array(list(itertools.chain(*dataset[[prediction]].values)))
    return clf.fit(X, Y)

# Test forest
def test_tree(clf, test_data, features):
    return clf.predict(test_data[features])

# Tests test_data on the forest classifier clf with features and target_label.
# encoded_map is a lookup table of the encoded values (numeric) to actual string values
def test_prediction(target_label, clf, test_data, features, encoded_map):
    corrects = 0
    predictions = test_tree(clf, test_data[features], features)
    for i in range(0, len(predictions)):
        if predictions[i] == test_data.iloc[i][target_label]:
            corrects += 1
    print "FOUND %d CORRECT PREDICTIONS" % corrects
    # Return model accuracy [%]
    return corrects / len(predictions)

def convert_encoded_district_to_str(preditions):
    return map(lambda p: districts[p], preditions)

def kmeans(k, dataset, colums):
    md = cluster.KMeans(n_clusters=k).fit(dataset[colums])
    return md.predict(dataset[colums]),md.cluster_centers_



In [47]:

    
target_label = 'ZIP CODE'

data = colls[pd.notnull(colls[target_label])]

# Encoding target/label column
# mdata, target = encode_column(data, 'ZIP CODE')
mdata, target = encode_column(data, target_label)

# Encode the feature columns
# mdata, _ = encode_column(mdata, 'BOROUGH')
# mdata, _ = encode_column(mdata, 'CONTRIBUTING FACTOR VEHICLE 1')
# mdata, _ = encode_column(mdata, 'VEHICLE TYPE CODE 1')
mdata, _ = encode_column(mdata, 'Conditions')

# KNN
s, cen = kmeans(200, mdata, ['LONGITUDE', 'LATITUDE'])
mdata['KMEANS'] = s


# Splitting date and time into month and hour
# mdata['HOUR'] = data.TIME.str.split(':').str.get(0)
# mdata['MONTH'] = data.DATE.str.split('/').str.get(1)



In [48]:

    
# Features for prediction
features = ['KMEANS','Conditions_encoded','HOUR']

# Split data set into training and test data
train_data = mdata.head(int(mdata.BOROUGH.count() * 0.80))
test_data = mdata.tail(int(mdata.BOROUGH.count() * 0.20))

target_label_encoded = target_label+'_encoded'

# Train the decision tree
clf = train_tree(target_label_encoded, features, train_data)

# Test the decision tree
print "Prediction accuracy %f" % test_prediction(target_label_encoded, clf, test_data, features, target)









    



TRAINING WITH 467746 SAMPLES
FOUND 73581 CORRECT PREDICTIONS
Prediction accuracy 0.629242



In [49]:

    
mdata[['LATITUDE', 'LONGITUDE', 'KMEANS']].to_csv('datasets/clusters/c200.csv', sep=',')
mdata[['LATITUDE', 'LONGITUDE', 'KMEANS']].head(20000).to_csv('datasets/clusters/c200_small_20000.csv', sep=',')

2cluster = TRAINING WITH 467746 SAMPLES FOUND 3053 CORRECT PREDICTIONS Prediction accuracy 0.026108

5cluster = TRAINING WITH 467746 SAMPLES FOUND 6664 CORRECT PREDICTIONS Prediction accuracy 0.056988

10cluster = TRAINING WITH 467746 SAMPLES FOUND 12126 CORRECT PREDICTIONS Prediction accuracy 0.103698

20cluster = TRAINING WITH 467746 SAMPLES FOUND 21044 CORRECT PREDICTIONS Prediction accuracy 0.179962

30cluster = TRAINING WITH 467746 SAMPLES FOUND 28535 CORRECT PREDICTIONS Prediction accuracy 0.244022

50cluster = TRAINING WITH 467746 SAMPLES FOUND 41761 CORRECT PREDICTIONS Prediction accuracy 0.357127

100cluster = TRAINING WITH 467746 SAMPLES FOUND 60735 CORRECT PREDICTIONS Prediction accuracy 0.519387

200cluster = TRAINING WITH 467746 SAMPLES FOUND 73581 CORRECT PREDICTIONS Prediction accuracy 0.629242



In [50]:

    
import geoplotlib as gpl

def get_spaced_colors(n):
    max_value = 16581375 #255**3
    interval = int(max_value / n)
    colors = [hex(I)[2:].zfill(6) for I in range(0, max_value, interval)]
    
    return [[int(i[:2], 16), int(i[2:4], 16), int(i[4:], 16), 255] for i in colors]

colormap = get_spaced_colors(200)

def coords(k):
    lat = mdata[mdata['KMEANS'] == k].LATITUDE.values
    lon = mdata[mdata['KMEANS'] == k].LONGITUDE.values
    
    return lat,lon
    
for i in range(0, 200):
    lat, lon = coords(i)

    latlong = {'lon': lon, 'lat': lat}
    gpl.dot(latlong, color=colormap[i])

gpl.inline()



In [ ]:

    
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(clf, mdata[features].values, mdata[target_label_encoded].values, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))



In [ ]:

    
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))



In [ ]:

    
from sklearn import tree
from sklearn.externals.six import StringIO  
import pydot 

from IPython.display import Image

dot_data = StringIO()  
tree.export_graphviz(clf, out_file=dot_data, feature_names=features,
                         filled=True, rounded=True,  
                         special_characters=True)  
graph = pydot.graph_from_dot_data(dot_data.getvalue())
# Image(graph.create_png())



In [ ]:

    
with open('datasets/tree.pdf', 'w') as f:
    f.write(graph.create_pdf())



In [ ]:

    
nan = colls[pd.isnull(colls.TemperatureC)]
print len(nan)
print len(colls)

Decision Tree Log

Finding ZIP CODE with HOUR, MONTH, BOROUGH

TRAINING WITH 438514 SAMPLES
FOUND 7884 CORRECT PREDICTIONS
Prediction accuracy 0.053937 ## Finding ZIP CODE with HOUR, MONTH, BOROUGH, CFV1, VTC1
TRAINING WITH 526217 SAMPLES
FOUND 3243 CORRECT PREDICTIONS
Prediction accuracy 0.055466 ## Finding VTC1 with HOUR, MONTH, BOROUGH, CFV1
TRAINING WITH 438514 SAMPLES
FOUND 77135 CORRECT PREDICTIONS
Prediction accuracy 0.527704



In [ ]:

    
colls[colls['DATE'] == "03/08/2015"].head(1000).sort_values(by='TIME')



In [ ]:

	Unnamed: 0	DATE	TIME	BOROUGH	ZIP CODE	LATITUDE	LONGITUDE	LOCATION	ON STREET NAME	CROSS STREET NAME	OFF STREET NAME	NUMBER OF PERSONS INJURED	NUMBER OF PEDESTRIANS INJURED	CONTRIBUTING FACTOR VEHICLE 1	CONTRIBUTING FACTOR VEHICLE 2	CONTRIBUTING FACTOR VEHICLE 3	CONTRIBUTING FACTOR VEHICLE 4	CONTRIBUTING FACTOR VEHICLE 5	UNIQUE KEY	VEHICLE TYPE CODE 1	VEHICLE TYPE CODE 2	VEHICLE TYPE CODE 3	VEHICLE TYPE CODE 4	VEHICLE TYPE CODE 5	Conditions	Precipitationmm	TemperatureC	VisibilityKm	HOUR	MONTH
0	0	03/14/2016	3:27	QUEENS	11372	40.747734	-73.882999	(40.7477341, -73.8829986)	ROOSEVELT AVENUE	83 STREET	NaN	1	1	Unspecified	NaN	NaN	NaN	NaN	3405169	OTHER	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	3	03
3	3	03/14/2016	0:45	MANHATTAN	10035	40.808279	-73.938793	(40.8082795, -73.9387929)	EAST 129 STREET	MADISON AVENUE	NaN	0	0	Unspecified	Unspecified	NaN	NaN	NaN	3405059	PASSENGER VEHICLE	PASSENGER VEHICLE	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0	03
4	4	03/13/2016	23:00	BROOKLYN	11206	40.706653	-73.950406	(40.7066527, -73.9504063)	UNION AVENUE	MONTROSE AVENUE	NaN	0	0	Driver Inattention/Distraction	Unspecified	NaN	NaN	NaN	3405121	PASSENGER VEHICLE	PASSENGER VEHICLE	NaN	NaN	NaN	NaN	NaN	NaN	NaN	23	03
6	6	03/13/2016	9:48	BROOKLYN	11212	40.661997	-73.919593	(40.661997, -73.9195931)	KINGS HIGHWAY	EAST 98 STREET	NaN	0	0	Passenger Distraction	Unspecified	NaN	NaN	NaN	3404744	PASSENGER VEHICLE	UNKNOWN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	9	03
9	9	03/13/2016	9:46	QUEENS	11106	40.756580	-73.929752	(40.75658, -73.9297516)	36 AVENUE	31 STREET	NaN	0	0	Failure to Yield Right-of-Way	Unspecified	NaN	NaN	NaN	3404995	LIVERY VEHICLE	PASSENGER VEHICLE	NaN	NaN	NaN	NaN	NaN	NaN	NaN	9	03