Classifying Manhattan with BigQuery and TensorFlow

Clear all Cells

Importing the training data from BigQuery



In [1]:

    
%%sql -d standard
SELECT
  timestamp,
  borough,
  latitude,
  longitude
FROM
  `bigquery-public-data.new_york.nypd_mv_collisions`
ORDER BY
  timestamp DESC
LIMIT
  15









    Out[1]:





    timestamp borough latitude longitude
2017-06-03 23:58:00 MANHATTAN    
2017-06-03 23:58:00 40.722553 -74.00141
2017-06-03 23:55:00 QUEENS 40.687458 -73.85726
2017-06-03 23:55:00    
2017-06-03 23:51:00 40.800983 -73.92927
2017-06-03 23:50:00 STATEN ISLAND 40.601006 -74.076454
2017-06-03 23:45:00 MANHATTAN 40.764626 -73.99555
2017-06-03 23:45:00 BROOKLYN 40.69731 -73.932274
2017-06-03 23:27:00 BRONX 40.832047 -73.88775
2017-06-03 23:26:00 BROOKLYN 40.68993 -73.98148
2017-06-03 23:19:00 BROOKLYN 40.644432 -73.9296
2017-06-03 23:18:00 QUEENS 40.693386 -73.75984
2017-06-03 23:15:00 40.58407 -73.92344
2017-06-03 23:15:00 40.732613 -73.925156
2017-06-03 23:10:00 BROOKLYN 40.68043 -74.01046
    
(rows: 15, time: 3.8s,    29MB processed, job: job_rWIlB6Yqi5pQ4qD494Tvb6Bzuk0)

Preprocess the training data on BigQuery



In [2]:

    
%%sql --module nyc_collisions
SELECT
  IF(borough = 'MANHATTAN', 1, 0) AS is_mt,
  latitude,
  longitude
FROM
  `bigquery-public-data.new_york.nypd_mv_collisions`
WHERE
  LENGTH(borough) > 0
  AND latitude IS NOT NULL AND latitude != 0.0
  AND longitude IS NOT NULL AND longitude != 0.0
  AND borough != 'BRONX'
ORDER BY
  RAND()
LIMIT
  10000

Import the BigQuery SQL result as NumPy array



In [3]:

    
import datalab.bigquery as bq
nyc_cols = bq.Query(nyc_collisions).to_dataframe(dialect='standard').as_matrix()

import numpy as np
is_mt = nyc_cols[:,0].astype(np.int32)
latlng = nyc_cols[:,1:3].astype(np.float32)
print("Is Manhattan: " + str(is_mt))
print("\nLat/Lng: \n\n" + str(latlng))
print("\nLoaded " + str(is_mt.size) + " rows.")









    



Is Manhattan: [0 1 1 ..., 1 1 0]

Lat/Lng: 

[[ 40.66989136 -73.93379974]
 [ 40.84839249 -73.93766785]
 [ 40.75763702 -73.96340942]
 ..., 
 [ 40.79355621 -73.96701813]
 [ 40.75833893 -73.99640656]
 [ 40.6225853  -73.96279907]]

Loaded 10000 rows.

Feature scaling and plotting



In [4]:

    
# standardization
from sklearn.preprocessing import StandardScaler
latlng_std = StandardScaler().fit_transform(latlng)

# plotting
import matplotlib.pyplot as plt
lat = latlng_std[:,0]
lng = latlng_std[:,1]
plt.scatter(lng[is_mt == 1], lat[is_mt == 1], c='b') # plot points in Manhattan in blue
plt.scatter(lng[is_mt == 0], lat[is_mt == 0], c='y') # plot points outside Manhattan in yellow
plt.show()

Split the data into "Training Data" and "Test Data"



In [5]:

    
# 8,000 pairs for training
latlng_train = latlng_std[0:8000]
is_mt_train = is_mt[0:8000]

# 2,000 pairs for test
latlng_test = latlng_std[8000:10000]
is_mt_test = is_mt[8000:10000]

Define a neural network



In [6]:

    
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR) # supress warning messages

# define two feature columns with real values
feature_columns = [tf.contrib.layers.real_valued_column("", dimension=2)]

# create a neural network
dnnc = tf.contrib.learn.DNNClassifier(
  feature_columns=feature_columns,
  hidden_units=[20, 20, 20, 20],
  n_classes=2)

dnnc









    Out[6]:





DNNClassifier(params={'head': <tensorflow.contrib.learn.python.learn.estimators.head._BinaryLogisticHead object at 0x7fbf24ad5950>, 'hidden_units': [20, 20, 20, 20], 'feature_columns': (_RealValuedColumn(column_name='', dimension=2, default_value=None, dtype=tf.float32, normalizer=None),), 'embedding_lr_multipliers': None, 'optimizer': None, 'dropout': None, 'gradient_clip_norm': None, 'activation_fn': <function relu at 0x7fbf344fa488>, 'input_layer_min_slice_size': None})

Check the accuracy of the neural network



In [7]:

    
# plot a predicted map of Manhattan
def plot_predicted_map(classifier):
  is_mt_pred = classifier.predict(latlng_std, as_iterable=False) # an array of prediction results
  plt.scatter(lng[is_mt_pred == 1], lat[is_mt_pred == 1], c='b')
  plt.scatter(lng[is_mt_pred == 0], lat[is_mt_pred == 0], c='y')
  plt.show()

# print the accuracy of the neural network 
def print_accuracy(classifier):
  accuracy = classifier.evaluate(x=latlng_test, y=is_mt_test)["accuracy"]
  print('Accuracy: {:.2%}'.format(accuracy))
  
# train the model just for 1 step and print the accuracy
dnnc.fit(x=latlng_train, y=is_mt_train, steps=1)
plot_predicted_map(dnnc)
print_accuracy(dnnc)









    



/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/deprecation.py:247: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.
  equality = a == b






    












    



Accuracy: 73.50%

Train the neural network



In [8]:

    
steps = 20
for i in range (1, 6):
  dnnc.fit(x=latlng_train, y=is_mt_train, steps=steps)
  plot_predicted_map(dnnc)
  print('Steps: ' + str(i * steps))
  
print('\nTraining Finished.')
print_accuracy(dnnc)









    












    



Steps: 20






    












    



Steps: 40






    












    



Steps: 60






    












    



Steps: 80






    












    



Steps: 100

Training Finished.
Accuracy: 99.70%



In [ ]:

timestamp	borough	latitude	longitude
2017-06-03 23:58:00	MANHATTAN
2017-06-03 23:58:00		40.722553	-74.00141
2017-06-03 23:55:00	QUEENS	40.687458	-73.85726
2017-06-03 23:55:00
2017-06-03 23:51:00		40.800983	-73.92927
2017-06-03 23:50:00	STATEN ISLAND	40.601006	-74.076454
2017-06-03 23:45:00	MANHATTAN	40.764626	-73.99555
2017-06-03 23:45:00	BROOKLYN	40.69731	-73.932274
2017-06-03 23:27:00	BRONX	40.832047	-73.88775
2017-06-03 23:26:00	BROOKLYN	40.68993	-73.98148
2017-06-03 23:19:00	BROOKLYN	40.644432	-73.9296
2017-06-03 23:18:00	QUEENS	40.693386	-73.75984
2017-06-03 23:15:00		40.58407	-73.92344
2017-06-03 23:15:00		40.732613	-73.925156
2017-06-03 23:10:00	BROOKLYN	40.68043	-74.01046