Please run setup first.


In [1]:
import pandas as pd
import numpy as np

In [2]:
%%bash
mkdir -p data
cd data
pwd
wget --no-clobber https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-01.csv


/home/jovyan/work/nyc_taxi_datashader/data
File ‘yellow_tripdata_2015-01.csv’ already there; not retrieving.


In [3]:
%%bash
ls -l data


total 1939420
-rw-r--r-- 1 jovyan users 1985964692 Aug 16  2016 yellow_tripdata_2015-01.csv

In [4]:
%time df = pd.read_csv('data/yellow_tripdata_2015-01.csv')


CPU times: user 1min 33s, sys: 20.5 s, total: 1min 53s
Wall time: 2min 22s

In [5]:
df.tail()


Out[5]:
VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count trip_distance pickup_longitude pickup_latitude RateCodeID store_and_fwd_flag dropoff_longitude dropoff_latitude payment_type fare_amount extra mta_tax tip_amount tolls_amount improvement_surcharge total_amount
12748981 1 2015-01-10 19:01:44 2015-01-10 19:05:40 2 1.0 -73.951988 40.786217 1 N -73.953735 40.775162 1 5.5 0.0 0.5 1.25 0.0 0.3 7.55
12748982 1 2015-01-10 19:01:44 2015-01-10 19:07:26 2 0.8 -73.982742 40.728184 1 N -73.974976 40.720013 1 6.0 0.0 0.5 2.00 0.0 0.3 8.80
12748983 1 2015-01-10 19:01:44 2015-01-10 19:15:01 1 3.4 -73.979324 40.749550 1 N -73.969101 40.787800 2 13.5 0.0 0.5 0.00 0.0 0.3 14.30
12748984 1 2015-01-10 19:01:44 2015-01-10 19:17:03 1 1.3 -73.999565 40.738483 1 N -73.981819 40.737652 1 10.5 0.0 0.5 2.25 0.0 0.3 13.55
12748985 1 2015-01-10 19:01:45 2015-01-10 19:07:33 1 0.7 -73.960350 40.766399 1 N -73.968643 40.760777 2 5.5 0.0 0.5 0.00 0.0 0.3 6.30

In [ ]:
df = df.loc[(df.pickup_longitude < -73.75) &
            (df.pickup_longitude > -74.15) &
            (df.dropoff_longitude < -73.75) &
            (df.dropoff_longitude > -74.15) &
            (df.pickup_latitude > 40.68) &
            (df.pickup_latitude < 40.84) &
            (df.dropoff_latitude > 40.68) &
            (df.dropoff_latitude < 40.84)].copy()

In [6]:
def latlng_to_meters(df, lat_name, lng_name):
    lat = df[lat_name]
    lng = df[lng_name]
    origin_shift = 2 * np.pi * 6378137 / 2.0
    mx = lng * origin_shift / 180.0
    my = np.log(np.tan((90 + lat) * np.pi / 360.0)) / (np.pi / 180.0)
    my = my * origin_shift / 180.0
    df.loc[:, lng_name] = mx
    df.loc[:, lat_name] = my

In [7]:
latlng_to_meters(df, 'pickup_latitude', 'pickup_longitude')
latlng_to_meters(df, 'dropoff_latitude', 'dropoff_longitude')
df.rename(columns={
    'pickup_longitude': 'pickup_x', 
    'dropoff_longitude': 'dropoff_x',
    'pickup_latitude': 'pickup_y', 
    'dropoff_latitude': 'dropoff_y'
},inplace=True)


/opt/conda/envs/python2/lib/python2.7/site-packages/ipykernel_launcher.py:6: RuntimeWarning: invalid value encountered in log
  

In [8]:
df.to_csv('data/nyc_taxi.csv', index=False)

In [9]:
!head data/nyc_taxi.csv


VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_x,pickup_y,RateCodeID,store_and_fwd_flag,dropoff_x,dropoff_y,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
2,2015-01-15 19:05:39,2015-01-15 19:23:42,1,1.59,-8236962.87845,4975552.61692,1,N,-8234835.38116,4975627.16997,1,12.0,1.0,0.5,3.25,0.0,0.3,17.05
1,2015-01-10 20:33:38,2015-01-10 20:53:28,1,3.3,-8237825.76757,4971752.28598,1,N,-8237020.63087,4976875.03705,1,14.5,0.5,0.5,2.0,0.0,0.3,17.8
1,2015-01-10 20:33:38,2015-01-10 20:43:41,1,1.8,-8233561.43069,4983296.28074,1,N,-8232278.98722,4986477.06136,2,9.5,0.5,0.5,0.0,0.0,0.3,10.8
1,2015-01-10 20:33:39,2015-01-10 20:35:31,1,0.5,-8238653.83538,4970221.02621,1,N,-8238123.87198,4971126.9786,2,3.5,0.5,0.5,0.0,0.0,0.3,4.8
1,2015-01-10 20:33:39,2015-01-10 20:52:58,1,3.0,-8234433.66211,4977362.79122,1,N,-8238107.73527,4974456.80877,2,15.0,0.5,0.5,0.0,0.0,0.3,16.3
1,2015-01-10 20:33:39,2015-01-10 20:53:52,1,9.0,-8223657.73973,4979070.68341,1,N,-8236192.56307,4976740.48846,1,27.0,0.5,0.5,6.7,5.33,0.3,40.33
1,2015-01-10 20:33:39,2015-01-10 20:58:31,1,2.2,-8235780.65241,4972011.72183,1,N,-8236804.05929,4975482.54878,2,14.0,0.5,0.5,0.0,0.0,0.3,15.3
1,2015-01-10 20:33:39,2015-01-10 20:42:20,3,0.8,-8237938.72451,4973206.44838,1,N,-8237086.8763,4972058.2305,1,7.0,0.5,0.5,1.66,0.0,0.3,9.96
1,2015-01-10 20:33:39,2015-01-10 21:11:35,3,18.2,-8213490.76567,4960024.90509,2,N,-8236261.35639,4976911.47761,2,52.0,0.0,0.5,0.0,5.33,0.3,58.13