Please run setup first.
Based on taxi_preprocessing_example.py
In [1]:
    
import pandas as pd
import numpy as np
    
In [2]:
    
%%bash
mkdir -p data
cd data
pwd
wget --no-clobber https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-01.csv
    
    
    
In [3]:
    
%%bash
ls -l data
    
    
In [4]:
    
%time df = pd.read_csv('data/yellow_tripdata_2015-01.csv')
    
    
In [5]:
    
df.tail()
    
    Out[5]:
In [ ]:
    
df = df.loc[(df.pickup_longitude < -73.75) &
            (df.pickup_longitude > -74.15) &
            (df.dropoff_longitude < -73.75) &
            (df.dropoff_longitude > -74.15) &
            (df.pickup_latitude > 40.68) &
            (df.pickup_latitude < 40.84) &
            (df.dropoff_latitude > 40.68) &
            (df.dropoff_latitude < 40.84)].copy()
    
In [6]:
    
def latlng_to_meters(df, lat_name, lng_name):
    lat = df[lat_name]
    lng = df[lng_name]
    origin_shift = 2 * np.pi * 6378137 / 2.0
    mx = lng * origin_shift / 180.0
    my = np.log(np.tan((90 + lat) * np.pi / 360.0)) / (np.pi / 180.0)
    my = my * origin_shift / 180.0
    df.loc[:, lng_name] = mx
    df.loc[:, lat_name] = my
    
In [7]:
    
latlng_to_meters(df, 'pickup_latitude', 'pickup_longitude')
latlng_to_meters(df, 'dropoff_latitude', 'dropoff_longitude')
df.rename(columns={
    'pickup_longitude': 'pickup_x', 
    'dropoff_longitude': 'dropoff_x',
    'pickup_latitude': 'pickup_y', 
    'dropoff_latitude': 'dropoff_y'
},inplace=True)
    
    
In [8]:
    
df.to_csv('data/nyc_taxi.csv', index=False)
    
In [9]:
    
!head data/nyc_taxi.csv