Please run setup first.
Based on taxi_preprocessing_example.py
In [1]:
import pandas as pd
import numpy as np
In [2]:
%%bash
mkdir -p data
cd data
pwd
wget --no-clobber https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-01.csv
In [3]:
%%bash
ls -l data
In [4]:
%time df = pd.read_csv('data/yellow_tripdata_2015-01.csv')
In [5]:
df.tail()
Out[5]:
In [ ]:
df = df.loc[(df.pickup_longitude < -73.75) &
(df.pickup_longitude > -74.15) &
(df.dropoff_longitude < -73.75) &
(df.dropoff_longitude > -74.15) &
(df.pickup_latitude > 40.68) &
(df.pickup_latitude < 40.84) &
(df.dropoff_latitude > 40.68) &
(df.dropoff_latitude < 40.84)].copy()
In [6]:
def latlng_to_meters(df, lat_name, lng_name):
lat = df[lat_name]
lng = df[lng_name]
origin_shift = 2 * np.pi * 6378137 / 2.0
mx = lng * origin_shift / 180.0
my = np.log(np.tan((90 + lat) * np.pi / 360.0)) / (np.pi / 180.0)
my = my * origin_shift / 180.0
df.loc[:, lng_name] = mx
df.loc[:, lat_name] = my
In [7]:
latlng_to_meters(df, 'pickup_latitude', 'pickup_longitude')
latlng_to_meters(df, 'dropoff_latitude', 'dropoff_longitude')
df.rename(columns={
'pickup_longitude': 'pickup_x',
'dropoff_longitude': 'dropoff_x',
'pickup_latitude': 'pickup_y',
'dropoff_latitude': 'dropoff_y'
},inplace=True)
In [8]:
df.to_csv('data/nyc_taxi.csv', index=False)
In [9]:
!head data/nyc_taxi.csv