Let's make some clean-up.
In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [ ]:
data_filename = '../data/trip_data_subset.csv'
fare_filename = '../data/trip_fare_subset.csv'
In [ ]:
data = pd.read_csv(data_filename)
fare = pd.read_csv(fare_filename)
First, we remove extra spaces in the headers.
In [ ]:
fare.columns = [col.strip() for col in fare.columns]
Then, we reorder the rows by date.
In [ ]:
data.pickup_datetime = pd.to_datetime(data.pickup_datetime)
data.dropoff_datetime = pd.to_datetime(data.dropoff_datetime)
fare.pickup_datetime = pd.to_datetime(fare.pickup_datetime)
In [ ]:
sorted_index = data.sort('pickup_datetime').index
In [ ]:
data = data.ix[sorted_index]
fare = fare.ix[sorted_index]
We now remove the index.
In [ ]:
data = data.reset_index()
del data['index']
In [ ]:
fare = fare.reset_index()
del fare['index']
We find a rectangle around Manhattan (using http://www.openstreetmap.org/).
In [ ]:
lon_min, lon_max = -74.1, -73
lat_min, lat_max = 40, 41
We get rid of values outside this area.
In [ ]:
lon_p, lat_p = data.pickup_longitude, data.pickup_latitude
lon_d, lat_d = data.dropoff_longitude, data.dropoff_latitude
In [ ]:
to_keep = ((lon_p > lon_min) &
(lon_p < lon_max) &
(lat_p > lat_min) &
(lat_p < lat_max) &
(lon_d > lon_min) &
(lon_d < lon_max) &
(lat_d > lat_min) &
(lat_d < lat_max))
We keep those rows where the coordinates are within the rectangle.
In [ ]:
data = data.ix[to_keep]
fare = fare.ix[to_keep]
Finally, we save the cleaned-up datasets.
In [ ]:
data.to_csv('../data/nyc_data.csv', index=False)
fare.to_csv('../data/nyc_fare.csv', index=False)