In [1]:
from datetime import datetime
from datetime import timedelta

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 100)
%matplotlib inline

In [2]:
df = pd.read_csv("DATA/babs_open_data_year_1/201402_babs_open_data/201402_trip_data.csv")

In [3]:
df.shape


Out[3]:
(144015, 11)

In [4]:
df['Start Date'] = pd.to_datetime(df['Start Date'])
df['End Date'] = pd.to_datetime(df['End Date'])

In [6]:
df.rename(columns={'Start Date':'Start Date Time', 'End Date':'End Date Time'}, inplace=True)

In [7]:
# fix the end time
df['End Date Time'] = df['Start Date Time'] + df['Duration'].apply(lambda x: np.timedelta64(x, 'm'))

In [8]:
df.head()


Out[8]:
Trip ID Duration Start Date Time Start Station Start Terminal End Date Time End Station End Terminal Bike # Subscription Type Zip Code
0 4576 63 2013-08-29 14:13:00 South Van Ness at Market 66 2013-08-29 15:16:00 South Van Ness at Market 66 520 Subscriber 94127
1 4607 70 2013-08-29 14:42:00 San Jose City Hall 10 2013-08-29 15:52:00 San Jose City Hall 10 661 Subscriber 95138
2 4130 71 2013-08-29 10:16:00 Mountain View City Hall 27 2013-08-29 11:27:00 Mountain View City Hall 27 48 Subscriber 97214
3 4251 77 2013-08-29 11:29:00 San Jose City Hall 10 2013-08-29 12:46:00 San Jose City Hall 10 26 Subscriber 95060
4 4299 83 2013-08-29 12:02:00 South Van Ness at Market 66 2013-08-29 13:25:00 Market at 10th 67 319 Subscriber 94103

In [9]:
df['Start Date'] = df['Start Date Time'].apply(lambda x: x.date())
df['End Date'] = df['End Date Time'].apply(lambda x: x.date())

df['Start Time'] = df['Start Date Time'].apply(lambda x: x.time())
df['End Time'] = df['End Date Time'].apply(lambda x: x.time())

In [10]:
df.head()


Out[10]:
Trip ID Duration Start Date Time Start Station Start Terminal End Date Time End Station End Terminal Bike # Subscription Type Zip Code Start Date End Date Start Time End Time
0 4576 63 2013-08-29 14:13:00 South Van Ness at Market 66 2013-08-29 15:16:00 South Van Ness at Market 66 520 Subscriber 94127 2013-08-29 2013-08-29 14:13:00 15:16:00
1 4607 70 2013-08-29 14:42:00 San Jose City Hall 10 2013-08-29 15:52:00 San Jose City Hall 10 661 Subscriber 95138 2013-08-29 2013-08-29 14:42:00 15:52:00
2 4130 71 2013-08-29 10:16:00 Mountain View City Hall 27 2013-08-29 11:27:00 Mountain View City Hall 27 48 Subscriber 97214 2013-08-29 2013-08-29 10:16:00 11:27:00
3 4251 77 2013-08-29 11:29:00 San Jose City Hall 10 2013-08-29 12:46:00 San Jose City Hall 10 26 Subscriber 95060 2013-08-29 2013-08-29 11:29:00 12:46:00
4 4299 83 2013-08-29 12:02:00 South Van Ness at Market 66 2013-08-29 13:25:00 Market at 10th 67 319 Subscriber 94103 2013-08-29 2013-08-29 12:02:00 13:25:00

In [11]:
df.shape


Out[11]:
(144015, 15)

In [12]:
df_loc = pd.read_csv("DATA/babs_open_data_year_1/201402_babs_open_data/201402_station_data.csv")

In [13]:
df_loc.head()


Out[13]:
station_id name lat long dockcount landmark installation
0 2 San Jose Diridon Caltrain Station 37.329732 -121.901782 27 San Jose 8/6/2013
1 3 San Jose Civic Center 37.330698 -121.888979 15 San Jose 8/5/2013
2 4 Santa Clara at Almaden 37.333988 -121.894902 11 San Jose 8/6/2013
3 5 Adobe on Almaden 37.331415 -121.893200 19 San Jose 8/5/2013
4 6 San Pedro Square 37.336721 -121.894074 15 San Jose 8/7/2013

In [14]:
df_merged_1 = pd.merge(df, df_loc, left_on='Start Station', right_on='name', how='inner', sort=False)

In [15]:
df_merged_1.shape


Out[15]:
(144015, 22)

In [16]:
df_merged_1.head()


Out[16]:
Trip ID Duration Start Date Time Start Station Start Terminal End Date Time End Station End Terminal Bike # Subscription Type Zip Code Start Date End Date Start Time End Time station_id name lat long dockcount landmark installation
0 4576 63 2013-08-29 14:13:00 South Van Ness at Market 66 2013-08-29 15:16:00 South Van Ness at Market 66 520 Subscriber 94127 2013-08-29 2013-08-29 14:13:00 15:16:00 66 South Van Ness at Market 37.774814 -122.418954 19 San Francisco 8/23/2013
1 4299 83 2013-08-29 12:02:00 South Van Ness at Market 66 2013-08-29 13:25:00 Market at 10th 67 319 Subscriber 94103 2013-08-29 2013-08-29 12:02:00 13:25:00 66 South Van Ness at Market 37.774814 -122.418954 19 San Francisco 8/23/2013
2 4760 113 2013-08-29 17:01:00 South Van Ness at Market 66 2013-08-29 18:54:00 South Van Ness at Market 66 553 Subscriber 94103 2013-08-29 2013-08-29 17:01:00 18:54:00 66 South Van Ness at Market 37.774814 -122.418954 19 San Francisco 8/23/2013
3 5070 168 2013-08-29 21:43:00 South Van Ness at Market 66 2013-08-30 00:31:00 South Van Ness at Market 66 598 Subscriber 94115 2013-08-29 2013-08-30 21:43:00 00:31:00 66 South Van Ness at Market 37.774814 -122.418954 19 San Francisco 8/23/2013
4 4765 228 2013-08-29 17:05:00 South Van Ness at Market 66 2013-08-29 20:53:00 Market at 10th 67 553 Subscriber 94103 2013-08-29 2013-08-29 17:05:00 20:53:00 66 South Van Ness at Market 37.774814 -122.418954 19 San Francisco 8/23/2013

In [17]:
df_merged_1.rename(columns={'lat':'Start Latitute',
                            'long':'Start Longitude',
                            'dockcount': 'Start Station Dockcount',
                            'landmark': 'Start Station Landmark',
                            'installation': 'Start Station Installation'},
                   inplace=True)

In [18]:
df_merged_1.head()


Out[18]:
Trip ID Duration Start Date Time Start Station Start Terminal End Date Time End Station End Terminal Bike # Subscription Type Zip Code Start Date End Date Start Time End Time station_id name Start Latitute Start Longitude Start Station Dockcount Start Station Landmark Start Station Installation
0 4576 63 2013-08-29 14:13:00 South Van Ness at Market 66 2013-08-29 15:16:00 South Van Ness at Market 66 520 Subscriber 94127 2013-08-29 2013-08-29 14:13:00 15:16:00 66 South Van Ness at Market 37.774814 -122.418954 19 San Francisco 8/23/2013
1 4299 83 2013-08-29 12:02:00 South Van Ness at Market 66 2013-08-29 13:25:00 Market at 10th 67 319 Subscriber 94103 2013-08-29 2013-08-29 12:02:00 13:25:00 66 South Van Ness at Market 37.774814 -122.418954 19 San Francisco 8/23/2013
2 4760 113 2013-08-29 17:01:00 South Van Ness at Market 66 2013-08-29 18:54:00 South Van Ness at Market 66 553 Subscriber 94103 2013-08-29 2013-08-29 17:01:00 18:54:00 66 South Van Ness at Market 37.774814 -122.418954 19 San Francisco 8/23/2013
3 5070 168 2013-08-29 21:43:00 South Van Ness at Market 66 2013-08-30 00:31:00 South Van Ness at Market 66 598 Subscriber 94115 2013-08-29 2013-08-30 21:43:00 00:31:00 66 South Van Ness at Market 37.774814 -122.418954 19 San Francisco 8/23/2013
4 4765 228 2013-08-29 17:05:00 South Van Ness at Market 66 2013-08-29 20:53:00 Market at 10th 67 553 Subscriber 94103 2013-08-29 2013-08-29 17:05:00 20:53:00 66 South Van Ness at Market 37.774814 -122.418954 19 San Francisco 8/23/2013

In [19]:
df_merged_1 = df_merged_1.drop('name', 1)

In [20]:
df_merged_1 = pd.merge(df_merged_1, df_loc, left_on='End Station', right_on='name', how='inner', sort=False)

In [21]:
df_merged_1.head()


Out[21]:
Trip ID Duration Start Date Time Start Station Start Terminal End Date Time End Station End Terminal Bike # Subscription Type Zip Code Start Date End Date Start Time End Time station_id_x Start Latitute Start Longitude Start Station Dockcount Start Station Landmark Start Station Installation station_id_y name lat long dockcount landmark installation
0 4576 63 2013-08-29 14:13:00 South Van Ness at Market 66 2013-08-29 15:16:00 South Van Ness at Market 66 520 Subscriber 94127 2013-08-29 2013-08-29 14:13:00 15:16:00 66 37.774814 -122.418954 19 San Francisco 8/23/2013 66 South Van Ness at Market 37.774814 -122.418954 19 San Francisco 8/23/2013
1 4760 113 2013-08-29 17:01:00 South Van Ness at Market 66 2013-08-29 18:54:00 South Van Ness at Market 66 553 Subscriber 94103 2013-08-29 2013-08-29 17:01:00 18:54:00 66 37.774814 -122.418954 19 San Francisco 8/23/2013 66 South Van Ness at Market 37.774814 -122.418954 19 San Francisco 8/23/2013
2 5070 168 2013-08-29 21:43:00 South Van Ness at Market 66 2013-08-30 00:31:00 South Van Ness at Market 66 598 Subscriber 94115 2013-08-29 2013-08-30 21:43:00 00:31:00 66 37.774814 -122.418954 19 San Francisco 8/23/2013 66 South Van Ness at Market 37.774814 -122.418954 19 San Francisco 8/23/2013
3 4584 262 2013-08-29 14:17:00 South Van Ness at Market 66 2013-08-29 18:39:00 South Van Ness at Market 66 587 Subscriber 94612 2013-08-29 2013-08-29 14:17:00 18:39:00 66 37.774814 -122.418954 19 San Francisco 8/23/2013 66 South Van Ness at Market 37.774814 -122.418954 19 San Francisco 8/23/2013
4 4079 995 2013-08-29 09:35:00 South Van Ness at Market 66 2013-08-30 02:10:00 South Van Ness at Market 66 327 Subscriber 94102 2013-08-29 2013-08-30 09:35:00 02:10:00 66 37.774814 -122.418954 19 San Francisco 8/23/2013 66 South Van Ness at Market 37.774814 -122.418954 19 San Francisco 8/23/2013

In [22]:
df_merged_1.rename(columns={'lat':'End Latitute',
                            'long':'End Longitude',
                            'dockcount':'End Station Dockcount',
                            'landmark':'End Station Landmark',
                            'installation':'End Station Installation'},
                   inplace=True)

In [23]:
df_merged_1 = df_merged_1.drop('name', 1)

In [24]:
df_merged_1.to_csv("DATA/babs_open_data_year_1/201402_babs_open_data/201402_merged_data.csv", index=False)

In [25]:
df_merged_1.head()


Out[25]:
Trip ID Duration Start Date Time Start Station Start Terminal End Date Time End Station End Terminal Bike # Subscription Type Zip Code Start Date End Date Start Time End Time station_id_x Start Latitute Start Longitude Start Station Dockcount Start Station Landmark Start Station Installation station_id_y End Latitute End Longitude End Station Dockcount End Station Landmark End Station Installation
0 4576 63 2013-08-29 14:13:00 South Van Ness at Market 66 2013-08-29 15:16:00 South Van Ness at Market 66 520 Subscriber 94127 2013-08-29 2013-08-29 14:13:00 15:16:00 66 37.774814 -122.418954 19 San Francisco 8/23/2013 66 37.774814 -122.418954 19 San Francisco 8/23/2013
1 4760 113 2013-08-29 17:01:00 South Van Ness at Market 66 2013-08-29 18:54:00 South Van Ness at Market 66 553 Subscriber 94103 2013-08-29 2013-08-29 17:01:00 18:54:00 66 37.774814 -122.418954 19 San Francisco 8/23/2013 66 37.774814 -122.418954 19 San Francisco 8/23/2013
2 5070 168 2013-08-29 21:43:00 South Van Ness at Market 66 2013-08-30 00:31:00 South Van Ness at Market 66 598 Subscriber 94115 2013-08-29 2013-08-30 21:43:00 00:31:00 66 37.774814 -122.418954 19 San Francisco 8/23/2013 66 37.774814 -122.418954 19 San Francisco 8/23/2013
3 4584 262 2013-08-29 14:17:00 South Van Ness at Market 66 2013-08-29 18:39:00 South Van Ness at Market 66 587 Subscriber 94612 2013-08-29 2013-08-29 14:17:00 18:39:00 66 37.774814 -122.418954 19 San Francisco 8/23/2013 66 37.774814 -122.418954 19 San Francisco 8/23/2013
4 4079 995 2013-08-29 09:35:00 South Van Ness at Market 66 2013-08-30 02:10:00 South Van Ness at Market 66 327 Subscriber 94102 2013-08-29 2013-08-30 09:35:00 02:10:00 66 37.774814 -122.418954 19 San Francisco 8/23/2013 66 37.774814 -122.418954 19 San Francisco 8/23/2013

In [26]:
df_merged_1[['Start Time', 'End Time', 'Duration']].head()


Out[26]:
Start Time End Time Duration
0 14:13:00 15:16:00 63
1 17:01:00 18:54:00 113
2 21:43:00 00:31:00 168
3 14:17:00 18:39:00 262
4 09:35:00 02:10:00 995