In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime
from datetime import timedelta

In [2]:
df = pd.read_csv("DATA/babs_master/trip_master.csv")

In [3]:
df[4000:4020]


Out[3]:
Trip ID Duration Start Date Start Station Start Terminal End Date End Station End Terminal Bike # Subscription Type Zip Code
4000 10165 772 9/3/2013 19:14 Post at Kearney 47 9/3/2013 19:27 Embarcadero at Folsom 51 543 Subscriber 94105
4001 10166 754 9/3/2013 19:14 Post at Kearney 47 9/3/2013 19:27 Embarcadero at Folsom 51 415 Subscriber 94105
4002 10168 471 9/3/2013 19:15 2nd at Folsom 62 9/3/2013 19:23 San Francisco Caltrain (Townsend at 4th) 70 485 Subscriber 94107
4003 10169 794 9/3/2013 19:20 Powell Street BART 39 9/3/2013 19:33 Clay at Battery 41 528 Customer NaN
4004 10171 783 9/3/2013 19:20 Powell Street BART 39 9/3/2013 19:33 Clay at Battery 41 494 Customer NaN
4005 10172 363 9/3/2013 19:21 San Francisco Caltrain (Townsend at 4th) 70 9/3/2013 19:27 Townsend at 7th 65 557 Subscriber 94103
4006 10173 601 9/3/2013 19:23 Clay at Battery 41 9/3/2013 19:33 Powell at Post (Union Square) 71 508 Subscriber 94107
4007 10174 1044 9/3/2013 19:24 San Francisco City Hall 58 9/3/2013 19:41 Spear at Folsom 49 634 Subscriber 94105
4008 10175 770 9/3/2013 19:26 Commercial at Montgomery 45 9/3/2013 19:39 Grant Avenue at Columbus Avenue 73 537 Customer NaN
4009 10176 242 9/3/2013 19:28 2nd at Folsom 62 9/3/2013 19:32 5th at Howard 57 462 Customer NaN
4010 10177 571 9/3/2013 19:29 Commercial at Montgomery 45 9/3/2013 19:38 Grant Avenue at Columbus Avenue 73 553 Customer NaN
4011 10178 523 9/3/2013 19:30 Commercial at Montgomery 45 9/3/2013 19:38 Grant Avenue at Columbus Avenue 73 273 Customer NaN
4012 10179 926 9/3/2013 19:31 Grant Avenue at Columbus Avenue 73 9/3/2013 19:46 Civic Center BART (7th at Market) 72 603 Subscriber 94117
4013 10183 832 9/3/2013 19:33 Embarcadero at Vallejo 48 9/3/2013 19:47 Harry Bridges Plaza (Ferry Building) 50 567 Customer 94019
4014 10184 801 9/3/2013 19:34 Embarcadero at Vallejo 48 9/3/2013 19:47 Harry Bridges Plaza (Ferry Building) 50 517 Customer 94402
4015 10185 1061 9/3/2013 19:36 Paseo de San Antonio 7 9/3/2013 19:54 SJSU 4th at San Carlos 12 24 Customer 95113
4016 10186 671 9/3/2013 19:38 2nd at Townsend 61 9/3/2013 19:49 Steuart at Market 74 548 Customer 94401
4017 10187 576 9/3/2013 19:40 2nd at Townsend 61 9/3/2013 19:49 Davis at Jackson 42 540 Subscriber 94111
4018 10188 643 9/3/2013 19:40 Powell Street BART 39 9/3/2013 19:51 Embarcadero at Folsom 51 613 Subscriber 94107
4019 10189 532 9/3/2013 19:45 Washington at Kearney 46 9/3/2013 19:54 Market at 4th 76 589 Subscriber 94597

In [4]:
df.rename(columns = {'Start Date':'Start Date Time', 'End Date':'End Date Time'}, inplace = True)

In [5]:
df['Start Date'], df['Start Time'] = zip(*df['Start Date Time'].apply(lambda x: x.split(' ')))
df['End Date'], df['End Time'] = zip(*df['End Date Time'].apply(lambda x: x.split(' ')))

In [6]:
df.head()


Out[6]:
Trip ID Duration Start Date Time Start Station Start Terminal End Date Time End Station End Terminal Bike # Subscription Type Zip Code Start Date Start Time End Date End Time
0 4576 63 8/29/2013 14:13 South Van Ness at Market 66 8/29/2013 14:14 South Van Ness at Market 66 520 Subscriber 94127 8/29/2013 14:13 8/29/2013 14:14
1 4607 70 8/29/2013 14:42 San Jose City Hall 10 8/29/2013 14:43 San Jose City Hall 10 661 Subscriber 95138 8/29/2013 14:42 8/29/2013 14:43
2 4130 71 8/29/2013 10:16 Mountain View City Hall 27 8/29/2013 10:17 Mountain View City Hall 27 48 Subscriber 97214 8/29/2013 10:16 8/29/2013 10:17
3 4251 77 8/29/2013 11:29 San Jose City Hall 10 8/29/2013 11:30 San Jose City Hall 10 26 Subscriber 95060 8/29/2013 11:29 8/29/2013 11:30
4 4299 83 8/29/2013 12:02 South Van Ness at Market 66 8/29/2013 12:04 Market at 10th 67 319 Subscriber 94103 8/29/2013 12:02 8/29/2013 12:04

In [7]:
df.shape


Out[7]:
(669959, 15)

In [8]:
df_loc = pd.read_csv("DATA/babs_master/station_master.csv")

In [9]:
df_loc.head()


Out[9]:
station_id name lat long dockcount landmark installation
0 2 San Jose Diridon Caltrain Station 37.329732 -121.901782 27 San Jose 8/6/2013
1 3 San Jose Civic Center 37.330698 -121.888979 15 San Jose 8/5/2013
2 4 Santa Clara at Almaden 37.333988 -121.894902 11 San Jose 8/6/2013
3 5 Adobe on Almaden 37.331415 -121.893200 19 San Jose 8/5/2013
4 6 San Pedro Square 37.336721 -121.894074 15 San Jose 8/7/2013

In [10]:
df_merged_1 = pd.merge(df, df_loc, left_on='Start Station', right_on='name', how='inner', sort=False)

In [11]:
df_merged_1.shape


Out[11]:
(658617, 22)

In [12]:
df_merged_1.head()


Out[12]:
Trip ID Duration Start Date Time Start Station Start Terminal End Date Time End Station End Terminal Bike # Subscription Type ... Start Time End Date End Time station_id name lat long dockcount landmark installation
0 4576 63 8/29/2013 14:13 South Van Ness at Market 66 8/29/2013 14:14 South Van Ness at Market 66 520 Subscriber ... 14:13 8/29/2013 14:14 66 South Van Ness at Market 37.774814 -122.418954 19 San Francisco 8/23/2013
1 4299 83 8/29/2013 12:02 South Van Ness at Market 66 8/29/2013 12:04 Market at 10th 67 319 Subscriber ... 12:02 8/29/2013 12:04 66 South Van Ness at Market 37.774814 -122.418954 19 San Francisco 8/23/2013
2 4760 113 8/29/2013 17:01 South Van Ness at Market 66 8/29/2013 17:03 South Van Ness at Market 66 553 Subscriber ... 17:01 8/29/2013 17:03 66 South Van Ness at Market 37.774814 -122.418954 19 San Francisco 8/23/2013
3 5070 168 8/29/2013 21:43 South Van Ness at Market 66 8/29/2013 21:46 South Van Ness at Market 66 598 Subscriber ... 21:43 8/29/2013 21:46 66 South Van Ness at Market 37.774814 -122.418954 19 San Francisco 8/23/2013
4 4765 228 8/29/2013 17:05 South Van Ness at Market 66 8/29/2013 17:08 Market at 10th 67 553 Subscriber ... 17:05 8/29/2013 17:08 66 South Van Ness at Market 37.774814 -122.418954 19 San Francisco 8/23/2013

5 rows × 22 columns


In [13]:
df_merged_1.rename(columns = {'lat':'Start Latitute', 'long':'Start Longitude', 'dockcount': 'Start Station Dockcount', 'landmark': 'Start Station Landmark', 'installation': 'Start Station Installation'}, inplace = True)

In [14]:
df_merged_1.head()


Out[14]:
Trip ID Duration Start Date Time Start Station Start Terminal End Date Time End Station End Terminal Bike # Subscription Type ... Start Time End Date End Time station_id name Start Latitute Start Longitude Start Station Dockcount Start Station Landmark Start Station Installation
0 4576 63 8/29/2013 14:13 South Van Ness at Market 66 8/29/2013 14:14 South Van Ness at Market 66 520 Subscriber ... 14:13 8/29/2013 14:14 66 South Van Ness at Market 37.774814 -122.418954 19 San Francisco 8/23/2013
1 4299 83 8/29/2013 12:02 South Van Ness at Market 66 8/29/2013 12:04 Market at 10th 67 319 Subscriber ... 12:02 8/29/2013 12:04 66 South Van Ness at Market 37.774814 -122.418954 19 San Francisco 8/23/2013
2 4760 113 8/29/2013 17:01 South Van Ness at Market 66 8/29/2013 17:03 South Van Ness at Market 66 553 Subscriber ... 17:01 8/29/2013 17:03 66 South Van Ness at Market 37.774814 -122.418954 19 San Francisco 8/23/2013
3 5070 168 8/29/2013 21:43 South Van Ness at Market 66 8/29/2013 21:46 South Van Ness at Market 66 598 Subscriber ... 21:43 8/29/2013 21:46 66 South Van Ness at Market 37.774814 -122.418954 19 San Francisco 8/23/2013
4 4765 228 8/29/2013 17:05 South Van Ness at Market 66 8/29/2013 17:08 Market at 10th 67 553 Subscriber ... 17:05 8/29/2013 17:08 66 South Van Ness at Market 37.774814 -122.418954 19 San Francisco 8/23/2013

5 rows × 22 columns


In [15]:
df_merged_1 = df_merged_1.drop('name', 1)

In [16]:
df_merged_1 = pd.merge(df_merged_1, df_loc, left_on='End Station', right_on='name', how='inner', sort=False)

In [21]:
df_merged_1.head()


Out[21]:
Trip ID Duration Start Date Time Start Station Start Terminal End Date Time End Station End Terminal Bike # Subscription Type ... Start Longitude Start Station Dockcount Start Station Landmark Start Station Installation station_id_y End Latitute End Longitude End Station Dockcount End Station Landmark End Station Installation
0 4576 63 8/29/2013 14:13 South Van Ness at Market 66 8/29/2013 14:14 South Van Ness at Market 66 520 Subscriber ... -122.418954 19 San Francisco 8/23/2013 66 37.774814 -122.418954 19 San Francisco 8/23/2013
1 4760 113 8/29/2013 17:01 South Van Ness at Market 66 8/29/2013 17:03 South Van Ness at Market 66 553 Subscriber ... -122.418954 19 San Francisco 8/23/2013 66 37.774814 -122.418954 19 San Francisco 8/23/2013
2 5070 168 8/29/2013 21:43 South Van Ness at Market 66 8/29/2013 21:46 South Van Ness at Market 66 598 Subscriber ... -122.418954 19 San Francisco 8/23/2013 66 37.774814 -122.418954 19 San Francisco 8/23/2013
3 4584 262 8/29/2013 14:17 South Van Ness at Market 66 8/29/2013 14:21 South Van Ness at Market 66 587 Subscriber ... -122.418954 19 San Francisco 8/23/2013 66 37.774814 -122.418954 19 San Francisco 8/23/2013
4 4079 995 8/29/2013 9:35 South Van Ness at Market 66 8/29/2013 9:52 South Van Ness at Market 66 327 Subscriber ... -122.418954 19 San Francisco 8/23/2013 66 37.774814 -122.418954 19 San Francisco 8/23/2013

5 rows × 27 columns


In [18]:
df_merged_1.rename(columns = {'lat':'End Latitute', 'long':'End Longitude', 'dockcount': 'End Station Dockcount', 'landmark': 'End Station Landmark', 'installation': 'End Station Installation'}, inplace = True)

In [19]:
df_merged_1 = df_merged_1.drop('name', 1)

In [22]:
df_weather = pd.read_csv("DATA/babs_master/weather_master.csv")
df_merged_1 = pd.merge(df_merged_1, df_weather, left_on='Start Date', right_on='Date', how='inner', sort=False)

In [25]:
df_merged_1.columns


Out[25]:
Index([u'Trip ID', u'Duration', u'Start Date Time', u'Start Station',
       u'Start Terminal', u'End Date Time', u'End Station', u'End Terminal',
       u'Bike #', u'Subscription Type', u'Zip Code', u'Start Date',
       u'Start Time', u'End Date', u'End Time', u'station_id_x',
       u'Start Latitute', u'Start Longitude', u'Start Station Dockcount',
       u'Start Station Landmark', u'Start Station Installation',
       u'station_id_y', u'End Latitute', u'End Longitude',
       u'End Station Dockcount', u'End Station Landmark',
       u'End Station Installation', u'Date', u'Max_Temperature_F',
       u'Mean_Temperature_F', u'Min_TemperatureF', u'Max_Dew_Point_F',
       u'MeanDew_Point_F', u'Min_Dewpoint_F', u'Max_Humidity',
       u'Mean_Humidity ', u'Min_Humidity ', u'Max_Sea_Level_Pressure_In ',
       u'Mean_Sea_Level_Pressure_In ', u'Min_Sea_Level_Pressure_In ',
       u'Max_Visibility_Miles ', u'Mean_Visibility_Miles ',
       u'Min_Visibility_Miles ', u'Max_Wind_Speed_MPH ',
       u'Mean_Wind_Speed_MPH ', u'Max_Gust_Speed_MPH', u'Precipitation_In ',
       u'Cloud_Cover ', u'Events', u'Wind_Dir_Degrees'],
      dtype='object')

In [24]:
df_merged_1 = df_merged_1.drop('zip', 1)

In [26]:
df_merged_1 = df_merged_1.drop('Date', 1)

In [27]:
df = df_merged_1

In [28]:
month =[]
day_of_month =[]
hour_of_day =[]
day_of_week =[]
weekend = []

for i in range(len(df['Start Date Time'].values)):
    date = datetime.strptime(df['Start Date Time'].values[i], "%m/%d/%Y %H:%M")
    moy = date.month
    dom = date.day
    hod = date.hour
    month.append(moy)
    day_of_month.append(dom)
    hour_of_day.append(hod)
    day_of_week.append(date.isoweekday())
    if date.isoweekday() in range(1, 6):
        weekend.append(0)
    else:
        weekend.append(1)
df['month'] = month
df['day_of_month'] = day_of_month
df['hour_of_day'] = hour_of_day
df['day_of_week'] = day_of_week
df['weekend'] = weekend
df['Duration_in_mins'] = df['Duration'] /60

In [ ]:
df.shape

In [30]:
df.to_csv("DATA/babs_master/merged_master.csv", index=False)

In [29]:
df.head()


Out[29]:
Trip ID Duration Start Date Time Start Station Start Terminal End Date Time End Station End Terminal Bike # Subscription Type ... Precipitation_In Cloud_Cover Events Wind_Dir_Degrees month day_of_month hour_of_day day_of_week weekend Duration_in_mins
0 4576 63 8/29/2013 14:13 South Van Ness at Market 66 8/29/2013 14:14 South Van Ness at Market 66 520 Subscriber ... 0 4 NaN 286 8 29 14 4 0 1.05
1 4576 63 8/29/2013 14:13 South Van Ness at Market 66 8/29/2013 14:14 South Van Ness at Market 66 520 Subscriber ... 0 5 NaN 313 8 29 14 4 0 1.05
2 4576 63 8/29/2013 14:13 South Van Ness at Market 66 8/29/2013 14:14 South Van Ness at Market 66 520 Subscriber ... 0 4 NaN 355 8 29 14 4 0 1.05
3 4576 63 8/29/2013 14:13 South Van Ness at Market 66 8/29/2013 14:14 South Van Ness at Market 66 520 Subscriber ... 0 4 NaN 355 8 29 14 4 0 1.05
4 4576 63 8/29/2013 14:13 South Van Ness at Market 66 8/29/2013 14:14 South Van Ness at Market 66 520 Subscriber ... 0 4 NaN 320 8 29 14 4 0 1.05

5 rows × 55 columns


In [ ]: