In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
# import plotly.plotly as py
%matplotlib inline
from datetime import datetime, timedelta

In [2]:
df = pd.read_csv("DATA/babs_master/merged_master.csv", low_memory=False)

In [3]:
df_station = pd.read_csv("DATA/babs_master/station_master.csv")

In [4]:
df_station


Out[4]:
station_id name lat long dockcount landmark installation
0 2 San Jose Diridon Caltrain Station 37.329732 -121.901782 27 San Jose 8/6/2013
1 3 San Jose Civic Center 37.330698 -121.888979 15 San Jose 8/5/2013
2 4 Santa Clara at Almaden 37.333988 -121.894902 11 San Jose 8/6/2013
3 5 Adobe on Almaden 37.331415 -121.893200 19 San Jose 8/5/2013
4 6 San Pedro Square 37.336721 -121.894074 15 San Jose 8/7/2013
5 7 Paseo de San Antonio 37.333798 -121.886943 15 San Jose 8/7/2013
6 8 San Salvador at 1st 37.330165 -121.885831 15 San Jose 8/5/2013
7 9 Japantown 37.348742 -121.894715 15 San Jose 8/5/2013
8 10 San Jose City Hall 37.337391 -121.886995 15 San Jose 8/6/2013
9 11 MLK Library 37.335885 -121.885660 19 San Jose 8/6/2013
10 12 SJSU 4th at San Carlos 37.332808 -121.883891 19 San Jose 8/7/2013
11 13 St James Park 37.339301 -121.889937 15 San Jose 8/6/2013
12 14 Arena Green / SAP Center 37.332692 -121.900084 19 San Jose 8/5/2013
13 16 SJSU - San Salvador at 9th 37.333955 -121.877349 15 San Jose 8/7/2013
14 21 Franklin at Maple 37.481758 -122.226904 15 Redwood City 8/12/2013
15 22 Redwood City Caltrain Station 37.486078 -122.232089 25 Redwood City 8/15/2013
16 23 San Mateo County Center 37.487616 -122.229951 15 Redwood City 8/15/2013
17 24 Redwood City Public Library 37.484219 -122.227424 15 Redwood City 8/12/2013
18 25 Stanford in Redwood City 37.485370 -122.203288 15 Redwood City 8/12/2013
19 26 Redwood City Medical Center 37.487682 -122.223492 15 Redwood City 8/12/2013
20 27 Mountain View City Hall 37.389218 -122.081896 15 Mountain View 8/16/2013
21 28 Mountain View Caltrain Station 37.394358 -122.076713 23 Mountain View 8/15/2013
22 29 San Antonio Caltrain Station 37.406940 -122.106758 23 Mountain View 8/15/2013
23 30 Evelyn Park and Ride 37.390277 -122.066553 15 Mountain View 8/16/2013
24 31 San Antonio Shopping Center 37.400443 -122.108338 15 Mountain View 12/31/2013
25 32 Castro Street and El Camino Real 37.385956 -122.083678 11 Mountain View 12/31/2013
26 33 Rengstorff Avenue / California Street 37.400241 -122.099076 15 Mountain View 8/16/2013
27 34 Palo Alto Caltrain Station 37.443988 -122.164759 23 Palo Alto 8/14/2013
28 35 University and Emerson 37.444521 -122.163093 11 Palo Alto 8/15/2013
29 36 California Ave Caltrain Station 37.429082 -122.142805 15 Palo Alto 8/14/2013
... ... ... ... ... ... ... ...
40 51 Embarcadero at Folsom 37.791464 -122.391034 19 San Francisco 8/20/2013
41 39 Powell Street BART 37.783871 -122.408433 19 San Francisco 8/25/2013
42 54 Embarcadero at Bryant 37.787152 -122.388013 15 San Francisco 8/20/2013
43 55 Temporary Transbay Terminal (Howard at Beale) 37.789756 -122.394643 23 San Francisco 8/20/2013
44 56 Beale at Market 37.792251 -122.397086 19 San Francisco 8/20/2013
45 57 5th at Howard 37.781752 -122.405127 15 San Francisco 8/21/2013
46 58 San Francisco City Hall 37.778650 -122.418235 19 San Francisco 8/21/2013
47 59 Golden Gate at Polk 37.781332 -122.418603 23 San Francisco 8/21/2013
48 60 Embarcadero at Sansome 37.804770 -122.403234 15 San Francisco 8/21/2013
49 61 2nd at Townsend 37.780526 -122.390288 27 San Francisco 8/22/2013
50 62 2nd at Folsom 37.785299 -122.396236 19 San Francisco 8/22/2013
51 63 Howard at 2nd 37.786978 -122.398108 19 San Francisco 8/22/2013
52 64 2nd at South Park 37.782259 -122.392738 15 San Francisco 8/22/2013
53 65 Townsend at 7th 37.771058 -122.402717 15 San Francisco 8/22/2013
54 66 South Van Ness at Market 37.774814 -122.418954 19 San Francisco 8/23/2013
55 67 Market at 10th 37.776619 -122.417385 27 San Francisco 8/23/2013
56 68 Yerba Buena Center of the Arts (3rd @ Howard) 37.784878 -122.401014 19 San Francisco 8/23/2013
57 69 San Francisco Caltrain 2 (330 Townsend) 37.776600 -122.395470 23 San Francisco 8/23/2013
58 70 San Francisco Caltrain (Townsend at 4th) 37.776617 -122.395260 19 San Francisco 8/23/2013
59 71 Powell at Post (Union Square) 37.788446 -122.408499 19 San Francisco 8/23/2013
60 72 Civic Center BART (7th at Market) 37.781039 -122.411748 23 San Francisco 8/23/2013
61 73 Grant Avenue at Columbus Avenue 37.798522 -122.407245 15 San Francisco 8/21/2013
62 74 Steuart at Market 37.794139 -122.394434 23 San Francisco 8/25/2013
63 75 Mechanics Plaza (Market at Battery) 37.791300 -122.399051 19 San Francisco 8/25/2013
64 76 Market at 4th 37.786305 -122.404966 19 San Francisco 8/25/2013
65 77 Market at Sansome 37.789625 -122.400811 27 San Francisco 8/25/2013
66 80 Santa Clara County Civic Center 37.352601 -121.905733 15 San Jose 12/31/2013
67 82 Broadway St at Battery St 37.798541 -122.400862 15 San Francisco 1/22/2014
68 83 Mezes Park 37.491269 -122.236234 15 Redwood City 2/20/2014
69 84 Ryland Park 37.342725 -121.895617 15 San Jose 4/9/2014

70 rows × 7 columns


In [5]:
df.columns


Out[5]:
Index([u'Trip ID', u'Duration', u'Start Date Time', u'Start Station',
       u'Start Terminal', u'End Date Time', u'End Station', u'End Terminal',
       u'Bike #', u'Subscription Type', u'Zip Code', u'Start Date',
       u'Start Time', u'End Date', u'End Time', u'station_id_x',
       u'Start Latitute', u'Start Longitude', u'Start Station Dockcount',
       u'Start Station Landmark', u'Start Station Installation',
       u'station_id_y', u'End Latitute', u'End Longitude',
       u'End Station Dockcount', u'End Station Landmark',
       u'End Station Installation'],
      dtype='object')

In [6]:
df.head()


Out[6]:
Trip ID Duration Start Date Time Start Station Start Terminal End Date Time End Station End Terminal Bike # Subscription Type ... Start Longitude Start Station Dockcount Start Station Landmark Start Station Installation station_id_y End Latitute End Longitude End Station Dockcount End Station Landmark End Station Installation
0 4576 63 8/29/2013 14:13 South Van Ness at Market 66 8/29/2013 14:14 South Van Ness at Market 66 520 Subscriber ... -122.418954 19 San Francisco 8/23/2013 66 37.774814 -122.418954 19 San Francisco 8/23/2013
1 4760 113 8/29/2013 17:01 South Van Ness at Market 66 8/29/2013 17:03 South Van Ness at Market 66 553 Subscriber ... -122.418954 19 San Francisco 8/23/2013 66 37.774814 -122.418954 19 San Francisco 8/23/2013
2 5070 168 8/29/2013 21:43 South Van Ness at Market 66 8/29/2013 21:46 South Van Ness at Market 66 598 Subscriber ... -122.418954 19 San Francisco 8/23/2013 66 37.774814 -122.418954 19 San Francisco 8/23/2013
3 4584 262 8/29/2013 14:17 South Van Ness at Market 66 8/29/2013 14:21 South Van Ness at Market 66 587 Subscriber ... -122.418954 19 San Francisco 8/23/2013 66 37.774814 -122.418954 19 San Francisco 8/23/2013
4 4079 995 8/29/2013 9:35 South Van Ness at Market 66 8/29/2013 9:52 South Van Ness at Market 66 327 Subscriber ... -122.418954 19 San Francisco 8/23/2013 66 37.774814 -122.418954 19 San Francisco 8/23/2013

5 rows × 27 columns

Duration


In [7]:
df = df[df['Duration'] < 1440]
df['minutes'] = df['Duration'] /60

plt.hist(df['minutes'], bins = 10, range = (df['minutes'].min(),df['minutes'].max()), color="orange")
plt.title("Duration of Bike Rental")
plt.xlabel("Bike Ride in Minutes")
plt.ylabel("Frequency")

plt.plot()


Out[7]:
[]

Subscription Type


In [8]:
df['Subscription Type'].value_counts()


Out[8]:
Subscriber    540819
Customer       60422
Name: Subscription Type, dtype: int64

In [9]:
labels = ['Subscriber', 'Customer']
fig, ax = plt.subplots()
y = [df['Subscription Type'].value_counts()[0], df['Subscription Type'].value_counts()[1]]
N = len(y)
x = range(N)
ax.set_xticklabels(labels, rotation='vertical')
width = 1/1.5
plt.bar(x, y, width, color="green")
plt.show()


Dock Counts


In [10]:
labels = df_station['name'].unique()
fig, ax = plt.subplots()
y = [df_station['dockcount'][i] for i in range(len(df_station))]
N = len(y)
x = range(N)
ax.set_xticklabels(labels, rotation='vertical')
width = 1/1.5
plt.bar(x, y, width, color="blue")
plt.show()


Start Station


In [11]:
df['Start Station'].value_counts()


Out[11]:
San Francisco Caltrain (Townsend at 4th)         46782
San Francisco Caltrain 2 (330 Townsend)          32346
Harry Bridges Plaza (Ferry Building)             27456
Temporary Transbay Terminal (Howard at Beale)    25158
2nd at Townsend                                  24191
Steuart at Market                                23103
Embarcadero at Sansome                           23050
Market at Sansome                                22760
Townsend at 7th                                  22715
Market at 10th                                   18495
Market at 4th                                    17995
2nd at South Park                                17190
Powell Street BART                               16190
2nd at Folsom                                    15110
Beale at Market                                  14842
Grant Avenue at Columbus Avenue                  14592
Embarcadero at Bryant                            13425
Embarcadero at Folsom                            12962
Civic Center BART (7th at Market)                12822
5th at Howard                                    12499
Howard at 2nd                                    12440
South Van Ness at Market                         11802
Commercial at Montgomery                         11256
Spear at Folsom                                  11247
Mechanics Plaza (Market at Battery)              11075
Powell at Post (Union Square)                    10507
Yerba Buena Center of the Arts (3rd @ Howard)    10229
Broadway St at Battery St                         9665
San Jose Diridon Caltrain Station                 9330
Davis at Jackson                                  9111
                                                 ...  
Paseo de San Antonio                              2042
Castro Street and El Camino Real                  1832
MLK Library                                       1806
San Antonio Caltrain Station                      1770
Japantown                                         1664
Ryland Park                                       1639
San Antonio Shopping Center                       1621
St James Park                                     1583
Palo Alto Caltrain Station                        1578
San Salvador at 1st                               1531
Evelyn Park and Ride                              1482
Redwood City Caltrain Station                     1375
SJSU - San Salvador at 9th                        1353
Arena Green / SAP Center                          1295
Washington at Kearney                             1220
Adobe on Almaden                                  1153
Cowper at University                              1129
San Jose Civic Center                             1127
SJSU 4th at San Carlos                            1013
Rengstorff Avenue / California Street              937
Santa Clara County Civic Center                    761
California Ave Caltrain Station                    699
Park at Olive                                      546
University and Emerson                             518
Stanford in Redwood City                           388
Mezes Park                                         302
Redwood City Medical Center                        281
San Mateo County Center                            195
Redwood City Public Library                        169
Franklin at Maple                                  168
Name: Start Station, dtype: int64

In [12]:
labels = df['Start Station'].unique()
fig, ax = plt.subplots()
y = [df['Start Station'].value_counts()[i] for i in range(len(df['Start Station'].unique()))]
N = len(y)
x = range(N)
ax.set_xticklabels(labels, rotation='vertical')
width = 1/1.5
plt.bar(x, y, width, color="blue")
plt.show()


End Station


In [13]:
df['Start Station'].value_counts()


Out[13]:
San Francisco Caltrain (Townsend at 4th)         46782
San Francisco Caltrain 2 (330 Townsend)          32346
Harry Bridges Plaza (Ferry Building)             27456
Temporary Transbay Terminal (Howard at Beale)    25158
2nd at Townsend                                  24191
Steuart at Market                                23103
Embarcadero at Sansome                           23050
Market at Sansome                                22760
Townsend at 7th                                  22715
Market at 10th                                   18495
Market at 4th                                    17995
2nd at South Park                                17190
Powell Street BART                               16190
2nd at Folsom                                    15110
Beale at Market                                  14842
Grant Avenue at Columbus Avenue                  14592
Embarcadero at Bryant                            13425
Embarcadero at Folsom                            12962
Civic Center BART (7th at Market)                12822
5th at Howard                                    12499
Howard at 2nd                                    12440
South Van Ness at Market                         11802
Commercial at Montgomery                         11256
Spear at Folsom                                  11247
Mechanics Plaza (Market at Battery)              11075
Powell at Post (Union Square)                    10507
Yerba Buena Center of the Arts (3rd @ Howard)    10229
Broadway St at Battery St                         9665
San Jose Diridon Caltrain Station                 9330
Davis at Jackson                                  9111
                                                 ...  
Paseo de San Antonio                              2042
Castro Street and El Camino Real                  1832
MLK Library                                       1806
San Antonio Caltrain Station                      1770
Japantown                                         1664
Ryland Park                                       1639
San Antonio Shopping Center                       1621
St James Park                                     1583
Palo Alto Caltrain Station                        1578
San Salvador at 1st                               1531
Evelyn Park and Ride                              1482
Redwood City Caltrain Station                     1375
SJSU - San Salvador at 9th                        1353
Arena Green / SAP Center                          1295
Washington at Kearney                             1220
Adobe on Almaden                                  1153
Cowper at University                              1129
San Jose Civic Center                             1127
SJSU 4th at San Carlos                            1013
Rengstorff Avenue / California Street              937
Santa Clara County Civic Center                    761
California Ave Caltrain Station                    699
Park at Olive                                      546
University and Emerson                             518
Stanford in Redwood City                           388
Mezes Park                                         302
Redwood City Medical Center                        281
San Mateo County Center                            195
Redwood City Public Library                        169
Franklin at Maple                                  168
Name: Start Station, dtype: int64

In [14]:
labels = df['End Station'].unique()
fig, ax = plt.subplots()
y = [df['End Station'].value_counts()[i] for i in range(len(df['Start Station'].unique()))]
N = len(y)
x = range(N)
ax.set_xticklabels(labels, rotation='vertical')
width = 1/1.5
plt.bar(x, y, width, color="red")
plt.show()


Start Zip Code/Lat-Long

End Zip code/Lat- Long

New Features

Start


In [15]:
month =[]
day_of_month =[]
hour_of_day =[]
day_of_week =[]
weekend = []

for i in range(len(df['Start Date Time'].values)):
    date = datetime.strptime(df['Start Date Time'].values[i], "%m/%d/%Y %H:%M")
    moy = date.month
    dom = date.day
    hod = date.hour
    month.append(moy)
    day_of_month.append(dom)
    hour_of_day.append(hod)
    day_of_week.append(date.isoweekday())
    if date.isoweekday() in range(1, 6):
        weekend.append(0)
    else:
        weekend.append(1)
df['month'] = month
df['day_of_month'] = day_of_month
df['hour_of_day'] = hour_of_day
df['day_of_week'] = day_of_week
df['weekend'] = weekend

Day of the Week


In [15]:
labels = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
fig, ax = plt.subplots()
plt.hist(df['day_of_week'], bins = 7, range = (df['day_of_week'].min(),df['day_of_week'].max()))
ax.set_xticklabels(labels, rotation='vertical')
plt.title("day of the Week for Bike Rental")
plt.xlabel("Day of the Week")
plt.ylabel("Frequency")

plt.plot()


Out[15]:
[]

Hour of the Day


In [16]:
plt.hist(df['hour_of_day'], bins = 24, range = (df['hour_of_day'].min(),df['hour_of_day'].max()))
plt.title("hour of day of Bike Rental")
plt.xlabel("Hour of the day of Bike Rental")
plt.ylabel("Frequency")

plt.plot()


Out[16]:
[]

Month


In [17]:
labels = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
fig, ax = plt.subplots()
plt.hist(df['month'], bins = 12, range = (df['month'].min(),df['month'].max()))
ax.set_xticklabels(labels, rotation='vertical')
plt.title("month of Bike Rental")
plt.xlabel("Value")
plt.ylabel("Frequency")

plt.plot()


Out[17]:
[]

Weekend/Weekday


In [104]:
labels = ['Weekday', 'Weekend']
fig, ax = plt.subplots()
plt.hist(df['weekend'],bins = 2, width = .4, range = (df['weekend'].min(),df['weekend'].max()) )
plt.title("Is Weekend of Bike Rental")
ax.set_xticklabels(labels, rotation='vertical')
plt.xlabel("Value")
plt.ylabel("Frequency")


Out[104]:
<matplotlib.text.Text at 0x1127ce110>

Distance

Plot Trips


In [18]:
df['Start Latitute'].max(), df['Start Latitute'].min(), df['Start Longitude'].max(), df['Start Longitude'].min()


Out[18]:
(37.804769999999998, 37.329732, -121.877349, -122.418954)

In [19]:
fig = plt.figure(figsize=(20,10))

ax = fig.add_subplot(111)
ax.set_axis_bgcolor('black')
plt.scatter( x=df['Start Longitude'], y=df['Start Latitute'], color='white', s=.02,alpha=.6)


Out[19]:
<matplotlib.collections.PathCollection at 0x10e82f950>

Weather


In [21]:
df_w = pd.read_csv("DATA/babs_master/weather_master.csv")

In [44]:
df_w.columns


Out[44]:
Index([u'Date', u'Max_Temperature_F', u'Mean_Temperature_F',
       u'Min_TemperatureF', u'Max_Dew_Point_F', u'MeanDew_Point_F',
       u'Min_Dewpoint_F', u'Max_Humidity', u'Mean_Humidity ', u'Min_Humidity ',
       u'Max_Sea_Level_Pressure_In ', u'Mean_Sea_Level_Pressure_In ',
       u'Min_Sea_Level_Pressure_In ', u'Max_Visibility_Miles ',
       u'Mean_Visibility_Miles ', u'Min_Visibility_Miles ',
       u'Max_Wind_Speed_MPH ', u'Mean_Wind_Speed_MPH ', u'Max_Gust_Speed_MPH',
       u'Precipitation_In ', u'Cloud_Cover ', u'Events', u'Wind_Dir_Degrees',
       u'zip', u'month'],
      dtype='object')

In [25]:
df_w['month'] = df_w['Date'].apply(lambda x : x.split("/")[0])

In [40]:
df_grp = df_w.groupby('month').mean()

In [48]:
df_grp.columns


Out[48]:
Index([u'Max_Temperature_F', u'Mean_Temperature_F', u'Min_TemperatureF',
       u'Max_Dew_Point_F', u'MeanDew_Point_F', u'Min_Dewpoint_F',
       u'Max_Humidity', u'Mean_Humidity ', u'Min_Humidity ',
       u'Max_Sea_Level_Pressure_In ', u'Mean_Sea_Level_Pressure_In ',
       u'Min_Sea_Level_Pressure_In ', u'Max_Visibility_Miles ',
       u'Mean_Visibility_Miles ', u'Min_Visibility_Miles ',
       u'Max_Wind_Speed_MPH ', u'Mean_Wind_Speed_MPH ', u'Max_Gust_Speed_MPH',
       u'Cloud_Cover ', u'Wind_Dir_Degrees', u'zip'],
      dtype='object')

In [63]:
mpl.rc('savefig', dpi=200)
params = {'figure.dpi' : 200,
          'figure.figsize' : (12, 10),
          'axes.axisbelow' : True,
          'lines.antialiased' : True,
          'axes.titlesize' : 'xx-large',
          'axes.labelsize' : 'x-large',
          'xtick.labelsize' : 'large',
          'ytick.labelsize' : 'large'}

for (k, v) in params.iteritems():
    plt.rcParams[k] = v

In [64]:
y = df_grp['Mean_Temperature_F'].values
N = len(y)
x = range(N)
labels = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

plt.bar(x, y, width=0.8, align='center', edgecolor='#cccccc')

plt.title("Mean Temperature")
plt.xlabel("\nMonth")
plt.ylabel("Temperature in F")

plt.gca().get_yaxis().set_major_formatter(
    mpl.ticker.FuncFormatter(lambda y, p: format(int(y), ','))
)

plt.xlim(-1, N);
plt.xticks(range(len(labels))[::], labels[::], rotation=90);



In [49]:
y = df_grp['Mean_Humidity '].values
N = len(y)
x = range(N)
labels = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

plt.bar(x, y, width=0.8, align='center', edgecolor='#cccccc')

plt.title("Mean Humidity")
plt.xlabel("\nMonth")
plt.ylabel("Humidity in %")

plt.gca().get_yaxis().set_major_formatter(
    mpl.ticker.FuncFormatter(lambda y, p: format(int(y), ','))
)

plt.xlim(-1, N);
plt.xticks(range(len(labels))[::], labels[::], rotation=90);



In [52]:
y = df_grp['Max_Visibility_Miles '].values
N = len(y)
x = range(N)
labels = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

plt.bar(x, y, width=0.8, align='center', edgecolor='#cccccc')

plt.title("Max Visibility")
plt.xlabel("\nMonth")
plt.ylabel("Max Visibility in miles")

plt.gca().get_yaxis().set_major_formatter(
    mpl.ticker.FuncFormatter(lambda y, p: format(int(y), ','))
)

plt.xlim(-1, N);
plt.xticks(range(len(labels))[::], labels[::], rotation=90);



In [53]:
y = df_grp['Mean_Wind_Speed_MPH '].values
N = len(y)
x = range(N)
labels = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

plt.bar(x, y, width=0.8, align='center', edgecolor='#cccccc')

plt.title("Wind Speed")
plt.xlabel("\nMonth")
plt.ylabel("Wind Speed in miles/hour")

plt.gca().get_yaxis().set_major_formatter(
    mpl.ticker.FuncFormatter(lambda y, p: format(int(y), ','))
)

plt.xlim(-1, N);
plt.xticks(range(len(labels))[::], labels[::], rotation=90);



In [54]:
y = df_grp['Cloud_Cover '].values
N = len(y)
x = range(N)
labels = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

plt.bar(x, y, width=0.8, align='center', edgecolor='#cccccc')

plt.title("Cloud Cover")
plt.xlabel("\nMonth")
plt.ylabel("Cloud Cover")

plt.gca().get_yaxis().set_major_formatter(
    mpl.ticker.FuncFormatter(lambda y, p: format(int(y), ','))
)

plt.xlim(-1, N);
plt.xticks(range(len(labels))[::], labels[::], rotation=90);



In [ ]: