How rider usage varies with temperature when binning months

Setting up the data


In [15]:
import numpy as np
import pandas as pd
import datetime
from pandas import Series, DataFrame
stations = pd.read_table('stations.tsv')
usage = pd.read_table('usage_2012.tsv')
weather = pd.read_table('daily_weather.tsv')

def change_seasons():
    weather.loc[weather["season_code"] == 1, "season_desc"] = 'Winter'
    weather.loc[weather["season_code"] == 2, "season_desc"] = 'Spring'
    weather.loc[weather["season_code"] == 3, "season_desc"] = 'Summer'
    weather.loc[weather["season_code"] == 4, "season_desc"] = 'Fall'


def convert_dates():
    for i in weather.index:
        weather.ix[i, 'date'] = datetime.datetime.strptime(
            str(weather.ix[i, 'date']), "%Y-%m-%d").date()


def add_months():
    for i in weather.index:
        weather.ix[i, 'month'] = weather.ix[i, 'date'].month
        

change_seasons()
convert_dates()
add_months()

Getting the data into groupby objects and getting a correlation table


In [23]:
months = weather[['month', 'subjective_temp', 'total_riders']].groupby('month')
corrdf = months.corr()

# Doing some NA val cleanup
del corrdf['month']
corrdf = corrdf.dropna()

# And now done
print corrdf


                       subjective_temp  total_riders
month                                               
1     subjective_temp         1.000000      0.692407
      total_riders            0.692407      1.000000
2     subjective_temp         1.000000      0.733898
      total_riders            0.733898      1.000000
3     subjective_temp         1.000000      0.728631
      total_riders            0.728631      1.000000
4     subjective_temp         1.000000      0.551594
      total_riders            0.551594      1.000000
5     subjective_temp         1.000000      0.064179
      total_riders            0.064179      1.000000
6     subjective_temp         1.000000     -0.339782
      total_riders           -0.339782      1.000000
7     subjective_temp         1.000000     -0.224447
      total_riders           -0.224447      1.000000
8     subjective_temp         1.000000      0.045404
      total_riders            0.045404      1.000000
9     subjective_temp         1.000000     -0.351309
      total_riders           -0.351309      1.000000
10    subjective_temp         1.000000      0.498876
      total_riders            0.498876      1.000000
11    subjective_temp         1.000000      0.535224
      total_riders            0.535224      1.000000
12    subjective_temp         1.000000      0.718123
      total_riders            0.718123      1.000000

[24 rows x 2 columns]

Comments on above

As we can see from the correlations, ridership goes up with temperature in the winter and goes down with increasing temperature in the summer. Riders do not seem to appreciate especially cold or hot temperatures, except for the month of May, when riders seem to not be especially encouraged or discouraged by temperature.

This is further illustrated by regrouping via season


In [29]:
seasons = weather[['season_desc', 'subjective_temp', 'total_riders']].groupby('season_desc')
corrdf = seasons.corr()

# Doing some NA val cleanup
# del corrdf['season_desc']
corrdf = corrdf.dropna()

# And now done
print corrdf


                             subjective_temp  total_riders
season_desc                                               
Fall        subjective_temp         1.000000      0.635616
            total_riders            0.635616      1.000000
Spring      subjective_temp         1.000000      0.435419
            total_riders            0.435419      1.000000
Summer      subjective_temp         1.000000     -0.282228
            total_riders           -0.282228      1.000000
Winter      subjective_temp         1.000000      0.812550
            total_riders            0.812550      1.000000

[8 rows x 2 columns]

In [ ]: