Data scientist, programmer, co-organize Montréal All-Girl Hack Night, PyLadies MTL
You can follow along with this talk at:
sudo apt-get install ipython-notebook
sudo apt-get install python-pandas
pip install ipython tornado pyzmq
pip install pandas
Taken from http://donnees.ville.montreal.qc.ca/fiche/velos-comptage/ (click "Vélos - comptage")
Number of people per day on 7 bike paths (collected using sensors)
Download and unzip the zip file from this page to run this yourself.
In [1]:
import pandas as pd
In [2]:
# some display options to make figures bigger
# hide this
pd.set_option('display.max_columns', 15)
pd.set_option('display.line_width', 400)
pd.set_option('display.mpl_style', 'default')
rcParams['figure.figsize'] = (14, 7)
import matplotlib
font = {'family' : 'normal',
'weight' : 'bold',
'size' : 22}
matplotlib.rc('font', **font)
In [3]:
bike_data = pd.read_csv("./2012.csv")
bike_data[:5]
Out[3]:
In [4]:
bike_data = pd.read_csv("./2012.csv", encoding='latin1', sep=';', index_col='Date', parse_dates=True, dayfirst=True)
In [5]:
# Get rid of missing columns
bike_data = bike_data.dropna(axis=1)
# Only use 3 of the columns so it all fits on the screen
In [6]:
bike_data = bike_data[['Berri 1', u'Côte-Sainte-Catherine', 'Maisonneuve 1']]
bike_data[:5]
Out[6]:
Exercise: Parse the CSVs from 2011 and earlier (warning: it's annoying)
In [7]:
bike_data[:3]
Out[7]:
In [8]:
bike_data.plot()
Out[8]:
In [9]:
bike_data.median()
Out[9]:
In [10]:
bike_data.median().plot(kind='bar')
Out[10]:
In [11]:
# column slice
column_slice = bike_data[['Berri 1', 'Maisonneuve 1']]
# row slice
column_slice[:3]
Out[11]:
In [12]:
column_slice.plot()
Out[12]:
In [13]:
bike_data['weekday'] = bike_data.index.weekday
bike_data.head()
Out[13]:
In [14]:
counts_by_day = bike_data.groupby('weekday').aggregate(numpy.sum)
counts_by_day
Out[14]:
In [15]:
counts_by_day.index = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
counts_by_day.plot()
Out[15]:
In [16]:
bike_data['Berri 1'].plot()
Out[16]:
In [17]:
def get_weather_data(year):
url_template = "http://climate.weather.gc.ca/climateData/bulkdata_e.html?format=csv&stationID=5415&Year={year}&Month={month}&timeframe=1&submit=Download+Data"
# mctavish station: 10761, airport station: 5415
data_by_month = []
for month in range(1, 13):
url = url_template.format(year=year, month=month)
weather_data = pd.read_csv(url, skiprows=16, index_col='Date/Time', parse_dates=True).dropna(axis=1)
weather_data.columns = map(lambda x: x.replace('\xb0', ''), weather_data.columns)
weather_data = weather_data.drop(['Year', 'Day', 'Month', 'Time', 'Data Quality'], axis=1)
data_by_month.append(weather_data.dropna())
# Concatenate and drop any empty columns
return pd.concat(data_by_month).dropna(axis=1, how='all').dropna()
In [18]:
weather_data = get_weather_data(2012)
In [19]:
weather_data[:5]
Out[19]:
In [20]:
bike_data['mean temp'] = weather_data['Temp (C)'].resample('D', how='mean')
In [21]:
bike_data.head()
Out[21]:
In [22]:
bike_data[['Berri 1', 'mean temp']].plot(subplots=True)
Out[22]:
In [23]:
bike_data['Rain'] = weather_data['Weather'].str.contains('Rain').map(lambda x: int(x)).resample('D', how='mean')
In [24]:
bike_data[['Berri 1', 'Rain']].plot(subplots=True)
Out[24]:
In [25]:
# Look at everything between May and September
summertime_data = bike_data['2012-05-01':'2012-09-01']
In [26]:
summertime_data['Berri 1'][:5] < 2500
Out[26]:
In [27]:
summertime_data = bike_data['2012-05-01':'2012-09-01']
bad_days = summertime_data[summertime_data['Berri 1'] < 2500]
bad_days[['Berri 1', 'Rain', 'mean temp', 'weekday']]
Out[27]:
In [28]:
julia = {'email': 'julia@jvns.ca', 'twitter': 'http://twitter.com/b0rk'}
In [29]:
print 'Email:', julia['email']
print 'Twitter:', julia['twitter']
print 'Slides: http://bit.ly/pyconca-pandas'