In [1]:
from pandas import DataFrame, Series
import pandas as pd
import numpy as np
When working with matplotlib we usually do
In [2]:
import matplotlib.pyplot as plt
and then some magic to get plots to show up here
In [3]:
%matplotlib inline
Which we can check:
In [8]:
plt.plot([1,3],[2,4])
plt.title("This is just a sample graph")
plt.xlabel("This is just an x-axis")
plt.ylabel("This is just a y-axis")
Out[8]:
In [10]:
plt.bar(range(10), np.random.rand(10))
plt.title("A random bar chart")
Out[10]:
In [16]:
xs = range(10)
plt.scatter(xs, 5 * np.random.rand(10) + xs, color='r', marker='*', label='series1')
plt.scatter(xs, 5 * np.random.rand(10) + xs, color='g', marker='o', label='series2')
plt.title("A scatterplot with two series")
plt.legend(loc=9)
Out[16]:
Now let's pull in our bike share data
In [17]:
weather = pd.read_table('daily_weather.tsv', parse_dates=['date'])
stations = pd.read_table('stations.tsv')
usage = pd.read_table('usage_2012.tsv', parse_dates=['time_start', 'time_end'])
weather.index = pd.DatetimeIndex(weather['date'])
weather.season_desc = weather.season_desc.map({'Spring' : 'Winter', 'Winter' : 'Fall', 'Fall' : 'Summer', 'Summer' : 'Spring' })
We can now plot the temperature across the year:
In [28]:
plt.scatter(weather.index, weather.temp)
Out[28]:
Or look at the scatterplot of temperature and humidity:
In [30]:
plt.scatter(weather.humidity, weather.temp)
Out[30]:
Or look at the scatter between the number of riders and temperature:
In [32]:
plt.scatter(weather.temp, weather.total_riders)
Out[32]:
Let's break that down by season. That gives us a good example of mixing vanilla Python code with matplotlib code:
In [41]:
for season, color in zip(['Winter','Spring','Summer','Fall'],['blue','green','orange','brown']):
temps = weather[weather.season_desc == season].temp
riders = weather[weather.season_desc == season].total_riders
plt.scatter(temps, riders, color=color, label=season)
plt.legend(loc=4)
plt.ylim([0, 10000])
plt.xlabel("temperature")
plt.ylabel("# of riders")
Out[41]:
Scatterplot matrix
In [42]:
from pandas.tools.plotting import scatter_matrix
In [46]:
scatter_matrix(weather[['temp', 'humidity', 'windspeed', 'total_riders']])
Out[46]:
You can also call plots directly on the dataframes (or series) themselves:
In [47]:
weather['temp'].hist()
Out[47]:
In [48]:
weather['temp'].plot()
Out[48]:
In [61]:
avg_daily_trips = usage.groupby('station_start').size() / 365
trips = DataFrame({ 'avg_daily_trips' : avg_daily_trips })
station_geos = stations[['station', 'lat', 'long']]
trips_by_geo = pd.merge(station_geos, trips, left_on='station', right_index=True)
trips_by_geo
Out[61]:
In [62]:
plt.scatter(trips_by_geo['long'], trips_by_geo['lat'], s=trips_by_geo['avg_daily_trips'])
Out[62]:
In [ ]: