In [1]:
import os
from urllib.request import urlretrieve
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn')
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
In [2]:
# get data if not already downloaded
url = 'https://data.seattle.gov/api/views/65db-xm6k/rows.csv?accessType=DOWNLOAD'
filename = 'fremont.csv'
force = False
if force or not os.path.exists(filename):
urlretrieve(url, filename)
In [3]:
# create dataframe, use Date column as index
# can parse on import but very slow, replaced with conversion in next cell to speed up
# data = pd.read_csv(filename, index_col='Date', parse_dates=True)
data = pd.read_csv(filename, index_col='Date')
In [4]:
# convert Date column to datetime, try statement is faster, except statement is foolproof
# reference: http://strftime.org/
try:
data.index = pd.to_datetime(data.index, format='%m/%d/%Y %I:%M:%S %p')
except TypeError:
data.index = pd.to_datetime(data.index)
In [5]:
# rename columns
data.columns = ['West', 'East']
In [6]:
# check data shape
data.shape
Out[6]:
In [7]:
# view some data
data.head()
Out[7]:
In [8]:
# view some data
data.tail()
Out[8]:
In [9]:
# riders by day
# very dense plot
data.plot()
Out[9]:
In [10]:
# riders by week
# summarize data by week
data.resample('w').sum().plot()
Out[10]:
In [11]:
# add Total column to dataframe
data['Total'] = data['West'] + data['East']
In [12]:
# riders by year
# resample by rolling 365 days to look for annual trends
ax = data.resample('d').sum().rolling(365).sum().plot()
# set y axis to zero to avoid misinterpretation
ax.set_ylim(0, None)
Out[12]:
In [13]:
# riders by time of day, shows commuters
data.groupby(data.index.time).mean().plot()
Out[13]:
In [14]:
pivoted = data.pivot_table('Total', index=data.index.time, columns=data.index.date )
pivoted.iloc[:5, :5]
Out[14]:
In [15]:
# every day by time of day, note two patterns
pivoted.plot(legend=False, alpha=0.1)
Out[15]:
In [16]:
# 24 observations for each day
pivoted.shape
Out[16]:
In [17]:
# 1732 observations for each hour
x = pivoted.fillna(0).T.values
x.shape
Out[17]:
In [18]:
# make data two dimensional
x2 = PCA(2, svd_solver='full').fit_transform(x)
x2.shape
Out[18]:
In [19]:
# note two different patterns of days
plt.scatter(x2[:, 0], x2[:, 1])
Out[19]:
In [20]:
# color data by cluster
gmm = GaussianMixture(2).fit(x)
labels = gmm.predict(x)
plt.scatter(x2[:, 0], x2[:, 1], c=labels, cmap='rainbow')
plt.colorbar();
In [21]:
fig, ax = plt.subplots(1, 2, figsize=(14, 6))
pivoted.T[labels == 0].T.plot(legend=False, alpha=0.1, ax=ax[0]);
pivoted.T[labels == 1].T.plot(legend=False, alpha=0.1, ax=ax[1]);
ax[0].set_title('Purple Cluster') # weekend pattern?
ax[1].set_title('Red Cluster'); # weekday pattern?
In [22]:
dayofweek = pd.DatetimeIndex(pivoted.columns).dayofweek
In [23]:
plt.scatter(x2[:, 0], x2[:, 1], c=dayofweek, cmap='rainbow')
plt.colorbar();
In [28]:
# outliers are public holidays?
dates = pd.DatetimeIndex(pivoted.columns)
dates[(labels == 0) & (dayofweek < 5)]
Out[28]:
In [ ]: