Unsupervised Analysis of Days of week


In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn')

import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture

Get Data


In [2]:
from jupyterworkflow.data import get_fremont_data
data = get_fremont_data()
pivoted = data.pivot_table('Total', index = data.index.time, columns = data.index.date)
pivoted.plot(legend = False, alpha = 0.01)


Out[2]:
<matplotlib.axes._subplots.AxesSubplot at 0x1f298f69eb8>

Principal Component Analysis


In [3]:
X = pivoted.fillna(0).T.values
X.shape


Out[3]:
(1824, 24)

In [4]:
X2 = PCA(2).fit_transform(X)
X2.shape


Out[4]:
(1824, 2)

In [5]:
plt.scatter(X2[:, 0], X2[:, 1])


Out[5]:
<matplotlib.collections.PathCollection at 0x1f29c6bf940>

Unsupervised Clustering


In [6]:
gmm = GaussianMixture(2).fit(X)
labels = gmm.predict(X)
labels


Out[6]:
array([1, 1, 1, ..., 1, 1, 0], dtype=int64)

In [7]:
plt.scatter(X2[:, 0], X2[:, 1], c = labels, cmap = 'rainbow')
plt.colorbar()


Out[7]:
<matplotlib.colorbar.Colorbar at 0x1f29c81ec88>

In [8]:
fig, ax = plt.subplots(1, 2, figsize=(14,6))

pivoted.T[labels == 0].T.plot(legend = False, alpha = 0.01, ax=ax[0])
pivoted.T[labels == 1].T.plot(legend = False, alpha = 0.01, ax=ax[1])

ax[0].set_title('Purple Cluster')
ax[1].set_title('Red Cluster')


Out[8]:
<matplotlib.text.Text at 0x1f29c806cf8>

Comparing with Day of week


In [9]:
dayofweek = pd.DatetimeIndex(pivoted.columns).dayofweek
plt.scatter(X2[:, 0], X2[:, 1], c = dayofweek, cmap = 'rainbow')
plt.colorbar()


Out[9]:
<matplotlib.colorbar.Colorbar at 0x1f2a1985b70>

Analyzing Outliers

The following points are weekdays with a holiday-like pattern


In [10]:
dates = pd.DatetimeIndex(pivoted.columns)
dates[(labels == 1) & (dayofweek < 5)]


Out[10]:
DatetimeIndex(['2012-10-03', '2012-10-04', '2012-10-05', '2012-10-08',
               '2012-10-09', '2012-10-10', '2012-10-11', '2012-10-12',
               '2012-10-15', '2012-10-16',
               ...
               '2017-09-18', '2017-09-19', '2017-09-20', '2017-09-21',
               '2017-09-22', '2017-09-25', '2017-09-26', '2017-09-27',
               '2017-09-28', '2017-09-29'],
              dtype='datetime64[ns]', length=1259, freq=None)