In [35]:
%matplotlib inline
import pandas as pd
In [2]:
from jupyterworkflow.data import get_fremont_data
data = get_fremont_data()
pivoted = data.pivot_table('Total', index=data.index.time, columns=data.index.date)
pivoted.plot(legend=False, alpha=0.01) #we now have a line for each day
Out[2]:
In [13]:
X = pivoted.fillna(0).T.values
X.shape
#24 hours per 1631 days - T transposes
Out[13]:
In [20]:
from sklearn.decomposition import PCA
X2 = PCA(2, svd_solver='full').fit_transform(X)
In [21]:
X2.shape
Out[21]:
In [22]:
import matplotlib.pyplot as plt
plt.scatter(X2[:, 0], X2[:, 1])
#This shows there are two types of days (clustered)
Out[22]:
In [23]:
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(2)
gmm.fit(X)
labels = gmm.predict(X)
In [26]:
plt.scatter(X2[:, 0], X2[:, 1], c=labels, cmap='rainbow')
plt.colorbar()
Out[26]:
In [30]:
# Show cluster 0
pivoted.T[labels == 0].T.plot(legend=False, alpha=0.1);
In [31]:
# Show cluster 1
pivoted.T[labels == 1].T.plot(legend=False, alpha=0.1);
In [38]:
dayofweek = pd.DatetimeIndex(pivoted.columns).dayofweek
In [41]:
plt.scatter(X2[:, 0], X2[:, 1], c=dayofweek, cmap='rainbow')
plt.colorbar()
Out[41]:
In [45]:
#Non weekends but in the "holidays" (non commute) cluster
#in seattle: 6/2 bad weather, people did not commute
dates = pd.DatetimeIndex(pivoted.columns)
dates[(labels == 1) & (dayofweek < 5)]
Out[45]:
In [ ]: