In [ ]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline
sns.set_context('paper')
sns.set_style('darkgrid')
PCA example as is often shown in books:
In [ ]:
X = np.random.multivariate_normal([0., 0.], [[1., .7],
[.7, 2]], size=1000)
plt.plot(X[:,0], X[:,1], 'o', alpha=.5)
Pretty straight-forward to do in sklearn
:
In [ ]:
import sklearn.decomposition
In [ ]:
pca = sklearn.decomposition.PCA()
Xt = pca.fit_transform(X)
What does the following return?
In [ ]:
(pca.components_**2).sum(1)
In [ ]:
plt.subplot(2,2,1)
plt.plot(X[:,0], X[:,1], 'o', alpha=.1)
plt.arrow(0, 0, pca.components_[0,0], pca.components_[0,1], head_width=.2, linewidth=3, zorder=99)
plt.arrow(0, 0, pca.components_[1,0], pca.components_[1,1], head_width=.2, linewidth=3, zorder=99)
plt.subplot(2,2,2)
plt.plot(Xt[:,0], Xt[:,1], 'o', alpha=.1)
In [ ]:
def sim_data(n=200, p=20, mu=None, sigma=None,
comp_pr=0, subs_pr=0, seed=123456):
""" data generation procedure
n : int, default=200
number of facilities
p : int, default=20
number of resource availability indicators
mu : list, default=[2,0,-2]
means for love mixture
sigma : list, default=[.6,.4,.6]
spreads for love mixture
comp_pr : float, default=0
fraction of resources to be complementary
subs_pr : float, default=0
fraction of resources to be substitutable
seed : int, default=123456
random seed, for reproducibility
"""
# set random seed for reproducibility
np.random.seed(seed)
# means and spreads for the logit_love mixture distribution
if mu == None or sigma == None:
mu = [2, 0, -2]
sigma = [.6, .4, .6]
# sample logit_love from mixture of gaussians
cluster = np.empty(n, dtype=int)
logit_love = np.empty(n)
for i in range(n):
cluster[i] = np.random.choice(range(len(mu)))
logit_love[i] = np.random.normal(mu[cluster[i]], sigma[cluster[i]])
# now generate preliminary resource availability indicators
logit_thing = np.random.normal(0., 2., size=p)
X = np.empty((n,p))
for i in range(n):
for j in range(p):
X[i,j] = np.random.uniform() < 1 / (1 + np.exp(-( \
logit_love[i] + logit_thing[j])))
# now simulate complementarity and substitutability
complementary_to = np.empty(p)
for j in range(p):
if np.random.uniform() < comp_pr:
complementary_to[j] = np.random.choice(range(p))
else:
complementary_to[j] = np.nan
substitutable_to = np.empty(p)
for j in range(p):
if np.random.uniform() < subs_pr:
substitutable_to[j] = np.random.choice(range(p))
else:
substitutable_to[j] = np.nan
for i in range(n):
for j in range(p):
# only include a complementary resource if the resource it complements is present
j_prime = complementary_to[j]
if not np.isnan(j_prime):
if X[i,j_prime] == False:
X[i,j] = False
# only include a substitutable resource if the resource that substitutes for it is absent
j_prime = substitutable_to[j]
if not np.isnan(j_prime):
if X[i,j_prime] == True:
X[i,j] = False
results = pd.DataFrame(index=range(n))
results['cluster']=cluster
results['logit_love']=logit_love
for j in range(p):
results['X_%d'%j] = X[:,j]
# stick all the other parameters into the results as class attributes, in case they are useful
results.mu = mu
results.sigma = sigma
results.logit_thing = logit_thing
results.substitutable_to = substitutable_to
results.complementary_to = complementary_to
return results
sim_data(n=5, p=3)
What will the shape of the sim_data be for $n=19$ and $p=103$? And will it take long to make?
In [ ]:
sim_data(n=19, p=103).shape
We will use a simulated dataset with a lot rows and a few columns for getting to know PCA:
In [ ]:
df = sim_data(n=1000, p=7)
In [ ]:
df.head()
In [ ]:
X = np.array(df.filter(like='X_'))
In [ ]:
plt.plot(X.T + np.random.normal(0., .2, size=(7,1000)), alpha=.2)
;
In [ ]:
plt.plot(df.logit_love, X.sum(1), 'o')
So what will PCA do? Let's see:
In [ ]:
import sklearn.decomposition
In [ ]:
pca = sklearn.decomposition.PCA()
Xt = pca.fit_transform(X)
How long does this take?
In [ ]:
%%time
pca = sklearn.decomposition.PCA()
Xt = pca.fit_transform(X)
What does it look like?
In [ ]:
plt.plot(Xt[:,0], Xt[:,1], 'o')
plt.xlabel('$PC_0$')
plt.ylabel('$PC_1$')
How much variance is explained by each principle component?
In [ ]:
plt.plot(pca.explained_variance_ratio_, 's-')
plt.axis(xmin=-.5, xmax=6.5, ymax=.5)
And how does it compare the logit(love)?
In [ ]:
plt.plot(df.logit_love, Xt[:,0], 'o')
plt.xlabel('logit(love)')
plt.ylabel('$PC_0$')
The good news is that we have way more than 7 indicators of quality:
In [ ]:
df = sim_data(n=1000, p=70)
X = np.array(df.filter(like='X_'))
plt.plot(df.logit_love, X.sum(1), 'o')
How long do you think this will take now?
In [ ]:
X.shape
In [ ]:
%%time
pca = sklearn.decomposition.PCA()
Xt = pca.fit_transform(X)
What will plot of explained variance ratio look like?
In [ ]:
plt.plot(pca.explained_variance_ratio_, 's-')
plt.axis(xmin=-.5)
And what do you think scatter of first two principle components will look like?
In [ ]:
plt.plot(Xt[:,0], Xt[:,1], 'o')
plt.xlabel('$PC_0$')
plt.ylabel('$PC_1$')
We should color that in... We did start with a mixture model, after all, and that means we know the clusters.
In [ ]:
for c, dfc in df.groupby('cluster'):
plt.plot(Xt[dfc.index, 0], Xt[dfc.index, 1], 'o', label='Cluster %s'%c)
plt.xlabel('$PC_0$')
plt.ylabel('$PC_1$')
plt.legend(loc=(1,.1))
And did I actually manage to improve the quality of my quality scores?
In [ ]:
plt.subplot(2,2,1)
plt.plot(df.logit_love, X.sum(1), 'o', alpha=.1)
plt.xlabel('logit(love)')
plt.ylabel('number of resources')
plt.subplot(2,2,2)
plt.plot(df.logit_love, Xt[:,0], 'o', alpha=.1)
plt.xlabel('logit(love)')
plt.ylabel('$PC_0$')
It does not seem so, at least, not dramatically.
In [ ]:
# change the units of df.X_0, so that it is 100X larger
df.X_0 *= 100.
In [ ]:
np.all(df['X_0'] == df.X_0)
In [ ]:
df.X_71 = 'abie'
In [ ]:
df.X_71
In [ ]:
df.mu
What will happen now?
In [ ]:
X = np.array(df.filter(like='X_'))
pca = sklearn.decomposition.PCA()
Xt = pca.fit_transform(X)
What will the explained variance ratio plot look like?
In [ ]:
plt.plot(pca.explained_variance_ratio_, 's-')
plt.axis(xmin=-.5)
What will the cluster-colored scatter of the first two principle components look like?
In [ ]:
for c, dfc in df.groupby('cluster'):
plt.plot(Xt[dfc.index, 0], Xt[dfc.index, 1], 'o', label='Cluster %s'%c, alpha=.1)
plt.xlabel('$PC_0$')
plt.ylabel('$PC_1$')
plt.legend(loc=(1,.1))
What is the moral of that story?
Now, can I take another opportunity to demonstrate the kernel stuff? Not really... this example does not make a good case for the utility of kernel PCA.
In [ ]:
df = sim_data(n=1000, p=70, mu=[0], sigma=[2], subs_pr=.6, comp_pr=.5)
X = np.array(df.filter(like='X_'))
plt.plot(df.logit_love, X.sum(1), 'o')
Have a look at plain, old PCA on this data:
In [ ]:
pca = sklearn.decomposition.PCA()
Xt = pca.fit_transform(X)
In [ ]:
plt.subplot(2,2,1)
plt.plot(df.logit_love, X.sum(1), 'o', alpha=.1)
plt.xlabel('logit(love)')
plt.ylabel('number of resources')
plt.subplot(2,2,2)
plt.plot(df.logit_love, Xt[:,0], 'o', alpha=.1)
plt.xlabel('logit(love)')
plt.ylabel('$PC_0$')
Compare that the kernel PCA:
In [ ]:
%%time
pca = sklearn.decomposition.KernelPCA(n_components=2, kernel='rbf', gamma=.001)
Xt = pca.fit_transform(X)
In [ ]:
plt.subplot(2,2,1)
plt.plot(df.logit_love, X.sum(1), 'o', alpha=.1)
plt.xlabel('logit(love)')
plt.ylabel('number of resources')
plt.subplot(2,2,2)
plt.plot(df.logit_love, Xt[:,0], 'o', alpha=.1)
plt.xlabel('logit(love)')
plt.ylabel('$PC_0$')
And here is a custom kernel, that I was messing around with:
In [ ]:
import sklearn.metrics.pairwise
In [ ]:
%%time
pca = sklearn.decomposition.KernelPCA(n_components=2, kernel='precomputed')
D = sklearn.metrics.pairwise.pairwise_distances(X, metric='minkowski', p=.9)
Xt = pca.fit_transform(D)
A little bit different from PCA, but think of them as very similar: http://stats.stackexchange.com/questions/123063/is-there-any-good-reason-to-use-pca-instead-of-efa
In [ ]:
df = sim_data(n=1000, p=70, mu=[-1,2], sigma=[.5,.5])
X = np.array(df.filter(like='X_'))
plt.plot(df.logit_love, X.sum(1), 'o')
In [ ]:
pca = sklearn.decomposition.PCA()
Xt_pca = pca.fit_transform(X)
In [ ]:
efa = sklearn.decomposition.FactorAnalysis()
Xt = efa.fit_transform(X)
Have a look at how they compare:
In [ ]:
plt.subplot(2,2,1)
for c, dfc in df.groupby('cluster'):
plt.plot(Xt_pca[dfc.index, 0], Xt_pca[dfc.index, 1], 'o', label='Cluster %s'%c, alpha=.6)
plt.xlabel('$PC_0$')
plt.ylabel('$PC_1$')
plt.subplot(2,2,2)
for c, dfc in df.groupby('cluster'):
plt.plot(Xt[dfc.index, 0], Xt[dfc.index, 1], 'o', label='Cluster %s'%c, alpha=.6)
plt.xlabel('$F_0$')
plt.ylabel('$F_1$')
plt.legend(loc=(1,.1))
There are a ton of interest ways to embed high-dimensional data in low-dimensional spaces. I will do more next week if it seems like your projects will benefit from them...
In [ ]:
import sklearn.manifold
In [ ]:
%%time
mds = sklearn.manifold.MDS()
Xt = mds.fit_transform(X)
In [ ]:
for c, dfc in df.groupby('cluster'):
plt.plot(Xt[dfc.index, 0], Xt[dfc.index, 1], 'o', label='Cluster %s'%c)
plt.xlabel('$Y_0$')
plt.ylabel('$Y_1$')
plt.legend(loc=(1,.1))
Not to mention ways to embed low-dimensional data into high-dimensional spaces:
In [ ]:
import sklearn.ensemble
Read more, if interested: http://scikit-learn.org/stable/modules/ensemble.html#totally-random-trees-embedding
In [ ]:
rte = sklearn.ensemble.RandomTreesEmbedding()
Xt = rte.fit_transform(X)
Xt
In [ ]:
import sklearn.cluster
In [ ]:
K = 2
clf = sklearn.cluster.KMeans(n_clusters=K, n_init=100, n_jobs=2)
In [ ]:
%%time
clf.fit(X)
In [ ]:
y_pred = clf.predict(X)
We can use PCA to look at it:
In [ ]:
pca = sklearn.decomposition.PCA()
Xt = pca.fit_transform(X)
In [ ]:
for k in range(K):
i = np.where(y_pred == k)[0]
plt.plot(Xt[i,0], Xt[i,1], 'o', label='Cluster %d'%k, alpha=.5)
plt.legend(loc=(1,.1))
We can hack max_iter to make a flipbook of how this works:
In [ ]:
K =5
clf = sklearn.cluster.KMeans(n_clusters=K, max_iter=1)
y_pred = clf.fit_predict(X)
for k in range(K):
i = np.where(y_pred == k)[0]
plt.plot(Xt[i,0], Xt[i,1], 'o', label='Cluster %d'%k, alpha=.5)
plt.legend(loc=(1,.1))
Clear-ish? Big question: how to pick $K$.
In [ ]:
# try a range
y_pred = {}
for K in range(1, 10):
clf = sklearn.cluster.KMeans(n_clusters=K)
y_pred[K] = clf.fit_predict(X)
In [ ]:
# make a clustergram
cluster_means = {}
for k in y_pred.keys():
g = df.filter(like='X_').groupby(y_pred[k])
cluster_means[k] = g.mean().mean(axis=1)
k = pd.Series(range(1,10))
for i in range(len(y_pred[1])):
yy = [cluster_means[kk][y_pred[kk][i]]*np.exp(np.random.normal(0, .02)) for kk in k] # why is that so complicated???
plt.plot(k, yy, 'k-', alpha=.1)
plt.xlabel('Number of clusters ($K$)')
plt.ylabel('Cluster Mean of Means')
plt.title('Clustergram!')
TODO: Refactor this into a function that takes a list of clusterings.
In [ ]:
clf = sklearn.cluster.AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')
y_pred = clf.fit_predict(X)
In [ ]:
for k in np.unique(y_pred):
i = np.where(y_pred == k)[0]
plt.plot(Xt[i,0], Xt[i,1], 'o', label='Cluster %d'%k)
plt.legend(loc=(1,.1))
Good time to refactor, and to experiment with a few other values for affinity and for linkage.
In [ ]:
def plot_clusters():
for k in np.unique(y_pred):
i = np.where(y_pred == k)[0]
plt.plot(Xt[i,0], Xt[i,1], 'o', label='Cluster %s'%str(k))
plt.legend(loc=(1,.1))
In [ ]:
clf = sklearn.cluster.AgglomerativeClustering(n_clusters=3, affinity='l1', linkage='complete')
y_pred = clf.fit_predict(X)
plot_clusters()
In [ ]:
clf = sklearn.cluster.AgglomerativeClustering(n_clusters=3, affinity='cosine', linkage='average')
y_pred = clf.fit_predict(X)
plot_clusters()
Pretty clear that distance matters (linkage matters, too...)
In [ ]:
import sklearn.metrics.pairwise
In [ ]:
D = sklearn.metrics.pairwise.pairwise_distances(X, metric='jaccard')
X.shape, D.shape
In [ ]:
clf = sklearn.cluster.AgglomerativeClustering(n_clusters=3, affinity='precomputed', linkage='complete')
y_pred = clf.fit_predict(D)
plot_clusters()
In [ ]:
import scipy.spatial
In [ ]:
help(scipy.spatial.distance.minkowski)
In [ ]:
D = sklearn.metrics.pairwise.pairwise_distances(X, metric='minkowski', p=.5)
In [ ]:
clf = sklearn.cluster.AgglomerativeClustering(n_clusters=3, affinity='precomputed', linkage='average')
y_pred = clf.fit_predict(D)
plot_clusters()
A few other ways:
Affinity Propagation
In [ ]:
clf = sklearn.cluster.AffinityPropagation(damping=.99)
y_pred = clf.fit_predict(X)
plot_clusters()
In [ ]:
clf = sklearn.cluster.AffinityPropagation(damping=.95, affinity='precomputed')
y_pred = clf.fit_predict(D)
plot_clusters()
DBSCAN
In [ ]:
clf = sklearn.cluster.DBSCAN(eps=10)
y_pred = clf.fit_predict(X)
plot_clusters()
In [ ]:
clf = sklearn.cluster.DBSCAN(eps=100, metric='precomputed')
y_pred = clf.fit_predict(D)
plot_clusters()
In [ ]:
import sklearn.mixture
In [ ]:
K=2
clf = sklearn.mixture.GMM(n_components=K)
clf.fit(X)
y_pred = clf.predict(X)
plot_clusters()
In [ ]:
K=2
clf = sklearn.mixture.DPGMM(n_components=K)
clf.fit(X)
y_pred = clf.predict(X)
plot_clusters()
In [ ]:
K=2
clf = sklearn.mixture.VBGMM(n_components=K)
clf.fit(X)
y_pred = clf.predict(X)
plot_clusters()
Here is the jackpot of distances available in sklearn:
In [ ]:
import sklearn.metrics.pairwise
help(sklearn.metrics.pairwise.pairwise_distances)
In [ ]: