In [364]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline
sns.set_context('talk')
sns.set_style('darkgrid')
In [365]:
def sim_data(n=200, p=20, mu=None, sigma=None,
comp_pr=0, subs_pr=0, seed=123456):
""" data generation procedure
n : int, default=200
number of facilities
p : int, default=20
number of resource availability indicators
mu : list, default=[2,0,-2]
means for love mixture
sigma : list, default=[.6,.4,.6]
spreads for love mixture
comp_pr : float, default=0
fraction of resources to be complementary
subs_pr : float, default=0
fraction of resources to be substitutable
seed : int, default=123456
random seed, for reproducibility
"""
# set random seed for reproducibility
np.random.seed(seed)
# means and spreads for the logit_love mixture distribution
if mu == None or sigma == None:
mu = [2, 0, -2]
sigma = [.6, .4, .6]
# sample logit_love from mixture of gaussians
cluster = np.empty(n, dtype=int)
logit_love = np.empty(n)
for i in range(n):
cluster[i] = np.random.choice(range(len(mu)))
logit_love[i] = np.random.normal(mu[cluster[i]], sigma[cluster[i]])
# now generate preliminary resource availability indicators
logit_thing = np.random.normal(0., 2., size=p)
X = np.empty((n,p))
for i in range(n):
for j in range(p):
X[i,j] = np.random.uniform() < 1 / (1 + np.exp(-(logit_love[i] + logit_thing[j])))
# now simulate complementarity and substitutability
complementary_to = np.empty(p)
for j in range(p):
if np.random.uniform() < comp_pr:
complementary_to[j] = np.random.choice(range(p))
else:
complementary_to[j] = np.nan
substitutable_to = np.empty(p)
for j in range(p):
if np.random.uniform() < subs_pr:
substitutable_to[j] = np.random.choice(range(p))
else:
substitutable_to[j] = np.nan
for i in range(n):
for j in range(p):
# only include a complementary resource if the resource it complements is present
j_prime = complementary_to[j]
if not np.isnan(j_prime):
if X[i,j_prime] == False:
X[i,j] = False
# only include a substitutable resource if the resource that substitutes for it is absent
j_prime = substitutable_to[j]
if not np.isnan(j_prime):
if X[i,j_prime] == True:
X[i,j] = False
results = pd.DataFrame(index=range(n))
results['cluster']=cluster
results['logit_love']=logit_love
for j in range(p):
results['X_%d'%j] = X[:,j]
# stick all the other parameters into the results as class attributes, in case they are useful
results.mu = mu
results.sigma = sigma
results.logit_thing = logit_thing
results.substitutable_to = substitutable_to
results.complementary_to = complementary_to
return results
sim_data(n=5, p=3)
Out[365]:
What will the shape of the sim_data be for $n=19$ and $p=103$? And will it take long to make?
In [366]:
%time sim_data(n=19, p=103).shape
Out[366]:
We will use a simulated dataset with a lot rows and a few columns for getting to know PCA:
In [367]:
df = sim_data(n=1000, p=7)
In [368]:
df.head()
Out[368]:
In [369]:
X = np.array(df.filter(like='X_'))
In [370]:
plt.plot(X.T + np.random.normal(0., .2, size=(7,1000)), alpha=.2, color='k')
;
Out[370]:
In [371]:
plt.plot(df.logit_love, X.sum(1), 'o')
Out[371]:
So what will PCA do? Let's see:
In [372]:
import sklearn.decomposition
In [373]:
pca = sklearn.decomposition.PCA()
Xt = pca.fit_transform(X)
How long does this take?
In [374]:
%%time
pca = sklearn.decomposition.PCA()
Xt = pca.fit_transform(X)
In [375]:
plt.plot(Xt[:,0], Xt[:,1], 'o')
plt.xlabel('$PC_0$')
plt.ylabel('$PC_1$')
Out[375]:
In [376]:
plt.plot(pca.explained_variance_ratio_, 's-')
plt.axis(xmin=-.5, xmax=6.5, ymax=.5)
Out[376]:
In [377]:
plt.plot(df.logit_love, Xt[:,0], 'o')
plt.xlabel('logit(love)')
plt.ylabel('$PC_0$')
Out[377]:
The good news is that we have way more than 7 indicators of quality:
In [378]:
df = sim_data(n=1000, p=70, comp_pr=.5, subs_pr=.5)
X = np.array(df.filter(like='X_'))
plt.plot(df.logit_love, X.sum(1), 'o')
Out[378]:
How long do you think this will take now?
In [379]:
X.shape
Out[379]:
In [380]:
%%time
pca = sklearn.decomposition.PCA()
Xt = pca.fit_transform(X)
What will plot of explained variance ratio look like?
In [381]:
plt.plot(pca.explained_variance_ratio_, 's-')
plt.axis(xmin=-.5)
Out[381]:
And what do you think scatter of first two principle components will look like?
In [382]:
plt.plot(Xt[:,0], Xt[:,1], 'o')
plt.xlabel('$PC_0$')
plt.ylabel('$PC_1$')
Out[382]:
We should color that in... We did start with a mixture model, after all, and that means we know the clusters.
In [383]:
for c, dfc in df.groupby('cluster'):
plt.plot(Xt[dfc.index, 0], Xt[dfc.index, 1], 'o', label='Cluster %s'%c)
plt.xlabel('$PC_0$')
plt.ylabel('$PC_1$')
plt.legend(loc=(1,.1))
Out[383]:
And did I actually manage to improve the quality of my quality scores?
In [384]:
plt.subplot(2,2,1)
plt.plot(df.logit_love, X.sum(1), 'o', alpha=.1)
plt.xlabel('logit(love)')
plt.ylabel('number of resources')
plt.subplot(2,2,2)
plt.plot(df.logit_love, Xt[:,0], 'o', alpha=.1)
plt.xlabel('logit(love)')
plt.ylabel('$PC_0$')
Out[384]:
It does not seem so, at least, not dramatically.
In [385]:
df.X_0 *= 100.
What will happen now?
In [386]:
X = np.array(df.filter(like='X_'))
pca = sklearn.decomposition.PCA()
Xt = pca.fit_transform(X)
What will the explained variance ratio plot look like?
In [387]:
plt.plot(pca.explained_variance_ratio_, 's-')
plt.axis(xmin=-.5)
Out[387]:
What will the cluster-colored scatter of the first two principle components look like?
In [388]:
for c, dfc in df.groupby('cluster'):
plt.plot(Xt[dfc.index, 0], Xt[dfc.index, 1], 'o', label='Cluster %s'%c, alpha=.1)
plt.xlabel('$PC_0$')
plt.ylabel('$PC_1$')
plt.legend(loc=(1,.1))
Out[388]:
What is the moral of that story?
In [389]:
# Scaling is important
Now, can I take another opportunity to demonstrate the kernel stuff?
In [390]:
df = sim_data(n=1000, p=70, mu=[0], sigma=[2], subs_pr=.6, comp_pr=.5)
X = np.array(df.filter(like='X_'))
plt.plot(df.logit_love, X.sum(1), 'o')
Out[390]:
In [391]:
pca = sklearn.decomposition.PCA()
Xt = pca.fit_transform(X)
In [392]:
plt.subplot(2,2,1)
plt.plot(df.logit_love, X.sum(1), 'o', alpha=.1)
plt.xlabel('logit(love)')
plt.ylabel('number of resources')
plt.subplot(2,2,2)
plt.plot(df.logit_love, Xt[:,0], 'o', alpha=.1)
plt.xlabel('logit(love)')
plt.ylabel('$PC_0$')
Out[392]:
In [393]:
%%time
pca = sklearn.decomposition.KernelPCA(n_components=2, kernel='rbf', gamma=.001)
Xt = pca.fit_transform(X)
In [394]:
plt.subplot(2,2,1)
plt.plot(df.logit_love, X.sum(1), 'o', alpha=.1)
plt.xlabel('logit(love)')
plt.ylabel('number of resources')
plt.subplot(2,2,2)
plt.plot(df.logit_love, Xt[:,0], 'o', alpha=.1)
plt.xlabel('logit(love)')
plt.ylabel('$PC_0$')
Out[394]:
What about this?
In [395]:
%%time
pca = sklearn.decomposition.KernelPCA(n_components=2, kernel='precomputed')
D = sklearn.metrics.pairwise.pairwise_distances(X, metric='minkowski', p=.9)
Xt = pca.fit_transform(D)
In [396]:
plt.subplot(2,2,1)
plt.plot(df.logit_love, X.sum(1), 'o', alpha=.1)
plt.xlabel('logit(love)')
plt.ylabel('number of resources')
plt.subplot(2,2,2)
plt.plot(df.logit_love, Xt[:,0], 'o', alpha=.1)
plt.xlabel('logit(love)')
plt.ylabel('$PC_0$')
Out[396]:
That didn't work well at all...
A little bit different from PCA, but think of them as very similar: http://stats.stackexchange.com/questions/123063/is-there-any-good-reason-to-use-pca-instead-of-efa
In [397]:
df = sim_data(n=1000, p=70, mu=[-1,2], sigma=[.5,.5])
X = np.array(df.filter(like='X_'))
plt.plot(df.logit_love, X.sum(1), 'o')
Out[397]:
In [398]:
pca = sklearn.decomposition.PCA()
Xt_pca = pca.fit_transform(X)
In [399]:
efa = sklearn.decomposition.FactorAnalysis()
Xt = efa.fit_transform(X)
In [400]:
plt.subplot(2,2,1)
for c, dfc in df.groupby('cluster'):
plt.plot(Xt_pca[dfc.index, 0], Xt_pca[dfc.index, 1], 'o', label='Cluster %s'%c, alpha=.6)
plt.xlabel('$PC_0$')
plt.ylabel('$PC_1$')
plt.subplot(2,2,2)
for c, dfc in df.groupby('cluster'):
plt.plot(Xt[dfc.index, 0], Xt[dfc.index, 1], 'o', label='Cluster %s'%c, alpha=.6)
plt.xlabel('$F_0$')
plt.ylabel('$F_1$')
plt.legend(loc=(1,.1))
plt.subplots_adjust(wspace=.3)
There are a ton of interest ways to embed high-dimensional data in low-dimensional spaces. I will do more next week if it seems like your projects will benefit from them...
In [401]:
import sklearn.manifold
In [402]:
%%time
mds = sklearn.manifold.MDS()
Xt = mds.fit_transform(X)
In [403]:
for c, dfc in df.groupby('cluster'):
plt.plot(Xt[dfc.index, 0], Xt[dfc.index, 1], 'o', label='Cluster %s'%c)
plt.xlabel('$Y_0$')
plt.ylabel('$Y_1$')
plt.legend(loc=(1,.1))
Out[403]:
Not to mention ways to embed low-dimensional data into high-dimensional spaces:
In [404]:
import sklearn.ensemble
Read more, if interested: http://scikit-learn.org/stable/modules/ensemble.html#totally-random-trees-embedding
In [405]:
rte = sklearn.ensemble.RandomTreesEmbedding()
Xt = rte.fit_transform(X)
Xt
Out[405]:
In [406]:
import sklearn.cluster
In [407]:
K = 2
clf = sklearn.cluster.KMeans(n_clusters=K, n_init=100, n_jobs=2)
In [408]:
X.shape
Out[408]:
In [409]:
%%time
clf.fit(X)
Out[409]:
In [410]:
y_pred = clf.predict(X)
We can use PCA to look at it:
In [411]:
pca = sklearn.decomposition.PCA()
Xt = pca.fit_transform(X)
In [412]:
for k in range(K):
i = np.where(y_pred == k)[0]
plt.plot(Xt[i,0], Xt[i,1], 'o', label='Cluster %d'%k, alpha=.5)
plt.legend(loc=(1,.1))
Out[412]:
In [413]:
df['y_pred'] = y_pred
In [414]:
df.groupby(['cluster', 'y_pred']).logit_love.count().unstack().fillna('')
Out[414]:
We can hack max_iter to make a flipbook of how this works:
In [415]:
K =5
clf = sklearn.cluster.KMeans(n_clusters=K, max_iter=30)
y_pred = clf.fit_predict(X)
for k in range(K):
i = np.where(y_pred == k)[0]
plt.plot(Xt[i,0], Xt[i,1], 'o', label='Cluster %d'%k, alpha=.5)
plt.legend(loc=(1,.1))
Out[415]:
In [416]:
df.y_pred = y_pred
df.groupby(['cluster', 'y_pred']).logit_love.count().unstack().fillna('')
Out[416]:
Clear-ish? Big question: how to pick $K$.
In [417]:
# try a range
y_pred = {}
for K in range(1, 10):
clf = sklearn.cluster.KMeans(n_clusters=K)
y_pred[K] = clf.fit_predict(X)
In [418]:
# make a clustergram
cluster_means = {}
for k in y_pred.keys():
g = df.filter(like='X_').groupby(y_pred[k])
cluster_means[k] = g.mean().mean(axis=1)
k = pd.Series(range(1,10))
for i in range(len(y_pred[1])):
yy = [cluster_means[kk][y_pred[kk][i]]*np.exp(np.random.normal(0, .02)) for kk in k] # why is that so complicated???
plt.plot(k, yy, 'k-', alpha=.1)
plt.xlabel('Number of clusters ($K$)')
plt.ylabel('Cluster Mean of Means')
plt.title('Clustergram!')
Out[418]:
In [419]:
clf = sklearn.cluster.AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')
y_pred = clf.fit_predict(X)
In [420]:
for k in np.unique(y_pred):
i = np.where(y_pred == k)[0]
plt.plot(Xt[i,0], Xt[i,1], 'o', label='Cluster %d'%k)
plt.legend(loc=(1,.1))
Out[420]:
Good time to refactor, and to experiment with a few other values for affinity and for linkage.
In [421]:
def plot_clusters():
for k in np.unique(y_pred):
i = np.where(y_pred == k)[0]
plt.plot(Xt[i,0], Xt[i,1], 'o', label='Cluster %s'%str(k))
plt.legend(loc=(1,.1))
In [422]:
clf = sklearn.cluster.AgglomerativeClustering(n_clusters=3, affinity='l1',
linkage='complete')
y_pred = clf.fit_predict(X)
plot_clusters()
In [423]:
clf = sklearn.cluster.AgglomerativeClustering(n_clusters=3,
affinity='cosine', linkage='average')
y_pred = clf.fit_predict(X)
plot_clusters()
Pretty clear that distance matters (linkage matters, too...)
In [424]:
import sklearn.metrics.pairwise
In [425]:
D = sklearn.metrics.pairwise.pairwise_distances(X, metric='jaccard')
X.shape, D.shape
Out[425]:
In [426]:
clf = sklearn.cluster.AgglomerativeClustering(n_clusters=3,
affinity='precomputed', linkage='complete')
y_pred = clf.fit_predict(D)
plot_clusters()
In [427]:
import scipy.spatial
In [428]:
help(scipy.spatial.distance.minkowski)
In [429]:
D = sklearn.metrics.pairwise.pairwise_distances(X, metric='minkowski',
p=.5)
In [430]:
clf = sklearn.cluster.AgglomerativeClustering(n_clusters=3, affinity='precomputed', linkage='average')
y_pred = clf.fit_predict(D)
plot_clusters()
A few other ways:
Affinity Propagation
In [431]:
clf = sklearn.cluster.AffinityPropagation(damping=.99)
y_pred = clf.fit_predict(X)
plot_clusters()
In [432]:
clf = sklearn.cluster.AffinityPropagation(damping=.95, affinity='precomputed')
y_pred = clf.fit_predict(D)
plot_clusters()
DBSCAN
In [433]:
clf = sklearn.cluster.DBSCAN(eps=10)
y_pred = clf.fit_predict(X)
plot_clusters()
In [434]:
clf = sklearn.cluster.DBSCAN(eps=100, metric='precomputed')
y_pred = clf.fit_predict(D)
plot_clusters()
In [435]:
import sklearn.mixture
In [436]:
K=2
clf = sklearn.mixture.GMM(n_components=K)
clf.fit(X)
y_pred = clf.predict(X)
plot_clusters()
In [437]:
K=2
clf = sklearn.mixture.DPGMM(n_components=K)
clf.fit(X)
y_pred = clf.predict(X)
plot_clusters()
In [438]:
K=2
clf = sklearn.mixture.VBGMM(n_components=K)
clf.fit(X)
y_pred = clf.predict(X)
plot_clusters()
Distances available in sklearn
In [439]:
import sklearn.metrics.pairwise
help(sklearn.metrics.pairwise.pairwise_distances)
In [ ]: