In [ ]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline
sns.set_context('paper')
sns.set_style('darkgrid')

PCA example as is often shown in books:


In [ ]:
X = np.random.multivariate_normal([0., 0.], [[1., .7],
                                             [.7, 2]], size=1000)
plt.plot(X[:,0], X[:,1], 'o', alpha=.5)

Pretty straight-forward to do in sklearn:


In [ ]:
import sklearn.decomposition

In [ ]:
pca = sklearn.decomposition.PCA()
Xt = pca.fit_transform(X)

What does the following return?


In [ ]:
(pca.components_**2).sum(1)

In [ ]:
plt.subplot(2,2,1)
plt.plot(X[:,0], X[:,1], 'o', alpha=.1)
plt.arrow(0, 0, pca.components_[0,0], pca.components_[0,1], head_width=.2, linewidth=3, zorder=99)
plt.arrow(0, 0, pca.components_[1,0], pca.components_[1,1], head_width=.2, linewidth=3, zorder=99)

plt.subplot(2,2,2)
plt.plot(Xt[:,0], Xt[:,1], 'o', alpha=.1)

In [ ]:
def sim_data(n=200, p=20, mu=None, sigma=None,
             comp_pr=0, subs_pr=0, seed=123456):
    """ data generation procedure
    n : int, default=200
      number of facilities
    p : int, default=20
      number of resource availability indicators
    mu : list, default=[2,0,-2]
      means for love mixture
    sigma : list, default=[.6,.4,.6]
      spreads for love mixture
    comp_pr : float, default=0
      fraction of resources to be complementary
    subs_pr : float, default=0
      fraction of resources to be substitutable
    seed : int, default=123456
      random seed, for reproducibility
    """
    
    # set random seed for reproducibility
    np.random.seed(seed)

    # means and spreads for the logit_love mixture distribution
    if mu == None or sigma == None:
        mu = [2, 0, -2]
        sigma = [.6, .4, .6]

    # sample logit_love from mixture of gaussians
    cluster = np.empty(n, dtype=int)
    logit_love = np.empty(n)

    for i in range(n):
        cluster[i] = np.random.choice(range(len(mu)))
        logit_love[i] = np.random.normal(mu[cluster[i]], sigma[cluster[i]])
    
    # now generate preliminary resource availability indicators
    logit_thing = np.random.normal(0., 2., size=p)

    X = np.empty((n,p))
    for i in range(n):
        for j in range(p):
            X[i,j] = np.random.uniform() < 1 / (1 + np.exp(-( \
                logit_love[i] + logit_thing[j])))


    # now simulate complementarity and substitutability

    complementary_to = np.empty(p)
    for j in range(p):
        if np.random.uniform() < comp_pr:
            complementary_to[j] = np.random.choice(range(p))
        else:
            complementary_to[j] = np.nan

    substitutable_to = np.empty(p)
    for j in range(p):
        if np.random.uniform() < subs_pr:
            substitutable_to[j] = np.random.choice(range(p))
        else:
            substitutable_to[j] = np.nan

    for i in range(n):
        for j in range(p):
            # only include a complementary resource if the resource it complements is present
            j_prime = complementary_to[j]
            if not np.isnan(j_prime):
                if X[i,j_prime] == False:
                    X[i,j] = False

            # only include a substitutable resource if the resource that substitutes for it is absent
            j_prime = substitutable_to[j]
            if not np.isnan(j_prime):
                if X[i,j_prime] == True:
                    X[i,j] = False

    results = pd.DataFrame(index=range(n))
    results['cluster']=cluster
    results['logit_love']=logit_love
    for j in range(p):
        results['X_%d'%j] = X[:,j]
    
    # stick all the other parameters into the results as class attributes, in case they are useful
    results.mu = mu
    results.sigma = sigma
    results.logit_thing = logit_thing
    results.substitutable_to = substitutable_to
    results.complementary_to = complementary_to
    return results
sim_data(n=5, p=3)

What will the shape of the sim_data be for $n=19$ and $p=103$? And will it take long to make?


In [ ]:
sim_data(n=19, p=103).shape

We will use a simulated dataset with a lot rows and a few columns for getting to know PCA:


In [ ]:
df = sim_data(n=1000, p=7)

In [ ]:
df.head()

In [ ]:
X = np.array(df.filter(like='X_'))

In [ ]:
plt.plot(X.T + np.random.normal(0., .2, size=(7,1000)), alpha=.2)
;

In [ ]:
plt.plot(df.logit_love, X.sum(1), 'o')

So what will PCA do? Let's see:


In [ ]:
import sklearn.decomposition

In [ ]:
pca = sklearn.decomposition.PCA()
Xt = pca.fit_transform(X)

How long does this take?


In [ ]:
%%time

pca = sklearn.decomposition.PCA()
Xt = pca.fit_transform(X)

What does it look like?


In [ ]:
plt.plot(Xt[:,0], Xt[:,1], 'o')
plt.xlabel('$PC_0$')
plt.ylabel('$PC_1$')

How much variance is explained by each principle component?


In [ ]:
plt.plot(pca.explained_variance_ratio_, 's-')
plt.axis(xmin=-.5, xmax=6.5, ymax=.5)

And how does it compare the logit(love)?


In [ ]:
plt.plot(df.logit_love, Xt[:,0], 'o')
plt.xlabel('logit(love)')
plt.ylabel('$PC_0$')

The good news is that we have way more than 7 indicators of quality:


In [ ]:
df = sim_data(n=1000, p=70)
X = np.array(df.filter(like='X_'))
plt.plot(df.logit_love, X.sum(1), 'o')

How long do you think this will take now?


In [ ]:
X.shape

In [ ]:
%%time

pca = sklearn.decomposition.PCA()
Xt = pca.fit_transform(X)

What will plot of explained variance ratio look like?


In [ ]:
plt.plot(pca.explained_variance_ratio_, 's-')
plt.axis(xmin=-.5)

And what do you think scatter of first two principle components will look like?


In [ ]:
plt.plot(Xt[:,0], Xt[:,1], 'o')
plt.xlabel('$PC_0$')
plt.ylabel('$PC_1$')

We should color that in... We did start with a mixture model, after all, and that means we know the clusters.


In [ ]:
for c, dfc in df.groupby('cluster'):
    plt.plot(Xt[dfc.index, 0], Xt[dfc.index, 1], 'o', label='Cluster %s'%c)
plt.xlabel('$PC_0$')
plt.ylabel('$PC_1$')
plt.legend(loc=(1,.1))

And did I actually manage to improve the quality of my quality scores?


In [ ]:
plt.subplot(2,2,1)
plt.plot(df.logit_love, X.sum(1), 'o', alpha=.1)
plt.xlabel('logit(love)')
plt.ylabel('number of resources')

plt.subplot(2,2,2)
plt.plot(df.logit_love, Xt[:,0], 'o', alpha=.1)
plt.xlabel('logit(love)')
plt.ylabel('$PC_0$')

It does not seem so, at least, not dramatically.

Let's use this setup to investigate the importance of scaling:


In [ ]:
# change the units of df.X_0, so that it is 100X larger
df.X_0 *= 100.

In [ ]:
np.all(df['X_0'] == df.X_0)

In [ ]:
df.X_71 = 'abie'

In [ ]:
df.X_71

In [ ]:
df.mu

What will happen now?


In [ ]:
X = np.array(df.filter(like='X_'))
pca = sklearn.decomposition.PCA()
Xt = pca.fit_transform(X)

What will the explained variance ratio plot look like?


In [ ]:
plt.plot(pca.explained_variance_ratio_, 's-')
plt.axis(xmin=-.5)

What will the cluster-colored scatter of the first two principle components look like?


In [ ]:
for c, dfc in df.groupby('cluster'):
    plt.plot(Xt[dfc.index, 0], Xt[dfc.index, 1], 'o', label='Cluster %s'%c, alpha=.1)
plt.xlabel('$PC_0$')
plt.ylabel('$PC_1$')
plt.legend(loc=(1,.1))

What is the moral of that story?

Scaling matters a lot!

Now, can I take another opportunity to demonstrate the kernel stuff? Not really... this example does not make a good case for the utility of kernel PCA.


In [ ]:
df = sim_data(n=1000, p=70, mu=[0], sigma=[2], subs_pr=.6, comp_pr=.5)
X = np.array(df.filter(like='X_'))
plt.plot(df.logit_love, X.sum(1), 'o')

Have a look at plain, old PCA on this data:


In [ ]:
pca = sklearn.decomposition.PCA()
Xt = pca.fit_transform(X)

In [ ]:
plt.subplot(2,2,1)
plt.plot(df.logit_love, X.sum(1), 'o', alpha=.1)
plt.xlabel('logit(love)')
plt.ylabel('number of resources')

plt.subplot(2,2,2)
plt.plot(df.logit_love, Xt[:,0], 'o', alpha=.1)
plt.xlabel('logit(love)')
plt.ylabel('$PC_0$')

Compare that the kernel PCA:


In [ ]:
%%time

pca = sklearn.decomposition.KernelPCA(n_components=2, kernel='rbf', gamma=.001)
Xt = pca.fit_transform(X)

In [ ]:
plt.subplot(2,2,1)
plt.plot(df.logit_love, X.sum(1), 'o', alpha=.1)
plt.xlabel('logit(love)')
plt.ylabel('number of resources')

plt.subplot(2,2,2)
plt.plot(df.logit_love, Xt[:,0], 'o', alpha=.1)
plt.xlabel('logit(love)')
plt.ylabel('$PC_0$')

And here is a custom kernel, that I was messing around with:


In [ ]:
import sklearn.metrics.pairwise

In [ ]:
%%time

pca = sklearn.decomposition.KernelPCA(n_components=2, kernel='precomputed')
D = sklearn.metrics.pairwise.pairwise_distances(X, metric='minkowski', p=.9)
Xt = pca.fit_transform(D)

Factor Analysis

A little bit different from PCA, but think of them as very similar: http://stats.stackexchange.com/questions/123063/is-there-any-good-reason-to-use-pca-instead-of-efa


In [ ]:
df = sim_data(n=1000, p=70, mu=[-1,2], sigma=[.5,.5])
X = np.array(df.filter(like='X_'))
plt.plot(df.logit_love, X.sum(1), 'o')

In [ ]:
pca = sklearn.decomposition.PCA()
Xt_pca = pca.fit_transform(X)

In [ ]:
efa = sklearn.decomposition.FactorAnalysis()
Xt = efa.fit_transform(X)

Have a look at how they compare:


In [ ]:
plt.subplot(2,2,1)
for c, dfc in df.groupby('cluster'):
    plt.plot(Xt_pca[dfc.index, 0], Xt_pca[dfc.index, 1], 'o', label='Cluster %s'%c, alpha=.6)
plt.xlabel('$PC_0$')
plt.ylabel('$PC_1$')

plt.subplot(2,2,2)
for c, dfc in df.groupby('cluster'):
    plt.plot(Xt[dfc.index, 0], Xt[dfc.index, 1], 'o', label='Cluster %s'%c, alpha=.6)
plt.xlabel('$F_0$')
plt.ylabel('$F_1$')
plt.legend(loc=(1,.1))

There are a ton of interest ways to embed high-dimensional data in low-dimensional spaces. I will do more next week if it seems like your projects will benefit from them...


In [ ]:
import sklearn.manifold

In [ ]:
%%time

mds = sklearn.manifold.MDS()
Xt = mds.fit_transform(X)

In [ ]:
for c, dfc in df.groupby('cluster'):
    plt.plot(Xt[dfc.index, 0], Xt[dfc.index, 1], 'o', label='Cluster %s'%c)
plt.xlabel('$Y_0$')
plt.ylabel('$Y_1$')
plt.legend(loc=(1,.1))

Not to mention ways to embed low-dimensional data into high-dimensional spaces:


In [ ]:
import sklearn.ensemble

In [ ]:
rte = sklearn.ensemble.RandomTreesEmbedding()
Xt = rte.fit_transform(X)

Xt

Let's shift now to clustering


In [ ]:
import sklearn.cluster

In [ ]:
K = 2
clf = sklearn.cluster.KMeans(n_clusters=K, n_init=100, n_jobs=2)

In [ ]:
%%time

clf.fit(X)

In [ ]:
y_pred = clf.predict(X)

We can use PCA to look at it:


In [ ]:
pca = sklearn.decomposition.PCA()
Xt = pca.fit_transform(X)

In [ ]:
for k in range(K):
    i = np.where(y_pred == k)[0]
    plt.plot(Xt[i,0], Xt[i,1], 'o', label='Cluster %d'%k, alpha=.5)
plt.legend(loc=(1,.1))

We can hack max_iter to make a flipbook of how this works:


In [ ]:
K =5
clf = sklearn.cluster.KMeans(n_clusters=K, max_iter=1)
y_pred = clf.fit_predict(X)

for k in range(K):
    i = np.where(y_pred == k)[0]
    plt.plot(Xt[i,0], Xt[i,1], 'o', label='Cluster %d'%k, alpha=.5)
    
plt.legend(loc=(1,.1))

Clear-ish? Big question: how to pick $K$.


In [ ]:
# try a range
y_pred = {}

for K in range(1, 10):
    clf = sklearn.cluster.KMeans(n_clusters=K)
    y_pred[K] = clf.fit_predict(X)

In [ ]:
# make a clustergram
cluster_means = {}
for k in y_pred.keys():
    g = df.filter(like='X_').groupby(y_pred[k])
    cluster_means[k] = g.mean().mean(axis=1)
    
k = pd.Series(range(1,10))
for i in range(len(y_pred[1])):
    yy = [cluster_means[kk][y_pred[kk][i]]*np.exp(np.random.normal(0, .02)) for kk in k]  # why is that so complicated???
    plt.plot(k, yy, 'k-', alpha=.1)

plt.xlabel('Number of clusters ($K$)')
plt.ylabel('Cluster Mean of Means')
plt.title('Clustergram!')

TODO: Refactor this into a function that takes a list of clusterings.

Let's try agglomerative clustering


In [ ]:
clf = sklearn.cluster.AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')
y_pred = clf.fit_predict(X)

In [ ]:
for k in np.unique(y_pred):
    i = np.where(y_pred == k)[0]
    plt.plot(Xt[i,0], Xt[i,1], 'o', label='Cluster %d'%k)
    
plt.legend(loc=(1,.1))

Good time to refactor, and to experiment with a few other values for affinity and for linkage.


In [ ]:
def plot_clusters():
    for k in np.unique(y_pred):
        i = np.where(y_pred == k)[0]
        plt.plot(Xt[i,0], Xt[i,1], 'o', label='Cluster %s'%str(k))

    plt.legend(loc=(1,.1))

In [ ]:
clf = sklearn.cluster.AgglomerativeClustering(n_clusters=3, affinity='l1', linkage='complete')
y_pred = clf.fit_predict(X)

plot_clusters()

In [ ]:
clf = sklearn.cluster.AgglomerativeClustering(n_clusters=3, affinity='cosine', linkage='average')
y_pred = clf.fit_predict(X)

plot_clusters()

Pretty clear that distance matters (linkage matters, too...)


In [ ]:
import sklearn.metrics.pairwise

In [ ]:
D = sklearn.metrics.pairwise.pairwise_distances(X, metric='jaccard')
X.shape, D.shape

In [ ]:
clf = sklearn.cluster.AgglomerativeClustering(n_clusters=3, affinity='precomputed', linkage='complete')
y_pred = clf.fit_predict(D)

plot_clusters()

In [ ]:
import scipy.spatial

In [ ]:
help(scipy.spatial.distance.minkowski)

In [ ]:
D = sklearn.metrics.pairwise.pairwise_distances(X, metric='minkowski', p=.5)

In [ ]:
clf = sklearn.cluster.AgglomerativeClustering(n_clusters=3, affinity='precomputed', linkage='average')
y_pred = clf.fit_predict(D)

plot_clusters()

A few other ways:

Affinity Propagation


In [ ]:
clf = sklearn.cluster.AffinityPropagation(damping=.99)
y_pred = clf.fit_predict(X)
plot_clusters()

In [ ]:
clf = sklearn.cluster.AffinityPropagation(damping=.95, affinity='precomputed')
y_pred = clf.fit_predict(D)
plot_clusters()

DBSCAN


In [ ]:
clf = sklearn.cluster.DBSCAN(eps=10)
y_pred = clf.fit_predict(X)
plot_clusters()

In [ ]:
clf = sklearn.cluster.DBSCAN(eps=100, metric='precomputed')
y_pred = clf.fit_predict(D)
plot_clusters()

Finally, for the statisticians, an actual statistical model (Gaussian Mixture Model):


In [ ]:
import sklearn.mixture

In [ ]:
K=2
clf = sklearn.mixture.GMM(n_components=K)
clf.fit(X)
y_pred = clf.predict(X)
plot_clusters()

In [ ]:
K=2
clf = sklearn.mixture.DPGMM(n_components=K)
clf.fit(X)
y_pred = clf.predict(X)
plot_clusters()

In [ ]:
K=2
clf = sklearn.mixture.VBGMM(n_components=K)
clf.fit(X)
y_pred = clf.predict(X)
plot_clusters()

Here is the jackpot of distances available in sklearn:


In [ ]:
import sklearn.metrics.pairwise

help(sklearn.metrics.pairwise.pairwise_distances)

In [ ]: