In [364]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline
sns.set_context('talk')
sns.set_style('darkgrid')

In [365]:
def sim_data(n=200, p=20, mu=None, sigma=None,
             comp_pr=0, subs_pr=0, seed=123456):
    """ data generation procedure
    n : int, default=200
      number of facilities
    p : int, default=20
      number of resource availability indicators
    mu : list, default=[2,0,-2]
      means for love mixture
    sigma : list, default=[.6,.4,.6]
      spreads for love mixture
    comp_pr : float, default=0
      fraction of resources to be complementary
    subs_pr : float, default=0
      fraction of resources to be substitutable
    seed : int, default=123456
      random seed, for reproducibility
    """
    
    # set random seed for reproducibility
    np.random.seed(seed)

    # means and spreads for the logit_love mixture distribution
    if mu == None or sigma == None:
        mu = [2, 0, -2]
        sigma = [.6, .4, .6]

    # sample logit_love from mixture of gaussians
    cluster = np.empty(n, dtype=int)
    logit_love = np.empty(n)

    for i in range(n):
        cluster[i] = np.random.choice(range(len(mu)))
        logit_love[i] = np.random.normal(mu[cluster[i]], sigma[cluster[i]])
    
    # now generate preliminary resource availability indicators
    logit_thing = np.random.normal(0., 2., size=p)

    X = np.empty((n,p))
    for i in range(n):
        for j in range(p):
            X[i,j] = np.random.uniform() < 1 / (1 + np.exp(-(logit_love[i] + logit_thing[j])))


    # now simulate complementarity and substitutability

    complementary_to = np.empty(p)
    for j in range(p):
        if np.random.uniform() < comp_pr:
            complementary_to[j] = np.random.choice(range(p))
        else:
            complementary_to[j] = np.nan

    substitutable_to = np.empty(p)
    for j in range(p):
        if np.random.uniform() < subs_pr:
            substitutable_to[j] = np.random.choice(range(p))
        else:
            substitutable_to[j] = np.nan

    for i in range(n):
        for j in range(p):
            # only include a complementary resource if the resource it complements is present
            j_prime = complementary_to[j]
            if not np.isnan(j_prime):
                if X[i,j_prime] == False:
                    X[i,j] = False

            # only include a substitutable resource if the resource that substitutes for it is absent
            j_prime = substitutable_to[j]
            if not np.isnan(j_prime):
                if X[i,j_prime] == True:
                    X[i,j] = False

    results = pd.DataFrame(index=range(n))
    results['cluster']=cluster
    results['logit_love']=logit_love
    for j in range(p):
        results['X_%d'%j] = X[:,j]
    
    # stick all the other parameters into the results as class attributes, in case they are useful
    results.mu = mu
    results.sigma = sigma
    results.logit_thing = logit_thing
    results.substitutable_to = substitutable_to
    results.complementary_to = complementary_to
    return results
sim_data(n=5, p=3)


Out[365]:
cluster logit_love X_0 X_1 X_2
0 1 0.381006 1 1 0
1 0 2.021408 1 1 1
2 0 2.727267 1 1 1
3 0 1.896071 1 1 1
4 2 -1.734073 0 0 1

What will the shape of the sim_data be for $n=19$ and $p=103$? And will it take long to make?


In [366]:
%time sim_data(n=19, p=103).shape


CPU times: user 58.2 ms, sys: 4.33 ms, total: 62.5 ms
Wall time: 48.1 ms
Out[366]:
(19, 105)

We will use a simulated dataset with a lot rows and a few columns for getting to know PCA:


In [367]:
df = sim_data(n=1000, p=7)

In [368]:
df.head()


Out[368]:
cluster logit_love X_0 X_1 X_2 X_3 X_4 X_5 X_6
0 1 0.381006 0 0 0 1 0 1 1
1 0 2.021408 1 0 1 1 1 1 1
2 0 2.727267 0 0 1 1 1 1 1
3 0 1.896071 1 0 1 1 1 1 0
4 2 -1.734073 0 0 0 0 0 0 0

In [369]:
X = np.array(df.filter(like='X_'))

In [370]:
plt.plot(X.T + np.random.normal(0., .2, size=(7,1000)), alpha=.2, color='k')
;


Out[370]:
''

In [371]:
plt.plot(df.logit_love, X.sum(1), 'o')


Out[371]:
[<matplotlib.lines.Line2D at 0x7fab92cfab10>]

So what will PCA do? Let's see:


In [372]:
import sklearn.decomposition

In [373]:
pca = sklearn.decomposition.PCA()
Xt = pca.fit_transform(X)

How long does this take?


In [374]:
%%time

pca = sklearn.decomposition.PCA()
Xt = pca.fit_transform(X)


CPU times: user 1.1 ms, sys: 0 ns, total: 1.1 ms
Wall time: 742 µs

In [375]:
plt.plot(Xt[:,0], Xt[:,1], 'o')
plt.xlabel('$PC_0$')
plt.ylabel('$PC_1$')


Out[375]:
<matplotlib.text.Text at 0x7fab91a40350>

In [376]:
plt.plot(pca.explained_variance_ratio_, 's-')
plt.axis(xmin=-.5, xmax=6.5, ymax=.5)


Out[376]:
(-0.5, 6.5, 0.0, 0.5)

In [377]:
plt.plot(df.logit_love, Xt[:,0], 'o')
plt.xlabel('logit(love)')
plt.ylabel('$PC_0$')


Out[377]:
<matplotlib.text.Text at 0x7fab92c02650>

The good news is that we have way more than 7 indicators of quality:


In [378]:
df = sim_data(n=1000, p=70, comp_pr=.5, subs_pr=.5)
X = np.array(df.filter(like='X_'))
plt.plot(df.logit_love, X.sum(1), 'o')


Out[378]:
[<matplotlib.lines.Line2D at 0x7fab92b43dd0>]

How long do you think this will take now?


In [379]:
X.shape


Out[379]:
(1000, 70)

In [380]:
%%time

pca = sklearn.decomposition.PCA()
Xt = pca.fit_transform(X)


CPU times: user 8.87 ms, sys: 15 µs, total: 8.89 ms
Wall time: 8.56 ms

What will plot of explained variance ratio look like?


In [381]:
plt.plot(pca.explained_variance_ratio_, 's-')
plt.axis(xmin=-.5)


Out[381]:
(-0.5, 70.0, 0.0, 0.30000000000000004)

And what do you think scatter of first two principle components will look like?


In [382]:
plt.plot(Xt[:,0], Xt[:,1], 'o')
plt.xlabel('$PC_0$')
plt.ylabel('$PC_1$')


Out[382]:
<matplotlib.text.Text at 0x7fab92df3790>

We should color that in... We did start with a mixture model, after all, and that means we know the clusters.


In [383]:
for c, dfc in df.groupby('cluster'):
    plt.plot(Xt[dfc.index, 0], Xt[dfc.index, 1], 'o', label='Cluster %s'%c)
plt.xlabel('$PC_0$')
plt.ylabel('$PC_1$')
plt.legend(loc=(1,.1))


Out[383]:
<matplotlib.legend.Legend at 0x7fab92b1ad10>

And did I actually manage to improve the quality of my quality scores?


In [384]:
plt.subplot(2,2,1)
plt.plot(df.logit_love, X.sum(1), 'o', alpha=.1)
plt.xlabel('logit(love)')
plt.ylabel('number of resources')

plt.subplot(2,2,2)
plt.plot(df.logit_love, Xt[:,0], 'o', alpha=.1)
plt.xlabel('logit(love)')
plt.ylabel('$PC_0$')


Out[384]:
<matplotlib.text.Text at 0x7fab928e43d0>

It does not seem so, at least, not dramatically.

Let's use this setup to investigate the importance of scaling:


In [385]:
df.X_0 *= 100.

What will happen now?


In [386]:
X = np.array(df.filter(like='X_'))
pca = sklearn.decomposition.PCA()
Xt = pca.fit_transform(X)

What will the explained variance ratio plot look like?


In [387]:
plt.plot(pca.explained_variance_ratio_, 's-')
plt.axis(xmin=-.5)


Out[387]:
(-0.5, 70.0, 0.0, 1.0)

What will the cluster-colored scatter of the first two principle components look like?


In [388]:
for c, dfc in df.groupby('cluster'):
    plt.plot(Xt[dfc.index, 0], Xt[dfc.index, 1], 'o', label='Cluster %s'%c, alpha=.1)
plt.xlabel('$PC_0$')
plt.ylabel('$PC_1$')
plt.legend(loc=(1,.1))


Out[388]:
<matplotlib.legend.Legend at 0x7fab92a3c450>

What is the moral of that story?


In [389]:
# Scaling is important

Now, can I take another opportunity to demonstrate the kernel stuff?


In [390]:
df = sim_data(n=1000, p=70, mu=[0], sigma=[2], subs_pr=.6, comp_pr=.5)
X = np.array(df.filter(like='X_'))
plt.plot(df.logit_love, X.sum(1), 'o')


Out[390]:
[<matplotlib.lines.Line2D at 0x7fab926b8150>]

In [391]:
pca = sklearn.decomposition.PCA()
Xt = pca.fit_transform(X)

In [392]:
plt.subplot(2,2,1)
plt.plot(df.logit_love, X.sum(1), 'o', alpha=.1)
plt.xlabel('logit(love)')
plt.ylabel('number of resources')

plt.subplot(2,2,2)
plt.plot(df.logit_love, Xt[:,0], 'o', alpha=.1)
plt.xlabel('logit(love)')
plt.ylabel('$PC_0$')


Out[392]:
<matplotlib.text.Text at 0x7fab92591690>

In [393]:
%%time

pca = sklearn.decomposition.KernelPCA(n_components=2, kernel='rbf', gamma=.001)
Xt = pca.fit_transform(X)


CPU times: user 51 ms, sys: 4.08 ms, total: 55.1 ms
Wall time: 54.6 ms

In [394]:
plt.subplot(2,2,1)
plt.plot(df.logit_love, X.sum(1), 'o', alpha=.1)
plt.xlabel('logit(love)')
plt.ylabel('number of resources')

plt.subplot(2,2,2)
plt.plot(df.logit_love, Xt[:,0], 'o', alpha=.1)
plt.xlabel('logit(love)')
plt.ylabel('$PC_0$')


Out[394]:
<matplotlib.text.Text at 0x7fab9241b990>

What about this?


In [395]:
%%time

pca = sklearn.decomposition.KernelPCA(n_components=2, kernel='precomputed')
D = sklearn.metrics.pairwise.pairwise_distances(X, metric='minkowski', p=.9)
Xt = pca.fit_transform(D)


CPU times: user 2.64 s, sys: 3.65 ms, total: 2.65 s
Wall time: 2.65 s

In [396]:
plt.subplot(2,2,1)
plt.plot(df.logit_love, X.sum(1), 'o', alpha=.1)
plt.xlabel('logit(love)')
plt.ylabel('number of resources')

plt.subplot(2,2,2)
plt.plot(df.logit_love, Xt[:,0], 'o', alpha=.1)
plt.xlabel('logit(love)')
plt.ylabel('$PC_0$')


Out[396]:
<matplotlib.text.Text at 0x7fab9229a7d0>

That didn't work well at all...

Factor Analysis

A little bit different from PCA, but think of them as very similar: http://stats.stackexchange.com/questions/123063/is-there-any-good-reason-to-use-pca-instead-of-efa


In [397]:
df = sim_data(n=1000, p=70, mu=[-1,2], sigma=[.5,.5])
X = np.array(df.filter(like='X_'))
plt.plot(df.logit_love, X.sum(1), 'o')


Out[397]:
[<matplotlib.lines.Line2D at 0x7fab92043cd0>]

In [398]:
pca = sklearn.decomposition.PCA()
Xt_pca = pca.fit_transform(X)

In [399]:
efa = sklearn.decomposition.FactorAnalysis()
Xt = efa.fit_transform(X)

In [400]:
plt.subplot(2,2,1)
for c, dfc in df.groupby('cluster'):
    plt.plot(Xt_pca[dfc.index, 0], Xt_pca[dfc.index, 1], 'o', label='Cluster %s'%c, alpha=.6)
plt.xlabel('$PC_0$')
plt.ylabel('$PC_1$')

plt.subplot(2,2,2)
for c, dfc in df.groupby('cluster'):
    plt.plot(Xt[dfc.index, 0], Xt[dfc.index, 1], 'o', label='Cluster %s'%c, alpha=.6)
plt.xlabel('$F_0$')
plt.ylabel('$F_1$')
plt.legend(loc=(1,.1))


plt.subplots_adjust(wspace=.3)


There are a ton of interest ways to embed high-dimensional data in low-dimensional spaces. I will do more next week if it seems like your projects will benefit from them...


In [401]:
import sklearn.manifold

In [402]:
%%time

mds = sklearn.manifold.MDS()
Xt = mds.fit_transform(X)


CPU times: user 22.3 s, sys: 3.01 s, total: 25.3 s
Wall time: 25.3 s

In [403]:
for c, dfc in df.groupby('cluster'):
    plt.plot(Xt[dfc.index, 0], Xt[dfc.index, 1], 'o', label='Cluster %s'%c)
plt.xlabel('$Y_0$')
plt.ylabel('$Y_1$')
plt.legend(loc=(1,.1))


Out[403]:
<matplotlib.legend.Legend at 0x7fab92bedb90>

Not to mention ways to embed low-dimensional data into high-dimensional spaces:


In [404]:
import sklearn.ensemble

In [405]:
rte = sklearn.ensemble.RandomTreesEmbedding()
Xt = rte.fit_transform(X)

Xt


Out[405]:
<1000x310 sparse matrix of type '<type 'numpy.float64'>'
	with 10000 stored elements in Compressed Sparse Row format>

Let's shift now to clustering


In [406]:
import sklearn.cluster

In [407]:
K = 2
clf = sklearn.cluster.KMeans(n_clusters=K, n_init=100, n_jobs=2)

In [408]:
X.shape


Out[408]:
(1000, 70)

In [409]:
%%time

clf.fit(X)


CPU times: user 738 ms, sys: 77.7 ms, total: 816 ms
Wall time: 1.04 s
Out[409]:
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=2, n_init=100,
    n_jobs=2, precompute_distances=True, random_state=None, tol=0.0001,
    verbose=0)

In [410]:
y_pred = clf.predict(X)

We can use PCA to look at it:


In [411]:
pca = sklearn.decomposition.PCA()
Xt = pca.fit_transform(X)

In [412]:
for k in range(K):
    i = np.where(y_pred == k)[0]
    plt.plot(Xt[i,0], Xt[i,1], 'o', label='Cluster %d'%k, alpha=.5)
plt.legend(loc=(1,.1))


Out[412]:
<matplotlib.legend.Legend at 0x7fab91e69950>

In [413]:
df['y_pred'] = y_pred

In [414]:
df.groupby(['cluster', 'y_pred']).logit_love.count().unstack().fillna('')


Out[414]:
y_pred 0 1
cluster
0 8 488
1 502 2

We can hack max_iter to make a flipbook of how this works:


In [415]:
K =5
clf = sklearn.cluster.KMeans(n_clusters=K, max_iter=30)
y_pred = clf.fit_predict(X)

for k in range(K):
    i = np.where(y_pred == k)[0]
    plt.plot(Xt[i,0], Xt[i,1], 'o', label='Cluster %d'%k, alpha=.5)
    
plt.legend(loc=(1,.1))


Out[415]:
<matplotlib.legend.Legend at 0x7fab94ef1850>

In [416]:
df.y_pred = y_pred
df.groupby(['cluster', 'y_pred']).logit_love.count().unstack().fillna('')


Out[416]:
y_pred 0 1 2 3 4
cluster
0 144 162 7 183
1 5 231 268

Clear-ish? Big question: how to pick $K$.


In [417]:
# try a range
y_pred = {}

for K in range(1, 10):
    clf = sklearn.cluster.KMeans(n_clusters=K)
    y_pred[K] = clf.fit_predict(X)

In [418]:
# make a clustergram
cluster_means = {}
for k in y_pred.keys():
    g = df.filter(like='X_').groupby(y_pred[k])
    cluster_means[k] = g.mean().mean(axis=1)
    
k = pd.Series(range(1,10))
for i in range(len(y_pred[1])):
    yy = [cluster_means[kk][y_pred[kk][i]]*np.exp(np.random.normal(0, .02)) for kk in k]  # why is that so complicated???
    plt.plot(k, yy, 'k-', alpha=.1)

plt.xlabel('Number of clusters ($K$)')
plt.ylabel('Cluster Mean of Means')
plt.title('Clustergram!')


Out[418]:
<matplotlib.text.Text at 0x7fab91cfb7d0>

Let's try agglomerative clustering


In [419]:
clf = sklearn.cluster.AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')
y_pred = clf.fit_predict(X)

In [420]:
for k in np.unique(y_pred):
    i = np.where(y_pred == k)[0]
    plt.plot(Xt[i,0], Xt[i,1], 'o', label='Cluster %d'%k)
    
plt.legend(loc=(1,.1))


Out[420]:
<matplotlib.legend.Legend at 0x7fab91d599d0>

Good time to refactor, and to experiment with a few other values for affinity and for linkage.


In [421]:
def plot_clusters():
    for k in np.unique(y_pred):
        i = np.where(y_pred == k)[0]
        plt.plot(Xt[i,0], Xt[i,1], 'o', label='Cluster %s'%str(k))

    plt.legend(loc=(1,.1))

In [422]:
clf = sklearn.cluster.AgglomerativeClustering(n_clusters=3, affinity='l1',
                                              linkage='complete')
y_pred = clf.fit_predict(X)

plot_clusters()



In [423]:
clf = sklearn.cluster.AgglomerativeClustering(n_clusters=3, 
                                              affinity='cosine', linkage='average')
y_pred = clf.fit_predict(X)

plot_clusters()


Pretty clear that distance matters (linkage matters, too...)


In [424]:
import sklearn.metrics.pairwise

In [425]:
D = sklearn.metrics.pairwise.pairwise_distances(X, metric='jaccard')
X.shape, D.shape


Out[425]:
((1000, 70), (1000, 1000))

In [426]:
clf = sklearn.cluster.AgglomerativeClustering(n_clusters=3, 
                                              affinity='precomputed', linkage='complete')
y_pred = clf.fit_predict(D)

plot_clusters()



In [427]:
import scipy.spatial

In [428]:
help(scipy.spatial.distance.minkowski)


Help on function minkowski in module scipy.spatial.distance:

minkowski(u, v, p)
    Computes the Minkowski distance between two 1-D arrays.
    
    The Minkowski distance between 1-D arrays `u` and `v`,
    is defined as
    
    .. math::
    
       {||u-v||}_p = (\sum{|u_i - v_i|^p})^{1/p}.
    
    Parameters
    ----------
    u : (N,) array_like
        Input array.
    v : (N,) array_like
        Input array.
    p : int
        The order of the norm of the difference :math:`{||u-v||}_p`.
    
    Returns
    -------
    d : double
        The Minkowski distance between vectors `u` and `v`.


In [429]:
D = sklearn.metrics.pairwise.pairwise_distances(X, metric='minkowski',
                                                p=.5)

In [430]:
clf = sklearn.cluster.AgglomerativeClustering(n_clusters=3, affinity='precomputed', linkage='average')
y_pred = clf.fit_predict(D)

plot_clusters()


A few other ways:

Affinity Propagation


In [431]:
clf = sklearn.cluster.AffinityPropagation(damping=.99)
y_pred = clf.fit_predict(X)
plot_clusters()



In [432]:
clf = sklearn.cluster.AffinityPropagation(damping=.95, affinity='precomputed')
y_pred = clf.fit_predict(D)
plot_clusters()


DBSCAN


In [433]:
clf = sklearn.cluster.DBSCAN(eps=10)
y_pred = clf.fit_predict(X)
plot_clusters()



In [434]:
clf = sklearn.cluster.DBSCAN(eps=100, metric='precomputed')
y_pred = clf.fit_predict(D)
plot_clusters()


Finally, for the statisticians, an actual statistical model:


In [435]:
import sklearn.mixture

In [436]:
K=2
clf = sklearn.mixture.GMM(n_components=K)
clf.fit(X)
y_pred = clf.predict(X)
plot_clusters()



In [437]:
K=2
clf = sklearn.mixture.DPGMM(n_components=K)
clf.fit(X)
y_pred = clf.predict(X)
plot_clusters()



In [438]:
K=2
clf = sklearn.mixture.VBGMM(n_components=K)
clf.fit(X)
y_pred = clf.predict(X)
plot_clusters()


Distances available in sklearn


In [439]:
import sklearn.metrics.pairwise

help(sklearn.metrics.pairwise.pairwise_distances)


Help on function pairwise_distances in module sklearn.metrics.pairwise:

pairwise_distances(X, Y=None, metric='euclidean', n_jobs=1, **kwds)
    Compute the distance matrix from a vector array X and optional Y.
    
    This method takes either a vector array or a distance matrix, and returns
    a distance matrix. If the input is a vector array, the distances are
    computed. If the input is a distances matrix, it is returned instead.
    
    This method provides a safe way to take a distance matrix as input, while
    preserving compatibility with many other algorithms that take a vector
    array.
    
    If Y is given (default is None), then the returned matrix is the pairwise
    distance between the arrays from both X and Y.
    
    Valid values for metric are:
    
    - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
      'manhattan']. These metrics support sparse matrix inputs.
    
    - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
      'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis',
      'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
      'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']
      See the documentation for scipy.spatial.distance for details on these
      metrics. These metrics do not support sparse matrix inputs.
    
    Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are
    valid scipy.spatial.distance metrics), the scikit-learn implementation
    will be used, which is faster and has support for sparse matrices (except
    for 'cityblock'). For a verbose description of the metrics from
    scikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics
    function.
    
    Parameters
    ----------
    X : array [n_samples_a, n_samples_a] if metric == "precomputed", or,              [n_samples_a, n_features] otherwise
        Array of pairwise distances between samples, or a feature array.
    
    Y : array [n_samples_b, n_features]
        A second feature array only if X has shape [n_samples_a, n_features].
    
    metric : string, or callable
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string, it must be one of the options
        allowed by scipy.spatial.distance.pdist for its metric parameter, or
        a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
        If metric is "precomputed", X is assumed to be a distance matrix.
        Alternatively, if metric is a callable function, it is called on each
        pair of instances (rows) and the resulting value recorded. The callable
        should take two arrays from X as input and return a value indicating
        the distance between them.
    
    n_jobs : int
        The number of jobs to use for the computation. This works by breaking
        down the pairwise matrix into n_jobs even slices and computing them in
        parallel.
    
        If -1 all CPUs are used. If 1 is given, no parallel computing code is
        used at all, which is useful for debugging. For n_jobs below -1,
        (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
        are used.
    
    `**kwds` : optional keyword parameters
        Any further parameters are passed directly to the distance function.
        If using a scipy.spatial.distance metric, the parameters are still
        metric dependent. See the scipy docs for usage examples.
    
    Returns
    -------
    D : array [n_samples_a, n_samples_a] or [n_samples_a, n_samples_b]
        A distance matrix D such that D_{i, j} is the distance between the
        ith and jth vectors of the given matrix X, if Y is None.
        If Y is not None, then D_{i, j} is the distance between the ith array
        from X and the jth array from Y.


In [ ]: