In [1]:
import pandas as pd
import numpy as np
import pylab as pl
%matplotlib inline
pl.rcParams['font.family']='Serif'

Not-yet-automated initial pipeline:


In [2]:
path = '../data/stem-cell/'
file_names=['NG_norm_concat.csv','NN_norm_concat.csv','4FI_norm_concat.csv']

Okay, so these look like just the actual timestamps of when they went through the machine, not the barcode

  • Why then are the timestamps wonky for the later cells in each category? UGGGGG

In [3]:
NN_filenames = ['NN_00.export.csv',
'NN_02.export.csv',
'NN_04.export.csv',
'NN_06.export.csv',
'NN_08.export.csv',
'NN_10.export.csv',
'NN_12.export.csv',
'NN_14.export.csv',
'NN_16.export.csv',
'NN_18.export.csv',
'NN_20.export.csv',
'NN_22.export.csv',
'NN_24.export.csv',
'NN_30.export.csv']

In [4]:
NG_filenames = ['NG_00.export.csv',
'NG_02.export.csv',
'NG_04.export.csv',
'NG_06.export.csv',
'NG_08.export.csv',
'NG_10.export.csv',
'NG_12.export.csv',
'NG_14.export.csv',
'NG_16.export.csv',
'NG_18.export.csv',
'NG_20.export.csv',
'NG_22.export.csv',
'NG_24.export.csv',
'NG_30.export.csv']

In [5]:
Oct4_filenames = [
'4FI_00.export.csv',
'4FI_01.export.csv',
'4FI_02.export.csv',
'4FI_04.export.csv',
'4FI_06.export.csv',
'4FI_08.export.csv',
'4FI_10.export.csv',
'4FI_12.export.csv',
'4FI_14.export.csv',
'4FI_16.export.csv',
'4FI_17.export.csv',
'4FI_18.export.csv',
'4FI_20.export.csv']

In [6]:
times = [int(s[3:5]) for s in NG_filenames]
print(times)
pl.plot(times)


[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 30]
Out[6]:
[<matplotlib.lines.Line2D at 0x10a755ac8>]

In [ ]:
times_oct = [int(s[4:6]) for s in Oct4_filenames]
print(times_oct)
pl.plot(times_oct)

In [11]:
times_nn = [int(s[3:5]) for s in NN_filenames]

In [7]:
times_oct = [int(s[4:6]) for s in Oct4_filenames]
print(times_oct)
pl.plot(times_oct)


[0, 1, 2, 4, 6, 8, 10, 12, 14, 16, 17, 18, 20]
Out[7]:
[<matplotlib.lines.Line2D at 0x10a850828>]

In [143]:
NG = [pd.read_csv(path+'NG/'+fname) for fname in NG_filenames]
NN = [pd.read_csv(path+'NN/'+fname) for fname in NN_filenames]
Oct4 = [pd.read_csv(path+'4FI/'+fname) for fname in Oct4_filenames]

In [144]:
NG[0].columns


Out[144]:
Index(['Time', ' "Cell_length"', ' "BC-1(Pd102)Dd"', ' "BC-2(Pd104)Dd"', ' "BC-3(Pd105)Dd"', ' "BC-4(Pd106)Dd"', ' "BC-5(Pd108)Dd"', ' "BC-6(Pd110)Dd"', ' "pPLK1(In113)Dd"', ' "H3K9ac(In115)Dd"', ' "IdU(I127)Dd"', ' "H4Kac(La139)Dd"', ' "cCasp3(Pr141)Dd"', ' "CD54(Nd142)Dd"', ' "pStat3-705(Nd143)Dd"', ' "Thy1(Nd144)Dd"', ' "pAMPK(Nd145)Dd"', ' "CD24(Nd146)Dd"', ' "c-Myc(Sm147)Dd"', ' "CD73(Nd148)Dd"', ' "pS6(Sm149)Dd"', ' "Oct4(Nd150)Dd"', ' "pErk(Eu151)Dd"', ' "Ki67(Sm152)Dd"', ' "pStat3-727(Eu153)Dd"', ' "pSrc(Sm154)Dd"', ' "Ox(Gd155)Dd"', ' "Nanog(Gd156)Dd"', ' "pRb(Gd158)Dd"', ' "pAkt(Tb159)Dd"', ' "Sox2(Gd160)Dd"', ' "GFP(Dy162)Dd"', ' "CyclinB1(Dy164)Dd"', ' "SSEA1(Ho165)Dd"', ' "IKBa(Er166)Dd"', ' "Lin28(Er167)Dd"', ' "CD140a(Er168)Dd"', ' "EpCAM(Tm169)Dd"', ' "bCatenin(Er170)Dd"', ' "CD44(Yb171)Dd"', ' "MEFSK4(Yb172)Dd"', ' "Klf4(Yb174)Dd"', ' "p53(Lu175)Dd"', ' "pH3(Yb176)Dd"', ' "DNA-1(Ir191)Dd"', ' "DNA-2(Ir193)Dd"', ' "beadDist"', ' "barcode"', ' "Event #"'], dtype='object')

In [145]:
NG[0].columns[1:-3]


Out[145]:
Index([' "Cell_length"', ' "BC-1(Pd102)Dd"', ' "BC-2(Pd104)Dd"', ' "BC-3(Pd105)Dd"', ' "BC-4(Pd106)Dd"', ' "BC-5(Pd108)Dd"', ' "BC-6(Pd110)Dd"', ' "pPLK1(In113)Dd"', ' "H3K9ac(In115)Dd"', ' "IdU(I127)Dd"', ' "H4Kac(La139)Dd"', ' "cCasp3(Pr141)Dd"', ' "CD54(Nd142)Dd"', ' "pStat3-705(Nd143)Dd"', ' "Thy1(Nd144)Dd"', ' "pAMPK(Nd145)Dd"', ' "CD24(Nd146)Dd"', ' "c-Myc(Sm147)Dd"', ' "CD73(Nd148)Dd"', ' "pS6(Sm149)Dd"', ' "Oct4(Nd150)Dd"', ' "pErk(Eu151)Dd"', ' "Ki67(Sm152)Dd"', ' "pStat3-727(Eu153)Dd"', ' "pSrc(Sm154)Dd"', ' "Ox(Gd155)Dd"', ' "Nanog(Gd156)Dd"', ' "pRb(Gd158)Dd"', ' "pAkt(Tb159)Dd"', ' "Sox2(Gd160)Dd"', ' "GFP(Dy162)Dd"', ' "CyclinB1(Dy164)Dd"', ' "SSEA1(Ho165)Dd"', ' "IKBa(Er166)Dd"', ' "Lin28(Er167)Dd"', ' "CD140a(Er168)Dd"', ' "EpCAM(Tm169)Dd"', ' "bCatenin(Er170)Dd"', ' "CD44(Yb171)Dd"', ' "MEFSK4(Yb172)Dd"', ' "Klf4(Yb174)Dd"', ' "p53(Lu175)Dd"', ' "pH3(Yb176)Dd"', ' "DNA-1(Ir191)Dd"', ' "DNA-2(Ir193)Dd"'], dtype='object')

In [146]:
pl.plot(times,[len(n) for n in NG],label='Nanog-GFP')
pl.plot(times,[len(n) for n in NN],label='Nanog-Neo')
pl.plot(times_oct,[len(n) for n in Oct4],label='Oct4-GFP')
pl.legend(loc='best')
pl.ylabel('Number of cells')
pl.xlabel('Day')
pl.title('Cells per day')


Out[146]:
<matplotlib.text.Text at 0x135f64940>

In [147]:
max([len(n) for n in NG]) / min([len(n) for n in NG]),max([len(n) for n in NN]) / min([len(n) for n in NN])


Out[147]:
(10.929617720614505, 12.385311871227364)

In [148]:
max([len(n) for n in Oct4]) / min([len(n) for n in Oct4])


Out[148]:
4.937889660039995

In [574]:
# construct a numpy array containing all of the NG_samples
all_data = np.vstack(n for n in NG)
data = all_data[:,2:-3]
print(data.shape)


(203017, 44)

In [592]:
data_p = np.arcsinh(data*15)

In [593]:
from time import time
from scipy.cluster import hierarchy
from sklearn.manifold import MDS

def embedding(X,labels=None,linkage='single',
              profile=True):
    ''' (1) compute linkage matrix,
    (2) compute pairwise distances using linkage matrix,
    (3) compute embedding using distances'''
    t = time()
    n = len(X)
    c = hierarchy.linkage(X,linkage)
    d = distance.squareform(hierarchy.cophenet(c))
    t1 = time()
    print('Computed cophenetic correlations in {0:.2f}s'.format(t1-t))
    
    mds = MDS(dissimilarity='precomputed')
    X_ = mds.fit_transform(d[:n,:n])
    t2 = time()
    print('Computed MDS embedding in {0:.2f}s'.format(t2-t1))
    if labels==None:
        pl.scatter(X_[:,0],X_[:,1])
    else:
        pl.scatter(X_[:,0],X_[:,1],c=labels,
           linewidths=0)
    return X_,c

In [594]:
np.random.seed(0)
sample_ind = np.random.rand(len(data_p)) < 0.005
sum(sample_ind)


Out[594]:
1003

In [595]:
sample = data_p[sample_ind]
results = embedding(sample)


Computed cophenetic correlations in 0.05s
Computed MDS embedding in 27.71s

In [596]:
t = day[sample_ind]
X_ = results[0]
pl.scatter(X_[:,0],X_[:,1],c=t,
           linewidths=0)
pl.colorbar()


Out[596]:
<matplotlib.colorbar.Colorbar at 0x13882c3c8>

In [591]:
seaborn.kdeplot(data[:,10])
pl.figure()
seaborn.kdeplot(np.log(data[:,10] - np.min(data[:,10])+1))
pl.figure()
seaborn.kdeplot(np.arcsinh(data[:,10]*15))


Out[591]:
<matplotlib.axes._subplots.AxesSubplot at 0x16a299c88>

In [531]:
seaborn.kdeplot(np.log(data[:,10:12] - np.min(data[:,10:12])+1))


Out[531]:
<matplotlib.axes._subplots.AxesSubplot at 0x12d5bd9b0>

In [570]:
signed_log_data = np.log(abs(data[:,10]))*(2*(data[:,10]>0)-1)
seaborn.kdeplot(signed_log_data)


Out[570]:
<matplotlib.axes._subplots.AxesSubplot at 0x19b269be0>

In [552]:
lim=5
subset = abs(data[:,10])<lim
seaborn.kdeplot(data[:,10][subset])
pl.xlim(-lim,lim)
sum(subset)


Out[552]:
91163

In [571]:
def signed_log(data):
    return np.log(abs(data))*(2*(data>=0)-1)

In [573]:
x = np.arange(-100,100,0.0001)
y = signed_log(x)
pl.plot(x,y)
#pl.xlim(-1,1)
pl.figure()
pl.plot(x,np.arcsinh(x))


Out[573]:
[<matplotlib.lines.Line2D at 0x151662588>]

In [564]:
x = np.arange(-100,100,0.0001)
y = np.log(x)
pl.plot(x,y)
pl.xlim(0,2)


Out[564]:
(0, 2)

In [402]:
pl.hist(data[:,10],bins=100);
pl.figure()
pl.hist(data[:,10],bins=100,log=True);



In [151]:
processed = 1/ (1+np.exp(-data*10))
pl.hist(processed[:,10],bins=50);



In [152]:
processed = np.log(data - (np.min(data))+1);
pl.hist(processed[:,10],bins=50,normed=True,log=True);
#pl.xscale('log')


Out[152]:
(array([  1.20841905e-04,   0.00000000e+00,   1.20841905e-04,
          1.20841905e-04,   8.05612698e-05,   2.81964444e-04,
          5.23648254e-04,   1.36954159e-03,   6.68658539e-03,
          5.30898768e-02,   3.62239721e+00,   2.06458394e+00,
          5.95508906e-01,   3.48427492e-01,   2.73988879e-01,
          2.11271930e-01,   1.67084074e-01,   1.38766787e-01,
          1.10932868e-01,   9.38135987e-02,   7.60095580e-02,
          6.17904939e-02,   5.38954895e-02,   4.33419631e-02,
          3.88305320e-02,   3.07341244e-02,   2.70685866e-02,
          2.44503454e-02,   2.13084559e-02,   1.92541435e-02,
          1.52663606e-02,   1.34940127e-02,   1.20439098e-02,
          9.74791364e-03,   8.01584634e-03,   7.69360126e-03,
          6.76714666e-03,   5.39760508e-03,   4.18918603e-03,
          3.10160889e-03,   2.01403174e-03,   1.57094476e-03,
          1.04729651e-03,   8.86173968e-04,   2.41683809e-04,
          1.61122540e-04,   4.02806349e-05,   4.02806349e-05,
          4.02806349e-05,   4.02806349e-05]),
 array([ 3.36575672,  3.48804118,  3.61032564,  3.73261011,  3.85489457,
         3.97717904,  4.0994635 ,  4.22174797,  4.34403243,  4.4663169 ,
         4.58860136,  4.71088583,  4.83317029,  4.95545476,  5.07773922,
         5.20002369,  5.32230815,  5.44459262,  5.56687708,  5.68916155,
         5.81144601,  5.93373048,  6.05601494,  6.1782994 ,  6.30058387,
         6.42286833,  6.5451528 ,  6.66743726,  6.78972173,  6.91200619,
         7.03429066,  7.15657512,  7.27885959,  7.40114405,  7.52342852,
         7.64571298,  7.76799745,  7.89028191,  8.01256638,  8.13485084,
         8.25713531,  8.37941977,  8.50170424,  8.6239887 ,  8.74627316,
         8.86855763,  8.99084209,  9.11312656,  9.23541102,  9.35769549,
         9.47997995]),
 <a list of 50 Patch objects>)

In [453]:
#for i,cofactor in enumerate([0.001,0.01,0.1,0.2,0.3,0.4,0.5,1.0,2.0,5.0,10.0,100.0,1000.0]):
for i,cofactor in enumerate([10**i for i in np.arange(-1,2,0.05)]):
    processed = np.arcsinh(data*cofactor)
    #pl.hist(processed[:,10],bins=100);
    seaborn.kdeplot(processed[:,10],shade=True)
    pl.title('Cofactor: {0:.3f}'.format(cofactor))
    pl.ylim(0,0.75)
    pl.xlim(-10,15)
    pl.savefig('../figures/effect-of-normalization/{0}.jpg'.format(i))
    pl.close()

In [ ]:


In [430]:
processed = np.arcsinh(data*0.5)
seaborn.kdeplot(processed[:,30],shade=True)


Out[430]:
<matplotlib.axes._subplots.AxesSubplot at 0x12bb443c8>

In [516]:
blah = np.random.randn(100000)
seaborn.kdeplot(blah,shade=True)


Out[516]:
<matplotlib.axes._subplots.AxesSubplot at 0x1518ba860>

In [517]:
#for i,cofactor in enumerate([0.001,0.01,0.1,0.2,0.3,0.4,0.5,1.0,2.0,5.0,10.0,100.0,1000.0]):
for i,cofactor in enumerate([10**i for i in np.arange(-1,2,0.05)[::-1]]):
    proc_blah = np.arcsinh(blah/cofactor)
    #pl.hist(processed[:,10],bins=100);
    seaborn.kdeplot(proc_blah,shade=True)
    pl.title('Cofactor: {0:.3f}'.format(cofactor))
    #pl.ylim(0,0.1+np.max(proc_blah))
    pl.ylim(0,4)
    pl.xlim(-7,7)
    pl.savefig('../figures/effect-of-normalization-on-normal/{0}.jpg'.format(i))
    pl.close()

In [513]:
cofactors = [1,5,10,20,50,100]
cofactor = cofactors[-1]
for cofactor in cofactors:
    pl.plot(np.arange(-100,100),np.arcsinh(np.arange(-100,100)/cofactor),label=cofactor)
#pl.plot(np.arange(-100,100),np.arcsinh(np.arange(-100,100)/cofactors[0]),label=cofactors[0])
#pl.plot(np.arange(-100,100),np.arcsinh(np.arange(-100,100)/cofactors[1]),label=cofactors[1])
#pl.plot(np.arange(-100,100),np.arcsinh(np.arange(-100,100)/cofactors[2]),label=cofactors[2])
#pl.ylabel(r'$\arcsin(x/\text{cofactor})$')
#pl.xlabel(r'$x$')
pl.xlabel('x')
pl.ylabel('arcsinh(x/cofactor)')
pl.title('Cofactors and linear region size')
pl.legend(loc='best')


Out[513]:
<matplotlib.legend.Legend at 0x1bb6af208>

In [497]:
data = processed

In [378]:
data_nn = np.arcsinh(np.vstack(n for n in NN)[:,2:-3])

In [156]:
day = np.hstack(np.ones(len(n))*times[i] for i,n in enumerate(NG))
day.shape


Out[156]:
(203017,)

In [157]:
day_nn = np.hstack(np.ones(len(n))*times[i] for i,n in enumerate(NN))
day_nn.shape


Out[157]:
(559654,)

In [158]:
day_Oct4 = np.hstack(np.ones(len(n))*times_oct[i] for i,n in enumerate(Oct4))
day_Oct4.shape


Out[158]:
(296682,)

In [159]:
Oct4_processed = np.arcsinh(np.vstack(n for n in Oct4)[:,2:-3])

In [160]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(data)


Out[160]:
PCA(copy=True, n_components=None, whiten=False)

In [161]:
pl.plot(pca.explained_variance_ratio_)
sum(pca.explained_variance_ratio_[:2])


Out[161]:
0.42273775582152928

In [162]:
pca = PCA(2)
X_ = pca.fit_transform(data)
X_.shape


Out[162]:
(203017, 2)

In [163]:
pl.scatter(X_[:,0],X_[:,1],c=day,linewidths=0,alpha=0.5)
pl.colorbar()


Out[163]:
<matplotlib.colorbar.Colorbar at 0x10b532908>

In [164]:
Y_ = pca.inverse_transform(X_)
Y_.shape


Out[164]:
(203017, 44)

In [165]:
np.random.seed(0)
sample_size=10000
sample_ind=np.arange(len(data))[np.random.rand(len(data))<(1.0*sample_size/len(data))]
sample = data[sample_ind]
sample.shape


Out[165]:
(9905, 44)

In [166]:
pca = PCA(2)
X_ = pca.fit_transform(sample)
X_.shape


Out[166]:
(9905, 2)

In [167]:
pl.plot(pca.explained_variance_ratio_)
sum(pca.explained_variance_ratio_[:2])


Out[167]:
0.41991348436833031

In [168]:
X_ = X_[:,:2]

In [169]:
X_.shape,data.shape


Out[169]:
((9905, 2), (203017, 44))

In [170]:
pl.scatter(X_[:,0],X_[:,1],c=day[sample_ind],linewidths=0,alpha=0.5)
pl.xlabel("PC1")
pl.ylabel("PC2")
pl.title('PCA embedding of 10k-point subsample')
pl.colorbar()


Out[170]:
<matplotlib.colorbar.Colorbar at 0x134db0550>

In [34]:
Y_ = pca.inverse_transform(X_)

In [40]:
from sklearn.manifold import LocallyLinearEmbedding
lle = LocallyLinearEmbedding(n_neighbors=10)
lle.fit(sample)


Out[40]:
LocallyLinearEmbedding(eigen_solver='auto', hessian_tol=0.0001, max_iter=100,
            method='standard', modified_tol=1e-12, n_components=2,
            n_neighbors=10, neighbors_algorithm='auto', random_state=None,
            reg=0.001, tol=1e-06)

In [41]:
X_ = lle.embedding_
pl.scatter(X_[:,0],X_[:,1],c=day[sample_ind],linewidths=0,alpha=0.5)
pl.title('LLE embedding of 10k-point subsample')
pl.colorbar()


Out[41]:
<matplotlib.colorbar.Colorbar at 0x129a0d0b8>

In [37]:
from scipy.spatial import distance
pdist_orig = distance.pdist(sample)
pdist_reconstructed = distance.pdist(Y_)

In [38]:
diff = distance.squareform(pdist_orig) - distance.squareform(pdist_reconstructed)

In [39]:
np.sqrt(np.sum(diff**2))


Out[39]:
91903.64581602595

In [40]:
np.linalg.norm(diff)


Out[40]:
91903.645816027245

In [41]:
np.linalg.norm(diff) / len(sample)


Out[41]:
9.27851043069432

In [42]:
X_.shape,data.shape


Out[42]:
((9905, 2), (203017, 44))

In [51]:
from sklearn import neighbors
def one_nn_baseline(X,Y):
    one_nn_X = neighbors.kneighbors_graph(X,2)
    one_nn_Y = neighbors.kneighbors_graph(Y,2)
    sames = 0
    for i in range(len(X)):
        neighbor_X = one_nn_X[i].indices[one_nn_X[i].indices!=i][0]
        neighbor_Y = one_nn_Y[i].indices[one_nn_Y[i].indices!=i][0]
        if neighbor_X == neighbor_Y:
            sames+=1
    return 1.0*sames / len(X)

In [45]:
one_nn_baseline(sample,X_)


Out[45]:
0.003028773346794548

In [46]:
knn_X = neighbors.kneighbors_graph(X_,3)
knn_X[0].indices[knn_X[0].indices!=0]


Out[46]:
array([  34, 1204], dtype=int32)

In [47]:
sum(knn_X[0].indices[knn_X[0].indices!=0]==knn_X[0].indices[knn_X[0].indices!=0])


Out[47]:
2

In [62]:
s = set(range(10))
s.intersection(range(5))


Out[62]:
{0, 1, 2, 3, 4}

In [52]:
def knn_baseline(X,Y,k=5):
    k = k+1 # since self is counted as a neighbor in the kneighbors graph
    knn_X = neighbors.kneighbors_graph(X,k)
    knn_Y = neighbors.kneighbors_graph(Y,k)
    sames = 0
    for i in range(len(X)):
        neighbors_X = set(knn_X[i].indices[knn_X[i].indices!=i])
        neighbors_Y = set(knn_Y[i].indices[knn_Y[i].indices!=i])
        sames += len(neighbors_X.intersection(neighbors_Y))
        #sames += sum(neighbors_X == neighbors_Y)
        #sames 
    return 1.0*sames / (len(X)*(k-1))

In [77]:
knn_baseline(sample,X_,5)


Out[77]:
0.012417970721857647

In [52]:
sample.shape,X_.shape


Out[52]:
((9905, 44), (9905, 2))

In [50]:
pca = PCA(10)
X_ = pca.fit_transform(sample)
one_nn_baseline(sample,X_)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-50-e5e303ad8de0> in <module>()
      1 pca = PCA(10)
      2 X_ = pca.fit_transform(sample)
----> 3 one_nn_baseline(sample,X_)

NameError: name 'one_nn_baseline' is not defined

In [34]:
knn_baseline(sample,X_,5)


Out[34]:
0.262756183745583

In [45]:
from sklearn.cluster import DBSCAN

In [58]:
pl.hist(distance.pdist(sample),bins=50);
pl.figure()
pl.hist(distance.pdist(X_),bins=50);


From Wikipedia:

  • min_samples > D
  • eps = bendpoint in plot of distance to k=minPts nearest-neighbor

In [78]:
def k_dist(X,k=5):
    k = k+1 # since self is counted as a neighbor in the kneighbors graph
    knn = neighbors.kneighbors_graph(X,k,mode='distance')
    dist = np.zeros(len(X))
    for i in range(len(X)):
        dist[i] = knn[i].T[knn[i].indices[-1]].data[0]
    return dist

In [49]:
X_.shape


Out[49]:
(9905, 2)

In [87]:
d = k_dist(X_,X_.shape[1])

In [88]:
pl.hist(d,bins=50);



In [89]:
knn[10].indices[-1]


Out[89]:
100

In [86]:
dbs = DBSCAN(min_samples=50,eps=4.3)

In [87]:
dbs.fit(X_)


Out[87]:
DBSCAN(algorithm='auto', eps=4.3, leaf_size=30, metric='euclidean',
    min_samples=50, p=None, random_state=None)

In [88]:
pl.hist(dbs.labels_,bins=len(set(dbs.labels_)));



In [89]:
pl.scatter(X_[:,0],X_[:,1],c=dbs.labels_,linewidths=0,alpha=0.5)
pl.xlabel("PC1")
pl.ylabel("PC2")
pl.title('PCA embedding of 10k-point subsample')
pl.colorbar()


Out[89]:
<matplotlib.colorbar.Colorbar at 0x135fe8b70>

In [109]:
d = k_dist(sample,sample.shape[1])
pl.hist(d,bins=50);



In [119]:
dbs = DBSCAN(min_samples=sample.shape[1],eps=9.0)
dbs.fit(sample)


Out[119]:
DBSCAN(algorithm='auto', eps=9.0, leaf_size=30, metric='euclidean',
    min_samples=44, p=None, random_state=None)

In [120]:
pl.hist(dbs.labels_,bins=len(set(dbs.labels_)));



In [31]:
from sklearn.manifold import Isomap
iso = Isomap(10)
iso.fit(sample)
X_ = iso.embedding_
print(one_nn_baseline(sample,X_))
print(knn_baseline(sample,X_,5))


0.005451792024230187
0.015689045936395758

In [53]:
from sklearn.manifold import Isomap
iso = Isomap()
iso.fit(sample)


Out[53]:
Isomap(eigen_solver='auto', max_iter=None, n_components=2, n_neighbors=5,
    neighbors_algorithm='auto', path_method='auto', tol=0)

In [54]:
iso.reconstruction_error()


Out[54]:
616.11588539516492

In [78]:
X_ = iso.embedding_

In [67]:
pl.scatter(X_[:,0],X_[:,1],c=day[sample_ind],linewidths=0,alpha=0.5)
pl.title('Isomap embedding of 10k-point subsample')
pl.colorbar()


Out[67]:
<matplotlib.colorbar.Colorbar at 0x129225d30>

In [68]:
one_nn_baseline(sample,X_)


Out[68]:
0.006057546693589096

In [79]:
knn_baseline(sample,X_,5)


Out[79]:
0.019040888440181727

In [126]:
sample.shape


Out[126]:
(9905, 44)

In [142]:
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(n_components=2,kernel='poly',degree=2)
X_ = kpca.fit_transform(sample)

In [143]:
pl.scatter(X_[:,0],X_[:,1],c=day[sample_ind],linewidths=0,alpha=0.5)
pl.title('KPCA embedding of 10k-point subsample')
pl.colorbar()


Out[143]:
<matplotlib.colorbar.Colorbar at 0x1373773c8>

In [144]:
from sklearn.svm import SVR

In [148]:
svr = SVR('poly')

In [ ]:
svr.fit(sample,day[sample_ind])

In [147]:
svr.score(sample,day[sample_ind])


Out[147]:
0.75073358518553945

In [ ]:
from sklearn.kernel_approximation

In [ ]:
from sklearn.manifold import SpectralEmbedding
se = SpectralEmbedding()
X_ = se.fit_transform(sample)

In [ ]:
pl.scatter(X_[:,0],X_[:,1],c=day[sample_ind],linewidths=0,alpha=0.5)
pl.title('Spectral embedding of 10k-point subsample')
pl.colorbar()

In [ ]:


In [ ]:
one_nn_baseline(sample,X_)

In [ ]:
knn_baseline(sample,X_,10)

In [171]:
from sklearn.cross_decomposition import PLSRegression

In [172]:
pls = PLSRegression(2)

In [173]:
len(day),len(data)


Out[173]:
(203017, 203017)

In [174]:
np.random.seed(0)
ind = np.arange(len(day))
ind_nn = np.arange(len(day_nn))
np.random.shuffle(ind)
np.random.shuffle(ind_nn)
split = 0.9
train,test = ind[:len(ind)*split],ind[len(ind)*split:]
train_nn,test_nn = ind_nn[:len(ind_nn)*split],ind_nn[len(ind_nn)*split:]

In [175]:
len(train),len(test),len(train_nn),len(test_nn)


Out[175]:
(182715, 20302, 503688, 55966)

In [176]:
train


Out[176]:
array([149910, 183376, 124875, ..., 138496,  39419, 132654])

In [177]:
data.shape,data[train].shape,data[test].shape


Out[177]:
((203017, 44), (182715, 44), (20302, 44))

In [178]:
pls.fit(data[train],day[train])


Out[178]:
PLSRegression(copy=True, max_iter=500, n_components=2, scale=True, tol=1e-06)

In [179]:
pls.score(data[train],day[train]),pls.score(data[test],day[test])


Out[179]:
(0.64794832986917905, 0.6555345616272843)

In [180]:
pls.fit(data_nn[train_nn],day_nn[train_nn])
pls.score(data_nn[train_nn],day_nn[train_nn]),pls.score(data_nn[test_nn],day_nn[test_nn])


Out[180]:
(0.63536083668704668, 0.63721208533053919)

In [181]:
train_scores = []
test_scores = []
models = []
ns = list(range(1,10)) + [15,20,25,30,35,40]

for i in ns:
    pls = PLSRegression(i)
    pls.fit(data[train],day[train])
    train_scores.append(pls.score(data[train],day[train]))
    test_scores.append(pls.score(data[test],day[test]))
    models.append(pls)

In [182]:
pl.plot(ns,train_scores,label='Train (90% of data)')
pl.plot(ns,test_scores,label='Test (10% of data)')
pl.xlabel('Number of Components')
pl.ylabel('Score')
pl.legend(loc='best')
pl.title('PLS Regression Scores (Nanog-GFP)')


Out[182]:
<matplotlib.text.Text at 0x1355e79e8>

In [82]:
train_scores = []
test_scores = []
models = []
ns = list(range(1,10)) + [15,20,25,30,35,40]

for i in ns:
    pls = PLSRegression(i)
    pls.fit(data_nn[train_nn],day_nn[train_nn])
    train_scores.append(pls.score(data_nn[train_nn],day_nn[train_nn]))
    test_scores.append((pls.score(data_nn[test_nn],day_nn[test_nn])))
    models.append(pls)

In [83]:
pl.plot(ns,train_scores,label='Train (90% of data)')
pl.plot(ns,test_scores,label='Test (10% of data)')
pl.xlabel('Number of Components')
pl.ylabel('Score')
pl.legend(loc='best')
pl.title('PLS Regression Scores (Nanog-Neo)')


Out[83]:
<matplotlib.text.Text at 0x1275ca908>

In [84]:
# the effect of overfitting is very slight
pl.plot(ns,np.array(train_scores) - np.array(test_scores))


Out[84]:
[<matplotlib.lines.Line2D at 0x129924b38>]

In [85]:
for i in range(5):
    print('Dim: {0}, Score: {1:.3f}'.format(ns[i],test_scores[i]))


Dim: 1, Score: 0.416
Dim: 2, Score: 0.637
Dim: 3, Score: 0.732
Dim: 4, Score: 0.776
Dim: 5, Score: 0.802

In [ ]:


In [86]:
m = models[1]
X_ = m.transform(data)
X_ = m.transform(data_nn)

In [88]:
pl.scatter(X_[:,0],X_[:,1],c=day_nn,cmap='winter',linewidths=0,alpha=0.5)
pl.xlabel("PLS Component 1")
pl.ylabel("PLS Component 2")
pl.colorbar()
pl.title("Partial least squares 'progression analysis'")


Out[88]:
<matplotlib.text.Text at 0x1286e4dd8>

In [89]:
xlim = np.min(X_[:,0]),np.max(X_[:,0])
ylim = np.min(X_[:,1]),np.max(X_[:,1])
xlim,ylim


Out[89]:
((-7.7928443605304913, 10.150609137642995),
 (-7.6848229302663134, 7.0137687963623536))

In [130]:
def PLS_progression_analysis(data,times,
                             sample_name='',
                             dims=list(range(1,10))+list(range(10,20,2)),
                             train_test_split=0.9):
    # train-test split
    np.random.seed(0)
    ind = np.arange(len(times))
    np.random.shuffle(ind)
    split_index = len(ind)*train_test_split
    train,test = ind[:split_index],ind[split_index:]
    
    # fit models
    train_scores = []
    test_scores = []
    models = []
    
    for i in dims:        
        pls = PLSRegression(i)
        pls.fit(data[train],times[train])
        train_scores.append(pls.score(data[train],times[train]))
        test_scores.append(pls.score(data[test],times[test]))
        models.append(pls)
    
    train_p = 100*train_test_split
    test_p = 100-train_p
    
    pl.plot(dims,train_scores,label='Train ({0:.0f}% of data)'.format(train_p))
    pl.plot(dims,test_scores,label='Test ({0:.0f}% of data)'.format(test_p))
    pl.xlabel('Number of Components')
    pl.ylabel('Score')
    pl.legend(loc='best')
    title = 'PLS Regression Scores'
    if len(sample_name) > 0:
        title = ': '.join((title,sample_name))
    pl.title(title)
    
    return train_scores,test_scores,models

In [131]:
results = PLS_progression_analysis(sample,day[sample_ind],train_test_split=0.8)


(7924, 44) (7924,)

In [132]:
results = PLS_progression_analysis(Oct4_processed,day_Oct4,'Oct4',train_test_split=0.8)


(237345, 44) (237345,)

In [ ]:


In [ ]:
for t in times:
    pl.figure()
    kde = KernelDensity(0.25)
    kde.fit(X_[day==t])
    c = np.exp(kde.score_samples(X_[day==t]))
    pl.scatter(X_[day==t,0],X_[day==t,1],c=c,cmap='winter',linewidths=0,alpha=0.5)
    
    pl.xlim(*xlim)
    pl.ylim(*ylim)
    pl.xlabel("PLS Component 1")
    pl.ylabel("PLS Component 2")
    pl.title("PLS 'progression analysis:' Nanog-GFP, Day {0}".format(t))
    pl.savefig('../figures/progression-animation/ng-day {0}.jpg'.format(t))

In [96]:
for t in times:
    pl.figure()
    kde = KernelDensity(0.25)
    kde.fit(X_[day_nn==t])
    c = np.exp(kde.score_samples(X_[day_nn==t]))
    pl.scatter(X_[day_nn==t,0],X_[day_nn==t,1],c=c,cmap='winter',linewidths=0,alpha=0.5)
    
    pl.xlim(*xlim)
    pl.ylim(*ylim)
    pl.xlabel("PLS Component 1")
    pl.ylabel("PLS Component 2")
    pl.title("PLS 'progression analysis:' Nanog-Neo, Day {0}".format(t))
    pl.savefig('../figures/progression-animation/nn-day {0}.jpg'.format(t))



In [186]:
m = models[0]
X_ = m.transform(data)

In [202]:
# let's plot progression on just the leading 'progression axis' identified by PLS

In [204]:
day1 = X_[day==day[0]]
day1.shape


Out[204]:
(2938, 1)

In [184]:
from sklearn.neighbors import KernelDensity
kde = KernelDensity(2)

In [224]:
kde.fit(day1)


Out[224]:
KernelDensity(algorithm='auto', atol=0, bandwidth=2, breadth_first=True,
       kernel='gaussian', leaf_size=40, metric='euclidean',
       metric_params=None, rtol=0)

In [225]:
min(day1),max(day1)


Out[225]:
(array([-5.8874691]), array([ 4.7322055]))

In [226]:
x = np.arange(min(day1),max(day1),0.1)
y = kde.score_samples(np.reshape(x,(len(x),1)))

In [227]:
pl.plot(x,y)


Out[227]:
[<matplotlib.lines.Line2D at 0x163d70898>]

In [231]:
from sklearn.grid_search import GridSearchCV

# this snippet from documentation: 
# http://scikit-learn.org/stable/auto_examples/neighbors/plot_digits_kde_sampling.html
params = {'bandwidth': np.logspace(-1, 1, 10)}
grid = GridSearchCV(KernelDensity(), params)
grid.fit(day1)

print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))

# use the best estimator to compute the kernel density estimate
kde = grid.best_estimator_
# estimate density
kde.fit(day1)

x = np.arange(min(day1),max(day1),0.1)
y = kde.score_samples(np.reshape(x,(len(x),1)))


best bandwidth: 0.46415888336127786

In [232]:
pl.plot(x,y)


Out[232]:
[<matplotlib.lines.Line2D at 0x1670c6860>]

In [198]:
x = np.arange(min(X_),max(X_),0.02)

In [238]:
progression = []
for d in set(day):
    today = X_[day==d]
    # this snippet from documentation: 
    # http://scikit-learn.org/stable/auto_examples/neighbors/plot_digits_kde_sampling.html
    params = {'bandwidth': np.logspace(-1, 1, 10)}
    grid = GridSearchCV(KernelDensity(), params)
    grid.fit(today)

    print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))

    # use the best estimator to compute the kernel density estimate
    kde = grid.best_estimator_
    # estimate density
    kde.fit(today)

    y = kde.score_samples(np.reshape(x,(len(x),1)))
    progression.append(y)


best bandwidth: 0.46415888336127786
best bandwidth: 0.2782559402207124
best bandwidth: 0.2782559402207124
best bandwidth: 0.46415888336127786
best bandwidth: 0.46415888336127786
best bandwidth: 0.2782559402207124
best bandwidth: 0.2782559402207124
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-238-8b2aa93403eb> in <module>()
      6     params = {'bandwidth': np.logspace(-1, 1, 10)}
      7     grid = GridSearchCV(KernelDensity(), params)
----> 8     grid.fit(today)
      9 
     10     print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))

/Users/joshuafass/anaconda/lib/python3.4/site-packages/sklearn/grid_search.py in fit(self, X, y)
    594 
    595         """
--> 596         return self._fit(X, y, ParameterGrid(self.param_grid))
    597 
    598 

/Users/joshuafass/anaconda/lib/python3.4/site-packages/sklearn/grid_search.py in _fit(self, X, y, parameter_iterable)
    376                                     train, test, self.verbose, parameters,
    377                                     self.fit_params, return_parameters=True)
--> 378             for parameters in parameter_iterable
    379             for train, test in cv)
    380 

/Users/joshuafass/anaconda/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    651             self._iterating = True
    652             for function, args, kwargs in iterable:
--> 653                 self.dispatch(function, args, kwargs)
    654 
    655             if pre_dispatch == "all" or n_jobs == 1:

/Users/joshuafass/anaconda/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in dispatch(self, func, args, kwargs)
    398         """
    399         if self._pool is None:
--> 400             job = ImmediateApply(func, args, kwargs)
    401             index = len(self._jobs)
    402             if not _verbosity_filter(index, self.verbose):

/Users/joshuafass/anaconda/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in __init__(self, func, args, kwargs)
    136         # Don't delay the application, to avoid keeping the input
    137         # arguments in memory
--> 138         self.results = func(*args, **kwargs)
    139 
    140     def get(self):

/Users/joshuafass/anaconda/lib/python3.4/site-packages/sklearn/cross_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters)
   1238     else:
   1239         estimator.fit(X_train, y_train, **fit_params)
-> 1240     test_score = _score(estimator, X_test, y_test, scorer)
   1241     if return_train_score:
   1242         train_score = _score(estimator, X_train, y_train, scorer)

/Users/joshuafass/anaconda/lib/python3.4/site-packages/sklearn/cross_validation.py in _score(estimator, X_test, y_test, scorer)
   1292     """Compute the score of an estimator on a given test set."""
   1293     if y_test is None:
-> 1294         score = scorer(estimator, X_test)
   1295     else:
   1296         score = scorer(estimator, X_test, y_test)

/Users/joshuafass/anaconda/lib/python3.4/site-packages/sklearn/metrics/scorer.py in _passthrough_scorer(estimator, *args, **kwargs)
    174 def _passthrough_scorer(estimator, *args, **kwargs):
    175     """Function that wraps estimator.score"""
--> 176     return estimator.score(*args, **kwargs)
    177 
    178 

/Users/joshuafass/anaconda/lib/python3.4/site-packages/sklearn/neighbors/kde.py in score(self, X)
    171             Log probabilities of each data point in X.
    172         """
--> 173         return np.sum(self.score_samples(X))
    174 
    175     def sample(self, n_samples=1, random_state=None):

/Users/joshuafass/anaconda/lib/python3.4/site-packages/sklearn/neighbors/kde.py in score_samples(self, X)
    153         log_density = self.tree_.kernel_density(
    154             X, h=self.bandwidth, kernel=self.kernel, atol=atol_N,
--> 155             rtol=self.rtol, breadth_first=self.breadth_first, return_log=True)
    156         log_density -= np.log(N)
    157         return log_density

KeyboardInterrupt: 

In [199]:
progression = []
for d in set(day):
    today = X_[day==d]
    kde = KernelDensity(0.2)
    kde.fit(today)

    y = kde.score_samples(np.reshape(x,(len(x),1)))
    progression.append(y)

In [241]:
prog = np.exp(np.array(progression)).T
prog.shape


Out[241]:
(742, 14)

In [242]:
aspect = float(prog.shape[1]) / prog.shape[0]
aspect


Out[242]:
0.018867924528301886

In [243]:
pl.imshow(-prog,aspect=0.5*aspect,
          interpolation='none',cmap='gray')
pl.xlabel('Timestep')
pl.ylabel('Progression')


Out[243]:
<matplotlib.text.Text at 0x1246c1048>

In [219]:
X_.shape


Out[219]:
(203017, 1)

In [229]:
pl.scatter(day+np.random.randn(len(day))*0.1,X_[:,0],linewidths=0,alpha=0.005)


Out[229]:
<matplotlib.collections.PathCollection at 0x12318cc50>

In [268]:
# color snippet modified from:
# http://stackoverflow.com/questions/8931268/using-colormaps-to-set-color-of-line-in-matplotlib
import matplotlib.colors as colors
import matplotlib.cm as cmx
jet = cm = pl.get_cmap('winter') 
cNorm  = colors.Normalize(vmin=0, vmax=len(times))
scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=jet)

for i,y in enumerate(progression):
    colorVal = scalarMap.to_rgba(i)
    pl.plot(x,np.exp(y),label=times[i],color=colorVal)
    #pl.legend(loc='best')
pl.xlabel("PLS Component 1")
pl.ylabel("Probability density")
pl.title("1D Partial least squares 'progression analysis'")


Out[268]:
<matplotlib.text.Text at 0x1823a1ba8>

In [ ]:
# another way to do this, use a color-intensity channel to encode probability density, 
# and then have time on the x axis and progression on the y axis

In [317]:
# color snippet modified from:
# http://stackoverflow.com/questions/8931268/using-colormaps-to-set-color-of-line-in-matplotlib
import matplotlib.colors as colors
import matplotlib.cm as cmx
jet = cm = pl.get_cmap('winter') 
cNorm  = colors.Normalize(vmin=0, vmax=5)
scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=jet)

for i,y in enumerate(progression):
    pl.figure()
    colorVal = scalarMap.to_rgba(0)
    pl.plot(x,np.exp(y),color=colorVal,linewidth=3)
    pl.ylim(0,0.45)
    for j in range(5):
        if i-j>=0:
            colorVal = scalarMap.to_rgba(j+1)
            pl.plot(x,np.exp(progression[i-j]),color=colorVal,alpha=1/(j+1),linewidth=min(1,3-j))
    #pl.legend(loc='best')
    pl.xlabel("PLS Component 1")
    pl.ylabel("Probability density")
    pl.title("1D PLS 'progression analysis' Day {0}".format(times[i]))
    pl.savefig('../figures/1d-progression-animation/ng-day {0}.jpg'.format(times[i]))



In [235]:
from scipy.spatial import distance

In [236]:
prog = np.array(progression)
prog.shape


Out[236]:
(14, 742)

In [237]:
distmat = distance.squareform(distance.pdist(prog,'canberra'))
distmat.shape


Out[237]:
(14, 14)

In [244]:
pl.imshow(distmat,interpolation='none',cmap='Blues')
pl.xlabel('Timestep')
pl.ylabel('Timestep')


Out[244]:
<matplotlib.text.Text at 0x1246624a8>

In [134]:
pl.scatter(models[-1].predict(data),day,alpha=0.1,linewidths=0)
pl.plot(day,day)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-134-f82e81c28df0> in <module>()
----> 1 pl.scatter(models[-1].predict(data),day,alpha=0.1,linewidths=0)
      2 pl.plot(day,day)

NameError: name 'models' is not defined

Another idea: cluster the full dataset, and describe each timepoint by the relative occupancy of each cluster


In [12]:
from sklearn.cluster import MiniBatchKMeans

In [13]:
num_clust=20

In [14]:
km = MiniBatchKMeans(num_clust)

In [359]:
km.fit(data)


Out[359]:
MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
        init_size=None, max_iter=100, max_no_improvement=10, n_clusters=20,
        n_init=3, random_state=None, reassignment_ratio=0.01, tol=0.0,
        verbose=0)

In [360]:
y = km.predict(data)

In [361]:
def num_in_clust(Y,num_clusters=50):
    assert(num_clusters >= max(Y))
    occupancy = np.zeros(num_clusters)
    for y in Y:
        occupancy[y] += 1
    return occupancy

In [361]:


In [362]:
pl.bar(range(num_clust),num_in_clust(y,num_clust))
pl.title('Overall')
pl.figure()
pl.bar(range(num_clust),num_in_clust(y[day==times[0]],num_clust))
pl.title('Initial')
pl.figure()
pl.bar(range(num_clust),num_in_clust(y[day==times[-1]],num_clust))
pl.title('Final')


Out[362]:
<matplotlib.text.Text at 0x12fd88748>

In [363]:
featurized = np.array([num_in_clust(y[day==t]) for t in times])
featurized.shape


Out[363]:
(14, 50)

In [364]:
distmat = distance.squareform(distance.pdist(featurized,'canberra'))
pl.imshow(distmat,interpolation='none',cmap='Blues')


Out[364]:
<matplotlib.image.AxesImage at 0x12bd57588>

In [365]:
#clust_ind = sorted(range(50),key=lambda i:featurized[:,i].dot(times))
clust_ind = sorted(range(num_clust),key=lambda i:(y==i).dot(np.arange(len(y))))

In [366]:
pl.bar(range(num_clust),num_in_clust(y,num_clust))
pl.title('Overall')
pl.figure()
pl.bar(range(num_clust),num_in_clust(y[day==times[0]],num_clust))
pl.title('Initial')
pl.figure()
pl.bar(range(num_clust),num_in_clust(y[day==times[-1]],num_clust))
pl.title('Final')


Out[366]:
<matplotlib.text.Text at 0x12d525128>

In [367]:
clust_mat = np.array([num_in_clust(y[day==t],num_clust)[clust_ind] for t in times]).T

In [368]:
for i in range(len(times)):
    pl.figure()
    pl.bar(range(num_clust),num_in_clust(y[day==times[i]],num_clust)[clust_ind])
    pl.title('Cluster occupancies: Day {0}'.format(times[i]))
    pl.xlabel('Cluster ID')
    pl.ylabel('Occupancy')
    pl.ylim(0,np.max(clust_mat))
    pl.savefig('../figures/occupancy-animation/ng-day {0}.jpg'.format(times[i]))



In [369]:
aspect = float(clust_mat.shape[1]) / clust_mat.shape[0]
pl.imshow(clust_mat,aspect=aspect,
          interpolation='none',cmap='Blues')
pl.xlabel('Timestep')
pl.ylabel('Cluster ID')
pl.title('Cluster occupancy per time step')
pl.colorbar()


Out[369]:
<matplotlib.colorbar.Colorbar at 0x12c59e358>

In [370]:
pl.imshow(clust_mat,interpolation='none')


Out[370]:
<matplotlib.image.AxesImage at 0x12bfe9550>

In [356]:
y[clust_ind]


Out[356]:
array([6, 6, 8, 8, 6, 6, 6, 3, 6, 6], dtype=int32)

In [376]:
y.shape


Out[376]:
(203017,)

In [380]:
X_ = pca.fit_transform(data)
pl.scatter(X_[:,0],X_[:,1],c=[clust_ind[i] for i in y],cmap='Blues',linewidths=0,alpha=0.5)
pl.colorbar()


Out[380]:
<matplotlib.colorbar.Colorbar at 0x1dcef8ac8>

In [33]:
len(all_data)


Out[33]:
762671

In [21]:
all_data = np.vstack((data,data_nn))
all_data.shape


Out[21]:
(762671, 44)

In [22]:
pca = PCA()
pca.fit(data)
pl.plot(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_[:2]))


0.422737755822

In [23]:
pca = PCA()
pca.fit(data_nn)
pl.plot(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_[:2]))


0.321411102281

In [386]:
X_ = models[1].transform(data)
X_.shape


Out[386]:
(203017, 2)

In [387]:
import seaborn

In [399]:
xlim = np.min(X_[:,0]),np.max(X_[:,0])
ylim = np.min(X_[:,1]),np.max(X_[:,1])

In [400]:
for i,d in enumerate(set(day)):
    seaborn.kdeplot(X_[day==d]);
    pl.xlim(*xlim)
    pl.ylim(*ylim)
    pl.savefig('../figures/2d-contour-progression-animation/{0}.jpg'.format(i))
    pl.close()

In [ ]: