In [1]:

    
import pandas as pd
import numpy as np
import pylab as pl
%matplotlib inline
pl.rcParams['font.family']='Serif'

Not-yet-automated initial pipeline:

download files from cytobank (https://www.cytobank.org/cytobank/experiments?project=688)
export from FCS to CSV in the ../data/stem-cell/ directory (currently using free trial of FlowJo to do the format conversion, but there are apparently open source python tools for this as well)



In [2]:

    
path = '../data/stem-cell/'
file_names=['NG_norm_concat.csv','NN_norm_concat.csv','4FI_norm_concat.csv']

Okay, so these look like just the actual timestamps of when they went through the machine, not the barcode

Why then are the timestamps wonky for the later cells in each category? UGGGGG



In [3]:

    
NN_filenames = ['NN_00.export.csv',
'NN_02.export.csv',
'NN_04.export.csv',
'NN_06.export.csv',
'NN_08.export.csv',
'NN_10.export.csv',
'NN_12.export.csv',
'NN_14.export.csv',
'NN_16.export.csv',
'NN_18.export.csv',
'NN_20.export.csv',
'NN_22.export.csv',
'NN_24.export.csv',
'NN_30.export.csv']



In [4]:

    
NG_filenames = ['NG_00.export.csv',
'NG_02.export.csv',
'NG_04.export.csv',
'NG_06.export.csv',
'NG_08.export.csv',
'NG_10.export.csv',
'NG_12.export.csv',
'NG_14.export.csv',
'NG_16.export.csv',
'NG_18.export.csv',
'NG_20.export.csv',
'NG_22.export.csv',
'NG_24.export.csv',
'NG_30.export.csv']



In [5]:

    
Oct4_filenames = [
'4FI_00.export.csv',
'4FI_01.export.csv',
'4FI_02.export.csv',
'4FI_04.export.csv',
'4FI_06.export.csv',
'4FI_08.export.csv',
'4FI_10.export.csv',
'4FI_12.export.csv',
'4FI_14.export.csv',
'4FI_16.export.csv',
'4FI_17.export.csv',
'4FI_18.export.csv',
'4FI_20.export.csv']



In [6]:

    
times = [int(s[3:5]) for s in NG_filenames]
print(times)
pl.plot(times)









    



[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 30]






    Out[6]:





[<matplotlib.lines.Line2D at 0x10a755ac8>]



In [ ]:

    
times_oct = [int(s[4:6]) for s in Oct4_filenames]
print(times_oct)
pl.plot(times_oct)



In [11]:

    
times_nn = [int(s[3:5]) for s in NN_filenames]



In [7]:

    
times_oct = [int(s[4:6]) for s in Oct4_filenames]
print(times_oct)
pl.plot(times_oct)









    



[0, 1, 2, 4, 6, 8, 10, 12, 14, 16, 17, 18, 20]






    Out[7]:





[<matplotlib.lines.Line2D at 0x10a850828>]



In [143]:

    
NG = [pd.read_csv(path+'NG/'+fname) for fname in NG_filenames]
NN = [pd.read_csv(path+'NN/'+fname) for fname in NN_filenames]
Oct4 = [pd.read_csv(path+'4FI/'+fname) for fname in Oct4_filenames]



In [144]:

    
NG[0].columns









    Out[144]:





Index(['Time', ' "Cell_length"', ' "BC-1(Pd102)Dd"', ' "BC-2(Pd104)Dd"', ' "BC-3(Pd105)Dd"', ' "BC-4(Pd106)Dd"', ' "BC-5(Pd108)Dd"', ' "BC-6(Pd110)Dd"', ' "pPLK1(In113)Dd"', ' "H3K9ac(In115)Dd"', ' "IdU(I127)Dd"', ' "H4Kac(La139)Dd"', ' "cCasp3(Pr141)Dd"', ' "CD54(Nd142)Dd"', ' "pStat3-705(Nd143)Dd"', ' "Thy1(Nd144)Dd"', ' "pAMPK(Nd145)Dd"', ' "CD24(Nd146)Dd"', ' "c-Myc(Sm147)Dd"', ' "CD73(Nd148)Dd"', ' "pS6(Sm149)Dd"', ' "Oct4(Nd150)Dd"', ' "pErk(Eu151)Dd"', ' "Ki67(Sm152)Dd"', ' "pStat3-727(Eu153)Dd"', ' "pSrc(Sm154)Dd"', ' "Ox(Gd155)Dd"', ' "Nanog(Gd156)Dd"', ' "pRb(Gd158)Dd"', ' "pAkt(Tb159)Dd"', ' "Sox2(Gd160)Dd"', ' "GFP(Dy162)Dd"', ' "CyclinB1(Dy164)Dd"', ' "SSEA1(Ho165)Dd"', ' "IKBa(Er166)Dd"', ' "Lin28(Er167)Dd"', ' "CD140a(Er168)Dd"', ' "EpCAM(Tm169)Dd"', ' "bCatenin(Er170)Dd"', ' "CD44(Yb171)Dd"', ' "MEFSK4(Yb172)Dd"', ' "Klf4(Yb174)Dd"', ' "p53(Lu175)Dd"', ' "pH3(Yb176)Dd"', ' "DNA-1(Ir191)Dd"', ' "DNA-2(Ir193)Dd"', ' "beadDist"', ' "barcode"', ' "Event #"'], dtype='object')



In [145]:

    
NG[0].columns[1:-3]









    Out[145]:





Index([' "Cell_length"', ' "BC-1(Pd102)Dd"', ' "BC-2(Pd104)Dd"', ' "BC-3(Pd105)Dd"', ' "BC-4(Pd106)Dd"', ' "BC-5(Pd108)Dd"', ' "BC-6(Pd110)Dd"', ' "pPLK1(In113)Dd"', ' "H3K9ac(In115)Dd"', ' "IdU(I127)Dd"', ' "H4Kac(La139)Dd"', ' "cCasp3(Pr141)Dd"', ' "CD54(Nd142)Dd"', ' "pStat3-705(Nd143)Dd"', ' "Thy1(Nd144)Dd"', ' "pAMPK(Nd145)Dd"', ' "CD24(Nd146)Dd"', ' "c-Myc(Sm147)Dd"', ' "CD73(Nd148)Dd"', ' "pS6(Sm149)Dd"', ' "Oct4(Nd150)Dd"', ' "pErk(Eu151)Dd"', ' "Ki67(Sm152)Dd"', ' "pStat3-727(Eu153)Dd"', ' "pSrc(Sm154)Dd"', ' "Ox(Gd155)Dd"', ' "Nanog(Gd156)Dd"', ' "pRb(Gd158)Dd"', ' "pAkt(Tb159)Dd"', ' "Sox2(Gd160)Dd"', ' "GFP(Dy162)Dd"', ' "CyclinB1(Dy164)Dd"', ' "SSEA1(Ho165)Dd"', ' "IKBa(Er166)Dd"', ' "Lin28(Er167)Dd"', ' "CD140a(Er168)Dd"', ' "EpCAM(Tm169)Dd"', ' "bCatenin(Er170)Dd"', ' "CD44(Yb171)Dd"', ' "MEFSK4(Yb172)Dd"', ' "Klf4(Yb174)Dd"', ' "p53(Lu175)Dd"', ' "pH3(Yb176)Dd"', ' "DNA-1(Ir191)Dd"', ' "DNA-2(Ir193)Dd"'], dtype='object')



In [146]:

    
pl.plot(times,[len(n) for n in NG],label='Nanog-GFP')
pl.plot(times,[len(n) for n in NN],label='Nanog-Neo')
pl.plot(times_oct,[len(n) for n in Oct4],label='Oct4-GFP')
pl.legend(loc='best')
pl.ylabel('Number of cells')
pl.xlabel('Day')
pl.title('Cells per day')









    Out[146]:





<matplotlib.text.Text at 0x135f64940>



In [147]:

    
max([len(n) for n in NG]) / min([len(n) for n in NG]),max([len(n) for n in NN]) / min([len(n) for n in NN])









    Out[147]:





(10.929617720614505, 12.385311871227364)



In [148]:

    
max([len(n) for n in Oct4]) / min([len(n) for n in Oct4])









    Out[148]:





4.937889660039995



In [574]:

    
# construct a numpy array containing all of the NG_samples
all_data = np.vstack(n for n in NG)
data = all_data[:,2:-3]
print(data.shape)









    



(203017, 44)



In [592]:

    
data_p = np.arcsinh(data*15)



In [593]:

    
from time import time
from scipy.cluster import hierarchy
from sklearn.manifold import MDS

def embedding(X,labels=None,linkage='single',
              profile=True):
    ''' (1) compute linkage matrix,
    (2) compute pairwise distances using linkage matrix,
    (3) compute embedding using distances'''
    t = time()
    n = len(X)
    c = hierarchy.linkage(X,linkage)
    d = distance.squareform(hierarchy.cophenet(c))
    t1 = time()
    print('Computed cophenetic correlations in {0:.2f}s'.format(t1-t))
    
    mds = MDS(dissimilarity='precomputed')
    X_ = mds.fit_transform(d[:n,:n])
    t2 = time()
    print('Computed MDS embedding in {0:.2f}s'.format(t2-t1))
    if labels==None:
        pl.scatter(X_[:,0],X_[:,1])
    else:
        pl.scatter(X_[:,0],X_[:,1],c=labels,
           linewidths=0)
    return X_,c



In [594]:

    
np.random.seed(0)
sample_ind = np.random.rand(len(data_p)) < 0.005
sum(sample_ind)









    Out[594]:





1003



In [595]:

    
sample = data_p[sample_ind]
results = embedding(sample)









    



Computed cophenetic correlations in 0.05s
Computed MDS embedding in 27.71s



In [596]:

    
t = day[sample_ind]
X_ = results[0]
pl.scatter(X_[:,0],X_[:,1],c=t,
           linewidths=0)
pl.colorbar()









    Out[596]:





<matplotlib.colorbar.Colorbar at 0x13882c3c8>



In [591]:

    
seaborn.kdeplot(data[:,10])
pl.figure()
seaborn.kdeplot(np.log(data[:,10] - np.min(data[:,10])+1))
pl.figure()
seaborn.kdeplot(np.arcsinh(data[:,10]*15))









    Out[591]:





<matplotlib.axes._subplots.AxesSubplot at 0x16a299c88>



In [531]:

    
seaborn.kdeplot(np.log(data[:,10:12] - np.min(data[:,10:12])+1))









    Out[531]:





<matplotlib.axes._subplots.AxesSubplot at 0x12d5bd9b0>



In [570]:

    
signed_log_data = np.log(abs(data[:,10]))*(2*(data[:,10]>0)-1)
seaborn.kdeplot(signed_log_data)









    Out[570]:





<matplotlib.axes._subplots.AxesSubplot at 0x19b269be0>



In [552]:

    
lim=5
subset = abs(data[:,10])<lim
seaborn.kdeplot(data[:,10][subset])
pl.xlim(-lim,lim)
sum(subset)









    Out[552]:





91163



In [571]:

    
def signed_log(data):
    return np.log(abs(data))*(2*(data>=0)-1)



In [573]:

    
x = np.arange(-100,100,0.0001)
y = signed_log(x)
pl.plot(x,y)
#pl.xlim(-1,1)
pl.figure()
pl.plot(x,np.arcsinh(x))









    Out[573]:





[<matplotlib.lines.Line2D at 0x151662588>]



In [564]:

    
x = np.arange(-100,100,0.0001)
y = np.log(x)
pl.plot(x,y)
pl.xlim(0,2)









    Out[564]:





(0, 2)



In [402]:

    
pl.hist(data[:,10],bins=100);
pl.figure()
pl.hist(data[:,10],bins=100,log=True);



In [151]:

    
processed = 1/ (1+np.exp(-data*10))
pl.hist(processed[:,10],bins=50);



In [152]:

    
processed = np.log(data - (np.min(data))+1);
pl.hist(processed[:,10],bins=50,normed=True,log=True);
#pl.xscale('log')









    Out[152]:





(array([  1.20841905e-04,   0.00000000e+00,   1.20841905e-04,
          1.20841905e-04,   8.05612698e-05,   2.81964444e-04,
          5.23648254e-04,   1.36954159e-03,   6.68658539e-03,
          5.30898768e-02,   3.62239721e+00,   2.06458394e+00,
          5.95508906e-01,   3.48427492e-01,   2.73988879e-01,
          2.11271930e-01,   1.67084074e-01,   1.38766787e-01,
          1.10932868e-01,   9.38135987e-02,   7.60095580e-02,
          6.17904939e-02,   5.38954895e-02,   4.33419631e-02,
          3.88305320e-02,   3.07341244e-02,   2.70685866e-02,
          2.44503454e-02,   2.13084559e-02,   1.92541435e-02,
          1.52663606e-02,   1.34940127e-02,   1.20439098e-02,
          9.74791364e-03,   8.01584634e-03,   7.69360126e-03,
          6.76714666e-03,   5.39760508e-03,   4.18918603e-03,
          3.10160889e-03,   2.01403174e-03,   1.57094476e-03,
          1.04729651e-03,   8.86173968e-04,   2.41683809e-04,
          1.61122540e-04,   4.02806349e-05,   4.02806349e-05,
          4.02806349e-05,   4.02806349e-05]),
 array([ 3.36575672,  3.48804118,  3.61032564,  3.73261011,  3.85489457,
         3.97717904,  4.0994635 ,  4.22174797,  4.34403243,  4.4663169 ,
         4.58860136,  4.71088583,  4.83317029,  4.95545476,  5.07773922,
         5.20002369,  5.32230815,  5.44459262,  5.56687708,  5.68916155,
         5.81144601,  5.93373048,  6.05601494,  6.1782994 ,  6.30058387,
         6.42286833,  6.5451528 ,  6.66743726,  6.78972173,  6.91200619,
         7.03429066,  7.15657512,  7.27885959,  7.40114405,  7.52342852,
         7.64571298,  7.76799745,  7.89028191,  8.01256638,  8.13485084,
         8.25713531,  8.37941977,  8.50170424,  8.6239887 ,  8.74627316,
         8.86855763,  8.99084209,  9.11312656,  9.23541102,  9.35769549,
         9.47997995]),
 <a list of 50 Patch objects>)



In [453]:

    
#for i,cofactor in enumerate([0.001,0.01,0.1,0.2,0.3,0.4,0.5,1.0,2.0,5.0,10.0,100.0,1000.0]):
for i,cofactor in enumerate([10**i for i in np.arange(-1,2,0.05)]):
    processed = np.arcsinh(data*cofactor)
    #pl.hist(processed[:,10],bins=100);
    seaborn.kdeplot(processed[:,10],shade=True)
    pl.title('Cofactor: {0:.3f}'.format(cofactor))
    pl.ylim(0,0.75)
    pl.xlim(-10,15)
    pl.savefig('../figures/effect-of-normalization/{0}.jpg'.format(i))
    pl.close()



In [ ]:



In [430]:

    
processed = np.arcsinh(data*0.5)
seaborn.kdeplot(processed[:,30],shade=True)









    Out[430]:





<matplotlib.axes._subplots.AxesSubplot at 0x12bb443c8>



In [516]:

    
blah = np.random.randn(100000)
seaborn.kdeplot(blah,shade=True)









    Out[516]:





<matplotlib.axes._subplots.AxesSubplot at 0x1518ba860>



In [517]:

    
#for i,cofactor in enumerate([0.001,0.01,0.1,0.2,0.3,0.4,0.5,1.0,2.0,5.0,10.0,100.0,1000.0]):
for i,cofactor in enumerate([10**i for i in np.arange(-1,2,0.05)[::-1]]):
    proc_blah = np.arcsinh(blah/cofactor)
    #pl.hist(processed[:,10],bins=100);
    seaborn.kdeplot(proc_blah,shade=True)
    pl.title('Cofactor: {0:.3f}'.format(cofactor))
    #pl.ylim(0,0.1+np.max(proc_blah))
    pl.ylim(0,4)
    pl.xlim(-7,7)
    pl.savefig('../figures/effect-of-normalization-on-normal/{0}.jpg'.format(i))
    pl.close()



In [513]:

    
cofactors = [1,5,10,20,50,100]
cofactor = cofactors[-1]
for cofactor in cofactors:
    pl.plot(np.arange(-100,100),np.arcsinh(np.arange(-100,100)/cofactor),label=cofactor)
#pl.plot(np.arange(-100,100),np.arcsinh(np.arange(-100,100)/cofactors[0]),label=cofactors[0])
#pl.plot(np.arange(-100,100),np.arcsinh(np.arange(-100,100)/cofactors[1]),label=cofactors[1])
#pl.plot(np.arange(-100,100),np.arcsinh(np.arange(-100,100)/cofactors[2]),label=cofactors[2])
#pl.ylabel(r'$\arcsin(x/\text{cofactor})$')
#pl.xlabel(r'$x$')
pl.xlabel('x')
pl.ylabel('arcsinh(x/cofactor)')
pl.title('Cofactors and linear region size')
pl.legend(loc='best')









    Out[513]:





<matplotlib.legend.Legend at 0x1bb6af208>



In [497]:

    
data = processed



In [378]:

    
data_nn = np.arcsinh(np.vstack(n for n in NN)[:,2:-3])



In [156]:

    
day = np.hstack(np.ones(len(n))*times[i] for i,n in enumerate(NG))
day.shape









    Out[156]:





(203017,)



In [157]:

    
day_nn = np.hstack(np.ones(len(n))*times[i] for i,n in enumerate(NN))
day_nn.shape









    Out[157]:





(559654,)



In [158]:

    
day_Oct4 = np.hstack(np.ones(len(n))*times_oct[i] for i,n in enumerate(Oct4))
day_Oct4.shape









    Out[158]:





(296682,)



In [159]:

    
Oct4_processed = np.arcsinh(np.vstack(n for n in Oct4)[:,2:-3])



In [160]:

    
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(data)









    Out[160]:





PCA(copy=True, n_components=None, whiten=False)



In [161]:

    
pl.plot(pca.explained_variance_ratio_)
sum(pca.explained_variance_ratio_[:2])









    Out[161]:





0.42273775582152928



In [162]:

    
pca = PCA(2)
X_ = pca.fit_transform(data)
X_.shape









    Out[162]:





(203017, 2)



In [163]:

    
pl.scatter(X_[:,0],X_[:,1],c=day,linewidths=0,alpha=0.5)
pl.colorbar()









    Out[163]:





<matplotlib.colorbar.Colorbar at 0x10b532908>



In [164]:

    
Y_ = pca.inverse_transform(X_)
Y_.shape









    Out[164]:





(203017, 44)



In [165]:

    
np.random.seed(0)
sample_size=10000
sample_ind=np.arange(len(data))[np.random.rand(len(data))<(1.0*sample_size/len(data))]
sample = data[sample_ind]
sample.shape









    Out[165]:





(9905, 44)



In [166]:

    
pca = PCA(2)
X_ = pca.fit_transform(sample)
X_.shape









    Out[166]:





(9905, 2)



In [167]:

    
pl.plot(pca.explained_variance_ratio_)
sum(pca.explained_variance_ratio_[:2])









    Out[167]:





0.41991348436833031



In [168]:

    
X_ = X_[:,:2]



In [169]:

    
X_.shape,data.shape









    Out[169]:





((9905, 2), (203017, 44))



In [170]:

    
pl.scatter(X_[:,0],X_[:,1],c=day[sample_ind],linewidths=0,alpha=0.5)
pl.xlabel("PC1")
pl.ylabel("PC2")
pl.title('PCA embedding of 10k-point subsample')
pl.colorbar()









    Out[170]:





<matplotlib.colorbar.Colorbar at 0x134db0550>



In [34]:

    
Y_ = pca.inverse_transform(X_)



In [40]:

    
from sklearn.manifold import LocallyLinearEmbedding
lle = LocallyLinearEmbedding(n_neighbors=10)
lle.fit(sample)









    Out[40]:





LocallyLinearEmbedding(eigen_solver='auto', hessian_tol=0.0001, max_iter=100,
            method='standard', modified_tol=1e-12, n_components=2,
            n_neighbors=10, neighbors_algorithm='auto', random_state=None,
            reg=0.001, tol=1e-06)



In [41]:

    
X_ = lle.embedding_
pl.scatter(X_[:,0],X_[:,1],c=day[sample_ind],linewidths=0,alpha=0.5)
pl.title('LLE embedding of 10k-point subsample')
pl.colorbar()









    Out[41]:





<matplotlib.colorbar.Colorbar at 0x129a0d0b8>



In [37]:

    
from scipy.spatial import distance
pdist_orig = distance.pdist(sample)
pdist_reconstructed = distance.pdist(Y_)



In [38]:

    
diff = distance.squareform(pdist_orig) - distance.squareform(pdist_reconstructed)



In [39]:

    
np.sqrt(np.sum(diff**2))









    Out[39]:





91903.64581602595



In [40]:

    
np.linalg.norm(diff)









    Out[40]:





91903.645816027245



In [41]:

    
np.linalg.norm(diff) / len(sample)









    Out[41]:





9.27851043069432



In [42]:

    
X_.shape,data.shape









    Out[42]:





((9905, 2), (203017, 44))



In [51]:

    
from sklearn import neighbors
def one_nn_baseline(X,Y):
    one_nn_X = neighbors.kneighbors_graph(X,2)
    one_nn_Y = neighbors.kneighbors_graph(Y,2)
    sames = 0
    for i in range(len(X)):
        neighbor_X = one_nn_X[i].indices[one_nn_X[i].indices!=i][0]
        neighbor_Y = one_nn_Y[i].indices[one_nn_Y[i].indices!=i][0]
        if neighbor_X == neighbor_Y:
            sames+=1
    return 1.0*sames / len(X)



In [45]:

    
one_nn_baseline(sample,X_)









    Out[45]:





0.003028773346794548



In [46]:

    
knn_X = neighbors.kneighbors_graph(X_,3)
knn_X[0].indices[knn_X[0].indices!=0]









    Out[46]:





array([  34, 1204], dtype=int32)



In [47]:

    
sum(knn_X[0].indices[knn_X[0].indices!=0]==knn_X[0].indices[knn_X[0].indices!=0])









    Out[47]:





2



In [62]:

    
s = set(range(10))
s.intersection(range(5))









    Out[62]:





{0, 1, 2, 3, 4}



In [52]:

    
def knn_baseline(X,Y,k=5):
    k = k+1 # since self is counted as a neighbor in the kneighbors graph
    knn_X = neighbors.kneighbors_graph(X,k)
    knn_Y = neighbors.kneighbors_graph(Y,k)
    sames = 0
    for i in range(len(X)):
        neighbors_X = set(knn_X[i].indices[knn_X[i].indices!=i])
        neighbors_Y = set(knn_Y[i].indices[knn_Y[i].indices!=i])
        sames += len(neighbors_X.intersection(neighbors_Y))
        #sames += sum(neighbors_X == neighbors_Y)
        #sames 
    return 1.0*sames / (len(X)*(k-1))



In [77]:

    
knn_baseline(sample,X_,5)









    Out[77]:





0.012417970721857647



In [52]:

    
sample.shape,X_.shape









    Out[52]:





((9905, 44), (9905, 2))



In [50]:

    
pca = PCA(10)
X_ = pca.fit_transform(sample)
one_nn_baseline(sample,X_)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-50-e5e303ad8de0> in <module>()
      1 pca = PCA(10)
      2 X_ = pca.fit_transform(sample)
----> 3 one_nn_baseline(sample,X_)

NameError: name 'one_nn_baseline' is not defined



In [34]:

    
knn_baseline(sample,X_,5)









    Out[34]:





0.262756183745583



In [45]:

    
from sklearn.cluster import DBSCAN



In [58]:

    
pl.hist(distance.pdist(sample),bins=50);
pl.figure()
pl.hist(distance.pdist(X_),bins=50);

From Wikipedia:

min_samples > D
eps = bendpoint in plot of distance to k=minPts nearest-neighbor



In [78]:

    
def k_dist(X,k=5):
    k = k+1 # since self is counted as a neighbor in the kneighbors graph
    knn = neighbors.kneighbors_graph(X,k,mode='distance')
    dist = np.zeros(len(X))
    for i in range(len(X)):
        dist[i] = knn[i].T[knn[i].indices[-1]].data[0]
    return dist



In [49]:

    
X_.shape









    Out[49]:





(9905, 2)



In [87]:

    
d = k_dist(X_,X_.shape[1])



In [88]:

    
pl.hist(d,bins=50);



In [89]:

    
knn[10].indices[-1]









    Out[89]:





100



In [86]:

    
dbs = DBSCAN(min_samples=50,eps=4.3)



In [87]:

    
dbs.fit(X_)









    Out[87]:





DBSCAN(algorithm='auto', eps=4.3, leaf_size=30, metric='euclidean',
    min_samples=50, p=None, random_state=None)



In [88]:

    
pl.hist(dbs.labels_,bins=len(set(dbs.labels_)));



In [89]:

    
pl.scatter(X_[:,0],X_[:,1],c=dbs.labels_,linewidths=0,alpha=0.5)
pl.xlabel("PC1")
pl.ylabel("PC2")
pl.title('PCA embedding of 10k-point subsample')
pl.colorbar()









    Out[89]:





<matplotlib.colorbar.Colorbar at 0x135fe8b70>



In [109]:

    
d = k_dist(sample,sample.shape[1])
pl.hist(d,bins=50);



In [119]:

    
dbs = DBSCAN(min_samples=sample.shape[1],eps=9.0)
dbs.fit(sample)









    Out[119]:





DBSCAN(algorithm='auto', eps=9.0, leaf_size=30, metric='euclidean',
    min_samples=44, p=None, random_state=None)



In [120]:

    
pl.hist(dbs.labels_,bins=len(set(dbs.labels_)));



In [31]:

    
from sklearn.manifold import Isomap
iso = Isomap(10)
iso.fit(sample)
X_ = iso.embedding_
print(one_nn_baseline(sample,X_))
print(knn_baseline(sample,X_,5))









    



0.005451792024230187
0.015689045936395758



In [53]:

    
from sklearn.manifold import Isomap
iso = Isomap()
iso.fit(sample)









    Out[53]:





Isomap(eigen_solver='auto', max_iter=None, n_components=2, n_neighbors=5,
    neighbors_algorithm='auto', path_method='auto', tol=0)



In [54]:

    
iso.reconstruction_error()









    Out[54]:





616.11588539516492



In [78]:

    
X_ = iso.embedding_



In [67]:

    
pl.scatter(X_[:,0],X_[:,1],c=day[sample_ind],linewidths=0,alpha=0.5)
pl.title('Isomap embedding of 10k-point subsample')
pl.colorbar()









    Out[67]:





<matplotlib.colorbar.Colorbar at 0x129225d30>



In [68]:

    
one_nn_baseline(sample,X_)









    Out[68]:





0.006057546693589096



In [79]:

    
knn_baseline(sample,X_,5)









    Out[79]:





0.019040888440181727



In [126]:

    
sample.shape









    Out[126]:





(9905, 44)



In [142]:

    
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(n_components=2,kernel='poly',degree=2)
X_ = kpca.fit_transform(sample)



In [143]:

    
pl.scatter(X_[:,0],X_[:,1],c=day[sample_ind],linewidths=0,alpha=0.5)
pl.title('KPCA embedding of 10k-point subsample')
pl.colorbar()









    Out[143]:





<matplotlib.colorbar.Colorbar at 0x1373773c8>



In [144]:

    
from sklearn.svm import SVR



In [148]:

    
svr = SVR('poly')



In [ ]:

    
svr.fit(sample,day[sample_ind])



In [147]:

    
svr.score(sample,day[sample_ind])









    Out[147]:





0.75073358518553945



In [ ]:

    
from sklearn.kernel_approximation



In [ ]:

    
from sklearn.manifold import SpectralEmbedding
se = SpectralEmbedding()
X_ = se.fit_transform(sample)



In [ ]:

    
pl.scatter(X_[:,0],X_[:,1],c=day[sample_ind],linewidths=0,alpha=0.5)
pl.title('Spectral embedding of 10k-point subsample')
pl.colorbar()



In [ ]:



In [ ]:

    
one_nn_baseline(sample,X_)



In [ ]:

    
knn_baseline(sample,X_,10)



In [171]:

    
from sklearn.cross_decomposition import PLSRegression



In [172]:

    
pls = PLSRegression(2)



In [173]:

    
len(day),len(data)









    Out[173]:





(203017, 203017)



In [174]:

    
np.random.seed(0)
ind = np.arange(len(day))
ind_nn = np.arange(len(day_nn))
np.random.shuffle(ind)
np.random.shuffle(ind_nn)
split = 0.9
train,test = ind[:len(ind)*split],ind[len(ind)*split:]
train_nn,test_nn = ind_nn[:len(ind_nn)*split],ind_nn[len(ind_nn)*split:]



In [175]:

    
len(train),len(test),len(train_nn),len(test_nn)









    Out[175]:





(182715, 20302, 503688, 55966)



In [176]:

    
train









    Out[176]:





array([149910, 183376, 124875, ..., 138496,  39419, 132654])



In [177]:

    
data.shape,data[train].shape,data[test].shape









    Out[177]:





((203017, 44), (182715, 44), (20302, 44))



In [178]:

    
pls.fit(data[train],day[train])









    Out[178]:





PLSRegression(copy=True, max_iter=500, n_components=2, scale=True, tol=1e-06)



In [179]:

    
pls.score(data[train],day[train]),pls.score(data[test],day[test])









    Out[179]:





(0.64794832986917905, 0.6555345616272843)



In [180]:

    
pls.fit(data_nn[train_nn],day_nn[train_nn])
pls.score(data_nn[train_nn],day_nn[train_nn]),pls.score(data_nn[test_nn],day_nn[test_nn])









    Out[180]:





(0.63536083668704668, 0.63721208533053919)



In [181]:

    
train_scores = []
test_scores = []
models = []
ns = list(range(1,10)) + [15,20,25,30,35,40]

for i in ns:
    pls = PLSRegression(i)
    pls.fit(data[train],day[train])
    train_scores.append(pls.score(data[train],day[train]))
    test_scores.append(pls.score(data[test],day[test]))
    models.append(pls)



In [182]:

    
pl.plot(ns,train_scores,label='Train (90% of data)')
pl.plot(ns,test_scores,label='Test (10% of data)')
pl.xlabel('Number of Components')
pl.ylabel('Score')
pl.legend(loc='best')
pl.title('PLS Regression Scores (Nanog-GFP)')









    Out[182]:





<matplotlib.text.Text at 0x1355e79e8>



In [82]:

    
train_scores = []
test_scores = []
models = []
ns = list(range(1,10)) + [15,20,25,30,35,40]

for i in ns:
    pls = PLSRegression(i)
    pls.fit(data_nn[train_nn],day_nn[train_nn])
    train_scores.append(pls.score(data_nn[train_nn],day_nn[train_nn]))
    test_scores.append((pls.score(data_nn[test_nn],day_nn[test_nn])))
    models.append(pls)



In [83]:

    
pl.plot(ns,train_scores,label='Train (90% of data)')
pl.plot(ns,test_scores,label='Test (10% of data)')
pl.xlabel('Number of Components')
pl.ylabel('Score')
pl.legend(loc='best')
pl.title('PLS Regression Scores (Nanog-Neo)')









    Out[83]:





<matplotlib.text.Text at 0x1275ca908>



In [84]:

    
# the effect of overfitting is very slight
pl.plot(ns,np.array(train_scores) - np.array(test_scores))









    Out[84]:





[<matplotlib.lines.Line2D at 0x129924b38>]



In [85]:

    
for i in range(5):
    print('Dim: {0}, Score: {1:.3f}'.format(ns[i],test_scores[i]))









    



Dim: 1, Score: 0.416
Dim: 2, Score: 0.637
Dim: 3, Score: 0.732
Dim: 4, Score: 0.776
Dim: 5, Score: 0.802



In [ ]:



In [86]:

    
m = models[1]
X_ = m.transform(data)
X_ = m.transform(data_nn)



In [88]:

    
pl.scatter(X_[:,0],X_[:,1],c=day_nn,cmap='winter',linewidths=0,alpha=0.5)
pl.xlabel("PLS Component 1")
pl.ylabel("PLS Component 2")
pl.colorbar()
pl.title("Partial least squares 'progression analysis'")









    Out[88]:





<matplotlib.text.Text at 0x1286e4dd8>



In [89]:

    
xlim = np.min(X_[:,0]),np.max(X_[:,0])
ylim = np.min(X_[:,1]),np.max(X_[:,1])
xlim,ylim









    Out[89]:





((-7.7928443605304913, 10.150609137642995),
 (-7.6848229302663134, 7.0137687963623536))



In [130]:

    
def PLS_progression_analysis(data,times,
                             sample_name='',
                             dims=list(range(1,10))+list(range(10,20,2)),
                             train_test_split=0.9):
    # train-test split
    np.random.seed(0)
    ind = np.arange(len(times))
    np.random.shuffle(ind)
    split_index = len(ind)*train_test_split
    train,test = ind[:split_index],ind[split_index:]
    
    # fit models
    train_scores = []
    test_scores = []
    models = []
    
    for i in dims:        
        pls = PLSRegression(i)
        pls.fit(data[train],times[train])
        train_scores.append(pls.score(data[train],times[train]))
        test_scores.append(pls.score(data[test],times[test]))
        models.append(pls)
    
    train_p = 100*train_test_split
    test_p = 100-train_p
    
    pl.plot(dims,train_scores,label='Train ({0:.0f}% of data)'.format(train_p))
    pl.plot(dims,test_scores,label='Test ({0:.0f}% of data)'.format(test_p))
    pl.xlabel('Number of Components')
    pl.ylabel('Score')
    pl.legend(loc='best')
    title = 'PLS Regression Scores'
    if len(sample_name) > 0:
        title = ': '.join((title,sample_name))
    pl.title(title)
    
    return train_scores,test_scores,models



In [131]:

    
results = PLS_progression_analysis(sample,day[sample_ind],train_test_split=0.8)









    



(7924, 44) (7924,)



In [132]:

    
results = PLS_progression_analysis(Oct4_processed,day_Oct4,'Oct4',train_test_split=0.8)









    



(237345, 44) (237345,)



In [ ]:



In [ ]:

    
for t in times:
    pl.figure()
    kde = KernelDensity(0.25)
    kde.fit(X_[day==t])
    c = np.exp(kde.score_samples(X_[day==t]))
    pl.scatter(X_[day==t,0],X_[day==t,1],c=c,cmap='winter',linewidths=0,alpha=0.5)
    
    pl.xlim(*xlim)
    pl.ylim(*ylim)
    pl.xlabel("PLS Component 1")
    pl.ylabel("PLS Component 2")
    pl.title("PLS 'progression analysis:' Nanog-GFP, Day {0}".format(t))
    pl.savefig('../figures/progression-animation/ng-day {0}.jpg'.format(t))



In [96]:

    
for t in times:
    pl.figure()
    kde = KernelDensity(0.25)
    kde.fit(X_[day_nn==t])
    c = np.exp(kde.score_samples(X_[day_nn==t]))
    pl.scatter(X_[day_nn==t,0],X_[day_nn==t,1],c=c,cmap='winter',linewidths=0,alpha=0.5)
    
    pl.xlim(*xlim)
    pl.ylim(*ylim)
    pl.xlabel("PLS Component 1")
    pl.ylabel("PLS Component 2")
    pl.title("PLS 'progression analysis:' Nanog-Neo, Day {0}".format(t))
    pl.savefig('../figures/progression-animation/nn-day {0}.jpg'.format(t))



In [186]:

    
m = models[0]
X_ = m.transform(data)



In [202]:

    
# let's plot progression on just the leading 'progression axis' identified by PLS



In [204]:

    
day1 = X_[day==day[0]]
day1.shape









    Out[204]:





(2938, 1)



In [184]:

    
from sklearn.neighbors import KernelDensity
kde = KernelDensity(2)



In [224]:

    
kde.fit(day1)









    Out[224]:





KernelDensity(algorithm='auto', atol=0, bandwidth=2, breadth_first=True,
       kernel='gaussian', leaf_size=40, metric='euclidean',
       metric_params=None, rtol=0)



In [225]:

    
min(day1),max(day1)









    Out[225]:





(array([-5.8874691]), array([ 4.7322055]))



In [226]:

    
x = np.arange(min(day1),max(day1),0.1)
y = kde.score_samples(np.reshape(x,(len(x),1)))



In [227]:

    
pl.plot(x,y)









    Out[227]:





[<matplotlib.lines.Line2D at 0x163d70898>]



In [231]:

    
from sklearn.grid_search import GridSearchCV

# this snippet from documentation: 
# http://scikit-learn.org/stable/auto_examples/neighbors/plot_digits_kde_sampling.html
params = {'bandwidth': np.logspace(-1, 1, 10)}
grid = GridSearchCV(KernelDensity(), params)
grid.fit(day1)

print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))

# use the best estimator to compute the kernel density estimate
kde = grid.best_estimator_
# estimate density
kde.fit(day1)

x = np.arange(min(day1),max(day1),0.1)
y = kde.score_samples(np.reshape(x,(len(x),1)))









    



best bandwidth: 0.46415888336127786



In [232]:

    
pl.plot(x,y)









    Out[232]:





[<matplotlib.lines.Line2D at 0x1670c6860>]



In [198]:

    
x = np.arange(min(X_),max(X_),0.02)



In [238]:

    
progression = []
for d in set(day):
    today = X_[day==d]
    # this snippet from documentation: 
    # http://scikit-learn.org/stable/auto_examples/neighbors/plot_digits_kde_sampling.html
    params = {'bandwidth': np.logspace(-1, 1, 10)}
    grid = GridSearchCV(KernelDensity(), params)
    grid.fit(today)

    print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))

    # use the best estimator to compute the kernel density estimate
    kde = grid.best_estimator_
    # estimate density
    kde.fit(today)

    y = kde.score_samples(np.reshape(x,(len(x),1)))
    progression.append(y)









    



best bandwidth: 0.46415888336127786
best bandwidth: 0.2782559402207124
best bandwidth: 0.2782559402207124
best bandwidth: 0.46415888336127786
best bandwidth: 0.46415888336127786
best bandwidth: 0.2782559402207124
best bandwidth: 0.2782559402207124






    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-238-8b2aa93403eb> in <module>()
      6     params = {'bandwidth': np.logspace(-1, 1, 10)}
      7     grid = GridSearchCV(KernelDensity(), params)
----> 8     grid.fit(today)
      9 
     10     print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))

/Users/joshuafass/anaconda/lib/python3.4/site-packages/sklearn/grid_search.py in fit(self, X, y)
    594 
    595         """
--> 596         return self._fit(X, y, ParameterGrid(self.param_grid))
    597 
    598 

/Users/joshuafass/anaconda/lib/python3.4/site-packages/sklearn/grid_search.py in _fit(self, X, y, parameter_iterable)
    376                                     train, test, self.verbose, parameters,
    377                                     self.fit_params, return_parameters=True)
--> 378             for parameters in parameter_iterable
    379             for train, test in cv)
    380 

/Users/joshuafass/anaconda/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    651             self._iterating = True
    652             for function, args, kwargs in iterable:
--> 653                 self.dispatch(function, args, kwargs)
    654 
    655             if pre_dispatch == "all" or n_jobs == 1:

/Users/joshuafass/anaconda/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in dispatch(self, func, args, kwargs)
    398         """
    399         if self._pool is None:
--> 400             job = ImmediateApply(func, args, kwargs)
    401             index = len(self._jobs)
    402             if not _verbosity_filter(index, self.verbose):

/Users/joshuafass/anaconda/lib/python3.4/site-packages/sklearn/externals/joblib/parallel.py in __init__(self, func, args, kwargs)
    136         # Don't delay the application, to avoid keeping the input
    137         # arguments in memory
--> 138         self.results = func(*args, **kwargs)
    139 
    140     def get(self):

/Users/joshuafass/anaconda/lib/python3.4/site-packages/sklearn/cross_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters)
   1238     else:
   1239         estimator.fit(X_train, y_train, **fit_params)
-> 1240     test_score = _score(estimator, X_test, y_test, scorer)
   1241     if return_train_score:
   1242         train_score = _score(estimator, X_train, y_train, scorer)

/Users/joshuafass/anaconda/lib/python3.4/site-packages/sklearn/cross_validation.py in _score(estimator, X_test, y_test, scorer)
   1292     """Compute the score of an estimator on a given test set."""
   1293     if y_test is None:
-> 1294         score = scorer(estimator, X_test)
   1295     else:
   1296         score = scorer(estimator, X_test, y_test)

/Users/joshuafass/anaconda/lib/python3.4/site-packages/sklearn/metrics/scorer.py in _passthrough_scorer(estimator, *args, **kwargs)
    174 def _passthrough_scorer(estimator, *args, **kwargs):
    175     """Function that wraps estimator.score"""
--> 176     return estimator.score(*args, **kwargs)
    177 
    178 

/Users/joshuafass/anaconda/lib/python3.4/site-packages/sklearn/neighbors/kde.py in score(self, X)
    171             Log probabilities of each data point in X.
    172         """
--> 173         return np.sum(self.score_samples(X))
    174 
    175     def sample(self, n_samples=1, random_state=None):

/Users/joshuafass/anaconda/lib/python3.4/site-packages/sklearn/neighbors/kde.py in score_samples(self, X)
    153         log_density = self.tree_.kernel_density(
    154             X, h=self.bandwidth, kernel=self.kernel, atol=atol_N,
--> 155             rtol=self.rtol, breadth_first=self.breadth_first, return_log=True)
    156         log_density -= np.log(N)
    157         return log_density

KeyboardInterrupt:



In [199]:

    
progression = []
for d in set(day):
    today = X_[day==d]
    kde = KernelDensity(0.2)
    kde.fit(today)

    y = kde.score_samples(np.reshape(x,(len(x),1)))
    progression.append(y)



In [241]:

    
prog = np.exp(np.array(progression)).T
prog.shape









    Out[241]:





(742, 14)



In [242]:

    
aspect = float(prog.shape[1]) / prog.shape[0]
aspect









    Out[242]:





0.018867924528301886



In [243]:

    
pl.imshow(-prog,aspect=0.5*aspect,
          interpolation='none',cmap='gray')
pl.xlabel('Timestep')
pl.ylabel('Progression')









    Out[243]:





<matplotlib.text.Text at 0x1246c1048>



In [219]:

    
X_.shape









    Out[219]:





(203017, 1)



In [229]:

    
pl.scatter(day+np.random.randn(len(day))*0.1,X_[:,0],linewidths=0,alpha=0.005)









    Out[229]:





<matplotlib.collections.PathCollection at 0x12318cc50>



In [268]:

    
# color snippet modified from:
# http://stackoverflow.com/questions/8931268/using-colormaps-to-set-color-of-line-in-matplotlib
import matplotlib.colors as colors
import matplotlib.cm as cmx
jet = cm = pl.get_cmap('winter') 
cNorm  = colors.Normalize(vmin=0, vmax=len(times))
scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=jet)

for i,y in enumerate(progression):
    colorVal = scalarMap.to_rgba(i)
    pl.plot(x,np.exp(y),label=times[i],color=colorVal)
    #pl.legend(loc='best')
pl.xlabel("PLS Component 1")
pl.ylabel("Probability density")
pl.title("1D Partial least squares 'progression analysis'")









    Out[268]:





<matplotlib.text.Text at 0x1823a1ba8>



In [ ]:

    
# another way to do this, use a color-intensity channel to encode probability density, 
# and then have time on the x axis and progression on the y axis



In [317]:

    
# color snippet modified from:
# http://stackoverflow.com/questions/8931268/using-colormaps-to-set-color-of-line-in-matplotlib
import matplotlib.colors as colors
import matplotlib.cm as cmx
jet = cm = pl.get_cmap('winter') 
cNorm  = colors.Normalize(vmin=0, vmax=5)
scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=jet)

for i,y in enumerate(progression):
    pl.figure()
    colorVal = scalarMap.to_rgba(0)
    pl.plot(x,np.exp(y),color=colorVal,linewidth=3)
    pl.ylim(0,0.45)
    for j in range(5):
        if i-j>=0:
            colorVal = scalarMap.to_rgba(j+1)
            pl.plot(x,np.exp(progression[i-j]),color=colorVal,alpha=1/(j+1),linewidth=min(1,3-j))
    #pl.legend(loc='best')
    pl.xlabel("PLS Component 1")
    pl.ylabel("Probability density")
    pl.title("1D PLS 'progression analysis' Day {0}".format(times[i]))
    pl.savefig('../figures/1d-progression-animation/ng-day {0}.jpg'.format(times[i]))



In [235]:

    
from scipy.spatial import distance



In [236]:

    
prog = np.array(progression)
prog.shape









    Out[236]:





(14, 742)



In [237]:

    
distmat = distance.squareform(distance.pdist(prog,'canberra'))
distmat.shape









    Out[237]:





(14, 14)



In [244]:

    
pl.imshow(distmat,interpolation='none',cmap='Blues')
pl.xlabel('Timestep')
pl.ylabel('Timestep')









    Out[244]:





<matplotlib.text.Text at 0x1246624a8>



In [134]:

    
pl.scatter(models[-1].predict(data),day,alpha=0.1,linewidths=0)
pl.plot(day,day)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-134-f82e81c28df0> in <module>()
----> 1 pl.scatter(models[-1].predict(data),day,alpha=0.1,linewidths=0)
      2 pl.plot(day,day)

NameError: name 'models' is not defined

Another idea: cluster the full dataset, and describe each timepoint by the relative occupancy of each cluster



In [12]:

    
from sklearn.cluster import MiniBatchKMeans



In [13]:

    
num_clust=20



In [14]:

    
km = MiniBatchKMeans(num_clust)



In [359]:

    
km.fit(data)









    Out[359]:





MiniBatchKMeans(batch_size=100, compute_labels=True, init='k-means++',
        init_size=None, max_iter=100, max_no_improvement=10, n_clusters=20,
        n_init=3, random_state=None, reassignment_ratio=0.01, tol=0.0,
        verbose=0)



In [360]:

    
y = km.predict(data)



In [361]:

    
def num_in_clust(Y,num_clusters=50):
    assert(num_clusters >= max(Y))
    occupancy = np.zeros(num_clusters)
    for y in Y:
        occupancy[y] += 1
    return occupancy



In [361]:



In [362]:

    
pl.bar(range(num_clust),num_in_clust(y,num_clust))
pl.title('Overall')
pl.figure()
pl.bar(range(num_clust),num_in_clust(y[day==times[0]],num_clust))
pl.title('Initial')
pl.figure()
pl.bar(range(num_clust),num_in_clust(y[day==times[-1]],num_clust))
pl.title('Final')









    Out[362]:





<matplotlib.text.Text at 0x12fd88748>



In [363]:

    
featurized = np.array([num_in_clust(y[day==t]) for t in times])
featurized.shape









    Out[363]:





(14, 50)



In [364]:

    
distmat = distance.squareform(distance.pdist(featurized,'canberra'))
pl.imshow(distmat,interpolation='none',cmap='Blues')









    Out[364]:





<matplotlib.image.AxesImage at 0x12bd57588>



In [365]:

    
#clust_ind = sorted(range(50),key=lambda i:featurized[:,i].dot(times))
clust_ind = sorted(range(num_clust),key=lambda i:(y==i).dot(np.arange(len(y))))



In [366]:

    
pl.bar(range(num_clust),num_in_clust(y,num_clust))
pl.title('Overall')
pl.figure()
pl.bar(range(num_clust),num_in_clust(y[day==times[0]],num_clust))
pl.title('Initial')
pl.figure()
pl.bar(range(num_clust),num_in_clust(y[day==times[-1]],num_clust))
pl.title('Final')









    Out[366]:





<matplotlib.text.Text at 0x12d525128>



In [367]:

    
clust_mat = np.array([num_in_clust(y[day==t],num_clust)[clust_ind] for t in times]).T



In [368]:

    
for i in range(len(times)):
    pl.figure()
    pl.bar(range(num_clust),num_in_clust(y[day==times[i]],num_clust)[clust_ind])
    pl.title('Cluster occupancies: Day {0}'.format(times[i]))
    pl.xlabel('Cluster ID')
    pl.ylabel('Occupancy')
    pl.ylim(0,np.max(clust_mat))
    pl.savefig('../figures/occupancy-animation/ng-day {0}.jpg'.format(times[i]))



In [369]:

    
aspect = float(clust_mat.shape[1]) / clust_mat.shape[0]
pl.imshow(clust_mat,aspect=aspect,
          interpolation='none',cmap='Blues')
pl.xlabel('Timestep')
pl.ylabel('Cluster ID')
pl.title('Cluster occupancy per time step')
pl.colorbar()









    Out[369]:





<matplotlib.colorbar.Colorbar at 0x12c59e358>



In [370]:

    
pl.imshow(clust_mat,interpolation='none')









    Out[370]:





<matplotlib.image.AxesImage at 0x12bfe9550>



In [356]:

    
y[clust_ind]









    Out[356]:





array([6, 6, 8, 8, 6, 6, 6, 3, 6, 6], dtype=int32)



In [376]:

    
y.shape









    Out[376]:





(203017,)



In [380]:

    
X_ = pca.fit_transform(data)
pl.scatter(X_[:,0],X_[:,1],c=[clust_ind[i] for i in y],cmap='Blues',linewidths=0,alpha=0.5)
pl.colorbar()









    Out[380]:





<matplotlib.colorbar.Colorbar at 0x1dcef8ac8>



In [33]:

    
len(all_data)









    Out[33]:





762671



In [21]:

    
all_data = np.vstack((data,data_nn))
all_data.shape









    Out[21]:





(762671, 44)



In [22]:

    
pca = PCA()
pca.fit(data)
pl.plot(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_[:2]))









    



0.422737755822



In [23]:

    
pca = PCA()
pca.fit(data_nn)
pl.plot(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_[:2]))









    



0.321411102281



In [386]:

    
X_ = models[1].transform(data)
X_.shape









    Out[386]:





(203017, 2)



In [387]:

    
import seaborn



In [399]:

    
xlim = np.min(X_[:,0]),np.max(X_[:,0])
ylim = np.min(X_[:,1]),np.max(X_[:,1])



In [400]:

    
for i,d in enumerate(set(day)):
    seaborn.kdeplot(X_[day==d]);
    pl.xlim(*xlim)
    pl.ylim(*ylim)
    pl.savefig('../figures/2d-contour-progression-animation/{0}.jpg'.format(i))
    pl.close()



In [ ]: