Analysis of branches

A transition count matrix is used to run this notebook.

The rows are the branches (identified as child-parent) and the columns are the transitions

Load the data

import the compressed NumPy matrix and show an example tree to help visualize.



In [5]:

    
import os,time
import gensim
import numpy as np
from vertebratesLib import *

outputFile = os.path.join("..","data","hv-compressed","branches.npz")
npz = np.load(outputFile)
mat = npz['matrix']
rows = npz['rows']
columns = npz['columns']



In [2]:

    
from IPython.display import Image
Image(filename='./figures/simple-tree.png')









    Out[2]:

Summary statistics and plots



In [3]:

    
## make example plots
import matplotlib.pyplot as plt

rowMeans = mat.mean(axis=0)
colMeans = mat.mean(axis=1)
print("total transitions: %s"%rowMeans.size)
print("total branches: %s"%colMeans.size)
fig = plt.figure()
ax1 = fig.add_subplot(211)
ax1.hist(rowMeans,bins=50)
ax1.set_title("Transitions")
ax2 = fig.add_subplot(212) 
ax2.hist(colMeans,bins=50)
ax2.set_title("Branches")
plt.savefig(os.path.join(".","figures","branches-histograms.png"),dpi=300)









    



total transitions: 400
total branches: 197



In [4]:

    
Image(filename=os.path.join(".","figures","branches-histograms.png"))









    Out[4]:

Run LDA



In [22]:

    
## create the documents from the matrix
vocab = npz['columns']

## create the documents from the matrix     
texts = []
usedColumns = set([])
usedRows = set([])
for r in range(mat.shape[0]):
    hitInds = np.where(mat[r,:] > 1)[0]
    
    ## ignore inplace transitions and documents with < 2 changes
    hits = vocab[hitInds]
    hits = list(set(hits.tolist()))
    passedHits = []
    for hit in hits:
        if hit[0] == hit[1]:
            continue
        passedHits.append(hit)

    if len(passedHits) < 2:
        continue
        
    usedRows.update([r])
    text = []
    for hi in hitInds:
        word = vocab[hi]

        if word[0] == word[1]:
            continue

        usedColumns.update([hi])
        text.extend([vocab[hi]] * mat[r,hi])
    texts.append(text)
usedColumns = list(usedColumns)
usedRows = list(usedRows)
actualRows = rows[usedRows]

dictionary = gensim.corpora.Dictionary(texts)
dictionary.save('/tmp/branchs.dict')

## create a corpus from the documents
corpus = [dictionary.doc2bow(text) for text in texts]
gensim.corpora.MmCorpus.serialize('/tmp/branches.mm', corpus)
mm = gensim.corpora.MmCorpus('/tmp/branches.mm')
print mm









    



MmCorpus(197 documents, 378 features, 33763 non-zero entries)

Run the model



In [15]:

    
timeStart = time.time()
numTopics = 25
lda = gensim.models.LdaMulticore(corpus=mm, num_topics=numTopics, id2word=dictionary,chunksize=1,iterations=50, passes=50)
for t in range(numTopics):
    print("topic-%s: %s"%(t,lda.print_topic(t,topn=8)))
print("end: %s"%time.strftime('%H:%M:%S',time.gmtime(time.time()-timeStart)))









    



topic-0: 0.041*DE + 0.039*IV + 0.036*VI + 0.036*AT + 0.033*ED + 0.028*RK + 0.025*TS + 0.024*TA
topic-1: 0.055*ED + 0.049*DE + 0.040*KR + 0.040*IV + 0.038*VI + 0.029*TS + 0.023*SA + 0.022*AS
topic-2: 0.037*ED + 0.036*DE + 0.033*KR + 0.026*TS + 0.025*VI + 0.025*ST + 0.022*IV + 0.021*LI
topic-3: 0.042*ED + 0.041*RK + 0.038*VI + 0.029*DE + 0.028*KR + 0.027*ST + 0.026*IV + 0.023*AS
topic-4: 0.042*ED + 0.041*VI + 0.036*DE + 0.032*TS + 0.028*IV + 0.025*AT + 0.025*KR + 0.024*ST
topic-5: 0.076*VI + 0.037*IV + 0.035*AT + 0.031*AV + 0.028*SN + 0.026*PS + 0.025*ED + 0.023*NS
topic-6: 0.047*IV + 0.046*ED + 0.035*DE + 0.033*ST + 0.031*VI + 0.031*KR + 0.028*TS + 0.026*AS
topic-7: 0.042*IV + 0.041*DE + 0.038*VI + 0.037*ED + 0.031*KR + 0.026*RK + 0.026*IL + 0.023*ST
topic-8: 0.032*KR + 0.030*DE + 0.028*ED + 0.028*IV + 0.025*SA + 0.024*VI + 0.020*IL + 0.020*RK
topic-9: 0.056*VI + 0.045*KR + 0.035*ED + 0.034*IV + 0.034*DE + 0.024*AV + 0.023*RK + 0.023*TA
topic-10: 0.038*IV + 0.037*ED + 0.028*AT + 0.028*KR + 0.027*TA + 0.025*ST + 0.024*AS + 0.021*RK
topic-11: 0.060*DE + 0.054*IV + 0.049*ED + 0.041*VI + 0.034*KR + 0.030*RK + 0.026*NS + 0.024*ST
topic-12: 0.067*ED + 0.062*DE + 0.058*IV + 0.056*VI + 0.054*KR + 0.039*RK + 0.036*TS + 0.029*AS
topic-13: 0.057*ED + 0.048*IV + 0.041*VI + 0.040*DE + 0.035*KR + 0.033*TS + 0.026*RK + 0.024*AS
topic-14: 0.043*AT + 0.039*VI + 0.033*IV + 0.028*KR + 0.028*TA + 0.023*NS + 0.023*AV + 0.022*SN
topic-15: 0.051*VI + 0.040*IV + 0.034*ED + 0.033*ST + 0.030*AT + 0.030*DE + 0.025*TS + 0.023*NS
topic-16: 0.043*ED + 0.036*VI + 0.030*DE + 0.027*KR + 0.026*AS + 0.024*IV + 0.022*ST + 0.021*SA
topic-17: 0.043*VI + 0.042*ED + 0.032*IV + 0.026*DE + 0.025*TA + 0.022*AT + 0.022*TS + 0.022*LV
topic-18: 0.059*ED + 0.051*VI + 0.041*DE + 0.039*TS + 0.037*RK + 0.035*IV + 0.032*KR + 0.028*ST
topic-19: 0.044*ED + 0.035*IV + 0.034*ST + 0.033*VI + 0.031*DE + 0.028*AT + 0.026*SN + 0.025*RK
topic-20: 0.064*KR + 0.051*IV + 0.040*ED + 0.034*VI + 0.031*DE + 0.025*SA + 0.025*TS + 0.025*NS
topic-21: 0.064*IV + 0.042*ED + 0.042*VI + 0.040*TA + 0.035*KR + 0.032*DE + 0.028*AT + 0.027*RK
topic-22: 0.059*IV + 0.052*VI + 0.040*AT + 0.030*ED + 0.028*KR + 0.028*TA + 0.027*SN + 0.027*NS
topic-23: 0.050*IV + 0.042*ED + 0.040*VI + 0.039*KR + 0.034*DE + 0.031*AS + 0.028*TS + 0.024*RK
topic-24: 0.071*IV + 0.044*TA + 0.041*VI + 0.039*KR + 0.036*NS + 0.033*AT + 0.024*VA + 0.023*ED
end: 00:00:26

Calculate similarities among branches



In [28]:

    
index = gensim.similarities.MatrixSimilarity(lda[mm],num_features=numTopics)
index.save('/tmp/branches.index')
index = gensim.similarities.MatrixSimilarity.load('/tmp/branches.index')

r = 0
rowId = actualRows[r]
print rowId
text = texts[r]
textVector = mm[r]
probDistn = lda[textVector]
sims = index[probDistn]
distMat = np.array([])

for r,rowId in enumerate(actualRows):
    text = texts[r]
    textVector = mm[r]
    probDistn = lda[textVector]
    sims = index[probDistn]
    if distMat.size != 0:
        distMat = np.vstack((distMat,sims))
    else:
        distMat = sims
        
print distMat.shape









    



Acipenser#N1
(197, 197)



In [33]:

    
%matplotlib inline
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(11,11),dpi=300)
cmap = plt.cm.PuBuGn
ax = fig.add_subplot(111)
hmap = ax.imshow(distMat,interpolation='nearest',aspect='auto',cmap=cmap)
#_ = ax.set_title("class=%s"%c)
_ = ax.set_xticks(range(actualRows.size))
_ = ax.set_yticks(range(actualRows.size))
_ = ax.set_xticklabels(actualRows,fontsize=5)
_ = ax.set_yticklabels(actualRows,fontsize=5)
xlabs = ax.get_xticklabels()
plt.setp(xlabs, rotation=90)
#ax.set_aspect(1./ax.get_data_ratio())
cbar = fig.colorbar(hmap, orientation='vertical')



In [39]:

    
from htsint.tools import Heatmap
rowLabels = actualRows 
colLabels = actualRows
hm = Heatmap(distMat,rowLabels,colLabels,fontSize=3)
hm.draw_heatmap(cmap='uy',clabels=True,rlabels=True,rowFont=3)
hm.save("./figures/heatmap.png",dpi=900)









    



clustering the rows...
clustering the columns...



In [ ]: