Analysis of branches

A transition count matrix is used to run this notebook.

The rows are the branches (identified as child-parent) and the columns are the transitions


In [43]:
import os,time
import numpy as np
import gensim
from vertebratesLib import *

In [44]:
from IPython.display import Image
Image(filename='./figures/simple-tree.png')


Out[44]:

Prepare the corpus of documents


In [45]:
## make example plots
import matplotlib.pyplot as plt

## load position matrix
mat = None
for split in SPLITS:
    outputFile = os.path.join("..","data","hv-compressed","%s-positions.npz"%(split))
    npz = np.load(outputFile)

    if mat == None:
        mat = npz['tree'].astype(int)
    else:
        mat = np.vstack((mat,npz['tree'].astype(int)))

vocab = npz['columns']

## create the documents from the matrix                                                                                                                                         
texts = []
usedColumns = set([])
usedRows = set([])
for r in range(mat.shape[0]):
    hitInds = np.where(mat[r,:] > 1)[0]
    
    ## ignore inplace transitions and documents with < 2 changes
    hits = vocab[hitInds]
    hits = list(set(hits.tolist()))
    passedHits = []
    for hit in hits:
        if hit[0] == hit[1]:
            continue
        passedHits.append(hit)

    if len(passedHits) < 2:
        continue
        
    usedRows.update([r])
    text = []
    for hi in hitInds:
        word = vocab[hi]

        if word[0] == word[1]:
            continue

        usedColumns.update([hi])
        text.extend([vocab[hi]] * mat[r,hi])
    texts.append(text)
usedColumns = list(usedColumns)
usedRows = list(usedRows)
actualVocab = vocab[usedColumns]
    
dictionary = gensim.corpora.Dictionary(texts)
dictionary.save('/tmp/positions.dict')

## create a corpus from the documents                                                                                                                                                                       
corpus = [dictionary.doc2bow(text) for text in texts]
gensim.corpora.MmCorpus.serialize('/tmp/positions.mm', corpus)
mm = gensim.corpora.MmCorpus('/tmp/positions.mm')
print mm

rowMeans = mat.mean(axis=0)
colMeans = mat.mean(axis=1)
fig = plt.figure()
ax1 = fig.add_subplot(211)
ax1.hist(rowMeans,bins=50)
ax1.set_title("Transitions")
ax2 = fig.add_subplot(212) 
ax2.hist(colMeans,bins=50)
ax2.set_title("positions")
plt.savefig(os.path.join(".","figures","positions-histograms.png"),dpi=300)


MmCorpus(149102 documents, 372 features, 463257 non-zero entries)

In [46]:
Image(filename=os.path.join(".","figures","positions-histograms.png"))


Out[46]:

Run LDA

Run the model


In [47]:
timeStart = time.time()
lda = gensim.models.LdaMulticore(corpus=mm, num_topics=50, id2word=dictionary,chunksize=1000, passes=10)
for t in range(50):
    print("topic-%s: %s"%(t,lda.print_topic(t,topn=8)))
print("end: %s"%time.strftime('%H:%M:%S',time.gmtime(time.time()-timeStart)))


topic-0: 0.244*HY + 0.200*YH + 0.184*CS + 0.127*SC + 0.042*QE + 0.028*CG + 0.016*GC + 0.012*EQ
topic-1: 0.236*SN + 0.130*SR + 0.126*SD + 0.080*SK + 0.064*NS + 0.043*ST + 0.033*TS + 0.032*DS
topic-2: 0.220*NH + 0.196*HN + 0.128*NS + 0.082*ND + 0.048*NE + 0.041*NK + 0.039*SN + 0.022*HS
topic-3: 0.358*DE + 0.210*DN + 0.087*DG + 0.084*ED + 0.063*DS + 0.042*DA + 0.019*QD + 0.018*DH
topic-4: 0.295*RH + 0.276*HR + 0.076*RQ + 0.053*HQ + 0.051*RC + 0.045*CR + 0.016*WR + 0.013*NT
topic-5: 0.425*SP + 0.174*SA + 0.154*ST + 0.043*LH + 0.032*HL + 0.024*SL + 0.010*PS + 0.007*TS
topic-6: 0.217*DE + 0.147*ED + 0.144*IV + 0.059*VI + 0.020*IL + 0.020*PD + 0.017*QE + 0.017*DN
topic-7: 0.172*EV + 0.133*VE + 0.060*ET + 0.058*VG + 0.049*EA + 0.047*WL + 0.046*TE + 0.043*ED
topic-8: 0.214*GE + 0.176*GD + 0.143*EG + 0.063*DG + 0.061*GN + 0.058*AE + 0.044*AD + 0.042*GS
topic-9: 0.296*VM + 0.218*VL + 0.150*VI + 0.044*LV + 0.040*IV + 0.022*IL + 0.021*KR + 0.018*MV
topic-10: 0.382*SG + 0.377*GS + 0.036*AS + 0.026*SA + 0.020*GA + 0.017*AT + 0.014*AK + 0.014*ST
topic-11: 0.197*RK + 0.169*KR + 0.060*ST + 0.049*TS + 0.030*SA + 0.025*SN + 0.024*SP + 0.021*YF
topic-12: 0.272*KQ + 0.251*QK + 0.104*KR + 0.090*QR + 0.069*QE + 0.055*KE + 0.024*KN + 0.014*RQ
topic-13: 0.256*HQ + 0.237*QH + 0.155*EK + 0.150*KE + 0.032*EQ + 0.026*QE + 0.022*ED + 0.018*ER
topic-14: 0.255*LS + 0.215*SL + 0.045*LA + 0.035*LP + 0.031*LT + 0.028*AV + 0.023*QK + 0.021*ED
topic-15: 0.181*NK + 0.181*KN + 0.106*KT + 0.100*TK + 0.070*HQ + 0.064*LQ + 0.040*HP + 0.029*KS
topic-16: 0.321*PS + 0.164*PA + 0.148*SP + 0.102*AP + 0.064*PL + 0.060*PT + 0.041*LP + 0.011*PQ
topic-17: 0.465*IV + 0.459*VI + 0.019*VL + 0.007*IL + 0.006*IT + 0.005*IM + 0.002*LV + 0.002*MI
topic-18: 0.500*KR + 0.436*RK + 0.018*KQ + 0.005*KN + 0.004*KT + 0.003*KE + 0.003*QE + 0.003*RQ
topic-19: 0.374*ND + 0.335*DN + 0.029*DE + 0.018*QL + 0.016*PS + 0.012*NS + 0.012*DS + 0.011*SP
topic-20: 0.517*TS + 0.400*ST + 0.009*TA + 0.007*SN + 0.006*KR + 0.005*KE + 0.004*SA + 0.004*ED
topic-21: 0.496*ST + 0.052*SV + 0.052*SC + 0.046*CS + 0.042*SF + 0.037*TS + 0.032*SA + 0.032*SL
topic-22: 0.439*AT + 0.204*AS + 0.128*TA + 0.053*AV + 0.039*TS + 0.037*ST + 0.024*SA + 0.023*AP
topic-23: 0.345*AE + 0.133*EA + 0.051*VA + 0.050*AS + 0.041*AV + 0.025*SA + 0.023*NP + 0.022*RN
topic-24: 0.133*RK + 0.101*KR + 0.098*LV + 0.065*RA + 0.055*LI + 0.049*VL + 0.037*TS + 0.022*ST
topic-25: 0.518*IL + 0.194*LI + 0.142*IV + 0.062*IM + 0.031*IF + 0.007*RQ + 0.006*QR + 0.006*LV
topic-26: 0.175*VI + 0.128*IV + 0.056*KR + 0.056*WR + 0.044*RK + 0.026*RV + 0.026*IR + 0.025*ED
topic-27: 0.324*IT + 0.199*IV + 0.053*NS + 0.051*IA + 0.043*TI + 0.038*IS + 0.030*VI + 0.022*IM
topic-28: 0.353*EQ + 0.246*QE + 0.116*EK + 0.085*EA + 0.057*GV + 0.011*ED + 0.011*AE + 0.008*VA
topic-29: 0.107*LV + 0.077*LI + 0.076*DE + 0.071*VI + 0.060*IV + 0.054*YF + 0.051*ED + 0.044*SP
topic-30: 0.278*GA + 0.265*QP + 0.137*PQ + 0.090*GS + 0.038*AG + 0.036*GT + 0.024*QS + 0.015*VI
topic-31: 0.243*TN + 0.216*TS + 0.135*NT + 0.089*ST + 0.054*NS + 0.042*SN + 0.024*IV + 0.023*ML
topic-32: 0.400*FL + 0.343*LF + 0.057*FI + 0.047*FV + 0.017*YN + 0.012*IF + 0.009*VF + 0.009*NY
topic-33: 0.375*LV + 0.309*LI + 0.090*VL + 0.071*IL + 0.055*LM + 0.020*LF + 0.016*LA + 0.008*IV
topic-34: 0.119*RS + 0.113*GR + 0.079*AL + 0.068*RG + 0.058*FS + 0.054*KS + 0.051*GK + 0.044*ES
topic-35: 0.334*VA + 0.200*VI + 0.121*VT + 0.085*IV + 0.062*AV + 0.051*VL + 0.018*VM + 0.016*VS
topic-36: 0.514*TA + 0.190*AT + 0.124*TS + 0.035*TV + 0.019*TP + 0.017*VA + 0.015*AV + 0.011*TI
topic-37: 0.346*VL + 0.176*LV + 0.162*TI + 0.085*TV + 0.066*VI + 0.031*IT + 0.022*TL + 0.021*TM
topic-38: 0.158*LR + 0.137*RL + 0.122*DE + 0.086*ED + 0.058*LQ + 0.031*SA + 0.028*LK + 0.027*LI
topic-39: 0.321*QR + 0.219*QH + 0.126*QL + 0.067*RQ + 0.058*QK + 0.030*LQ + 0.027*HQ + 0.025*QE
topic-40: 0.386*RQ + 0.245*RK + 0.075*QR + 0.041*KR + 0.021*RE + 0.016*RN + 0.015*VI + 0.015*RT
topic-41: 0.512*AV + 0.246*VA + 0.081*AT + 0.045*AI + 0.009*VI + 0.009*AM + 0.008*AG + 0.007*YF
topic-42: 0.385*TP + 0.069*PT + 0.064*TR + 0.043*TA + 0.041*TS + 0.033*VI + 0.022*SA + 0.022*AV
topic-43: 0.402*NS + 0.357*SN + 0.053*SG + 0.045*GS + 0.040*NT + 0.018*NG + 0.011*ST + 0.009*GN
topic-44: 0.443*AG + 0.237*GA + 0.063*AS + 0.044*AT + 0.024*DE + 0.020*ED + 0.018*AP + 0.014*PA
topic-45: 0.486*ML + 0.334*LM + 0.060*MI + 0.017*MK + 0.010*MV + 0.007*YH + 0.006*MR + 0.005*MF
topic-46: 0.488*SA + 0.390*AS + 0.052*ST + 0.033*AT + 0.004*AG + 0.003*TA + 0.002*SG + 0.002*AV
topic-47: 0.239*MV + 0.218*MI + 0.119*IM + 0.091*MT + 0.088*ML + 0.066*VM + 0.050*TM + 0.037*LM
topic-48: 0.371*YF + 0.329*FY + 0.050*YC + 0.047*CY + 0.044*YH + 0.023*YS + 0.015*YL + 0.013*SR
topic-49: 0.515*ED + 0.437*DE + 0.007*EQ + 0.007*EK + 0.005*EN + 0.004*QE + 0.003*EA + 0.002*DN
end: 00:01:18

In [49]:
i = 100
actualPosition = usedRows[i]
text = texts[i]
textVector = mm[i]
print len(texts)
print len(mm)

#print("row: %s"%actualPosition)
#print("document: %s"%(text))
#print "vectorized document: ",textVector
for tv in textVector:
    print "...",dictionary[tv[0]]
probDistn = lda[textVector]
print "prob: ", probDistn
print("topics...")
for topic in probDistn:
    print topic[0],round(topic[1],2),
    print lda.print_topic(topic[0], topn=8)


149102
149102
... ED
... DE
... DN
... ND
prob:  [(19, 0.59864912886079169), (49, 0.33277944256778053)]
topics...
19 0.6 0.374*ND + 0.335*DN + 0.029*DE + 0.018*QL + 0.016*PS + 0.012*NS + 0.012*DS + 0.011*SP
49 0.33 0.515*ED + 0.437*DE + 0.007*EQ + 0.007*EK + 0.005*EN + 0.004*QE + 0.003*EA + 0.002*DN

In [ ]: