In [43]:
import os,time
import numpy as np
import gensim
from vertebratesLib import *
In [44]:
from IPython.display import Image
Image(filename='./figures/simple-tree.png')
Out[44]:
In [45]:
## make example plots
import matplotlib.pyplot as plt
## load position matrix
mat = None
for split in SPLITS:
outputFile = os.path.join("..","data","hv-compressed","%s-positions.npz"%(split))
npz = np.load(outputFile)
if mat == None:
mat = npz['tree'].astype(int)
else:
mat = np.vstack((mat,npz['tree'].astype(int)))
vocab = npz['columns']
## create the documents from the matrix
texts = []
usedColumns = set([])
usedRows = set([])
for r in range(mat.shape[0]):
hitInds = np.where(mat[r,:] > 1)[0]
## ignore inplace transitions and documents with < 2 changes
hits = vocab[hitInds]
hits = list(set(hits.tolist()))
passedHits = []
for hit in hits:
if hit[0] == hit[1]:
continue
passedHits.append(hit)
if len(passedHits) < 2:
continue
usedRows.update([r])
text = []
for hi in hitInds:
word = vocab[hi]
if word[0] == word[1]:
continue
usedColumns.update([hi])
text.extend([vocab[hi]] * mat[r,hi])
texts.append(text)
usedColumns = list(usedColumns)
usedRows = list(usedRows)
actualVocab = vocab[usedColumns]
dictionary = gensim.corpora.Dictionary(texts)
dictionary.save('/tmp/positions.dict')
## create a corpus from the documents
corpus = [dictionary.doc2bow(text) for text in texts]
gensim.corpora.MmCorpus.serialize('/tmp/positions.mm', corpus)
mm = gensim.corpora.MmCorpus('/tmp/positions.mm')
print mm
rowMeans = mat.mean(axis=0)
colMeans = mat.mean(axis=1)
fig = plt.figure()
ax1 = fig.add_subplot(211)
ax1.hist(rowMeans,bins=50)
ax1.set_title("Transitions")
ax2 = fig.add_subplot(212)
ax2.hist(colMeans,bins=50)
ax2.set_title("positions")
plt.savefig(os.path.join(".","figures","positions-histograms.png"),dpi=300)
In [46]:
Image(filename=os.path.join(".","figures","positions-histograms.png"))
Out[46]:
Run the model
In [47]:
timeStart = time.time()
lda = gensim.models.LdaMulticore(corpus=mm, num_topics=50, id2word=dictionary,chunksize=1000, passes=10)
for t in range(50):
print("topic-%s: %s"%(t,lda.print_topic(t,topn=8)))
print("end: %s"%time.strftime('%H:%M:%S',time.gmtime(time.time()-timeStart)))
In [49]:
i = 100
actualPosition = usedRows[i]
text = texts[i]
textVector = mm[i]
print len(texts)
print len(mm)
#print("row: %s"%actualPosition)
#print("document: %s"%(text))
#print "vectorized document: ",textVector
for tv in textVector:
print "...",dictionary[tv[0]]
probDistn = lda[textVector]
print "prob: ", probDistn
print("topics...")
for topic in probDistn:
print topic[0],round(topic[1],2),
print lda.print_topic(topic[0], topn=8)
In [ ]: