In [5]:
import os,time
import gensim
import numpy as np
from vertebratesLib import *
outputFile = os.path.join("..","data","hv-compressed","branches.npz")
npz = np.load(outputFile)
mat = npz['matrix']
rows = npz['rows']
columns = npz['columns']
In [2]:
from IPython.display import Image
Image(filename='./figures/simple-tree.png')
Out[2]:
In [3]:
## make example plots
import matplotlib.pyplot as plt
rowMeans = mat.mean(axis=0)
colMeans = mat.mean(axis=1)
print("total transitions: %s"%rowMeans.size)
print("total branches: %s"%colMeans.size)
fig = plt.figure()
ax1 = fig.add_subplot(211)
ax1.hist(rowMeans,bins=50)
ax1.set_title("Transitions")
ax2 = fig.add_subplot(212)
ax2.hist(colMeans,bins=50)
ax2.set_title("Branches")
plt.savefig(os.path.join(".","figures","branches-histograms.png"),dpi=300)
In [4]:
Image(filename=os.path.join(".","figures","branches-histograms.png"))
Out[4]:
In [22]:
## create the documents from the matrix
vocab = npz['columns']
## create the documents from the matrix
texts = []
usedColumns = set([])
usedRows = set([])
for r in range(mat.shape[0]):
hitInds = np.where(mat[r,:] > 1)[0]
## ignore inplace transitions and documents with < 2 changes
hits = vocab[hitInds]
hits = list(set(hits.tolist()))
passedHits = []
for hit in hits:
if hit[0] == hit[1]:
continue
passedHits.append(hit)
if len(passedHits) < 2:
continue
usedRows.update([r])
text = []
for hi in hitInds:
word = vocab[hi]
if word[0] == word[1]:
continue
usedColumns.update([hi])
text.extend([vocab[hi]] * mat[r,hi])
texts.append(text)
usedColumns = list(usedColumns)
usedRows = list(usedRows)
actualRows = rows[usedRows]
dictionary = gensim.corpora.Dictionary(texts)
dictionary.save('/tmp/branchs.dict')
## create a corpus from the documents
corpus = [dictionary.doc2bow(text) for text in texts]
gensim.corpora.MmCorpus.serialize('/tmp/branches.mm', corpus)
mm = gensim.corpora.MmCorpus('/tmp/branches.mm')
print mm
Run the model
In [15]:
timeStart = time.time()
numTopics = 25
lda = gensim.models.LdaMulticore(corpus=mm, num_topics=numTopics, id2word=dictionary,chunksize=1,iterations=50, passes=50)
for t in range(numTopics):
print("topic-%s: %s"%(t,lda.print_topic(t,topn=8)))
print("end: %s"%time.strftime('%H:%M:%S',time.gmtime(time.time()-timeStart)))
In [28]:
index = gensim.similarities.MatrixSimilarity(lda[mm],num_features=numTopics)
index.save('/tmp/branches.index')
index = gensim.similarities.MatrixSimilarity.load('/tmp/branches.index')
r = 0
rowId = actualRows[r]
print rowId
text = texts[r]
textVector = mm[r]
probDistn = lda[textVector]
sims = index[probDistn]
distMat = np.array([])
for r,rowId in enumerate(actualRows):
text = texts[r]
textVector = mm[r]
probDistn = lda[textVector]
sims = index[probDistn]
if distMat.size != 0:
distMat = np.vstack((distMat,sims))
else:
distMat = sims
print distMat.shape
In [33]:
%matplotlib inline
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(11,11),dpi=300)
cmap = plt.cm.PuBuGn
ax = fig.add_subplot(111)
hmap = ax.imshow(distMat,interpolation='nearest',aspect='auto',cmap=cmap)
#_ = ax.set_title("class=%s"%c)
_ = ax.set_xticks(range(actualRows.size))
_ = ax.set_yticks(range(actualRows.size))
_ = ax.set_xticklabels(actualRows,fontsize=5)
_ = ax.set_yticklabels(actualRows,fontsize=5)
xlabs = ax.get_xticklabels()
plt.setp(xlabs, rotation=90)
#ax.set_aspect(1./ax.get_data_ratio())
cbar = fig.colorbar(hmap, orientation='vertical')
In [39]:
from htsint.tools import Heatmap
rowLabels = actualRows
colLabels = actualRows
hm = Heatmap(distMat,rowLabels,colLabels,fontSize=3)
hm.draw_heatmap(cmap='uy',clabels=True,rlabels=True,rowFont=3)
hm.save("./figures/heatmap.png",dpi=900)
In [ ]: