In [ ]:
# SEMANTIGRAM
# a visualization of topic change over time in transcript data

# @author cj carr
# http://cortexel.us
# http://github.com/cortexelus/semantigram

# todo

In [1]:
from xml.etree import cElementTree as ET 
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3
import HTMLParser 
h = HTMLParser.HTMLParser()

transcript = 'transcripts/physics-of-information2.xml'

In [2]:
tree = ET.parse(transcript)
root = tree.getroot()
root


Out[2]:
<Element 'transcript' at 0x7fd2471bbd20>

In [4]:
# Print examples from transcript
i = 0
for child in root:
    print child.attrib
    print child.text
    if i > 10: break
    i += 1


{'start': '0.68', 'dur': '0.98'}
you
{'start': '1.66', 'dur': '5.35'}
year on Rome unused condition and still
reading science
{'start': '7.01', 'dur': '4.27'}
a special panel discussion among good
example information
{'start': '11.28', 'dur': '5.46'}
brought to you by Canada&#39;s Perimeter
Institute for Theoretical Physics
{'start': '16.74', 'dur': '5.57'}
than independent nonprofit scientific
research and education outreach
{'start': '22.31', 'dur': '4.609'}
organization home to a growing cluster
up international scientists
{'start': '26.919', 'dur': '5.02'}
contemplating calculate new ideas about
the very essence in space
{'start': '31.939', 'dur': '6.531'}
time matter and information 300
Institute also offers a wide array of
{'start': '38.47', 'dur': '2.13'}
educational outreach activities
{'start': '40.6', 'dur': '3.35'}
for students teachers and members of the
general public
{'start': '43.95', 'dur': '3.11'}
in order to share the joys and
scientific research
{'start': '47.06', 'dur': '5.73'}
discovery and innovations

In [5]:
def formatText(text):
    text = text.replace("\n"," ")
    text = h.unescape(text)
    return text

In [6]:
def chunkByTexts(root, chunkSize):
    chunks = [] 
    i = 0
    for i in xrange(len(root) - chunkSize ):
        #chunk = " ".join(map(lambda x:x.text.replace("\n"," "),root[i:i+chunkSizeInTexts]))
        texts = []
        duration = 0
        for j in xrange(chunkSize):
            texts.append(formatText(root[i+j].text))
            duration += float(root[i+j].attrib["dur"])
        text = " ".join(texts)
        chunk = {"start": float(root[i].attrib["start"]), "duration": duration, "text": text}
        chunks.append(chunk)
    return chunks

In [10]:
def chunkByTime(root, minimumTime):
    chunks = [] 
    i = 0
    for i in xrange(len(root)):
        #chunk = " ".join(map(lambda x:x.text.replace("\n"," "),root[i:i+chunkSizeInTexts]))
        texts = []
        duration = 0
        for j in xrange(i,len(root)):
            texts.append(formatText(root[j].text))
            duration += float(root[j].attrib["dur"])
            if(duration>=minimumTime): break
        text = " ".join(texts)
        chunk = {"start": float(root[i].attrib["start"]), "duration": duration, "text": text}
        chunks.append(chunk)
    return chunks

In [11]:
# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend([u"'s",u"n't",u"'m",u"'d"])
print stopwords


[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u'not', u'only', u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', u'don', u'should', u'now', u"'s", u"n't", u"'m", u"'d"]

In [12]:
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

# here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [13]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora, models, similarities 

import time as time
import numpy as np
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d.axes3d as p3
from sklearn.cluster import AgglomerativeClustering

In [47]:
"""
#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.9, max_features=200000,
                                 min_df=0.0, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(documents) #fit the vectorizer to synopses

print(tfidf_matrix.shape)
terms = tfidf_vectorizer.get_feature_names()
print terms """


Out[47]:
"\n#define vectorizer parameters\ntfidf_vectorizer = TfidfVectorizer(max_df=0.9, max_features=200000,\n                                 min_df=0.0, stop_words='english',\n                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))\n\n%time tfidf_matrix = tfidf_vectorizer.fit_transform(documents) #fit the vectorizer to synopses\n\nprint(tfidf_matrix.shape)\nterms = tfidf_vectorizer.get_feature_names()\nprint terms "

In [17]:
import ImageDraw
from PIL import Image 
from IPython.display import Image as ImageDisplay

def drawTopicogram(corpus, ntopics, topicOrder, lda, filename):
    W = len(corpus)
    H = 100 #topics
    blockHeight = H/ntopics

    img = Image.new("RGB", (W, H), "black")
    draw = ImageDraw.Draw(img)
    
    for x in xrange(W):
        doc_lda = lda[corpus[x]]
        for t in doc_lda:
            topicNum = topicOrder.index(t[0])
            hue = int(360 * (topicNum*blockHeight/float(H)))
            saturation = 100
            lightness = int(pow(t[1],1) * 100)
            color = "hsl("+str(hue)+","+str(saturation)+"%,"+str(lightness)+"%)" 

            for y in xrange(topicNum *blockHeight,(topicNum+1) * blockHeight):
                draw.point((x,y), fill=color)


    img.save(filename, "PNG")

In [27]:
def buildTopicogram(root, time_chunking, size, ntopics, passes, filename):
    if(time_chunking):
        chunks = chunkByTime(root, size) # size: each chunk is at least size seconds long
    else: 
        chunks = chunkByTexts(root, size) # size: each chunk is the concatenation of this # of texts 
    documents = []
    for chunk in chunks:
        #chunk["tokenized"] = tokenize_and_stem(chunk["text"])
        documents.append(chunk["text"])
    #print len(documents)

    #tokenize
    tokenized_text = [tokenize_and_stem(text) for text in documents]

    #remove stop words
    texts = [[word for word in text if word not in stopwords] for text in tokenized_text] 
    
    #create a Gensim dictionary from the texts
    dictionary = corpora.Dictionary(texts)

    #remove extremes (similar to the min/max df step used when creating the tf-idf matrix)
    #dictionary.filter_extremes(no_below=0, no_above=1)

    #convert the dictionary to a bag of words corpus for reference
    corpus = [dictionary.doc2bow(text) for text in texts]

    #print len(dictionary)
    #print len(corpus)
    
    lda = models.LdaModel(corpus, num_topics=ntopics, id2word=dictionary, passes=passes, chunksize=100)
    print lda
    print lda.show_topics(ntopics) 
    #print corpus
        
    # sort topics by initial max
    # find maxs
    #topics = [[] for k in range(len(corpus))]
    #print "topics", topics
    topicMaxs = [(0,0,0)]*ntopics
    for x in xrange(len(corpus)):
        for y in lda[corpus[x]]:
            t = y[0]
            v = y[1]
            #print ":", x, y, t, v
            if topicMaxs[t][1] < v:
                # new max
                topicMaxs[t] = (t,v,x)
            #print ">", topicMaxs
    # sort by maxs
    topicOrder = sorted(topicMaxs, key = lambda s: s[2])
    topicOrder = map(lambda s: s[0], topicOrder)
    # draw it
    drawTopicogram(corpus, ntopics, topicOrder, lda, filename)




In [36]:
filename = "topicogramtest.png"
buildTopicogram(root, True, 60, 5, 10, filename)
ImageDisplay(filename=filename)


LdaModel(num_terms=1726, num_topics=5, decay=0.5, chunksize=100)
[(0, u'0.083*inform + 0.020*veri + 0.018*area + 0.017*talk + 0.015*reproduc + 0.014*interest + 0.014*way + 0.013*right + 0.012*hole + 0.012*black'), (1, u'0.065*comput + 0.051*univers + 0.019*way + 0.018*question + 0.016*go + 0.016*program + 0.016*physic + 0.013*electron + 0.013*good + 0.013*like'), (2, u'0.046*inform + 0.025*know + 0.023*think + 0.021*thing + 0.019*one + 0.019*actual + 0.017*like + 0.016*chang + 0.016*well + 0.015*system'), (3, u'0.023*one + 0.022*would + 0.022*think + 0.020*paper + 0.017*someth + 0.017*quantum + 0.016*differ + 0.016*might + 0.015*mean + 0.014*veri'), (4, u'0.025*question + 0.019*data + 0.016*thank + 0.015*go + 0.014*re + 0.014*panel + 0.012*ca + 0.011*perhap + 0.011*problem + 0.010*onli')]
Out[36]:

In [37]:
filename = "topicogramtes60-5.png"
buildTopicogram(root, True, 60, 5, 10, filename)
ImageDisplay(filename=filename)


LdaModel(num_terms=1726, num_topics=5, decay=0.5, chunksize=100)
[(0, u'0.033*paper + 0.022*quantum + 0.021*differ + 0.021*one + 0.020*might + 0.017*veri + 0.016*come + 0.014*someth + 0.013*mechan + 0.013*look'), (1, u'0.037*comput + 0.020*univers + 0.018*thing + 0.017*think + 0.016*say + 0.016*way + 0.016*inform + 0.014*know + 0.013*go + 0.013*piec'), (2, u'0.059*inform + 0.021*bit + 0.019*get + 0.017*teleport + 0.017*quantum + 0.017*veri + 0.016*actual + 0.014*one + 0.014*area + 0.014*measur'), (3, u'0.034*question + 0.024*univers + 0.018*like + 0.016*someth + 0.015*futur + 0.015*physic + 0.015*ca + 0.014*biolog + 0.013*program + 0.013*make'), (4, u'0.054*inform + 0.031*know + 0.025*think + 0.022*would + 0.021*get + 0.019*one + 0.018*mean + 0.018*arm + 0.015*certain + 0.012*go')]
Out[37]:

In [45]:
filename = "topicogramtest5-8.png"
buildTopicogram(root, True, 5, 8, 5, filename)
ImageDisplay(filename=filename)


LdaModel(num_terms=1726, num_topics=8, decay=0.5, chunksize=100)
[(0, u'0.036*time + 0.032*system + 0.026*arm + 0.025*ani + 0.024*question + 0.023*chang + 0.022*fact + 0.018*term + 0.018*certain + 0.017*give'), (1, u'0.082*comput + 0.054*univers + 0.049*go + 0.037*quantum + 0.034*thing + 0.029*think + 0.027*take + 0.026*said + 0.022*law + 0.021*say'), (2, u'0.041*happen + 0.039*mechan + 0.033*program + 0.031*quantum + 0.023*right + 0.021*area + 0.021*number + 0.021*predict + 0.020*becaus + 0.018*leonard'), (3, u'0.046*way + 0.037*paper + 0.029*answer + 0.027*question + 0.021*idea + 0.020*futur + 0.018*veri + 0.017*even + 0.017*differ + 0.016*word'), (4, u'0.048*mean + 0.048*know + 0.036*get + 0.032*good + 0.030*doe + 0.025*like + 0.021*inform + 0.018*thing + 0.018*anoth + 0.017*process'), (5, u'0.092*one + 0.063*someth + 0.058*veri + 0.034*inform + 0.027*much + 0.025*electron + 0.022*place + 0.021*peopl + 0.019*measur + 0.019*particular'), (6, u'0.072*inform + 0.053*actual + 0.041*could + 0.030*piec + 0.026*know + 0.024*make + 0.022*call + 0.022*realli + 0.019*ca + 0.019*get'), (7, u'0.042*think + 0.042*talk + 0.036*would + 0.030*well + 0.030*like + 0.027*inform + 0.026*might + 0.024*also + 0.021*mayb + 0.019*univers')]
Out[45]:

In [43]:
filename = "topicogramtest15-8.png"
buildTopicogram(root, True, 15, 8, 5, filename)
ImageDisplay(filename=filename)


LdaModel(num_terms=1726, num_topics=8, decay=0.5, chunksize=100)
[(0, u'0.036*think + 0.023*mayb + 0.023*data + 0.022*ask + 0.021*onli + 0.020*particular + 0.018*reproduc + 0.017*abl + 0.017*game + 0.015*person'), (1, u'0.059*quantum + 0.046*question + 0.041*mechan + 0.039*right + 0.028*said + 0.021*thank + 0.021*would + 0.020*veri + 0.019*someth + 0.019*like'), (2, u'0.034*paper + 0.032*come + 0.032*differ + 0.031*veri + 0.025*talk + 0.024*way + 0.022*much + 0.020*state + 0.020*inform + 0.019*someth'), (3, u'0.085*one + 0.040*bit + 0.036*word + 0.034*well + 0.032*number + 0.021*ll + 0.020*room + 0.020*hold + 0.017*say + 0.016*rather'), (4, u'0.044*good + 0.041*make + 0.032*mean + 0.028*know + 0.028*answer + 0.027*doe + 0.027*might + 0.024*electron + 0.021*inform + 0.020*ca'), (5, u'0.042*chang + 0.040*one + 0.034*inform + 0.026*arm + 0.023*anoth + 0.023*law + 0.021*biolog + 0.019*physic + 0.018*work + 0.018*would'), (6, u'0.081*univers + 0.071*comput + 0.026*program + 0.025*also + 0.025*think + 0.021*whole + 0.017*inform + 0.016*sinc + 0.016*happen + 0.016*physic'), (7, u'0.064*inform + 0.039*go + 0.036*thing + 0.034*like + 0.034*know + 0.025*actual + 0.024*could + 0.023*say + 0.023*get + 0.022*look')]
Out[43]:

In [35]:
filename = "topicogramtest.png"
buildTopicogram(root, True, 15, 10, 5, filename)
ImageDisplay(filename=filename)


LdaModel(num_terms=1726, num_topics=10, decay=0.5, chunksize=100)
[(0, u'0.075*comput + 0.053*univers + 0.035*think + 0.030*doe + 0.028*good + 0.023*look + 0.022*know + 0.020*said + 0.020*electron + 0.019*call'), (1, u'0.071*inform + 0.031*like + 0.028*know + 0.025*chang + 0.025*actual + 0.024*go + 0.022*thing + 0.022*get + 0.021*system + 0.018*ca'), (2, u'0.042*way + 0.040*bit + 0.038*go + 0.034*could + 0.029*fact + 0.029*talk + 0.020*time + 0.019*littl + 0.019*think + 0.016*state'), (3, u'0.076*question + 0.042*right + 0.026*good + 0.024*data + 0.023*mean + 0.022*lot + 0.022*answer + 0.020*inform + 0.019*reproduc + 0.018*panel'), (4, u'0.080*inform + 0.046*veri + 0.034*much + 0.033*differ + 0.030*think + 0.030*would + 0.030*physic + 0.022*complet + 0.021*paper + 0.019*rule'), (5, u'0.057*happen + 0.034*mean + 0.034*program + 0.034*futur + 0.030*predict + 0.024*mathemat + 0.022*molecul + 0.022*prove + 0.019*come + 0.019*state'), (6, u'0.057*make + 0.043*might + 0.032*paper + 0.032*piec + 0.031*someth + 0.030*work + 0.023*thank + 0.022*done + 0.020*got + 0.019*let'), (7, u'0.097*one + 0.051*someth + 0.035*get + 0.030*thing + 0.027*well + 0.025*say + 0.024*word + 0.023*whi + 0.023*would + 0.021*measur'), (8, u'0.053*like + 0.040*long + 0.031*audienc + 0.031*veri + 0.031*room + 0.029*surfac + 0.021*deal + 0.020*wo + 0.019*hologram + 0.018*clear'), (9, u'0.069*quantum + 0.041*univers + 0.038*mechan + 0.035*whole + 0.032*area + 0.025*number + 0.022*hole + 0.022*black + 0.020*say + 0.018*peopl')]
Out[35]:

In [34]:
filename = "topicogramtest.png"
buildTopicogram(root, True, 30, 10, 5, filename)
ImageDisplay(filename=filename)


LdaModel(num_terms=1726, num_topics=10, decay=0.5, chunksize=100)
[(0, u'0.043*know + 0.037*chang + 0.033*would + 0.033*system + 0.025*everyth + 0.024*becaus + 0.023*arm + 0.022*inform + 0.021*anoth + 0.019*certain'), (1, u'0.046*might + 0.035*number + 0.029*game + 0.025*rule + 0.019*evid + 0.019*hold + 0.019*room + 0.018*maximum + 0.016*specul + 0.016*possibl'), (2, u'0.087*inform + 0.035*get + 0.032*teleport + 0.023*mayb + 0.022*correl + 0.020*area + 0.020*said + 0.018*hole + 0.018*black + 0.015*super'), (3, u'0.086*inform + 0.048*go + 0.037*way + 0.035*quantum + 0.033*someth + 0.033*actual + 0.030*well + 0.026*thing + 0.025*get + 0.025*mechan'), (4, u'0.078*paper + 0.043*futur + 0.038*differ + 0.033*veri + 0.020*look + 0.018*kind + 0.017*holograph + 0.016*lot + 0.016*person + 0.013*sheep'), (5, u'0.100*comput + 0.072*univers + 0.046*think + 0.024*make + 0.023*law + 0.020*right + 0.020*time + 0.018*whole + 0.017*ani + 0.016*physic'), (6, u'0.036*like + 0.034*someth + 0.033*look + 0.032*one + 0.027*let + 0.024*abl + 0.024*said + 0.022*leonard + 0.021*observ + 0.021*know'), (7, u'0.045*electron + 0.041*ca + 0.036*around + 0.029*whi + 0.029*level + 0.028*like + 0.021*actual + 0.019*place + 0.018*random + 0.016*mysteri'), (8, u'0.059*question + 0.032*program + 0.025*answer + 0.023*time + 0.020*biolog + 0.020*word + 0.019*sinc + 0.017*univers + 0.016*physic + 0.015*like'), (9, u'0.053*one + 0.030*much + 0.029*good + 0.028*mean + 0.028*would + 0.028*come + 0.027*talk + 0.024*quantum + 0.024*principl + 0.021*state')]
Out[34]:

In [38]:
filename = "topicogramtest30-10.png"
buildTopicogram(root, True, 30, 10, 5, filename)
ImageDisplay(filename=filename)


LdaModel(num_terms=1726, num_topics=10, decay=0.5, chunksize=100)
[(0, u'0.046*electron + 0.037*sinc + 0.036*thank + 0.033*get + 0.023*idea + 0.023*zero + 0.021*energi + 0.019*know + 0.016*nom + 0.015*matter'), (1, u'0.067*one + 0.060*go + 0.039*know + 0.038*way + 0.037*like + 0.035*look + 0.034*happen + 0.025*inform + 0.022*call + 0.021*re'), (2, u'0.105*inform + 0.045*actual + 0.040*thing + 0.037*piec + 0.032*take + 0.028*place + 0.025*correl + 0.025*bit + 0.024*measur + 0.024*say'), (3, u'0.087*comput + 0.055*univers + 0.037*like + 0.034*get + 0.021*teleport + 0.020*ca + 0.020*know + 0.018*whi + 0.017*one + 0.015*possibl'), (4, u'0.039*think + 0.031*paper + 0.029*differ + 0.029*come + 0.029*talk + 0.027*inform + 0.027*someth + 0.022*good + 0.021*quantum + 0.021*much'), (5, u'0.041*veri + 0.041*think + 0.038*would + 0.033*competit + 0.020*observ + 0.019*reason + 0.018*cram + 0.018*year + 0.017*spin + 0.017*one'), (6, u'0.040*make + 0.039*chang + 0.034*system + 0.031*would + 0.027*law + 0.027*well + 0.025*mean + 0.022*anoth + 0.021*quantum + 0.021*becaus'), (7, u'0.044*program + 0.039*time + 0.039*answer + 0.030*biolog + 0.028*good + 0.025*perhap + 0.024*univers + 0.024*physic + 0.024*game + 0.022*doe'), (8, u'0.052*might + 0.037*level + 0.028*ca + 0.026*probabl + 0.026*found + 0.026*one + 0.024*surfac + 0.021*specul + 0.016*thought + 0.015*like'), (9, u'0.074*inform + 0.048*question + 0.025*data + 0.023*area + 0.021*find + 0.019*reproduc + 0.015*hole + 0.015*black + 0.014*univers + 0.014*almost')]
Out[38]:

In [ ]:
filename = "topicogramtest60-8.png"
%time buildTopicogram(root, True, 60, 8, 5, filename)
ImageDisplay(filename=filename)

In [40]:
filename = "topicogramtest60-10.png"
buildTopicogram(root, True, 60, 10, 5, filename) 
ImageDisplay(filename=filename)


LdaModel(num_terms=1726, num_topics=10, decay=0.5, chunksize=100)
[(0, u'0.045*think + 0.040*quantum + 0.039*one + 0.034*say + 0.032*thing + 0.028*would + 0.023*veri + 0.022*someth + 0.021*mechan + 0.019*state'), (1, u'0.035*inform + 0.030*chang + 0.027*like + 0.026*system + 0.023*well + 0.022*comput + 0.021*law + 0.021*time + 0.021*go + 0.020*know'), (2, u'0.073*paper + 0.062*futur + 0.027*experi + 0.027*wrong + 0.024*differ + 0.022*communiti + 0.021*home + 0.018*want + 0.018*good + 0.016*respons'), (3, u'0.099*inform + 0.031*know + 0.029*might + 0.027*mean + 0.023*talk + 0.020*lot + 0.019*veri + 0.018*think + 0.018*volum + 0.017*rule'), (4, u'0.042*make + 0.040*electron + 0.028*question + 0.027*ca + 0.024*one + 0.019*yessir + 0.018*energi + 0.018*audienc + 0.017*sinc + 0.016*panel'), (5, u'0.033*thank + 0.033*good + 0.028*level + 0.027*game + 0.024*word + 0.022*particular + 0.022*know + 0.020*observ + 0.020*written + 0.018*dead'), (6, u'0.045*one + 0.044*inform + 0.041*teleport + 0.029*actual + 0.026*measur + 0.026*well + 0.024*copi + 0.024*quantum + 0.023*thing + 0.022*get'), (7, u'0.030*someth + 0.023*look + 0.021*probabl + 0.020*zero + 0.020*get + 0.018*good + 0.018*said + 0.017*appli + 0.016*leonard + 0.015*exist'), (8, u'0.060*univers + 0.056*comput + 0.034*inform + 0.026*piec + 0.021*way + 0.018*physic + 0.018*program + 0.017*area + 0.013*reproduc + 0.012*interest'), (9, u'0.040*place + 0.040*data + 0.028*re + 0.028*long + 0.025*mayb + 0.024*go + 0.023*understand + 0.021*realli + 0.021*relat + 0.018*ask')]
Out[40]:

In [42]:
filename = "topicogramtest60-10.png"
buildTopicogram(root, True, 60, 10, 5, filename)
ImageDisplay(filename=filename)


LdaModel(num_terms=1726, num_topics=10, decay=0.5, chunksize=100)
[(0, u'0.059*actual + 0.039*one + 0.036*teleport + 0.035*place + 0.026*whi + 0.025*univers + 0.023*comput + 0.022*well + 0.022*inform + 0.021*ca'), (1, u'0.159*inform + 0.036*bit + 0.033*get + 0.028*know + 0.026*piec + 0.025*correl + 0.019*think + 0.018*volum + 0.015*way + 0.012*well'), (2, u'0.070*state + 0.045*one + 0.024*certain + 0.024*possibl + 0.022*dead + 0.021*unknown + 0.019*observ + 0.018*done + 0.018*exampl + 0.017*way'), (3, u'0.049*univers + 0.032*like + 0.025*happen + 0.020*comput + 0.018*veri + 0.017*even + 0.014*area + 0.014*also + 0.014*get + 0.013*onlin'), (4, u'0.041*question + 0.037*electron + 0.032*sinc + 0.030*ca + 0.030*thank + 0.023*panel + 0.020*probabl + 0.019*zero + 0.019*one + 0.017*energi'), (5, u'0.040*think + 0.035*would + 0.033*someth + 0.030*paper + 0.026*know + 0.026*veri + 0.025*good + 0.025*differ + 0.022*come + 0.022*time'), (6, u'0.039*inform + 0.037*reproduc + 0.026*hole + 0.025*black + 0.023*found + 0.023*machin + 0.022*room + 0.022*interest + 0.021*surfac + 0.021*area'), (7, u'0.054*quantum + 0.035*mechan + 0.035*complet + 0.028*principl + 0.027*rule + 0.023*relat + 0.023*much + 0.020*kind + 0.020*veri + 0.019*look'), (8, u'0.031*thing + 0.030*inform + 0.026*one + 0.026*chang + 0.025*go + 0.022*system + 0.022*like + 0.021*say + 0.019*know + 0.019*make'), (9, u'0.083*comput + 0.036*univers + 0.029*go + 0.028*way + 0.028*program + 0.027*question + 0.025*biolog + 0.021*good + 0.021*physic + 0.020*ask')]
Out[42]:

In [39]:
filename = "topicogramtest240-10.png"
buildTopicogram(root, True, 240, 10, 5, filename)
ImageDisplay(filename=filename)


LdaModel(num_terms=1726, num_topics=10, decay=0.5, chunksize=100)
[(0, u'0.099*inform + 0.024*make + 0.022*reproduc + 0.021*one + 0.020*actual + 0.019*well + 0.019*take + 0.017*leonard + 0.017*anoth + 0.017*could'), (1, u'0.036*would + 0.034*thing + 0.033*think + 0.031*know + 0.030*inform + 0.025*differ + 0.024*talk + 0.024*quantum + 0.023*chang + 0.022*come'), (2, u'0.031*one + 0.020*look + 0.018*function + 0.018*home + 0.017*deviat + 0.017*zero + 0.014*doe + 0.014*two + 0.013*veri + 0.012*particular'), (3, u'0.036*dead + 0.032*see + 0.028*quantum + 0.023*make + 0.020*look + 0.019*write + 0.019*upon + 0.018*stori + 0.018*coher + 0.018*creat'), (4, u'0.046*place + 0.044*correl + 0.029*veri + 0.026*teleport + 0.026*like + 0.021*sinc + 0.021*mechan + 0.020*quantum + 0.020*right + 0.019*big'), (5, u'0.028*electron + 0.026*data + 0.020*one + 0.018*certain + 0.017*mean + 0.016*know + 0.015*game + 0.013*veri + 0.013*answer + 0.013*experi'), (6, u'0.051*inform + 0.029*get + 0.021*question + 0.019*someth + 0.017*know + 0.016*notebook + 0.016*energi + 0.014*ask + 0.014*veri + 0.013*panel'), (7, u'0.059*inform + 0.049*paper + 0.036*might + 0.034*ca + 0.027*much + 0.022*someth + 0.022*piec + 0.021*measur + 0.021*thank + 0.018*futur'), (8, u'0.047*think + 0.031*biolog + 0.027*univers + 0.025*comput + 0.025*way + 0.020*program + 0.020*black + 0.019*go + 0.019*physic + 0.019*abl'), (9, u'0.065*comput + 0.049*univers + 0.021*good + 0.018*law + 0.018*question + 0.017*whole + 0.015*go + 0.014*whi + 0.014*happen + 0.012*actual')]
Out[39]:

In [41]:
filename = "topicogramtest240-10.png"
buildTopicogram(root, True, 240, 10, 5, filename)
ImageDisplay(filename=filename)


LdaModel(num_terms=1726, num_topics=10, decay=0.5, chunksize=100)
[(0, u'0.069*comput + 0.052*univers + 0.020*whole + 0.019*good + 0.019*law + 0.015*question + 0.014*say + 0.013*program + 0.013*whi + 0.013*actual'), (1, u'0.051*paper + 0.038*might + 0.037*inform + 0.036*ca + 0.028*much + 0.027*someth + 0.023*measur + 0.022*mean + 0.022*thank + 0.020*actual'), (2, u'0.077*inform + 0.028*one + 0.022*actual + 0.021*make + 0.021*copi + 0.019*reproduc + 0.019*could + 0.019*take + 0.019*well + 0.019*teleport'), (3, u'0.061*electron + 0.041*one + 0.034*quantum + 0.032*write + 0.030*relat + 0.030*dead + 0.029*make + 0.028*alway + 0.022*ok + 0.019*mean'), (4, u'0.031*zero + 0.025*get + 0.025*two + 0.022*one + 0.021*number + 0.020*function + 0.018*doe + 0.017*experi + 0.015*went + 0.014*call'), (5, u'0.068*inform + 0.031*piec + 0.021*data + 0.017*question + 0.016*go + 0.014*said + 0.013*veri + 0.013*game + 0.013*perhap + 0.013*somebodi'), (6, u'0.044*correl + 0.040*place + 0.026*veri + 0.022*like + 0.021*teleport + 0.020*sinc + 0.018*big + 0.018*requir + 0.018*mechan + 0.018*appli'), (7, u'0.041*inform + 0.024*get + 0.018*someth + 0.017*question + 0.016*problem + 0.016*physic + 0.015*panel + 0.014*veri + 0.014*audienc + 0.013*bigger'), (8, u'0.039*think + 0.036*would + 0.033*know + 0.028*thing + 0.026*inform + 0.025*differ + 0.024*talk + 0.023*chang + 0.022*come + 0.021*system'), (9, u'0.027*comput + 0.023*look + 0.018*quantum + 0.017*see + 0.017*biolog + 0.017*make + 0.014*program + 0.013*re + 0.013*abl + 0.013*go')]
Out[41]:

In [48]:
filename = "topicogramtest15-15.png"
buildTopicogram(root, True, 15, 15, 5, filename)
ImageDisplay(filename=filename)


LdaModel(num_terms=1726, num_topics=15, decay=0.5, chunksize=100)
[(0, u'0.082*time + 0.069*univers + 0.066*right + 0.034*come + 0.032*whole + 0.030*idea + 0.027*perhap + 0.027*yessir + 0.026*almost + 0.017*record'), (1, u'0.088*one + 0.067*differ + 0.049*talk + 0.044*veri + 0.042*state + 0.034*much + 0.033*principl + 0.033*inform + 0.033*predict + 0.030*complet'), (2, u'0.080*paper + 0.042*futur + 0.032*made + 0.032*done + 0.027*person + 0.025*mathemat + 0.023*observ + 0.022*veri + 0.022*relat + 0.022*come'), (3, u'0.154*like + 0.095*someth + 0.051*peopl + 0.040*univers + 0.036*quantum + 0.029*veri + 0.022*random + 0.021*know + 0.021*far + 0.020*mysteri'), (4, u'0.081*go + 0.069*way + 0.049*look + 0.048*well + 0.043*piec + 0.041*know + 0.035*re + 0.035*exampl + 0.029*someth + 0.029*data'), (5, u'0.125*quantum + 0.075*said + 0.073*mechan + 0.055*let + 0.035*call + 0.030*leonard + 0.029*good + 0.027*probabl + 0.023*exist + 0.023*thing'), (6, u'0.066*doe + 0.063*think + 0.051*comput + 0.049*good + 0.048*know + 0.040*might + 0.034*particular + 0.034*even + 0.030*see + 0.026*realli'), (7, u'0.092*happen + 0.062*electron + 0.047*sinc + 0.041*point + 0.040*program + 0.023*histori + 0.023*spin + 0.022*nom + 0.020*unknown + 0.020*possibl'), (8, u'0.128*question + 0.092*make + 0.054*answer + 0.042*measur + 0.040*ca + 0.040*word + 0.036*reproduc + 0.034*interest + 0.023*audienc + 0.021*machin'), (9, u'0.076*inform + 0.043*arm + 0.043*know + 0.034*take + 0.032*mayb + 0.030*ask + 0.028*back + 0.025*would + 0.025*think + 0.024*bit'), (10, u'0.081*one + 0.076*chang + 0.070*say + 0.043*certain + 0.038*biolog + 0.030*system + 0.029*big + 0.028*law + 0.027*ok + 0.027*experi'), (11, u'0.066*inform + 0.065*univers + 0.047*also + 0.034*physic + 0.032*area + 0.022*hole + 0.022*black + 0.022*understand + 0.021*work + 0.021*talk'), (12, u'0.091*comput + 0.051*thing + 0.050*actual + 0.047*think + 0.041*could + 0.029*ani + 0.027*place + 0.025*teleport + 0.021*na + 0.021*anoth'), (13, u'0.047*whi + 0.040*thank + 0.037*game + 0.036*long + 0.034*competit + 0.030*around + 0.028*zero + 0.025*room + 0.023*surfac + 0.023*found'), (14, u'0.153*inform + 0.084*get + 0.063*mean + 0.044*correl + 0.042*would + 0.031*rule + 0.031*well + 0.023*tri + 0.021*seem + 0.020*mind')]
Out[48]:

In [50]:
filename = "topicogramtest30-15.png"
buildTopicogram(root, True, 30, 15, 5, filename)
ImageDisplay(filename=filename)


LdaModel(num_terms=1726, num_topics=15, decay=0.5, chunksize=100)
[(0, u'0.062*talk + 0.049*correl + 0.043*quantum + 0.043*inform + 0.037*mechan + 0.034*right + 0.033*say + 0.032*relat + 0.031*term + 0.025*thing'), (1, u'0.065*think + 0.061*inform + 0.060*veri + 0.045*paper + 0.040*differ + 0.039*mean + 0.038*good + 0.034*much + 0.028*futur + 0.028*doe'), (2, u'0.073*question + 0.060*univers + 0.032*answer + 0.027*program + 0.027*whole + 0.023*perhap + 0.021*theorem + 0.021*biolog + 0.019*anybodi + 0.019*physic'), (3, u'0.062*inform + 0.054*comput + 0.048*actual + 0.046*could + 0.045*bit + 0.038*say + 0.038*everyth + 0.025*piec + 0.023*fact + 0.021*measur'), (4, u'0.139*get + 0.070*inform + 0.052*teleport + 0.033*learn + 0.030*hole + 0.030*black + 0.027*area + 0.026*world + 0.023*super + 0.020*cram'), (5, u'0.038*one + 0.034*probabl + 0.026*level + 0.024*exist + 0.019*involv + 0.018*respons + 0.018*ad + 0.017*would + 0.017*idea + 0.017*veri'), (6, u'0.078*way + 0.058*go + 0.040*look + 0.039*know + 0.033*data + 0.031*ask + 0.027*re + 0.024*someth + 0.024*inform + 0.022*take'), (7, u'0.057*one + 0.052*inform + 0.038*chang + 0.034*system + 0.026*arm + 0.026*thing + 0.024*certain + 0.022*time + 0.019*would + 0.019*ani'), (8, u'0.109*might + 0.073*univers + 0.040*realli + 0.029*expand + 0.028*like + 0.025*yes + 0.022*surfac + 0.022*kind + 0.021*level + 0.018*found'), (9, u'0.050*electron + 0.044*mayb + 0.041*take + 0.035*whi + 0.035*reproduc + 0.032*around + 0.027*actual + 0.025*zero + 0.021*machin + 0.021*random'), (10, u'0.044*said + 0.030*leonard + 0.029*panel + 0.025*appli + 0.023*guess + 0.021*entir + 0.019*histori + 0.019*doctor + 0.016*number + 0.016*onlin'), (11, u'0.068*state + 0.061*one + 0.058*come + 0.049*predict + 0.041*made + 0.038*rule + 0.037*talk + 0.037*quantum + 0.033*mathemat + 0.033*general'), (12, u'0.086*well + 0.075*make + 0.063*know + 0.048*go + 0.038*univers + 0.031*look + 0.027*game + 0.025*good + 0.024*abl + 0.021*observ'), (13, u'0.067*ca + 0.058*happen + 0.054*thank + 0.048*sinc + 0.047*tri + 0.035*competit + 0.029*word + 0.025*re + 0.023*spin + 0.022*configur'), (14, u'0.114*comput + 0.087*like + 0.073*someth + 0.053*quantum + 0.030*mechan + 0.029*call + 0.028*physic + 0.026*know + 0.025*veri + 0.024*would')]
Out[50]:

In [51]:
filename = "topicogramtest60-15.png"
buildTopicogram(root, True, 60, 15, 5, filename)
ImageDisplay(filename=filename)


LdaModel(num_terms=1726, num_topics=15, decay=0.5, chunksize=100)
[(0, u'0.056*actual + 0.052*place + 0.048*measur + 0.034*sinc + 0.025*ca + 0.025*like + 0.025*one + 0.023*thank + 0.022*tri + 0.022*word'), (1, u'0.058*think + 0.048*might + 0.044*would + 0.037*time + 0.036*futur + 0.036*veri + 0.033*biolog + 0.030*look + 0.027*big + 0.021*differ'), (2, u'0.047*paper + 0.039*much + 0.033*think + 0.031*talk + 0.031*would + 0.031*quantum + 0.030*thing + 0.029*differ + 0.026*someth + 0.026*yeah'), (3, u'0.149*inform + 0.041*get + 0.040*know + 0.037*someth + 0.031*time + 0.026*mean + 0.024*volum + 0.023*molecul + 0.023*talk + 0.022*come'), (4, u'0.047*game + 0.038*inform + 0.037*probabl + 0.036*leonard + 0.034*said + 0.032*doe + 0.027*nom + 0.027*physic + 0.027*good + 0.023*made'), (5, u'0.052*know + 0.051*chang + 0.038*say + 0.036*law + 0.032*piec + 0.028*take + 0.028*comput + 0.027*go + 0.027*could + 0.026*ani'), (6, u'0.073*question + 0.063*electron + 0.044*competit + 0.043*panel + 0.043*good + 0.042*mayb + 0.040*make + 0.028*general + 0.022*spin + 0.020*tri'), (7, u'0.085*way + 0.047*go + 0.045*data + 0.027*inform + 0.025*bit + 0.021*long + 0.021*re + 0.020*quantum + 0.020*predict + 0.018*principl'), (8, u'0.034*zero + 0.032*dead + 0.029*alway + 0.027*paper + 0.026*thank + 0.024*good + 0.021*word + 0.020*aliv + 0.020*a0 + 0.018*coher'), (9, u'0.030*notebook + 0.022*someth + 0.022*histori + 0.019*human + 0.019*function + 0.016*find + 0.016*keep + 0.016*ignor + 0.014*re + 0.014*go'), (10, u'0.043*area + 0.026*hole + 0.026*black + 0.025*perhap + 0.022*audienc + 0.022*ve + 0.021*expand + 0.020*inform + 0.020*guess + 0.019*super'), (11, u'0.065*quantum + 0.063*one + 0.058*make + 0.051*certain + 0.047*teleport + 0.045*everyth + 0.036*get + 0.033*correl + 0.031*got + 0.030*somebodi'), (12, u'0.052*one + 0.041*inform + 0.033*mean + 0.030*arm + 0.029*thing + 0.026*like + 0.025*well + 0.024*system + 0.022*give + 0.021*doe'), (13, u'0.095*univers + 0.087*comput + 0.032*program + 0.030*happen + 0.024*like + 0.020*whi + 0.020*possibl + 0.018*even + 0.017*mechan + 0.017*theorem'), (14, u'0.046*reproduc + 0.043*mathemat + 0.034*inform + 0.032*around + 0.029*machin + 0.024*record + 0.024*softwar + 0.022*interest + 0.020*video + 0.019*space')]
Out[51]:

In [52]:
filename = "topicogramtest90-15.png"
buildTopicogram(root, True, 90, 15, 5, filename)
ImageDisplay(filename=filename)


LdaModel(num_terms=1726, num_topics=15, decay=0.5, chunksize=100)
[(0, u'0.063*teleport + 0.060*place + 0.054*actual + 0.038*copi + 0.029*inform + 0.020*sir + 0.019*system + 0.016*adam + 0.016*bit + 0.016*incomplet'), (1, u'0.057*one + 0.047*thing + 0.041*think + 0.041*say + 0.040*would + 0.033*chang + 0.028*time + 0.028*inform + 0.027*could + 0.027*go'), (2, u'0.182*inform + 0.033*get + 0.031*area + 0.025*reproduc + 0.022*veri + 0.022*bit + 0.021*interest + 0.021*way + 0.019*hole + 0.019*black'), (3, u'0.031*know + 0.030*ask + 0.030*question + 0.030*observ + 0.028*possibl + 0.022*room + 0.022*game + 0.020*panel + 0.020*video + 0.019*thought'), (4, u'0.050*veri + 0.042*sinc + 0.037*relat + 0.032*get + 0.026*one + 0.025*someth + 0.025*particular + 0.024*aliv + 0.024*box + 0.023*upon'), (5, u'0.031*guess + 0.026*specul + 0.026*entir + 0.024*even + 0.024*like + 0.021*thousand + 0.021*veri + 0.021*deviat + 0.021*get + 0.019*problem'), (6, u'0.059*paper + 0.041*differ + 0.038*piec + 0.032*someth + 0.032*talk + 0.030*futur + 0.030*physic + 0.028*correl + 0.027*inform + 0.027*come'), (7, u'0.063*electron + 0.031*zero + 0.029*written + 0.026*communiti + 0.020*far + 0.019*home + 0.018*function + 0.016*ad + 0.016*hun + 0.015*sinc'), (8, u'0.031*actual + 0.030*probabl + 0.026*fact + 0.024*leonard + 0.023*one + 0.022*surfac + 0.022*like + 0.021*inform + 0.019*thing + 0.018*get'), (9, u'0.082*comput + 0.076*univers + 0.033*make + 0.028*question + 0.026*whole + 0.024*program + 0.019*happen + 0.019*whi + 0.018*answer + 0.018*thank'), (10, u'0.052*think + 0.048*know + 0.034*like + 0.031*inform + 0.025*mean + 0.025*system + 0.021*lot + 0.020*learn + 0.019*molecul + 0.019*volum'), (11, u'0.096*might + 0.073*quantum + 0.054*term + 0.053*veri + 0.045*mechan + 0.038*least + 0.036*rule + 0.026*differ + 0.024*talk + 0.024*right'), (12, u'0.058*mathemat + 0.050*good + 0.037*appli + 0.036*theorem + 0.032*distant + 0.031*dead + 0.028*call + 0.023*like + 0.020*institut + 0.019*panel'), (13, u'0.067*quantum + 0.067*ca + 0.046*principl + 0.035*come + 0.034*much + 0.031*mechan + 0.026*veri + 0.025*way + 0.025*certain + 0.023*quit'), (14, u'0.050*data + 0.037*way + 0.034*go + 0.032*comput + 0.031*re + 0.024*notebook + 0.023*look + 0.020*ask + 0.017*see + 0.016*like')]
Out[52]:

In [ ]: