In [ ]:
# SEMANTIGRAM
# a visualization of topic change over time in transcript data
# @author cj carr
# http://cortexel.us
# http://github.com/cortexelus/semantigram
# todo
In [1]:
from xml.etree import cElementTree as ET
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3
import HTMLParser
h = HTMLParser.HTMLParser()
transcript = 'transcripts/physics-of-information2.xml'
In [2]:
tree = ET.parse(transcript)
root = tree.getroot()
root
Out[2]:
In [4]:
# Print examples from transcript
i = 0
for child in root:
print child.attrib
print child.text
if i > 10: break
i += 1
In [5]:
def formatText(text):
text = text.replace("\n"," ")
text = h.unescape(text)
return text
In [6]:
def chunkByTexts(root, chunkSize):
chunks = []
i = 0
for i in xrange(len(root) - chunkSize ):
#chunk = " ".join(map(lambda x:x.text.replace("\n"," "),root[i:i+chunkSizeInTexts]))
texts = []
duration = 0
for j in xrange(chunkSize):
texts.append(formatText(root[i+j].text))
duration += float(root[i+j].attrib["dur"])
text = " ".join(texts)
chunk = {"start": float(root[i].attrib["start"]), "duration": duration, "text": text}
chunks.append(chunk)
return chunks
In [10]:
def chunkByTime(root, minimumTime):
chunks = []
i = 0
for i in xrange(len(root)):
#chunk = " ".join(map(lambda x:x.text.replace("\n"," "),root[i:i+chunkSizeInTexts]))
texts = []
duration = 0
for j in xrange(i,len(root)):
texts.append(formatText(root[j].text))
duration += float(root[j].attrib["dur"])
if(duration>=minimumTime): break
text = " ".join(texts)
chunk = {"start": float(root[i].attrib["start"]), "duration": duration, "text": text}
chunks.append(chunk)
return chunks
In [11]:
# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend([u"'s",u"n't",u"'m",u"'d"])
print stopwords
In [12]:
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
# here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed
def tokenize_and_stem(text):
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
stems = [stemmer.stem(t) for t in filtered_tokens]
return stems
In [13]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora, models, similarities
import time as time
import numpy as np
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d.axes3d as p3
from sklearn.cluster import AgglomerativeClustering
In [47]:
"""
#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.9, max_features=200000,
min_df=0.0, stop_words='english',
use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))
%time tfidf_matrix = tfidf_vectorizer.fit_transform(documents) #fit the vectorizer to synopses
print(tfidf_matrix.shape)
terms = tfidf_vectorizer.get_feature_names()
print terms """
Out[47]:
In [17]:
import ImageDraw
from PIL import Image
from IPython.display import Image as ImageDisplay
def drawTopicogram(corpus, ntopics, topicOrder, lda, filename):
W = len(corpus)
H = 100 #topics
blockHeight = H/ntopics
img = Image.new("RGB", (W, H), "black")
draw = ImageDraw.Draw(img)
for x in xrange(W):
doc_lda = lda[corpus[x]]
for t in doc_lda:
topicNum = topicOrder.index(t[0])
hue = int(360 * (topicNum*blockHeight/float(H)))
saturation = 100
lightness = int(pow(t[1],1) * 100)
color = "hsl("+str(hue)+","+str(saturation)+"%,"+str(lightness)+"%)"
for y in xrange(topicNum *blockHeight,(topicNum+1) * blockHeight):
draw.point((x,y), fill=color)
img.save(filename, "PNG")
In [27]:
def buildTopicogram(root, time_chunking, size, ntopics, passes, filename):
if(time_chunking):
chunks = chunkByTime(root, size) # size: each chunk is at least size seconds long
else:
chunks = chunkByTexts(root, size) # size: each chunk is the concatenation of this # of texts
documents = []
for chunk in chunks:
#chunk["tokenized"] = tokenize_and_stem(chunk["text"])
documents.append(chunk["text"])
#print len(documents)
#tokenize
tokenized_text = [tokenize_and_stem(text) for text in documents]
#remove stop words
texts = [[word for word in text if word not in stopwords] for text in tokenized_text]
#create a Gensim dictionary from the texts
dictionary = corpora.Dictionary(texts)
#remove extremes (similar to the min/max df step used when creating the tf-idf matrix)
#dictionary.filter_extremes(no_below=0, no_above=1)
#convert the dictionary to a bag of words corpus for reference
corpus = [dictionary.doc2bow(text) for text in texts]
#print len(dictionary)
#print len(corpus)
lda = models.LdaModel(corpus, num_topics=ntopics, id2word=dictionary, passes=passes, chunksize=100)
print lda
print lda.show_topics(ntopics)
#print corpus
# sort topics by initial max
# find maxs
#topics = [[] for k in range(len(corpus))]
#print "topics", topics
topicMaxs = [(0,0,0)]*ntopics
for x in xrange(len(corpus)):
for y in lda[corpus[x]]:
t = y[0]
v = y[1]
#print ":", x, y, t, v
if topicMaxs[t][1] < v:
# new max
topicMaxs[t] = (t,v,x)
#print ">", topicMaxs
# sort by maxs
topicOrder = sorted(topicMaxs, key = lambda s: s[2])
topicOrder = map(lambda s: s[0], topicOrder)
# draw it
drawTopicogram(corpus, ntopics, topicOrder, lda, filename)
In [36]:
filename = "topicogramtest.png"
buildTopicogram(root, True, 60, 5, 10, filename)
ImageDisplay(filename=filename)
Out[36]:
In [37]:
filename = "topicogramtes60-5.png"
buildTopicogram(root, True, 60, 5, 10, filename)
ImageDisplay(filename=filename)
Out[37]:
In [45]:
filename = "topicogramtest5-8.png"
buildTopicogram(root, True, 5, 8, 5, filename)
ImageDisplay(filename=filename)
Out[45]:
In [43]:
filename = "topicogramtest15-8.png"
buildTopicogram(root, True, 15, 8, 5, filename)
ImageDisplay(filename=filename)
Out[43]:
In [35]:
filename = "topicogramtest.png"
buildTopicogram(root, True, 15, 10, 5, filename)
ImageDisplay(filename=filename)
Out[35]:
In [34]:
filename = "topicogramtest.png"
buildTopicogram(root, True, 30, 10, 5, filename)
ImageDisplay(filename=filename)
Out[34]:
In [38]:
filename = "topicogramtest30-10.png"
buildTopicogram(root, True, 30, 10, 5, filename)
ImageDisplay(filename=filename)
Out[38]:
In [ ]:
filename = "topicogramtest60-8.png"
%time buildTopicogram(root, True, 60, 8, 5, filename)
ImageDisplay(filename=filename)
In [40]:
filename = "topicogramtest60-10.png"
buildTopicogram(root, True, 60, 10, 5, filename)
ImageDisplay(filename=filename)
Out[40]:
In [42]:
filename = "topicogramtest60-10.png"
buildTopicogram(root, True, 60, 10, 5, filename)
ImageDisplay(filename=filename)
Out[42]:
In [39]:
filename = "topicogramtest240-10.png"
buildTopicogram(root, True, 240, 10, 5, filename)
ImageDisplay(filename=filename)
Out[39]:
In [41]:
filename = "topicogramtest240-10.png"
buildTopicogram(root, True, 240, 10, 5, filename)
ImageDisplay(filename=filename)
Out[41]:
In [48]:
filename = "topicogramtest15-15.png"
buildTopicogram(root, True, 15, 15, 5, filename)
ImageDisplay(filename=filename)
Out[48]:
In [50]:
filename = "topicogramtest30-15.png"
buildTopicogram(root, True, 30, 15, 5, filename)
ImageDisplay(filename=filename)
Out[50]:
In [51]:
filename = "topicogramtest60-15.png"
buildTopicogram(root, True, 60, 15, 5, filename)
ImageDisplay(filename=filename)
Out[51]:
In [52]:
filename = "topicogramtest90-15.png"
buildTopicogram(root, True, 90, 15, 5, filename)
ImageDisplay(filename=filename)
Out[52]:
In [ ]: