In [ ]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import math
from collections import namedtuple
from os import listdir
import os
import gzip
%matplotlib
Entry = namedtuple('Entry',['value','cluster'])
TEntry = namedtuple('TEntry',['value','topic'])
In [ ]:
def openFile(name, mode):
if name.lower().endswith('.gz'):
return gzip.open(name, mode+'b')
else:
return open(name, mode)
In [ ]:
dataDir = '/Users/dgrossman/data/'
startsWith = 'tbirdBig'
In [ ]:
files = listdir(dataDir)
In [ ]:
filelist = list()
for f in files:
if f.endswith('.out') and f.startswith(startsWith) and (os.path.getsize(dataDir+f) > 0):
filelist.append(f)
In [ ]:
documents = dict()
for f in filelist:
a = openFile('/Users/dgrossman/data/' + f,'r')
words = ""
for w in a.readlines():
words += w.lstrip().strip()
words += ' '
a.close()
documents[f] = words.strip()
In [ ]:
doc_set = list()
for d in documents.itervalues():
doc_set.append(d)
In [ ]:
texts = []
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
for d in doc_set:
tokens = tokenizer.tokenize(d)
texts.append(tokens)
In [ ]:
dictionary = corpora.Dictionary(texts)
In [ ]:
corpus = [dictionary.doc2bow(text) for text in texts]
In [ ]:
topics = 100
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=topics, id2word = dictionary, passes=200)
In [ ]:
outData = list()
for x in(ldamodel.print_topics(num_topics=topics, num_words=40)):
parts = x.split('+')
sum = 0
lines = list()
empty = True
for p in parts:
val,cluster = p.lstrip().strip().split('*')
if (sum + float(val) < .8) or empty:
sum += float(val)
empty = False
lines.append(Entry(float(val),int(cluster)))
outData.append((sum,lines))
In [ ]:
import re
templateFile = '/Users/dgrossman/data/tbird.log.preProc.200.supports.clusters'
tf = openFile(templateFile,'r').readlines()
templateList = list()
for t in tf:
# print t.strip()
unescaped= re.sub(r'[\^]','',re.sub(r'[\\]', '', t)).strip()
templateList.append(unescaped)
In [ ]:
for topic, X in enumerate(outData):
s = X[0]
ent = X[1]
print 'topic=%i|count=%i|energy=%f' % (topic, len(ent), s)
wordBag = set()
for e in ent:
cluster, string = templateList[e.cluster].split(',',1)
print '\t%5i| %1.4f| %s' % (int(cluster),e.value,string)
for s in string.split():
wordBag.add(s)
print
print 'words used : %s' % ' '.join(sorted(wordBag))
print
In [ ]:
wc = list()
for topic, X in enumerate(outData):
s = X[0]
ent = X[1]
print 'topic=%i|count=%i|energy=%f' % (topic, len(ent), s)
print
for topic, X in enumerate(outData):
strength = X[0]
entries = X[1]
if strength > 0.4:
print 'topic=%i|count=%i|energy=%f' % (topic, len(entries), strength)
wordBag = set()
strongDict = dict()
strongList = list()
for e in entries:
cluster, string = templateList[e.cluster].split(',',1)
print '\t%5i| %1.4f| %s' % (int(cluster),e.value,string)
for value in string.split():
if value not in wordBag:
wordBag.add(value)
print 'adding',value,strength,e.value*100000
strongDict[value] = e.value*100000 +1
print 'stuff',topic,strongDict
wc.append((topic,strongDict))
print
print 'words used : %s' % ' '.join(sorted(wordBag))
print
In [ ]:
#make wordcloud from high energy topics
from wordcloud import WordCloud
import matplotlib.pyplot as plt
items = len(wc)
fig = plt.figure()
f =0
for w in wc:
topic,wordFreqs = w
cloudWords = list()
for wordRel in wordFreqs.iteritems():
#print topic,wordRel
if not wordRel[0].isupper():
cloudWords.append(wordRel)
if len(cloudWords) > 0:
f += 1
numFigs = math.ceil(math.sqrt(f))
print numFigs,f
temp =1
for w in wc:
topic,wordFreqs = w
cloudWords = list()
for wordRel in wordFreqs.iteritems():
#print topic,wordRel
if not wordRel[0].isupper():
cloudWords.append(wordRel)
if len(cloudWords) > 0:
wordcloud = WordCloud(max_font_size=40, relative_scaling=.5).generate_from_frequencies(cloudWords)
plt.subplot(numFigs,numFigs,temp)
plt.title('topic %s' % topic)
plt.imshow(wordcloud)
plt.axis("off")
temp = temp+1
plt.show()
In [ ]:
outData2 = list()
for x in(ldamodel.print_topics(num_topics=topics,num_words=40000)):
parts = x.split('+')
sum = 0
lines = list()
for p in parts:
val,cluster = p.lstrip().strip().split('*')
sum += float(val)
lines.append(Entry(float(val),int(cluster)))
outData2.append((sum,lines))
In [ ]:
for topic, X in enumerate(outData2):
s = X[0]
ent = X[1]
print 'topic=%i|count=%i|energy=%f' % (topic, len(ent), s)
for e in ent:
if e.value > .01 :
print '\t',topic,e
In [ ]:
bestCluster2Topic = dict()
for topic, scoreList in enumerate(outData2):
score = scoreList[0]
ent = scoreList[1]
for index in ent:
# print topic,index
if index.cluster not in bestCluster2Topic:
# print '********new******',topic,index
bestCluster2Topic[index.cluster] = TEntry(index.value,topic)
else:
current = bestCluster2Topic[index.cluster]
if index.value > current.value:
# print '***update****',topic,index
bestCluster2Topic[index.cluster] = TEntry(index.value,topic)
#print bestCluster2Topic
In [ ]:
topic2Clust = dict()
c2t = dict()
for cluster,tval in bestCluster2Topic.iteritems():
c2t[cluster] = tval.topic
for cluster,topic in c2t.iteritems():
if topic in topic2Clust:
c = topic2Clust[topic]
c.add(cluster)
else:
c = set()
c.add(cluster)
topic2Clust[topic]=c
In [ ]:
outMap = dict()
for k,t in topic2Clust.iteritems():
for i in t:
outMap[i]=k
In [ ]:
print outMap
In [ ]:
In [ ]: