notebook.community

Edit and run



In [ ]:

    
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import math
from collections import namedtuple
from os import listdir
import os
import gzip
%matplotlib
Entry = namedtuple('Entry',['value','cluster'])
TEntry = namedtuple('TEntry',['value','topic'])



In [ ]:

    
def openFile(name, mode):
    if name.lower().endswith('.gz'):
        return gzip.open(name, mode+'b')
    else:
        return open(name, mode)



In [ ]:

    
dataDir = '/Users/dgrossman/data/'
startsWith = 'tbirdBig'



In [ ]:

    
files =  listdir(dataDir)



In [ ]:

    
filelist = list()
for f in files:
    if f.endswith('.out') and f.startswith(startsWith) and (os.path.getsize(dataDir+f) > 0):
        filelist.append(f)



In [ ]:

    
documents = dict()
for f in filelist:
    a = openFile('/Users/dgrossman/data/' + f,'r')
    words = ""
    for w in a.readlines():
        words += w.lstrip().strip()
        words += ' '
    a.close()
    documents[f] = words.strip()



In [ ]:

    
doc_set = list()
for d in documents.itervalues():
    doc_set.append(d)



In [ ]:

    
texts = []
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
for d in doc_set:
    tokens = tokenizer.tokenize(d)
    texts.append(tokens)



In [ ]:

    
dictionary = corpora.Dictionary(texts)



In [ ]:

    
corpus = [dictionary.doc2bow(text) for text in texts]



In [ ]:

    
topics = 100
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=topics, id2word = dictionary, passes=200)



In [ ]:

    
outData = list()
for x in(ldamodel.print_topics(num_topics=topics, num_words=40)):
    parts =  x.split('+')
    sum = 0
    lines = list()
    empty = True
    for p in parts:
        val,cluster = p.lstrip().strip().split('*')

        if (sum + float(val) < .8) or empty:
            sum += float(val)
            
            empty = False
            lines.append(Entry(float(val),int(cluster)))
            
    outData.append((sum,lines))



In [ ]:

    
import re
templateFile = '/Users/dgrossman/data/tbird.log.preProc.200.supports.clusters'
tf = openFile(templateFile,'r').readlines()
templateList = list()
for t in tf:
    # print t.strip()
    unescaped= re.sub(r'[\^]','',re.sub(r'[\\]', '', t)).strip()
    templateList.append(unescaped)



In [ ]:

    
for topic, X in enumerate(outData):
    s = X[0]
    ent = X[1]
    print 'topic=%i|count=%i|energy=%f' % (topic, len(ent), s)
    wordBag = set()
    for e in ent:
        cluster, string = templateList[e.cluster].split(',',1)
        print '\t%5i| %1.4f| %s' % (int(cluster),e.value,string)
        for s in string.split():
            wordBag.add(s)
    print
    print 'words used : %s' % ' '.join(sorted(wordBag))
    print



In [ ]:

    
wc = list()

for topic, X in enumerate(outData):
    s = X[0]
    ent = X[1]
    print 'topic=%i|count=%i|energy=%f' % (topic, len(ent), s)
    
print 
for topic, X in enumerate(outData):
    strength = X[0]
    entries = X[1]
    if strength > 0.4:
        print 'topic=%i|count=%i|energy=%f' % (topic, len(entries), strength)
        wordBag = set()
        strongDict = dict()
        strongList = list()
        for e in entries:
            cluster, string = templateList[e.cluster].split(',',1)
            print '\t%5i| %1.4f| %s' % (int(cluster),e.value,string)
            
            for value in string.split():
                if value not in wordBag:
                    wordBag.add(value)
                    print 'adding',value,strength,e.value*100000
                    strongDict[value] = e.value*100000  +1      
                
        print 'stuff',topic,strongDict
        wc.append((topic,strongDict))
        
        print
        print 'words used : %s' % ' '.join(sorted(wordBag))
        print



In [ ]:

    
#make wordcloud from high energy topics
from wordcloud import WordCloud
import matplotlib.pyplot as plt

items = len(wc)


fig = plt.figure()


f =0
for w in wc:
    topic,wordFreqs = w
    cloudWords = list()
    for wordRel in wordFreqs.iteritems():
        #print topic,wordRel
        if not wordRel[0].isupper():
            cloudWords.append(wordRel)
    if len(cloudWords) > 0:
        f += 1


numFigs = math.ceil(math.sqrt(f))
print numFigs,f

temp =1
for w in wc:
   
    topic,wordFreqs = w
    cloudWords = list()
    for wordRel in wordFreqs.iteritems():
        #print topic,wordRel
        if not wordRel[0].isupper():
            cloudWords.append(wordRel)
    if len(cloudWords) > 0:
        wordcloud = WordCloud(max_font_size=40, relative_scaling=.5).generate_from_frequencies(cloudWords)
        
        plt.subplot(numFigs,numFigs,temp)
        plt.title('topic %s' % topic)
        plt.imshow(wordcloud)
        plt.axis("off")
        
        temp = temp+1
        
plt.show()



In [ ]:

    
outData2 = list()
for x in(ldamodel.print_topics(num_topics=topics,num_words=40000)):
    parts =  x.split('+')
    sum = 0
    lines = list()
    for p in parts:
        val,cluster = p.lstrip().strip().split('*')        
        sum += float(val)
        lines.append(Entry(float(val),int(cluster)))
    outData2.append((sum,lines))



In [ ]:

    
for topic, X in enumerate(outData2):
    s = X[0]
    ent = X[1]
    print 'topic=%i|count=%i|energy=%f' % (topic, len(ent), s)
    for e in ent:
        if e.value > .01 :
            print '\t',topic,e



In [ ]:

    
bestCluster2Topic = dict()

for topic, scoreList in enumerate(outData2):
    score = scoreList[0]
    ent = scoreList[1]
    for index in ent:
       # print topic,index
        
        if index.cluster not in bestCluster2Topic:
            # print '********new******',topic,index
            bestCluster2Topic[index.cluster] = TEntry(index.value,topic)
        else:
            current = bestCluster2Topic[index.cluster]
            if index.value > current.value:
                # print '***update****',topic,index
                bestCluster2Topic[index.cluster] = TEntry(index.value,topic)
        
#print bestCluster2Topic



In [ ]:

    
topic2Clust = dict()
c2t = dict()
for cluster,tval in bestCluster2Topic.iteritems():
    c2t[cluster] = tval.topic

for cluster,topic in c2t.iteritems():
    if topic in topic2Clust:
        c = topic2Clust[topic]
        c.add(cluster)
    else:
        c = set()
        c.add(cluster)
        topic2Clust[topic]=c



In [ ]:

    
outMap = dict()
for k,t in topic2Clust.iteritems():
    for i in t:
        outMap[i]=k



In [ ]:

    
print outMap



In [ ]:



In [ ]: