In [37]:
from gensim import corpora, models, similarities
import collections,operator,sys,numpy,pandas
from jinja2 import Template
import logging
reload(logging)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sys.path.append('ghalib-concordance/graphparser/')
from graphparser import GraphParser
urdup = GraphParser('ghalib-concordance/graphparser/settings/urdu.yaml')
with open('ghalib-concordance/output/lemma_documents.txt','r') as f:
text = f.read()
verses = text.split('\n')
verses_orig=[urdup.parse(v).output for v in verses]
assert(len(verses)==1461)
tokens=[]
for v in verses:
tokens+= v.split(' ')
stoplist=['honaa','','karnaa',
'kaa','se','me;n','nah','vuh','kih','ko','jaanaa','kii','nahii;n','mai;n','kyaa','meraa','jo','ham',
'bhii','to','kahnaa','yih','aanaa','ne','teraa','dekhnaa','aur','par','denaa',';gaalib','ko))ii','kyuu;n',
'hii','pah','bah','gar','rahnaa','tuu','phir','apnaa','har','ay','ik','kis','tum','kuchh',
'agar','ek','asad','ab','chaahiye','puuchhnaa','yuu;n','hamaaraa',
'mauj','yaa;n','nikalnaa','yaa','milnaa','liye','yak',"jaan'naa",'achchhaa','haa))e','vaa;n','tak','paanaa',
'magar','taa','pa;rnaa','khe;nchnaa','kabhii','lekin','u;thnaa','varnah','chalnaa',
'phir','lenaa','denaa','kahaa;n','sar','jab',"go","ban'naa","ya((nii","vuhii","aap","saknaa","kisii","yihii"
'jitnaa','saa','pahle','lagnaa','vale','mat','sahii','kam']
verbs=[w for w in set(tokens) if w.endswith('naa') and w!='tamanna']
stoplist+=verbs
In [ ]:
texts = [[word for word in verse.lower().split() if word not in stoplist] for verse in verses]
all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
texts = [[word for word in text if word not in tokens_once] for text in texts]
texts = [[urdup.parse(word).output for word in text] for text in texts]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
In [39]:
In [42]:
def gen_model(num_topics=15, passes=10,iterations=500,chunksize=100,workers=3):
model = models.LdaMulticore(corpus, id2word=dictionary, num_topics=num_topics, passes=passes,iterations=iterations,workers=workers)
return model
model=None#gen_model()
In [43]:
clouds_template='''
<!DOCTYPE html>
<meta charset="utf-8">
<head>
<script type="text/javascript" src="d3/d3.js"></script>
<script type="text/javascript" src="d3-cloud/d3.layout.cloud.js"></script>
<script type="application/json" id="data">
{{topic_words_json}}
</script>
</head>
<body>
<div id="models" style="width:50%;float:left">
</div>
<div id="texts" style="width:50%;float:left">
</div>
<script>
var fill = d3.scale.category20();
var word_data;
function make_cloud(cloud,id){
words = cloud.map(function(d){
return {text:d[1],size:d[0]*2000}
}).sort(function(a,b){
return a.size < b.size;
});
word_data = words;
d3.layout.cloud().size([800, 800])
.words(words)
.padding(1)
.rotate(function() { return 0})//~~(Math.random() * 2) * 90; })
.font("Impact")
.fontSize(function(d) { return d.size; })
.on("end", draw)
.start();
function show_text(id){
d3.select("div#texts").html("");
for (i=0; i<topic_verses[id].length; i++){
d3.select("div#texts").append("p").text(topic_verses[id][i]);
}
}
function draw(words) {
d3.select("div#models").append("svg")
.attr("width", 400)
.attr("height", 400)
.attr("id",id)
.on("click",function(d) {show_text(this.id) } )
.append("g")
.attr("transform", "translate(400,400)")
.selectAll("text")
.data(words)
.enter().append("text")
.style("font-size", function(d) { return d.size + "px"; })
.style("font-family", "Jameel Noori Nastaleeq")
.style("fill", function(d, i) { return 0;})//fill(i); })
.attr("text-anchor", "middle")
.attr("transform", function(d) {
return "translate(" + [d.x, d.y] + ")rotate(" + d.rotate + ")";
})
.text(function(d) { return d.text; });
}
}
var num_topics = {{num_topics}};
var json_data = JSON.parse(document.getElementById('data').innerHTML);
topic_words = json_data['topic_words'];
topic_verses = json_data['topic_verses'];
for (i=0;i<num_topics;i++) {
id = "topic_"+i;
make_cloud(topic_words[i], id);
}
</script>
</body>
</html>
'''
from IPython.display import IFrame
import os
import json
num_words = 100
count=0
last_fun = None
def serve_html(s,w,h):
import os
global count
count+=1
fn= '__tmp'+str(os.getpid())+'_'+str(count)+'.html'
global last_fn
last_fn = fn
with open(fn,'w') as f:
f.write(s)
return IFrame('files/'+fn,w,h)
def get_verses():
global model
global corpus
text_topics = [ model [x] for x in corpus ]
da = numpy.zeros((len(text_topics),model.num_topics))
for i, v in enumerate(text_topics):
for topic, value in v:
da[i,topic] = value
df = pandas.DataFrame(da) # probably a way to compress the above
verses_out = {}
for i in range (model.num_topics):
verses = []
for x in df.sort(columns=[i],ascending=False)[i].index:
v = df[i][x]
if (v > 0):
verses.append(verses_orig[x])
verses_out['topic_'+str(i)]=verses
return verses_out
def show_clouds():
global model
num_words = 100
data = {'topic_words': [model.show_topic(i,topn=num_words) for i in range(model.num_topics)],
'topic_verses': get_verses()}
topic_words_json = json.dumps(data)
s=Template(clouds_template).render(num_topics=model.num_topics,topic_words_json = topic_words_json)
with open('test-cloud.html',"w") as f:
f.write(s)
return(serve_html(s,1200,800))
#show_clouds()
In [45]:
from IPython.display import HTML
input_form = """
<div style="background-color:gainsboro; border:solid black; width:300px; padding:20px;">
Ready to Topic Model!<br>
Topics: <input type="text" id="num_topics" value="15"> <br>
Passes: <input type="text" id="passes" value="10"> <br>
Iterations: <input type="text" id="iterations" value="500">
Chunk Size: <input type="text" id="chunksize" value="100"> <br>
Workers: <input type="text" id="workers" value="3"> <br>
<button onclick="set_values()">Generate Model</button>
</div>
<div id = "clouds"></div>
"""
javascript = """
<script type="text/Javascript">
function set_values(){
var num_topics = document.getElementById('num_topics').value;
var passes = document.getElementById('passes').value;
var iterations = document.getElementById('iterations').value;
var chunksize = document.getElementById('chunksize').value;
var workers = document.getElementById('workers').value;
var command = " model=gen_model(num_topics="+num_topics+","+
"passes="+passes+","+
"iterations="+iterations+","+
"chunksize="+chunksize+","+
"workers="+workers+")"
console.log("Executing Command: " + command);
var kernel = IPython.notebook.kernel;
kernel.execute(command);
}
function show_clouds(){
var kernel = IPython.notebook.kernel;
kernel.execute('show_clouds()');
}
</script>
"""
HTML(input_form + javascript)
Out[45]:
In [46]:
show_clouds()
Out[46]:
In [ ]: