In [1]:

    
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib



In [2]:

    
import pandas as pd
from wordcloud import WordCloud
import gensim
from sklearn.decomposition import LatentDirichletAllocation
from scipy.sparse import csr_matrix

Load data_vraag from file



In [3]:

    
import pickle
data_vraag = pickle.load(open('preprocessedData.pkl', 'r'))



In [4]:

    
data_ppl = data_vraag[data_vraag['individu of groep']=='mijzelf']
data_org = data_vraag[data_vraag['individu of groep']!='mijzelf']



In [5]:

    
vraagTokens = data_vraag['SentToks'].tolist()



In [6]:

    
dic = gensim.corpora.Dictionary(vraagTokens)
corpus = [dic.doc2bow(text) for text in vraagTokens]



In [7]:

    
data = []
row  = []
col  = []
for n,doc in enumerate(corpus):
    for w,c in doc:
        col.append(n)
        row.append(w)
        data.append(c)

nSamples = len(corpus)
nFeatures = len(dic)
mm = csr_matrix((data, (col,row)), shape=(nSamples, nFeatures))



In [8]:

    
print "nSamples (docs) : ",nSamples
print "nFeatures(words): ",nFeatures
len(corpus),len(dic)









    



nSamples (docs) :  11700
nFeatures(words):  56266






    Out[8]:





(11700, 56266)



In [9]:

    
nTopics = 10
lda = LatentDirichletAllocation(n_topics=nTopics, max_iter=5,
                                learning_method='online', learning_offset=50.
                                #,random_state=0
                               )
lda.fit(mm)









    Out[9]:





LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_jobs=1, n_topics=10, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)



In [10]:

    
def getDocumentTopics(docTokens, lda):
    wcTuples = dic.doc2bow(docTokens)
    data = []
    row  = []
    col  = []

    for w,c in wcTuples:
        col.append(0)
        row.append(w)
        data.append(c)

    nSamples = 1
    nFeatures = len(dic)
    oneDoc = csr_matrix((data, (col,row)), shape=(nSamples, nFeatures))
    docWeights = lda.transform(oneDoc)[0]
    docWeights /= docWeights.sum()
    return docWeights



In [11]:

    
topicWords = []
topicWeightedWords = []

for topic_idx, topic in enumerate(lda.components_):
    weightedWordIdx = topic.argsort()[::-1]
    wordsInTopic = [dic[i] for i in weightedWordIdx[:10]]

    weights = topic / topic.sum()
    topicWeights = [ (weights[i],dic[i]) for i in weightedWordIdx[:10]]
    
    print "Topic #%d:" % topic_idx
    print " ".join(wordsInTopic)
    topicWords.append(wordsInTopic)
    topicWeightedWords.append(topicWeights)









    



Topic #0:
health care research disease patients new treatment people development netherlands
Topic #1:
kinderen onderzoek kind ziekte hersenen klachten behandeling hart patiënten ouders
Topic #2:
basisinkomen straling draadloze information lange-termijn oneindig knooppunten magnesium supergeleiding multimodale
Topic #3:
longziekten constructie gemengde bim antidepressiva schaliegas multi-scale longkanker response longaanval
Topic #4:
future fundamental physics leiding visiedocument nlr ruimtevaart insecten human hepatitis
Topic #5:
energie water duurzame brandstoffen zon elektrische fossiele elektriciteit wind warmte
Topic #6:
vrouwen mannen zeeland vliegtuigen lichaamseigen wkn getal neus top-tien bevalling
Topic #7:
mensen nieuwe maken mogelijk steeds vraag grote onderzoek leven nodig
Topic #8:
waarom muziek aarde mens massa wel stof komt zwaartekracht dood
Topic #9:
onderzoek sociale vraag samenleving ontwikkeling maatschappelijke onderwijs rol nederland kennis



In [12]:

    
def inRange(age, targetAge, delta):
    return (targetAge-delta)<=age and age<=(targetAge+delta)

def getPplCirca(targetAge, delta):
    return data_ppl[data_ppl['Leeftijd'].apply(lambda age: inRange(age,targetAge, delta))]



In [13]:

    
topicsByAge = np.zeros((data_ppl['Leeftijd'].max()+1, nTopics))
deltaAge = 5

for age in arange(data_ppl['Leeftijd'].max()+1): 
    dataGroup = getPplCirca(age,deltaAge)
    groupTokens = dataGroup['SentToks'].tolist()
    
    for qTokens in groupTokens:
        topicWeights = getDocumentTopics(qTokens, lda)
        for topic,weight in enumerate(topicWeights):
            topicsByAge[age,topic] += weight / len(groupTokens)



In [14]:

    
figure(figsize=(16,40))
for idx,topic in enumerate(topicWeightedWords):
    wc = WordCloud(background_color="white", relative_scaling=0.9)
    img = wc.generate_from_frequencies([ (word, weight) for weight,word in topic ])
    subplot(nTopics,2,2*idx+1)
    imshow(img)
    axis('off')
    
    subplot(nTopics,2,2*idx+2)
    plot(topicsByAge[:,idx])
    axis([10, 100, 0, 1.0])
    title('Topic #%2d'%(idx))