In [ ]:
import logging
from os import path
from pprint import pprint
import pandas as pd
from corputil import ListCorpus
from corputil.utils import load_stopwords
from gensim.models import LdaMulticore, TfidfModel
from gensim.models.phrases import Phrases
from gensim.corpora import Dictionary

stopwords = load_stopwords(path.join('data', 'german.txt'))

In [ ]:
num_topics = 45
chunksize, iterations, passes = 200, 500, 10
labels = ['2015-44', '2015-45', '2015-46', '2015-47', '2015-48', '2015-49', '2015-50', '2015-51', 
          '2015-52', '2015-53', '2016-01', '2016-02', '2016-03', '2016-04', '2016-05', '2016-06']
files = [path.join('data', 'CurrentNews', 's_{}.csv').format(label) for label in labels]
output_model = [path.join('models', 'lda', '{}.lda').format(label) for label in labels]
output_dict = path.join('models', 'lda', 'Words.dict')
output_bigram = path.join('models', 'lda', 'Bigram.phrase')

In [ ]:
dfs = [pd.read_csv(file, sep='|', encoding='utf-8') for file in files]

In [ ]:
corpora = [ListCorpus(list(df.loc[:, 'text'])) for df in dfs]

In [ ]:
def create_phrase():
    sentences = [sentence for corpus in corpora for sentence in corpus.sentences_token(stopwords=stopwords)]
    bigram = Phrases(sentences)
    return bigram


def create_dict():
    docs = [bigram[doc] for corpus in corpora for doc in corpus.doc_token(stopwords)]
    dictionary = Dictionary(docs)
    dictionary.filter_extremes()
    dictionary.compactify()
    return dictionary


def train_lda(corpus):
    bow = [dictionary.doc2bow(bigram[doc]) for doc in corpus]
    tfidf = TfidfModel(bow)
    bow = tfidf[bow]
    lda = LdaMulticore(bow, id2word=dictionary, chunksize=chunksize, #batch=True,
                       num_topics=num_topics, workers=2, passes=passes, iterations=iterations)
    return bow, lda

In [ ]:
bigram = create_phrase()
dictionary = create_dict()

models = []
docs = []

for i, corpus in enumerate(corpora):
    mmCorpus, model = train_lda(corpus.doc_token(stopwords=stopwords))
    models.append(model)
    docs.append(mmCorpus)
    model.save(output_model[i])

bigram.save(output_bigram)
dictionary.save(output_dict)

Web Visualization Pre-Computation for GitHub Pages


In [ ]:
from pprint import pprint
import json
import numpy as np
from gensim.matutils import sparse2full, cossim

In [ ]:
def permutations(coll, window):
    perms =[]
    for frame in range(len(coll) - (window - 1)):
        perm = [coll[frame + i] for i in range(window)]
        perms.append(tuple(perm))
    return perms

def topic_cluster(model, label, threshold):
    data = []
    for i1 in range(model.num_topics):
        for i2 in range(model.num_topics):
            if i1 != i2:
                similarity = cossim(model.show_topic(i1), model-show_topic(i2))
                if similarity >= threshold:
                    entry = {
                        'week': label,
                        's-topic': i1,
                        'e-topic': i2,
                        'sim': similarity
                    }
    return data

def topic_chains(models, threshold):
    data = []
    for i, (first, second) in enumerate(permutations(models, 2)):
        for i1 in range(first.num_topics):
            for i2 in range(second.num_topics):
                similarity = cossim(first.show_topic(i1), second.show_topic(i2))
                if similarity >= threshold:
                    entry = {
                        's-week': labels[i],
                        's-topic': i1,
                        'e-week': labels[i + 1],
                        'e-topic': i2,
                        'sim': similarity
                    }
                    data.append(entry)
    return data

In [ ]:
def topic_words(model):
    data = model.show_topics(-1, formatted=False)
    topics = []
    for i, c in data:
        words = []
        probs = []
        for word, prob in c:
            words.append(word)
            probs.append(prob)
        topics.append([words, probs])
    return topics

def topic_allocation(corpus):
    acc = []
    for vec in corpus:
        t_id = -1
        t_prob = -1
        for topic, prob in vec:
            if prob > t_prob:
                t_id = topic
        acc.append(t_id)
    return acc

def get_topics(df, model, doc):
    transform = model[doc]
    topics = topic_words(model)
    df['topic'] = topic_allocation(transform)
    d = []
    for i, (topic, prob) in enumerate(topics):
        dc = dict()
        dc['id'] = i
        dc['words'] = topic
        dc['probs'] = prob
        dc['articles'] = df[df['topic'] == i].count()['topic'].item() # Just pick a column... here topic
        if dc['articles'] > 0:
            d.append(dc)
    return d

topicData = dict()
for i, (model, doc) in enumerate(zip(models, docs)):
    df = dfs[i]
    topicData[labels[i]] = get_topics(df, model, doc)
with open(path.join('data', 'Web', 'Topics.json'), 'w', encoding='utf-8') as f:
    json.dump(topicData, f, indent=4)

In [ ]:
def order_data(d):
    return [
        d['LINKE'],
        d['SPD'],
        d['GRÜNE'],
        d['FDP'],
        d['CDU'],
        d['NPD']
    ]

def to_array(df):
    for key in df.keys():
        df[key] = order_data(df[key])
    return df

complete = pd.concat(dfs)
complete = complete.loc[:, ['site', 'LINKE', 'SPD', 'GRÜNE', 'FDP', 'CDU', 'NPD']]
grouped = complete.groupby('site').mean()
json_data = to_array(grouped.to_dict('index'))
json_data['All'] = order_data(grouped.mean())

with open(path.join('data', 'Web', 'SiteSentiment.json'), 'w', encoding='utf-8') as f:
        json.dump(json_data, f, indent=4)
        
time_data = dict()
for i, (label, df) in enumerate(zip(labels, dfs)):
    sentiment = df.loc[:, ['site', 'LINKE', 'SPD', 'GRÜNE', 'FDP', 'CDU', 'NPD']]
    sentiment = sentiment.groupby('site').mean()
    json_data = to_array(sentiment.to_dict('index'))
    json_data['All'] = order_data(sentiment.mean())
    time_data[label] = json_data
    
with open(path.join('data', 'Web', 'SiteSentimentTime.json'), 'w', encoding='utf-8') as f:
        json.dump(time_data, f, indent=4)