In [ ]:
import logging
from os import path
from pprint import pprint
import pandas as pd
from corputil import ListCorpus
from corputil.utils import load_stopwords
from gensim.models import LdaMulticore, TfidfModel
from gensim.models.phrases import Phrases
from gensim.corpora import Dictionary
stopwords = load_stopwords(path.join('data', 'german.txt'))
In [ ]:
num_topics = 45
chunksize, iterations, passes = 200, 500, 10
labels = ['2015-44', '2015-45', '2015-46', '2015-47', '2015-48', '2015-49', '2015-50', '2015-51',
'2015-52', '2015-53', '2016-01', '2016-02', '2016-03', '2016-04', '2016-05', '2016-06']
files = [path.join('data', 'CurrentNews', 's_{}.csv').format(label) for label in labels]
output_model = [path.join('models', 'lda', '{}.lda').format(label) for label in labels]
output_dict = path.join('models', 'lda', 'Words.dict')
output_bigram = path.join('models', 'lda', 'Bigram.phrase')
In [ ]:
dfs = [pd.read_csv(file, sep='|', encoding='utf-8') for file in files]
In [ ]:
corpora = [ListCorpus(list(df.loc[:, 'text'])) for df in dfs]
In [ ]:
def create_phrase():
sentences = [sentence for corpus in corpora for sentence in corpus.sentences_token(stopwords=stopwords)]
bigram = Phrases(sentences)
return bigram
def create_dict():
docs = [bigram[doc] for corpus in corpora for doc in corpus.doc_token(stopwords)]
dictionary = Dictionary(docs)
dictionary.filter_extremes()
dictionary.compactify()
return dictionary
def train_lda(corpus):
bow = [dictionary.doc2bow(bigram[doc]) for doc in corpus]
tfidf = TfidfModel(bow)
bow = tfidf[bow]
lda = LdaMulticore(bow, id2word=dictionary, chunksize=chunksize, #batch=True,
num_topics=num_topics, workers=2, passes=passes, iterations=iterations)
return bow, lda
In [ ]:
bigram = create_phrase()
dictionary = create_dict()
models = []
docs = []
for i, corpus in enumerate(corpora):
mmCorpus, model = train_lda(corpus.doc_token(stopwords=stopwords))
models.append(model)
docs.append(mmCorpus)
model.save(output_model[i])
bigram.save(output_bigram)
dictionary.save(output_dict)
In [ ]:
from pprint import pprint
import json
import numpy as np
from gensim.matutils import sparse2full, cossim
In [ ]:
def permutations(coll, window):
perms =[]
for frame in range(len(coll) - (window - 1)):
perm = [coll[frame + i] for i in range(window)]
perms.append(tuple(perm))
return perms
def topic_cluster(model, label, threshold):
data = []
for i1 in range(model.num_topics):
for i2 in range(model.num_topics):
if i1 != i2:
similarity = cossim(model.show_topic(i1), model-show_topic(i2))
if similarity >= threshold:
entry = {
'week': label,
's-topic': i1,
'e-topic': i2,
'sim': similarity
}
return data
def topic_chains(models, threshold):
data = []
for i, (first, second) in enumerate(permutations(models, 2)):
for i1 in range(first.num_topics):
for i2 in range(second.num_topics):
similarity = cossim(first.show_topic(i1), second.show_topic(i2))
if similarity >= threshold:
entry = {
's-week': labels[i],
's-topic': i1,
'e-week': labels[i + 1],
'e-topic': i2,
'sim': similarity
}
data.append(entry)
return data
In [ ]:
def topic_words(model):
data = model.show_topics(-1, formatted=False)
topics = []
for i, c in data:
words = []
probs = []
for word, prob in c:
words.append(word)
probs.append(prob)
topics.append([words, probs])
return topics
def topic_allocation(corpus):
acc = []
for vec in corpus:
t_id = -1
t_prob = -1
for topic, prob in vec:
if prob > t_prob:
t_id = topic
acc.append(t_id)
return acc
def get_topics(df, model, doc):
transform = model[doc]
topics = topic_words(model)
df['topic'] = topic_allocation(transform)
d = []
for i, (topic, prob) in enumerate(topics):
dc = dict()
dc['id'] = i
dc['words'] = topic
dc['probs'] = prob
dc['articles'] = df[df['topic'] == i].count()['topic'].item() # Just pick a column... here topic
if dc['articles'] > 0:
d.append(dc)
return d
topicData = dict()
for i, (model, doc) in enumerate(zip(models, docs)):
df = dfs[i]
topicData[labels[i]] = get_topics(df, model, doc)
with open(path.join('data', 'Web', 'Topics.json'), 'w', encoding='utf-8') as f:
json.dump(topicData, f, indent=4)
In [ ]:
def order_data(d):
return [
d['LINKE'],
d['SPD'],
d['GRÜNE'],
d['FDP'],
d['CDU'],
d['NPD']
]
def to_array(df):
for key in df.keys():
df[key] = order_data(df[key])
return df
complete = pd.concat(dfs)
complete = complete.loc[:, ['site', 'LINKE', 'SPD', 'GRÜNE', 'FDP', 'CDU', 'NPD']]
grouped = complete.groupby('site').mean()
json_data = to_array(grouped.to_dict('index'))
json_data['All'] = order_data(grouped.mean())
with open(path.join('data', 'Web', 'SiteSentiment.json'), 'w', encoding='utf-8') as f:
json.dump(json_data, f, indent=4)
time_data = dict()
for i, (label, df) in enumerate(zip(labels, dfs)):
sentiment = df.loc[:, ['site', 'LINKE', 'SPD', 'GRÜNE', 'FDP', 'CDU', 'NPD']]
sentiment = sentiment.groupby('site').mean()
json_data = to_array(sentiment.to_dict('index'))
json_data['All'] = order_data(sentiment.mean())
time_data[label] = json_data
with open(path.join('data', 'Web', 'SiteSentimentTime.json'), 'w', encoding='utf-8') as f:
json.dump(time_data, f, indent=4)