Calculate Political Opinion Models


In [ ]:
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
# logging.root.level = logging.INFO

from os import path
from random import shuffle
from corputil import FileCorpus, ListCorpus
from corputil.utils import load_stopwords
from gensim.models.word2vec import LineSentence, Word2Vec

In [ ]:
spd = [
    path.join('data', 'Politics', 'SPD.txt'),
    path.join('data', 'Politics', 'SPD_EU.txt'),
    path.join('data', 'Politics', 'SPD_Fraktion.txt')
]

linke = [
    path.join('data', 'Politics', 'Linke.txt'),
    path.join('data', 'Politics', 'Linke_EU.txt'),
    path.join('data', 'Politics', 'Linke_Fraktion.txt')
]

gruene = [
    path.join('data', 'Politics', 'Grüne.txt'),
    path.join('data', 'Politics', 'Grüne_EU.txt'),
    path.join('data', 'Politics', 'Grüne_Fraktion.txt')
]

fdp = [
    path.join('data', 'Politics', 'FDP.txt'),
    path.join('data', 'Politics', 'FDP_EU.txt'),
    path.join('data', 'Politics', 'FDP_Fraktion.txt')
]

cdu = [
    path.join('data', 'Politics', 'CDU.txt'),
    path.join('data', 'Politics', 'CDU_EU.txt'),
    path.join('data', 'Politics', 'CDU_Fraktion.txt')
]

npd = [
    path.join('data', 'Politics', 'NPD_Fraktion_MV.txt'),
    path.join('data', 'Politics', 'NPD_Fraktion_Sachsen.txt'),
    path.join('data', 'Politics', 'NPD_Jung.txt')
]

corpora = [
    FileCorpus(linke),
    FileCorpus(spd),
    FileCorpus(gruene), 
    FileCorpus(fdp), 
    FileCorpus(cdu), 
    FileCorpus(npd)
]

parties = [
    'Linke',
    'SPD',
    'Gruene',
    'FDP',
    'CDU',
    'NPD'
]

Training the Base Model

Calculate the base model (from german wiki), that is later used as a base for training the classification models.


In [ ]:
sentences = LineSentence(path.join('data', 'Archive', 'Corpus_Wiki.txt'))
base = Word2Vec(sentences, workers=4, iter=4, size=100, window=2, sg=1)

Save model to disk. Don't finalize the model because we need to train it with new data later!


In [ ]:
base.save(path.join('models', 'word2vec', 'Base.w2v'))
base = None
sentences = None

Training the Classifier


In [ ]:
for party, corpus in zip(parties, corpora):
    sentences = list(corpus.sentences_token())
    shuffle(sentences)
    model = Word2Vec.load(path.join('models', 'word2vec', 'Base.w2v'))
    model.train(sentences, total_examples=len(sentences))
    model.save(path.join('models', 'word2vec', '{}.w2v'.format(party)))

Political Ideology Detection


In [ ]:
models = [path.join('models', 'word2vec', '{}.w2v'.format(party)) for party in parties]
labels = ['2015-44', '2015-45', '2015-46', '2015-47', '2015-48', '2015-49', '2015-50', '2015-51', 
          '2015-52', '2015-53', '2016-01', '2016-02', '2016-03', '2016-04', '2016-05', '2016-06']
files = [path.join('data', 'CurrentNews', '{}.csv').format(label) for label in labels]
out = [path.join('data', 'CurrentNews', 's_{}.csv').format(label) for label in labels]

In [ ]:
import pandas as pd
import numpy as np


def calc_score(doc, mod):
    model = Word2Vec.load(mod)
    score = model.score(doc, len(doc))
    return score

# Taken from Matt Taddy: https://github.com/TaddyLab/gensim/blob/deepir/docs/notebooks/deepir.ipynb
def calc_probability(df, mods):
    docs = list(ListCorpus(list(df.loc[:, 'text'])).doc_sentences_token())
    sentlist = [s for d in docs for s in d]
    llhd = np.array( [ calc_score(sentlist, m) for m in mods ] )
    lhd = np.exp(llhd - llhd.max(axis=0))
    prob = pd.DataFrame( (lhd/lhd.sum(axis=0)).transpose() )
    prob["doc"] = [i for i,d in enumerate(docs) for s in d]
    prob = prob.groupby("doc").mean()
    return prob

# raw = pd.concat([pd.read_csv(file, sep='|', encoding='utf-8') for file in files], ignore_index=True)
# prob = calc_probability(raw, models)
# data = pd.concat([raw, prob], axis=1)
# data.groupby('site').mean()

for file, o in zip(files, out):
    data = pd.read_csv(file, sep='|', encoding='utf-8')
    sentiment = calc_probability(data, models)
    csv = pd.concat([data, sentiment], axis=1)
    csv.rename(columns={ 0: 'LINKE', 1: 'SPD', 2: 'GRÜNE', 3: 'FDP', 4: 'CDU', 5: 'NPD'  }, inplace=True)
    csv.to_csv(o, index=False, encoding='utf-8', sep='|')