In [37]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.level = logging.INFO

Activate logging for Gensim, so we can see that everything is working correctly. Gensim will, for example, complain if no C compiler is installed to let you know that Word2Vec will be awfully slow.


In [38]:
import re
import nltk
import os.path as path
from random import shuffle
from gensim.models import Word2Vec

Import NLTK to seperate a document into sentences and import Gensim to actually train the Word2Vec models.


In [39]:
pattern = re.compile(r'[\W\d]')

try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    punkt = nltk.download('punkt')


def init_tokenizer(lang):
    model = 'tokenizers/punkt/{}.pickle'.format(lang.lower())
    return nltk.data.load(model)


def sentences(doc):
    tokenizer = init_tokenizer("german")
    sent = tokenizer.tokenize(doc.strip())
    return sent


def token(sentence):
    letters = pattern.sub(" ", sentence)
    words = letters.lower().split()
    words = [word for word in words if len(word) > 1]
    return words


class FileCorpus:
    def __init__(self, files, encoding='UTF-8'):
        self.files = files
        self.encoding = encoding

    def __iter__(self):
        for file in self.files:
            for doc in open(file, encoding=self.encoding):
                yield doc
    
    def sentences(self):
        documents = [sentences(document) for document in self]
        t = [token(sentence) for document in documents for sentence in document]
        return t

In [40]:
def build_model(name, coll):
    sentences = FileCorpus(coll).sentences()
    shuffle(sentences)
    model = Word2Vec(sentences, workers=4, iter=10, size=100, window=2, sg=1, hs=1)
    model.save(path.join('models','{}.w2v'.format(name)))
    return model

In [41]:
npd = build_model('NPD', [path.join('data', file) 
                          for file in ['NPD.txt', 'NPD_MV.txt', 'NPD_Sachsen.txt']])
spd = build_model('SPD', [path.join('data', file) 
                          for file in ['SPD_Inland.txt', 'SPD_International.txt', 'SPD_Parteileben.txt']])
cdu = build_model('CDU', [path.join('data', file) 
                          for file in ['CDU.txt', 'CDU_EU.txt', 'CDU_Fraktion.txt']])
fdp = build_model('FDP', [path.join('data', file) 
                          for file in ['FDP.txt', 'FDP_Fraktion.txt']])
grüne = build_model('GRÜNE', [path.join('data', file) 
                          for file in ['Grüne.txt', 'Grüne_Fraktion.txt']])
linke = build_model('LINKE', [path.join('data', file) 
                          for file in ['Linke.txt', 'Linke_PR.txt', 'Linke_Fraktion.txt']])

In [52]:
npd.most_similar('flüchtlinge')


Out[52]:
[('asylanten', 0.5858976244926453),
 ('ausländer', 0.583103358745575),
 ('billiglöhner', 0.5816749930381775),
 ('zuwanderer', 0.5705763101577759),
 ('arbeitslose', 0.5674145817756653),
 ('kampfflugzeuge', 0.5511696934700012),
 ('jugendbanden', 0.5349843502044678),
 ('asylbewerber', 0.5338039398193359),
 ('arbeitssuchende', 0.5335099697113037),
 ('millionen', 0.5315791964530945)]

In [51]:
spd.most_similar('flüchtlinge')


Out[51]:
[('menschen', 0.6967843174934387),
 ('zuwanderer', 0.6696538925170898),
 ('unbegleitete', 0.6220597624778748),
 ('asylbewerber', 0.6165891885757446),
 ('einwanderer', 0.6147927045822144),
 ('kinder', 0.5995903015136719),
 ('ausländer', 0.5932613015174866),
 ('bürgerkriegsflüchtlinge', 0.5786679983139038),
 ('chinesen', 0.5778629779815674),
 ('flüchtlingskinder', 0.5742260217666626)]

In [50]:
cdu.most_similar('flüchtlinge')


Out[50]:
[('christen', 0.645157516002655),
 ('flüchtlingen', 0.6439053416252136),
 ('jesiden', 0.6176376342773438),
 ('flăźchtlinge', 0.6155807971954346),
 ('christinnen', 0.5799580812454224),
 ('menschen', 0.5666396617889404),
 ('binnenvertriebenen', 0.5659310817718506),
 ('bedrängter', 0.5651735067367554),
 ('asylsuchende', 0.5582743287086487),
 ('notleidenden', 0.5532867312431335)]

In [54]:
fdp.most_similar('flüchtlinge')


Out[54]:
[('asylbewerber', 0.6653411984443665),
 ('flüchtlingen', 0.659360408782959),
 ('zuwanderer', 0.6475093364715576),
 ('fachkräfte', 0.6308250427246094),
 ('jugendliche', 0.6167842149734497),
 ('menschen', 0.6118870973587036),
 ('lehrlinge', 0.6055551767349243),
 ('kinder', 0.5942135453224182),
 ('geringqualifizierten', 0.5866140723228455),
 ('jugendlichen', 0.5852203369140625)]

In [55]:
grüne.most_similar('flüchtlinge')


Out[55]:
[('menschen', 0.7058668732643127),
 ('flüchtlingen', 0.6527441740036011),
 ('unbegleitete', 0.6172116994857788),
 ('somalische', 0.6075341105461121),
 ('ghettos', 0.6022582650184631),
 ('christen', 0.5936412811279297),
 ('schutzbedürftige', 0.589066743850708),
 ('abschiebestopp', 0.5862842798233032),
 ('schutzsuchende', 0.5857851505279541),
 ('roma', 0.5851101279258728)]

In [56]:
linke.most_similar('flüchtlinge')


Out[56]:
[('schutzsuchende', 0.7579900622367859),
 ('schutzsuchenden', 0.7520430684089661),
 ('flüchtlingen', 0.7362326979637146),
 ('asylbewerber', 0.7226961851119995),
 ('asylsuchende', 0.6943240165710449),
 ('asylsuchenden', 0.6649428606033325),
 ('menschen', 0.6570167541503906),
 ('roma', 0.6551729440689087),
 ('staatenlosen', 0.6306562423706055),
 ('schutzsuchender', 0.6280955076217651)]