In [37]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.level = logging.INFO
Activate logging for Gensim, so we can see that everything is working correctly. Gensim will, for example, complain if no C compiler is installed to let you know that Word2Vec will be awfully slow.
In [38]:
import re
import nltk
import os.path as path
from random import shuffle
from gensim.models import Word2Vec
Import NLTK to seperate a document into sentences and import Gensim to actually train the Word2Vec models.
In [39]:
pattern = re.compile(r'[\W\d]')
try:
nltk.data.find("tokenizers/punkt")
except LookupError:
punkt = nltk.download('punkt')
def init_tokenizer(lang):
model = 'tokenizers/punkt/{}.pickle'.format(lang.lower())
return nltk.data.load(model)
def sentences(doc):
tokenizer = init_tokenizer("german")
sent = tokenizer.tokenize(doc.strip())
return sent
def token(sentence):
letters = pattern.sub(" ", sentence)
words = letters.lower().split()
words = [word for word in words if len(word) > 1]
return words
class FileCorpus:
def __init__(self, files, encoding='UTF-8'):
self.files = files
self.encoding = encoding
def __iter__(self):
for file in self.files:
for doc in open(file, encoding=self.encoding):
yield doc
def sentences(self):
documents = [sentences(document) for document in self]
t = [token(sentence) for document in documents for sentence in document]
return t
In [40]:
def build_model(name, coll):
sentences = FileCorpus(coll).sentences()
shuffle(sentences)
model = Word2Vec(sentences, workers=4, iter=10, size=100, window=2, sg=1, hs=1)
model.save(path.join('models','{}.w2v'.format(name)))
return model
In [41]:
npd = build_model('NPD', [path.join('data', file)
for file in ['NPD.txt', 'NPD_MV.txt', 'NPD_Sachsen.txt']])
spd = build_model('SPD', [path.join('data', file)
for file in ['SPD_Inland.txt', 'SPD_International.txt', 'SPD_Parteileben.txt']])
cdu = build_model('CDU', [path.join('data', file)
for file in ['CDU.txt', 'CDU_EU.txt', 'CDU_Fraktion.txt']])
fdp = build_model('FDP', [path.join('data', file)
for file in ['FDP.txt', 'FDP_Fraktion.txt']])
grüne = build_model('GRÜNE', [path.join('data', file)
for file in ['Grüne.txt', 'Grüne_Fraktion.txt']])
linke = build_model('LINKE', [path.join('data', file)
for file in ['Linke.txt', 'Linke_PR.txt', 'Linke_Fraktion.txt']])
In [52]:
npd.most_similar('flüchtlinge')
Out[52]:
In [51]:
spd.most_similar('flüchtlinge')
Out[51]:
In [50]:
cdu.most_similar('flüchtlinge')
Out[50]:
In [54]:
fdp.most_similar('flüchtlinge')
Out[54]:
In [55]:
grüne.most_similar('flüchtlinge')
Out[55]:
In [56]:
linke.most_similar('flüchtlinge')
Out[56]: