In [1]:
from os import path
import pandas as pd
import numpy as np
from corputil import FileCorpus
from corputil.utils import load_stopwords
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from gensim.corpora import Dictionary
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.externals import joblib
import matplotlib.pyplot as plt

stopwords = load_stopwords(path.join('data', 'german.txt'))

In [2]:
files = [
    path.join('data', 'Archive', 'Zeit_Politik.txt'),
    path.join('data', 'Archive', 'Zeit_Wirtschaft.txt'),
    path.join('data', 'Archive', 'Zeit_Wissenschaft.txt'),
    path.join('data', 'Archive', 'Zeit_Auto.txt'),
    path.join('data', 'Archive', 'Zeit_Bildung.txt'),
    path.join('data', 'Archive', 'Zeit_Panorama.txt'),
    path.join('data', 'Archive', 'Spiegel_Politik.txt'),
    path.join('data', 'Archive', 'Spiegel_Wirtschaft.txt'),
    path.join('data', 'Archive', 'Spiegel_Wissenschaft.txt'),
    path.join('data', 'Archive', 'Spiegel_Auto.txt'),
    path.join('data', 'Archive', 'Spiegel_Bildung.txt'),
    path.join('data', 'Archive', 'Spiegel_Geschichte.txt'),
    path.join('data', 'Archive', 'Spiegel_Kultur.txt'),
    path.join('data', 'Archive', 'Spiegel_Panorama.txt'),
    path.join('data', 'Archive', 'Spiegel_Reise.txt'),
    path.join('data', 'Archive', 'Spiegel_Sport.txt'),
    path.join('data', 'Archive', 'Spiegel_Technik.txt'),
    path.join('data', 'Archive', 'Stern_Politik.txt'),
    path.join('data', 'Archive', 'Stern_Panorama.txt'),
    path.join('data', 'Archive', 'Stern_Wirtschaft.txt'),
    path.join('data', 'Archive', 'Handelsblatt_Politik.txt'),
    path.join('data', 'Archive', 'Handelsblatt_Wirtschaft.txt'),
    path.join('data', 'Archive', 'WiWo_Politik.txt'),
    path.join('data', 'Archive', 'WiWo_Wirtschaft.txt')]
tags = [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0]

In [3]:
X, y = [], []
for file, tag in zip(files, tags):
    for doc in FileCorpus(file).doc_token(stopwords=stopwords):
        X.append(' '.join(doc))
        y.append(tag)

df = pd.DataFrame()
df['doc'] = X
df['tag'] = y

In [4]:
df = df.iloc[np.random.permutation(len(df))]

In [5]:
vectorizer = TfidfVectorizer(min_df=20, max_df=0.5)
vectorizer.fit(df['doc'])


Out[5]:
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=20,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [6]:
training = df[:600000]
test = df[600000:]
train_tfidf = vectorizer.transform(training['doc'])
test_tfidf = vectorizer.transform(test['doc'])

In [7]:
classifier = LinearSVC()
classifier.fit(train_tfidf, training['tag'])


Out[7]:
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [8]:
classifier.score(test_tfidf, test['tag'])


Out[8]:
0.93381799223773387

In [9]:
joblib.dump(vectorizer, path.join('models', 'classifier', 'Vectorizer.pkl'))
joblib.dump(classifier, path.join('models', 'classifier', 'Classifier.pkl'))


Out[9]:
['models\\classifier\\Classifier.pkl',
 'models\\classifier\\Classifier.pkl_01.npy',
 'models\\classifier\\Classifier.pkl_02.npy',
 'models\\classifier\\Classifier.pkl_03.npy']