In [1]:
from os import path
import pandas as pd
import numpy as np
from corputil import FileCorpus
from corputil.utils import load_stopwords
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from gensim.corpora import Dictionary
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.externals import joblib
import matplotlib.pyplot as plt
stopwords = load_stopwords(path.join('data', 'german.txt'))
In [2]:
files = [
path.join('data', 'Archive', 'Zeit_Politik.txt'),
path.join('data', 'Archive', 'Zeit_Wirtschaft.txt'),
path.join('data', 'Archive', 'Zeit_Wissenschaft.txt'),
path.join('data', 'Archive', 'Zeit_Auto.txt'),
path.join('data', 'Archive', 'Zeit_Bildung.txt'),
path.join('data', 'Archive', 'Zeit_Panorama.txt'),
path.join('data', 'Archive', 'Spiegel_Politik.txt'),
path.join('data', 'Archive', 'Spiegel_Wirtschaft.txt'),
path.join('data', 'Archive', 'Spiegel_Wissenschaft.txt'),
path.join('data', 'Archive', 'Spiegel_Auto.txt'),
path.join('data', 'Archive', 'Spiegel_Bildung.txt'),
path.join('data', 'Archive', 'Spiegel_Geschichte.txt'),
path.join('data', 'Archive', 'Spiegel_Kultur.txt'),
path.join('data', 'Archive', 'Spiegel_Panorama.txt'),
path.join('data', 'Archive', 'Spiegel_Reise.txt'),
path.join('data', 'Archive', 'Spiegel_Sport.txt'),
path.join('data', 'Archive', 'Spiegel_Technik.txt'),
path.join('data', 'Archive', 'Stern_Politik.txt'),
path.join('data', 'Archive', 'Stern_Panorama.txt'),
path.join('data', 'Archive', 'Stern_Wirtschaft.txt'),
path.join('data', 'Archive', 'Handelsblatt_Politik.txt'),
path.join('data', 'Archive', 'Handelsblatt_Wirtschaft.txt'),
path.join('data', 'Archive', 'WiWo_Politik.txt'),
path.join('data', 'Archive', 'WiWo_Wirtschaft.txt')]
tags = [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0]
In [3]:
X, y = [], []
for file, tag in zip(files, tags):
for doc in FileCorpus(file).doc_token(stopwords=stopwords):
X.append(' '.join(doc))
y.append(tag)
df = pd.DataFrame()
df['doc'] = X
df['tag'] = y
In [4]:
df = df.iloc[np.random.permutation(len(df))]
In [5]:
vectorizer = TfidfVectorizer(min_df=20, max_df=0.5)
vectorizer.fit(df['doc'])
Out[5]:
In [6]:
training = df[:600000]
test = df[600000:]
train_tfidf = vectorizer.transform(training['doc'])
test_tfidf = vectorizer.transform(test['doc'])
In [7]:
classifier = LinearSVC()
classifier.fit(train_tfidf, training['tag'])
Out[7]:
In [8]:
classifier.score(test_tfidf, test['tag'])
Out[8]:
In [9]:
joblib.dump(vectorizer, path.join('models', 'classifier', 'Vectorizer.pkl'))
joblib.dump(classifier, path.join('models', 'classifier', 'Classifier.pkl'))
Out[9]: