In [ ]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['figure.figsize'] = (35.0, 15.0)
matplotlib.rcParams['axes.titlesize'] = 30
from os import path
import pandas as pd
import numpy as np
from corputil import FileCorpus, ListCorpus
from corputil.utils import load_stopwords
from sklearn.manifold import TSNE
from gensim.corpora import Dictionary
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.externals import joblib
stopwords = load_stopwords(path.join('data', 'german.txt'))
In [ ]:
files = [
path.join('data', 'Archive', 'Spiegel_Politik.txt'),
path.join('data', 'Archive', 'Spiegel_Wirtschaft.txt'),
path.join('data', 'Archive', 'Spiegel_Wissenschaft.txt'),
path.join('data', 'Archive', 'Spiegel_Auto.txt'),
path.join('data', 'Archive', 'Spiegel_Bildung.txt'),
path.join('data', 'Archive', 'Spiegel_Kultur.txt'),
path.join('data', 'Archive', 'Spiegel_Panorama.txt'),
path.join('data', 'Archive', 'Spiegel_Reise.txt'),
path.join('data', 'Archive', 'Spiegel_Sport.txt'),
path.join('data', 'Archive', 'Spiegel_Technik.txt'),
path.join('data', 'Archive', 'Stern_Politik.txt'),
path.join('data', 'Archive', 'Stern_Panorama.txt'),
path.join('data', 'Archive', 'Stern_Wirtschaft.txt')]
tags = ['Politics', 'Economy', 'Science', 'Car', 'Education', 'Culture', 'Society',
'Travel', 'Sport', 'Technology', 'Politics', 'Society', 'Economy']
colors = {'Politics': 'red',
'Economy': 'blue',
'Science': 'green',
'Car': 'teal',
'Education': 'lime',
'Culture': 'purple',
'Society': 'orange',
'Travel': 'magenta',
'Sport': 'brown',
'Technology': 'cyan'}
In [ ]:
X, y = [], []
for file, tag in zip(files, tags):
for doc in FileCorpus([file]).doc_token():
X.append(' '.join(doc))
y.append(tag)
df = pd.DataFrame()
df['doc'] = X
df['tag'] = y
In [ ]:
df = df.iloc[np.random.permutation(len(df))]
In [ ]:
vectorizer = TfidfVectorizer(min_df=5, max_df=0.5, max_features=100000)
vectorizer.fit(df['doc'])
In [ ]:
training = df[:600000]
test = df[600000:]
train_tfidf = vectorizer.transform(training['doc'])
test_tfidf = vectorizer.transform(test['doc'])
In [ ]:
classifier = LinearSVC()
classifier.fit(train_tfidf, training['tag'])
In [ ]:
classifier.score(test_tfidf, test['tag'])
In [ ]:
# joblib.dump(vectorizer, path.join('models', 'classifier', 'Vectorizer.pkl'))
# joblib.dump(classifier, path.join('models', 'classifier', 'Classifier.pkl'))
In [ ]:
def process_data(file):
load = pd.read_csv(file, sep='|', encoding='utf-8')
texts = list(ListCorpus(list(load.loc[:, 'text'])).doc_token())
texts = [' '.join(doc) for doc in texts]
tfidf = vectorizer.transform(texts)
labels = list(classifier.predict(tfidf))
prep = TruncatedSVD(n_components=50, random_state=0).fit_transform(tfidf)
reduced = TSNE(n_components=2, perplexity=40, verbose=2).fit_transform(prep)
df = pd.DataFrame(reduced)
df['tag'] = labels
return df
def label_list(df):
return [
df.loc[df['tag'] == 'Politics'],
df.loc[df['tag'] == 'Economy'],
df.loc[df['tag'] == 'Science'],
df.loc[df['tag'] == 'Car'],
df.loc[df['tag'] == 'Education'],
df.loc[df['tag'] == 'Travel'],
df.loc[df['tag'] == 'Sport'],
df.loc[df['tag'] == 'Technology'],
df.loc[df['tag'] == 'Society'],
df.loc[df['tag'] == 'Culture']
]
In [ ]:
KW44 = path.join('data', 'CurrentNews', 'RAW_2015KW44.csv')
KW45 = path.join('data', 'CurrentNews', 'RAW_2015KW45.csv')
df_kw44 = label_list(process_data(KW44))
df_kw45 = label_list(process_data(KW45))
In [ ]:
f = plt.figure()
plt.axis('off')
ax_kw44 = f.add_subplot(121)
ax_kw45 = f.add_subplot(122)
ax_kw44.set_title('Calender Week 44\n26.10.2015 - 01.11.2015')
ax_kw45.set_title('Calender Week 45\n02.11.2015 - 08.11.2015')
ax_kw44.tick_params(labelbottom='off', labelleft='off')
ax_kw45.tick_params(labelbottom='off', labelleft='off')
hx = []
lx = []
for points in df_kw44:
tag = points['tag'].iloc[0]
handle = ax_kw44.scatter(points[0], points[1], c=colors[tag], label=tag, marker='x')
lx.append(tag)
hx.append(handle)
for points in df_kw45:
tag = points['tag'].iloc[0]
ax_kw45.scatter(points[0], points[1], c=colors[tag], label=tag, marker='x')
f.legend(hx, lx, loc='center', fontsize=24)
plt.show()
In [ ]:
KW46 = path.join('data', 'CurrentNews', 'RAW_2015KW46.csv')
KW47 = path.join('data', 'CurrentNews', 'RAW_2015KW47.csv')
df_kw46 = label_list(process_data(KW46))
df_kw47 = label_list(process_data(KW47))
In [ ]:
f = plt.figure()
plt.axis('off')
ax_kw46 = f.add_subplot(121)
ax_kw47 = f.add_subplot(122)
ax_kw46.set_title('Calender Week 46\n09.11.2015 - 15.11.2015')
ax_kw47.set_title('Calender Week 47\n16.11.2015 - 22.11.2015')
ax_kw46.tick_params(labelbottom='off', labelleft='off')
ax_kw47.tick_params(labelbottom='off', labelleft='off')
hx = []
lx = []
for points in df_kw46:
tag = points['tag'].iloc[0]
handle = ax_kw46.scatter(points[0], points[1], c=colors[tag], label=tag, marker='x')
lx.append(tag)
hx.append(handle)
for points in df_kw47:
tag = points['tag'].iloc[0]
ax_kw47.scatter(points[0], points[1], c=colors[tag], label=tag, marker='x')
f.legend(hx, lx, loc='center', fontsize=24)
plt.show()