notebook.community

Edit and run



In [ ]:

    
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['figure.figsize'] = (35.0, 15.0)
matplotlib.rcParams['axes.titlesize'] = 30


from os import path
import pandas as pd
import numpy as np
from corputil import FileCorpus, ListCorpus
from corputil.utils import load_stopwords
from sklearn.manifold import TSNE
from gensim.corpora import Dictionary
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.externals import joblib


stopwords = load_stopwords(path.join('data', 'german.txt'))



In [ ]:

    
files = [
    path.join('data', 'Archive', 'Spiegel_Politik.txt'),
    path.join('data', 'Archive', 'Spiegel_Wirtschaft.txt'),
    path.join('data', 'Archive', 'Spiegel_Wissenschaft.txt'),
    path.join('data', 'Archive', 'Spiegel_Auto.txt'),
    path.join('data', 'Archive', 'Spiegel_Bildung.txt'),
    path.join('data', 'Archive', 'Spiegel_Kultur.txt'),
    path.join('data', 'Archive', 'Spiegel_Panorama.txt'),
    path.join('data', 'Archive', 'Spiegel_Reise.txt'),
    path.join('data', 'Archive', 'Spiegel_Sport.txt'),
    path.join('data', 'Archive', 'Spiegel_Technik.txt'),
    path.join('data', 'Archive', 'Stern_Politik.txt'),
    path.join('data', 'Archive', 'Stern_Panorama.txt'),
    path.join('data', 'Archive', 'Stern_Wirtschaft.txt')]
tags = ['Politics', 'Economy', 'Science', 'Car', 'Education', 'Culture', 'Society', 
        'Travel', 'Sport', 'Technology', 'Politics', 'Society', 'Economy']
colors = {'Politics': 'red', 
          'Economy': 'blue', 
          'Science': 'green', 
          'Car': 'teal', 
          'Education': 'lime',  
          'Culture': 'purple', 
          'Society': 'orange', 
          'Travel': 'magenta', 
          'Sport': 'brown', 
          'Technology': 'cyan'}



In [ ]:

    
X, y = [], []
for file, tag in zip(files, tags):
    for doc in FileCorpus([file]).doc_token():
        X.append(' '.join(doc))
        y.append(tag)

df = pd.DataFrame()
df['doc'] = X
df['tag'] = y



In [ ]:

    
df = df.iloc[np.random.permutation(len(df))]



In [ ]:

    
vectorizer = TfidfVectorizer(min_df=5, max_df=0.5, max_features=100000)
vectorizer.fit(df['doc'])



In [ ]:

    
training = df[:600000]
test = df[600000:]
train_tfidf = vectorizer.transform(training['doc'])
test_tfidf = vectorizer.transform(test['doc'])



In [ ]:

    
classifier = LinearSVC()
classifier.fit(train_tfidf, training['tag'])



In [ ]:

    
classifier.score(test_tfidf, test['tag'])



In [ ]:

    
# joblib.dump(vectorizer, path.join('models', 'classifier', 'Vectorizer.pkl'))
# joblib.dump(classifier, path.join('models', 'classifier', 'Classifier.pkl'))



In [ ]:

    
def process_data(file):
    load = pd.read_csv(file, sep='|', encoding='utf-8')
    texts = list(ListCorpus(list(load.loc[:, 'text'])).doc_token())
    texts = [' '.join(doc) for doc in texts]
    tfidf = vectorizer.transform(texts)
    labels = list(classifier.predict(tfidf))
    prep = TruncatedSVD(n_components=50, random_state=0).fit_transform(tfidf)
    reduced = TSNE(n_components=2, perplexity=40, verbose=2).fit_transform(prep)
    df = pd.DataFrame(reduced)
    df['tag'] = labels
    return df

def label_list(df):
    return [
        df.loc[df['tag'] == 'Politics'],
        df.loc[df['tag'] == 'Economy'],
        df.loc[df['tag'] == 'Science'],
        df.loc[df['tag'] == 'Car'],
        df.loc[df['tag'] == 'Education'],
        df.loc[df['tag'] == 'Travel'],
        df.loc[df['tag'] == 'Sport'],
        df.loc[df['tag'] == 'Technology'],
        df.loc[df['tag'] == 'Society'],
        df.loc[df['tag'] == 'Culture']
    ]



In [ ]:

    
KW44 = path.join('data', 'CurrentNews', 'RAW_2015KW44.csv')
KW45 = path.join('data', 'CurrentNews', 'RAW_2015KW45.csv')

df_kw44 = label_list(process_data(KW44))
df_kw45 = label_list(process_data(KW45))



In [ ]:

    
f = plt.figure()
plt.axis('off')
ax_kw44 = f.add_subplot(121)
ax_kw45 = f.add_subplot(122)
ax_kw44.set_title('Calender Week 44\n26.10.2015 - 01.11.2015')
ax_kw45.set_title('Calender Week 45\n02.11.2015 - 08.11.2015')
ax_kw44.tick_params(labelbottom='off', labelleft='off')
ax_kw45.tick_params(labelbottom='off', labelleft='off')
hx = []
lx = []
for points in df_kw44:
    tag = points['tag'].iloc[0]
    handle = ax_kw44.scatter(points[0], points[1], c=colors[tag], label=tag, marker='x')
    lx.append(tag)
    hx.append(handle)
for points in df_kw45:
    tag = points['tag'].iloc[0]
    ax_kw45.scatter(points[0], points[1], c=colors[tag], label=tag, marker='x')
f.legend(hx, lx, loc='center', fontsize=24)
plt.show()



In [ ]:

    
KW46 = path.join('data', 'CurrentNews', 'RAW_2015KW46.csv')
KW47 = path.join('data', 'CurrentNews', 'RAW_2015KW47.csv')

df_kw46 = label_list(process_data(KW46))
df_kw47 = label_list(process_data(KW47))



In [ ]:

    
f = plt.figure()
plt.axis('off')
ax_kw46 = f.add_subplot(121)
ax_kw47 = f.add_subplot(122)
ax_kw46.set_title('Calender Week 46\n09.11.2015 - 15.11.2015')
ax_kw47.set_title('Calender Week 47\n16.11.2015 - 22.11.2015')
ax_kw46.tick_params(labelbottom='off', labelleft='off')
ax_kw47.tick_params(labelbottom='off', labelleft='off')
hx = []
lx = []
for points in df_kw46:
    tag = points['tag'].iloc[0]
    handle = ax_kw46.scatter(points[0], points[1], c=colors[tag], label=tag, marker='x')
    lx.append(tag)
    hx.append(handle)
for points in df_kw47:
    tag = points['tag'].iloc[0]
    ax_kw47.scatter(points[0], points[1], c=colors[tag], label=tag, marker='x')
f.legend(hx, lx, loc='center', fontsize=24)
plt.show()