In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
!ls -liLah ../../share/Data
In [5]:
# generate the raw_data from CSV
raw_data = pd.read_csv("../../share/Data/SPON_complete", delimiter=",", skipinitialspace=True)
Verwendung der Stopwörter von hier. Diese liegen in einer MIT Lizenz vor.
Diese Liste wird Dynamisch erweitert.
In [6]:
# Stopwords are downloaded and defined here
try:
with open("../german_stopwords_full.txt") as f:
STOPWORDS = [line.strip() for line in f if not line.startswith(";")]
except FileNotFoundError:
!wget https://raw.githubusercontent.com/solariz/german_stopwords/master/german_stopwords_full.txt
with open("../german_stopwords_full.txt") as f:
STOPWORDS = [line.strip() for line in f if not line.startswith(";")]
dynamic_stopwords = ["dass", "", " ", "worden", "jahren", "jahre", "jahr",
"heißt", "heißen", "müsse", "prozent"]
STOPWORDS += dynamic_stopwords
In [9]:
# Aufbereitung der Daten zur späteren Bearbeitung
## TODO: nicht mit DF sondern als Text bearbeiten -- Zeit- und Memkritisch
data = raw_data
if False:
# Cleaning the strings to be only alphanumeric
data['article'] = data['article'].map(lambda x: re.sub(r'\W+', '', str(x), re.UNICODE))
# Removing the stopwords
data['article'] = data['article'].map(lambda x: [item for item in x.split() if item not in STOPWORDS])
data.to_csv("./share/Data/SPON_complete_clean.csv".format(word) , sep=',')
data
In [7]:
def count_words(source):
""" Counting the words of the column article of a given Dataframe.
It is possible to define a word, so only this word will be counted.
"""
#split column Message to new df, create Serie by stack
s = (source.article.str.split(expand=True).stack().str.lower() )
#remove multiindex
s.index = s.index.droplevel(-1)
s.name= 'words'
#join Serie s to df source
df = (source.join(s))
# Cleaning the strings to be only alphanumeric
df['words'] = df['words'].map(lambda x: re.sub(r'\W+', '', str(x), re.U))
df = df[~df['words'].isin(
STOPWORDS)].groupby(
['words']).size().reset_index(
name='count'
).sort_values(by='count')
return df
In [8]:
def wordcounter(source, word):
df = count_words(source)
return df[df['words'].str.contains(word)]
In [9]:
# This will generate a list of Strings, representating the months from .01.2001 to .12.2012
datestrings = [".{:02d}.20{:02d}".format(m,y) for y in range(1, 17) for m in range(1, 13) ]
In [10]:
top25words_ofCategory = lambda m, c : count_words(
raw_data[
(raw_data.day.str.contains(m, na=False)) & (raw_data.cats.str.contains(c, na=False) )]
).nlargest(25, columns=['count', ]).set_index('words')
In [13]:
gen = False
if gen:
_ = [top25words_ofCategory('{}'.format(Y),
'Politik').to_csv("../../share/DATA/politics_top25words_{}.csv".format(Y))
for Y in range(2001, 2017)]
Out[13]: