In [1]:
import pandas as pd
msg = pd.read_csv("../../linux/git_messages.log", sep="\u0012", encoding='latin-1', header=None, names=['timestamp', 'text'])
msg['timestamp'] = pd.to_datetime(msg['timestamp'], unit='s')
# determining the first real commit timestamp
first_commit_timestamp = msg.iloc[-1]['timestamp']
# determining the last sensible commit timestamp
last_commit_timestamp = pd.to_datetime('today')
# filtering out wrong timestamps
msg = msg[
(msg['timestamp'] >= first_commit_timestamp) &
(msg['timestamp'] <= last_commit_timestamp)]
msg['text'] = msg['text'].apply(str)
msg.head()
Out[1]:
In [2]:
stopwords = ["add", "use", "fix", "patch", "make", "support", "update"]
msg['text'] = msg['text'].apply(lambda x: " ".join([item for item in x.split(" ") if item.lower() not in stopwords]))
msg.head()
Out[2]:
In [8]:
group_by_year = msg.groupby(pd.Grouper(key='timestamp', freq="AS")).apply(lambda x : " ".join(x.text.apply(str)))
group_by_year.head()
Out[8]:
In [9]:
%matplotlib inline
from wordcloud import WordCloud
w=300
h=800
generate_image = lambda text : WordCloud(width=w, height=h, margin=0, prefer_horizontal=0.5).generate(text)
images = group_by_year.apply(generate_image)
images
Out[9]:
In [10]:
%matplotlib inline
import matplotlib.pyplot as plt
n = len(images)
my_dpi=96
#f, axarr = plt.subplots(1,n, figsize=(h/my_dpi, w*n/my_dpi), dpi=my_dpi)
f, axarr = plt.subplots(1,n, figsize=(40,80), dpi=my_dpi)
for i, image in enumerate(images):
a = axarr[i]
a.imshow(image)
a.set_xticklabels([])
a.set_yticklabels([])
plt.subplots_adjust(wspace=0, hspace=0)