In [1]:
cd ~/projekte/openbsd/openbsd-src/
In [2]:
!git log --format=format:"%ai, %an" > ../commits
In [3]:
cd ..
In [4]:
ls
In [5]:
!head commits
In [6]:
import pandas as pd
In [7]:
%time df=pd.read_csv("commits", header=None, names=["time", "author"])
In [8]:
df
Out[8]:
Es gibt mehr als 142000 commits seit Oktober 1995.
In [9]:
df.head()
Out[9]:
In [10]:
df.author.value_counts()
Out[10]:
In echte Zeiten konvertieren (dauert lange):
In [11]:
%time df.index = pd.to_datetime(df['time'])
In [12]:
df.tail()
Out[12]:
In [13]:
df.sort_index(inplace=True)
In [14]:
df.tail()
Out[14]:
In [15]:
del df['time']
In [16]:
df["c"]=1
In [17]:
df['author'][2343]
Out[17]:
Leerzeichen am Anfang und Ende von Authorennamen entfernen:
In [18]:
df['author'] = df['author'].map(lambda x: x.strip())
In [19]:
df.head()
Out[19]:
In [20]:
commits_per_person = df.author.value_counts()
In [21]:
commits_per_person.describe()
Out[21]:
307 Committer? (zumindest 307 verschiedene Commit-Author Namen)
In [22]:
%pylab inline
Größe der Grafik heraufsetzen:
In [23]:
import matplotlib.pyplot as plt
plt.figsize(10,6)
In [24]:
commits_per_person.plot()
Out[24]:
In [25]:
top30=commits_per_person[:30]
top30
Out[25]:
In [26]:
top30.plot(kind="barh")
Out[26]:
In [27]:
commits_per_person['markus']
Out[27]:
In [28]:
genuesen = ["markus", "bluhm", "mpf", "hshoexer", "grunk"]
genu_commits = commits_per_person.ix[genuesen]
genu_commits
Out[28]:
In [29]:
bluhm = df[df.author == "bluhm"]
In [30]:
mpf = df[df.author == "mpf"]
In [31]:
bluhm.c.cumsum().plot(style="r", label="bluhm")
mpf.c.cumsum().plot(style="--", label="mpf")
title("mpf vs. bluhm")
legend(loc="best")
Out[31]:
In [32]:
markus = df[df.author == "markus"]
hshoexer = df[df.author == "hshoexer"]
grunk = df[df.author == "grunk"]
In [33]:
bluhm.c.cumsum().plot(style="r.", label="bluhm", alpha=0.2)
mpf.c.cumsum().plot(style="--.", label="mpf", alpha=0.2)
hshoexer.c.cumsum().plot(style="g-..", label="hshoexer")
grunk.c.cumsum().plot(style="m--.", label="grunk")
markus.c.cumsum().plot(style='k:.', label="markus", alpha=0.2)
legend(loc=0)
savefig("genucommits.pdf")
In [34]:
bluhm.head(1)
Out[34]:
In [35]:
mpf.head(1)
Out[35]:
In [36]:
markus.head(1)
Out[36]:
In [37]:
hshoexer.tail(1)
Out[37]:
In [38]:
grunk.tail(1)
Out[38]:
In [39]:
df.ix['2013']
Out[39]:
In [40]:
df.c.cumsum().plot()
Out[40]:
Recht stetiges Wachstum. Wird aber langsamer. Hmm...??
In [41]:
cmon=df.resample("M", how="sum").c.cumsum()
In [42]:
y=cmon.values
y
Out[42]:
In [43]:
x=arange(cmon.size)
x
Out[43]:
In [44]:
p=np.polyfit(x,y, 3)
p
Out[44]:
In [45]:
x1=arange(300)
In [46]:
plot(x,y)
plot(x1,np.polyval(p,x1), "r")
xlabel("months")
Out[46]:
In [47]:
np.polyval(p,x1).max()
Out[47]:
In [48]:
y.max()
Out[48]:
In [49]:
pd.Series(np.polyval(p,x1)).idxmax()
Out[49]:
In [50]:
pd.Series(x).idxmax()
Out[50]:
In [51]:
per_day=df.resample("D", how="sum")
In [52]:
per_day.head()
Out[52]:
In [53]:
per_day.c.max()
Out[53]:
In [54]:
per_day.c.idxmax()
Out[54]:
In [55]:
df.ix["2001-06-25"]
Out[55]:
In [56]:
per_month=df.resample("M", how="sum")
per_month.max()
Out[56]:
In [57]:
per_month.idxmax()
Out[57]:
Die meisten Commits gab es im Juni 2001: 1699 Stück
In [58]:
per_month.c.describe()
Out[58]:
Die Anzahl der Commits schwankt stark.
In [59]:
per_month.c.plot()
Out[59]:
In [60]:
pd.rolling_mean(per_month.c, 15).plot()
Out[60]:
Neue Spalte erzeugen mit dem Zeitstempel, denn der Index lässt sich nicht so gut auswerten.
In [61]:
df["datetime"]=df.index
df.head()
Out[61]:
In [62]:
df['weekday'] = df['datetime'].apply(lambda x: x.isoweekday())
Montag = 1, Sonntag = 7
In [63]:
df.head()
Out[63]:
In [64]:
wd=df.ix[:,["c","weekday"]]
In [65]:
per_weekday=wd.groupby("weekday")
In [66]:
per_weekday.sum()
Out[66]:
Die meisten Commits am Wochenanfang. Am Wochenende weniger (insbesondere Samstag).
In [67]:
per_weekday.sum().plot(kind="bar")
Out[67]:
In [68]:
df['hour'] = df['datetime'].apply(lambda x: x.hour)
In [69]:
per_hour = df.groupby('hour')['c'].sum()
per_hour.plot(kind="bar")
Out[69]:
Die meisten Commits abends.