In [4]:
import pandas as pd

URL = "https://raw.githubusercontent.com/feststelltaste/software-data/master/projects/buschmais-spring-petclinic-joa/git_log_unixtimestamp-author.log"
df = pd.read_csv(URL, sep="\t", names=["time","author"])
df[['timestamp', 'timezone']] = df.time.str.split(" ", expand=True)
df.head()


Out[4]:
time author timestamp timezone
0 1518180584 +0100 JavaOnAutobahn 1518180584 +0100
1 1518180229 +0100 JavaOnAutobahn 1518180229 +0100
2 1518179666 +0100 JavaOnAutobahn 1518179666 +0100
3 1518104859 +0100 Markus Harrer 1518104859 +0100
4 1518104723 +0100 Markus Harrer 1518104723 +0100

In [47]:
import pandas as pd

raw = pd.read_csv(
    URL,
    sep="\t",
    encoding="latin-1",
    header=None,
    names=['unix_timestamp', 'author'])

# create separate columns for time data
raw[['timestamp', 'timezone']] = raw['unix_timestamp'].str.split(" ", expand=True)
# convert timestamp data
raw['timestamp'] = pd.to_datetime(raw['timestamp'], unit="s")
# add hourly offset data
raw['timezone_offset'] = pd.to_numeric(raw['timezone']) / 100.0
# calculate the local time
raw["timestamp_local"] = raw['timestamp'] + pd.to_timedelta(raw['timezone_offset'], unit='h')

# filter out wrong timestamps
raw = raw[
    (raw['timestamp'] >= raw.iloc[-1]['timestamp']) &
    (raw['timestamp'] <= pd.to_datetime('today'))]

git_authors = raw[['timestamp_local', 'timezone', 'author']].copy()
git_authors.head()


Out[47]:
timestamp_local timezone author
3 2018-02-08 16:47:39 +0100 Markus Harrer
4 2018-02-08 16:45:23 +0100 Markus Harrer
5 2018-02-08 16:44:16 +0100 Markus Harrer
6 2018-02-08 15:04:42 +0100 Markus Harrer
7 2017-11-22 17:54:34 +0100 Markus Harrer

In [48]:
%matplotlib inline
df.timezone.value_counts().plot.pie(figsize=(7,7), label="")


Out[48]:
<matplotlib.axes._subplots.AxesSubplot at 0x1dcbfb8f6d8>

In [49]:
pie_df.timezone.value_counts().head(5).index


Out[49]:
Index(['+0800', '+0200', '+0100', '+0000', '+0900'], dtype='object')

In [45]:
top5 = pie_df.timezone.value_counts().nlargest()
top5


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-45-d0b66ee45931> in <module>()
----> 1 top5 = pie_df.timezone.nlargest()
      2 top5

C:\dev\apps\Anaconda3\lib\site-packages\pandas\core\series.py in nlargest(self, n, keep)
   2061         dtype: float64
   2062         """
-> 2063         return algorithms.SelectNSeries(self, n=n, keep=keep).nlargest()
   2064 
   2065     def nsmallest(self, n=5, keep='first'):

C:\dev\apps\Anaconda3\lib\site-packages\pandas\core\algorithms.py in nlargest(self)
    915 
    916     def nlargest(self):
--> 917         return self.compute('nlargest')
    918 
    919     def nsmallest(self):

C:\dev\apps\Anaconda3\lib\site-packages\pandas\core\algorithms.py in compute(self, method)
    952             raise TypeError("Cannot use method '{method}' with "
    953                             "dtype {dtype}".format(method=method,
--> 954                                                    dtype=dtype))
    955 
    956         if n <= 0:

TypeError: Cannot use method 'nlargest' with dtype object

In [42]:
df.loc[~df.timezone.isin(top5), 'plot_data'] = "Other"
df.plot_data.value_counts().plot.pie(figsize=[7,7])


Out[42]:
<matplotlib.axes._subplots.AxesSubplot at 0x1dcbfa7fdd8>