notebook.community

Edit and run



In [4]:

    
import pandas as pd

URL = "https://raw.githubusercontent.com/feststelltaste/software-data/master/projects/buschmais-spring-petclinic-joa/git_log_unixtimestamp-author.log"
df = pd.read_csv(URL, sep="\t", names=["time","author"])
df[['timestamp', 'timezone']] = df.time.str.split(" ", expand=True)
df.head()









    Out[4]:







  
    
      
      time
      author
      timestamp
      timezone
    
  
  
    
      0
      1518180584 +0100
      JavaOnAutobahn
      1518180584
      +0100
    
    
      1
      1518180229 +0100
      JavaOnAutobahn
      1518180229
      +0100
    
    
      2
      1518179666 +0100
      JavaOnAutobahn
      1518179666
      +0100
    
    
      3
      1518104859 +0100
      Markus Harrer
      1518104859
      +0100
    
    
      4
      1518104723 +0100
      Markus Harrer
      1518104723
      +0100



In [47]:

    
import pandas as pd

raw = pd.read_csv(
    URL,
    sep="\t",
    encoding="latin-1",
    header=None,
    names=['unix_timestamp', 'author'])

# create separate columns for time data
raw[['timestamp', 'timezone']] = raw['unix_timestamp'].str.split(" ", expand=True)
# convert timestamp data
raw['timestamp'] = pd.to_datetime(raw['timestamp'], unit="s")
# add hourly offset data
raw['timezone_offset'] = pd.to_numeric(raw['timezone']) / 100.0
# calculate the local time
raw["timestamp_local"] = raw['timestamp'] + pd.to_timedelta(raw['timezone_offset'], unit='h')

# filter out wrong timestamps
raw = raw[
    (raw['timestamp'] >= raw.iloc[-1]['timestamp']) &
    (raw['timestamp'] <= pd.to_datetime('today'))]

git_authors = raw[['timestamp_local', 'timezone', 'author']].copy()
git_authors.head()









    Out[47]:







  
    
      
      timestamp_local
      timezone
      author
    
  
  
    
      3
      2018-02-08 16:47:39
      +0100
      Markus Harrer
    
    
      4
      2018-02-08 16:45:23
      +0100
      Markus Harrer
    
    
      5
      2018-02-08 16:44:16
      +0100
      Markus Harrer
    
    
      6
      2018-02-08 15:04:42
      +0100
      Markus Harrer
    
    
      7
      2017-11-22 17:54:34
      +0100
      Markus Harrer



In [48]:

    
%matplotlib inline
df.timezone.value_counts().plot.pie(figsize=(7,7), label="")









    Out[48]:





<matplotlib.axes._subplots.AxesSubplot at 0x1dcbfb8f6d8>



In [49]:

    
pie_df.timezone.value_counts().head(5).index









    Out[49]:





Index(['+0800', '+0200', '+0100', '+0000', '+0900'], dtype='object')



In [45]:

    
top5 = pie_df.timezone.value_counts().nlargest()
top5









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-45-d0b66ee45931> in <module>()
----> 1 top5 = pie_df.timezone.nlargest()
      2 top5

C:\dev\apps\Anaconda3\lib\site-packages\pandas\core\series.py in nlargest(self, n, keep)
   2061         dtype: float64
   2062         """
-> 2063         return algorithms.SelectNSeries(self, n=n, keep=keep).nlargest()
   2064 
   2065     def nsmallest(self, n=5, keep='first'):

C:\dev\apps\Anaconda3\lib\site-packages\pandas\core\algorithms.py in nlargest(self)
    915 
    916     def nlargest(self):
--> 917         return self.compute('nlargest')
    918 
    919     def nsmallest(self):

C:\dev\apps\Anaconda3\lib\site-packages\pandas\core\algorithms.py in compute(self, method)
    952             raise TypeError("Cannot use method '{method}' with "
    953                             "dtype {dtype}".format(method=method,
--> 954                                                    dtype=dtype))
    955 
    956         if n <= 0:

TypeError: Cannot use method 'nlargest' with dtype object



In [42]:

    
df.loc[~df.timezone.isin(top5), 'plot_data'] = "Other"
df.plot_data.value_counts().plot.pie(figsize=[7,7])









    Out[42]:





<matplotlib.axes._subplots.AxesSubplot at 0x1dcbfa7fdd8>

	time	author	timestamp	timezone
0	1518180584 +0100	JavaOnAutobahn	1518180584	+0100
1	1518180229 +0100	JavaOnAutobahn	1518180229	+0100
2	1518179666 +0100	JavaOnAutobahn	1518179666	+0100
3	1518104859 +0100	Markus Harrer	1518104859	+0100
4	1518104723 +0100	Markus Harrer	1518104723	+0100

	timestamp_local	timezone	author
3	2018-02-08 16:47:39	+0100	Markus Harrer
4	2018-02-08 16:45:23	+0100	Markus Harrer
5	2018-02-08 16:44:16	+0100	Markus Harrer
6	2018-02-08 15:04:42	+0100	Markus Harrer
7	2017-11-22 17:54:34	+0100	Markus Harrer