In [4]:
import pandas as pd
URL = "https://raw.githubusercontent.com/feststelltaste/software-data/master/projects/buschmais-spring-petclinic-joa/git_log_unixtimestamp-author.log"
df = pd.read_csv(URL, sep="\t", names=["time","author"])
df[['timestamp', 'timezone']] = df.time.str.split(" ", expand=True)
df.head()
Out[4]:
In [47]:
import pandas as pd
raw = pd.read_csv(
URL,
sep="\t",
encoding="latin-1",
header=None,
names=['unix_timestamp', 'author'])
# create separate columns for time data
raw[['timestamp', 'timezone']] = raw['unix_timestamp'].str.split(" ", expand=True)
# convert timestamp data
raw['timestamp'] = pd.to_datetime(raw['timestamp'], unit="s")
# add hourly offset data
raw['timezone_offset'] = pd.to_numeric(raw['timezone']) / 100.0
# calculate the local time
raw["timestamp_local"] = raw['timestamp'] + pd.to_timedelta(raw['timezone_offset'], unit='h')
# filter out wrong timestamps
raw = raw[
(raw['timestamp'] >= raw.iloc[-1]['timestamp']) &
(raw['timestamp'] <= pd.to_datetime('today'))]
git_authors = raw[['timestamp_local', 'timezone', 'author']].copy()
git_authors.head()
Out[47]:
In [48]:
%matplotlib inline
df.timezone.value_counts().plot.pie(figsize=(7,7), label="")
Out[48]:
In [49]:
pie_df.timezone.value_counts().head(5).index
Out[49]:
In [45]:
top5 = pie_df.timezone.value_counts().nlargest()
top5
In [42]:
df.loc[~df.timezone.isin(top5), 'plot_data'] = "Other"
df.plot_data.value_counts().plot.pie(figsize=[7,7])
Out[42]: