In [3]:
import pandas as pd
pd?
In [4]:
log = pd.read_csv("../dataset/linux_blame_log.csv")
log.head()
Out[4]:
In [6]:
log.info(memory_usage='deep')
In [7]:
log.author = pd.Categorical(log.author)
log.path = pd.Categorical(log.path)
log.info(memory_usage='deep')
In [10]:
log.author.value_counts().head(10)
Out[10]:
In [12]:
log.timestamp = pd.to_datetime(log.timestamp)
log.head()
Out[12]:
In [22]:
log['age'] = pd.Timestamp('today') - log.timestamp
log.head()
Out[22]:
In [23]:
log['component'] = log.path.str.split("/").str[:2].str.join(":")
log.head()
Out[23]:
In [24]:
components_age = log.groupby('component').age.apply(min)
components_age.head()
Out[24]:
In [26]:
%matplotlib inline
components_age.sort_values().plot.bar(figsize=[15,5])
Out[26]:
In [ ]:
In [ ]: