notebook.community

Edit and run



In [ ]:

    
import pandas as pd

log = pd.read_csv("../dataset/linux_blame_log.csv")
log.head()



In [ ]:

    
log.info(memory_usage='deep')



In [ ]:

    
log.author = pd.Categorical(log.author)
log.path = pd.Categorical(log.path)
log.head()



In [ ]:

    
log.info(memory_usage='deep')



In [ ]:

    
%matplotlib inline
log.author.value_counts().head(10).plot.pie();



In [ ]:

    
log.timestamp = pd.to_datetime(log.timestamp)
log.head()



In [ ]:

    
knowledge = log.timestamp > pd.Timestamp('today') - pd.DateOffset(months=6)
knowledge.value_counts()



In [ ]:

    
knowledge.mean()



In [ ]:

    
knowledge.value_counts().plot.pie();



In [ ]:

    
log['component'] = log.path.str.split("/").str[0:2].str.join(":")
log.head()



In [ ]:

    
log.component.value_counts().head()



In [ ]:

    
log['age'] = pd.Timestamp('today') - log.timestamp
log.head()



In [ ]:

    
log.age.mean()



In [ ]:

    
log.age.describe()



In [ ]:

    
age_per_component = log.groupby(['component']).age.apply(min).sort_values()
age_per_component.head()



In [ ]:

    
age_per_component.tail(10)



In [ ]:

    
age_per_component.plot.bar(figsize=[15,5]);



In [ ]:

    
log_timed = log.set_index('timestamp')
log_timed.head()



In [ ]:

    
log_timed = log.groupby([pd.Grouper(key='timestamp', freq='M'), 'component']).line.count()
log_timed.head()



In [ ]:

    
component_history = log_timed.unstack().fillna(0)
component_history.head()



In [ ]:

    
relative_history = component_history.apply(
    lambda x : x / component_history.sum(axis=1))
relative_history.head()



In [ ]:

    
relative_history.plot.area(legend=False, figsize=[15,8])