In [1]:
import pandas as pd

log = pd.read_csv("../dataset/linux_blame_log.csv")
log.head()


Out[1]:
path author timestamp line
0 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 1448528085000000000 1
1 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 1448528085000000000 2
2 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 1448528085000000000 3
3 drivers/scsi/bfa/bfad_drv.h Jing Huang 1253753175000000000 4
4 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 1448528085000000000 5

In [2]:
log.info(memory_usage='deep')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5665947 entries, 0 to 5665946
Data columns (total 4 columns):
path         object
author       object
timestamp    int64
line         int64
dtypes: int64(2), object(2)
memory usage: 971.8 MB

In [3]:
log.author = pd.Categorical(log.author)
log.path = pd.Categorical(log.path)
log.head()


Out[3]:
path author timestamp line
0 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 1448528085000000000 1
1 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 1448528085000000000 2
2 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 1448528085000000000 3
3 drivers/scsi/bfa/bfad_drv.h Jing Huang 1253753175000000000 4
4 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 1448528085000000000 5

In [4]:
log.info(memory_usage='deep')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5665947 entries, 0 to 5665946
Data columns (total 4 columns):
path         category
author       category
timestamp    int64
line         int64
dtypes: category(2), int64(2)
memory usage: 109.9 MB

In [5]:
%matplotlib inline
log.author.value_counts().head(10).plot.pie();



In [6]:
log.timestamp = pd.to_datetime(log.timestamp)
log.head()


Out[6]:
path author timestamp line
0 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 1
1 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 2
2 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 3
3 drivers/scsi/bfa/bfad_drv.h Jing Huang 2009-09-24 00:46:15 4
4 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 5

In [7]:
knowledge = log.timestamp > pd.Timestamp('today') - pd.DateOffset(months=6)
knowledge.value_counts()


Out[7]:
False    5534009
True      131938
Name: timestamp, dtype: int64

In [8]:
knowledge.mean()


Out[8]:
0.023286133809582051

In [9]:
knowledge.value_counts().plot.pie();



In [10]:
log['component'] = log.path.str.split("/").str[0:2].str.join(":")
log.head()


Out[10]:
path author timestamp line component
0 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 1 drivers:scsi
1 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 2 drivers:scsi
2 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 3 drivers:scsi
3 drivers/scsi/bfa/bfad_drv.h Jing Huang 2009-09-24 00:46:15 4 drivers:scsi
4 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 5 drivers:scsi

In [11]:
log.component.value_counts().head()


Out[11]:
drivers:net           1747273
drivers:media         1147546
drivers:scsi           720596
drivers:infiniband     357729
drivers:input          187545
Name: component, dtype: int64

In [12]:
log['age'] = pd.Timestamp('today') - log.timestamp
log.head()


Out[12]:
path author timestamp line component age
0 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 1 drivers:scsi 955 days 05:53:42.898728
1 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 2 drivers:scsi 955 days 05:53:42.898728
2 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 3 drivers:scsi 955 days 05:53:42.898728
3 drivers/scsi/bfa/bfad_drv.h Jing Huang 2009-09-24 00:46:15 4 drivers:scsi 3209 days 14:02:12.898728
4 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 5 drivers:scsi 955 days 05:53:42.898728

In [13]:
log.age.mean()


Out[13]:
Timedelta('2516 days 04:30:11.360844')

In [14]:
log.age.describe()


Out[14]:
count                      5665947
mean     2516 days 04:30:11.360844
std      1563 days 11:40:19.258776
min        87 days 21:22:18.898728
25%      1052 days 06:19:00.898728
50%      2517 days 20:16:42.898728
75%      3831 days 01:19:37.898728
max      5933 days 19:33:53.898728
Name: age, dtype: object

In [15]:
age_per_component = log.groupby(['component']).age.apply(min).sort_values()
age_per_component.head()


Out[15]:
component
drivers:scsi   87 days 21:22:18.898728
drivers:i2c    87 days 22:42:53.898728
drivers:net    87 days 22:58:09.898728
drivers:of     88 days 15:17:11.898728
drivers:pci    88 days 19:04:06.898728
Name: age, dtype: timedelta64[ns]

In [16]:
age_per_component.tail(10)


Out[16]:
component
arch:ia64         2660 days 01:43:41.898728
arch:unicore32    2689 days 01:27:09.898728
drivers:char      2700 days 22:21:05.898728
drivers:serial    2725 days 07:28:34.898728
drivers:mfd       2748 days 16:25:55.898728
include:scsi      3789 days 15:34:51.898728
arch:i386         3927 days 07:23:55.898728
drivers:usb       4084 days 09:50:58.898728
include:asm-arm   4238 days 01:13:34.898728
arch:sparc64      4532 days 03:55:43.898728
Name: age, dtype: timedelta64[ns]

In [17]:
age_per_component.plot.bar(figsize=[15,5]);


Bonus section


In [18]:
log_timed = log.set_index('timestamp')
log_timed.head()


Out[18]:
path author line component age
timestamp
2015-11-26 08:54:45 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 1 drivers:scsi 955 days 05:53:42.898728
2015-11-26 08:54:45 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2 drivers:scsi 955 days 05:53:42.898728
2015-11-26 08:54:45 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 3 drivers:scsi 955 days 05:53:42.898728
2009-09-24 00:46:15 drivers/scsi/bfa/bfad_drv.h Jing Huang 4 drivers:scsi 3209 days 14:02:12.898728
2015-11-26 08:54:45 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 5 drivers:scsi 955 days 05:53:42.898728

In [19]:
log_timed = log.groupby([pd.Grouper(key='timestamp', freq='M'), 'component']).line.count()
log_timed.head()


Out[19]:
timestamp   component         
2002-04-30  drivers:block          8
            drivers:infiniband    17
2003-02-28  drivers:net           40
2005-03-31  drivers:block          9
            drivers:char           3
Name: line, dtype: int64

In [20]:
component_history = log_timed.unstack().fillna(0)
component_history.head()


Out[20]:
component arch:arc arch:arm arch:i386 arch:ia64 arch:mips arch:powerpc arch:s390 arch:sh arch:sparc arch:sparc64 ... drivers:uio drivers:usb drivers:uwb drivers:w1 drivers:xen include:asm-arm include:linux include:media include:scsi sound:i2c
timestamp
2002-04-30 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2003-02-28 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2005-03-31 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2005-04-30 0.0 497.0 5045.0 0.0 0.0 0.0 0.0 53.0 0.0 494.0 ... 0.0 4333.0 0.0 1571.0 0.0 46.0 200.0 0.0 0.0 88.0
2005-05-31 0.0 0.0 6.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 41.0 0.0 67.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 73 columns


In [21]:
relative_history = component_history.apply(
    lambda x : x / component_history.sum(axis=1))
relative_history.head()


Out[21]:
component arch:arc arch:arm arch:i386 arch:ia64 arch:mips arch:powerpc arch:s390 arch:sh arch:sparc arch:sparc64 ... drivers:uio drivers:usb drivers:uwb drivers:w1 drivers:xen include:asm-arm include:linux include:media include:scsi sound:i2c
timestamp
2002-04-30 0.0 0.00000 0.000000 0.0 0.0 0.0 0.0 0.000000 0.0 0.000000 ... 0.0 0.000000 0.0 0.000000 0.0 0.000000 0.000000 0.0 0.0 0.000000
2003-02-28 0.0 0.00000 0.000000 0.0 0.0 0.0 0.0 0.000000 0.0 0.000000 ... 0.0 0.000000 0.0 0.000000 0.0 0.000000 0.000000 0.0 0.0 0.000000
2005-03-31 0.0 0.00000 0.000000 0.0 0.0 0.0 0.0 0.000000 0.0 0.000000 ... 0.0 0.000000 0.0 0.000000 0.0 0.000000 0.000000 0.0 0.0 0.000000
2005-04-30 0.0 0.00059 0.005992 0.0 0.0 0.0 0.0 0.000063 0.0 0.000587 ... 0.0 0.005147 0.0 0.001866 0.0 0.000055 0.000238 0.0 0.0 0.000105
2005-05-31 0.0 0.00000 0.000276 0.0 0.0 0.0 0.0 0.000000 0.0 0.000000 ... 0.0 0.001885 0.0 0.003081 0.0 0.000000 0.000000 0.0 0.0 0.000000

5 rows × 73 columns


In [22]:
relative_history.plot.area(legend=False, figsize=[15,8])


Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x22f5670bb38>