No-Go

  • adsfsadf
  • asdfasdf

In [5]:
import pandas as pd

pd?

In [6]:
log = pd.read_csv("../dataset/linux_blame_log.csv")
log.head()


Out[6]:
path author timestamp line
0 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 1448528085000000000 1
1 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 1448528085000000000 2
2 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 1448528085000000000 3
3 drivers/scsi/bfa/bfad_drv.h Jing Huang 1253753175000000000 4
4 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 1448528085000000000 5

In [7]:
log.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5665947 entries, 0 to 5665946
Data columns (total 4 columns):
path         object
author       object
timestamp    int64
line         int64
dtypes: int64(2), object(2)
memory usage: 172.9+ MB

In [10]:
top10 = log.author.value_counts().head(10)
top10


Out[10]:
Linus Torvalds           838200
Hans Verkuil             118432
Mauro Carvalho Chehab    102107
Michael Chan              53945
Mike Marciniszyn          44843
Ralph Campbell            42453
Nicholas Bellinger        41823
Laurent Pinchart          40438
Antti Palosaari           40390
Alexander Duyck           39307
Name: author, dtype: int64

In [13]:
%matplotlib inline
top10.plot.pie();



In [15]:
log.timestamp = pd.to_datetime(log.timestamp)
log.head()


Out[15]:
path author timestamp line
0 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 1
1 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 2
2 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 3
3 drivers/scsi/bfa/bfad_drv.h Jing Huang 2009-09-24 00:46:15 4
4 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 5

In [16]:
log['age'] = pd.Timestamp('today') - log.timestamp
log.head()


Out[16]:
path author timestamp line age
0 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 1 1054 days 00:59:53.681968
1 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 2 1054 days 00:59:53.681968
2 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 3 1054 days 00:59:53.681968
3 drivers/scsi/bfa/bfad_drv.h Jing Huang 2009-09-24 00:46:15 4 3308 days 09:08:23.681968
4 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 5 1054 days 00:59:53.681968

In [18]:
log['component'] = log.path.str.split("/").str[:2].str.join(":")
log.head()


Out[18]:
path author timestamp line age component
0 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 1 1054 days 00:59:53.681968 drivers:scsi
1 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 2 1054 days 00:59:53.681968 drivers:scsi
2 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 3 1054 days 00:59:53.681968 drivers:scsi
3 drivers/scsi/bfa/bfad_drv.h Jing Huang 2009-09-24 00:46:15 4 3308 days 09:08:23.681968 drivers:scsi
4 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 5 1054 days 00:59:53.681968 drivers:scsi

In [19]:
age_for_component = log.groupby('component').age.min().sort_values()
age_for_component.head()


Out[19]:
component
drivers:scsi   186 days 16:28:29.681968
drivers:i2c    186 days 17:49:04.681968
drivers:net    186 days 18:04:20.681968
drivers:of     187 days 10:23:22.681968
drivers:pci    187 days 14:10:17.681968
Name: age, dtype: timedelta64[ns]

In [21]:
age_for_component.plot.bar(figsize=(15,5))


Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x24093ba2978>