In [1]:
import pandas as pd

path = "../../../software-data/projects/linux/linux_blame_log.csv.gz"
log = pd.read_csv(path)
log.head()


Out[1]:
path author timestamp line
0 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 1448528085000000000 1
1 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 1448528085000000000 2
2 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 1448528085000000000 3
3 drivers/scsi/bfa/bfad_drv.h Jing Huang 1253753175000000000 4
4 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 1448528085000000000 5

In [2]:
log.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5665947 entries, 0 to 5665946
Data columns (total 4 columns):
path         object
author       object
timestamp    int64
line         int64
dtypes: int64(2), object(2)
memory usage: 172.9+ MB

In [3]:
top10 = log['author'].value_counts().head(10)
top10


Out[3]:
Linus Torvalds           838200
Hans Verkuil             118432
Mauro Carvalho Chehab    102107
Michael Chan              53945
Mike Marciniszyn          44843
Ralph Campbell            42453
Nicholas Bellinger        41823
Laurent Pinchart          40438
Antti Palosaari           40390
Alexander Duyck           39307
Name: author, dtype: int64

In [4]:
%matplotlib inline
top10.plot.pie();


No-Go Areas


In [5]:
log['timestamp'] = pd.to_datetime(log['timestamp'])
log.head()


Out[5]:
path author timestamp line
0 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 1
1 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 2
2 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 3
3 drivers/scsi/bfa/bfad_drv.h Jing Huang 2009-09-24 00:46:15 4
4 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 5

In [6]:
log['age'] = pd.Timestamp("today") - log['timestamp']
log.head()


Out[6]:
path author timestamp line age
0 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 1 1097 days 01:12:16.922742
1 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 2 1097 days 01:12:16.922742
2 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 3 1097 days 01:12:16.922742
3 drivers/scsi/bfa/bfad_drv.h Jing Huang 2009-09-24 00:46:15 4 3351 days 09:20:46.922742
4 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 5 1097 days 01:12:16.922742

In [7]:
log['component'] = log['path'].str.split("/").str[:2].str.join(":")
log.head()


Out[7]:
path author timestamp line age component
0 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 1 1097 days 01:12:16.922742 drivers:scsi
1 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 2 1097 days 01:12:16.922742 drivers:scsi
2 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 3 1097 days 01:12:16.922742 drivers:scsi
3 drivers/scsi/bfa/bfad_drv.h Jing Huang 2009-09-24 00:46:15 4 3351 days 09:20:46.922742 drivers:scsi
4 drivers/scsi/bfa/bfad_drv.h Anil Gurumurthy 2015-11-26 08:54:45 5 1097 days 01:12:16.922742 drivers:scsi

In [8]:
age_per_component = log.groupby('component')['age'].min().sort_values()
age_per_component.head()


Out[8]:
component
drivers:scsi   229 days 16:40:52.922742
drivers:i2c    229 days 18:01:27.922742
drivers:net    229 days 18:16:43.922742
drivers:of     230 days 10:35:45.922742
drivers:pci    230 days 14:22:40.922742
Name: age, dtype: timedelta64[ns]

In [9]:
age_per_component.plot.bar(
    title="Alter pro Komponente (in Jahren)",
    figsize=[15,5]);


Wissensinseln

Bewertung des vorhandenen Wissens

...anhand der zuletzt geänderten Quellcodezeilen

Gruppieren mit minimalen Zeitstempel und Zeilenanzahl

=> Jüngste Änderung und Anzahl geänderter Zeilen pro Datei und Autor


In [10]:
knowledge = log.groupby(
    ['path', 'author']).agg(
        {'timestamp':'min', 'line':'count'}
    )
knowledge.head()


Out[10]:
timestamp line
path author
arch/arc/kernel/time.c Anna-Maria Gleixner 2016-07-13 17:17:07 13
Daniel Lezcano 2016-06-15 12:50:12 31
Noam Camus 2016-01-01 10:18:49 18
Vineet Gupta 2013-01-18 09:42:18 243
Viresh Kumar 2015-07-16 11:26:14 6

Wissensanteile berechnen

=> Prozentualer Anteil der zuletzt geänderten Zeilen pro Datei und Autor


In [11]:
knowledge['all_lines'] = knowledge.groupby('path')['line'].transform('sum')
knowledge['knowing'] = knowledge['line'] / knowledge['all_lines']
knowledge.head()


Out[11]:
timestamp line all_lines knowing
path author
arch/arc/kernel/time.c Anna-Maria Gleixner 2016-07-13 17:17:07 13 311 0.041801
Daniel Lezcano 2016-06-15 12:50:12 31 311 0.099678
Noam Camus 2016-01-01 10:18:49 18 311 0.057878
Vineet Gupta 2013-01-18 09:42:18 243 311 0.781350
Viresh Kumar 2015-07-16 11:26:14 6 311 0.019293

Maximales Wissen pro Datei identifizieren

=> Hauptautor pro Datei


In [12]:
max_knowledge_per_file = knowledge.groupby(['path'])['knowing'].transform(max)
knowledge_carriers = knowledge[knowledge['knowing'] == max_knowledge_per_file]
knowledge_carriers = knowledge_carriers.reset_index(level=1)
knowledge_carriers.head()


Out[12]:
author timestamp line all_lines knowing
path
arch/arc/kernel/time.c Vineet Gupta 2013-01-18 09:42:18 243 311 0.781350
arch/arm/common/timer-sp.c Rob Herring 2011-12-12 21:29:08 111 169 0.656805
arch/arm/include/asm/hardware/arm_timer.h Russell King 2010-01-16 15:07:08 24 29 0.827586
arch/arm/kernel/perf_event.c Jamie Iles 2010-02-02 19:25:44 176 523 0.336520
arch/arm/mach-at91/at91rm9200_time.c David Brownell 2007-07-31 00:41:26 81 95 0.852632

Visualisierung erstellen

=> Export in D3 Visualisierung "Zoomable Circle Packing"


In [14]:
from ausi import d3

d3.create_json_for_zoomable_circle_packing(
    knowledge_carriers.reset_index(),
    'author',
    'author',
    'path',
    '/',
    'all_lines',
    'knowing',
    'linux_circle_packing'
)


JSON file produced in 'C:\dev\repos\software-analytics\demos\20181127_Nuremberg\linux_circle_packing.json'
HTML file produced in 'C:\dev\repos\software-analytics\demos\20181127_Nuremberg\linux_circle_packing.html'

Ende