In [1]:
import sys;sys.path.append("..")
import pandas as pd
from pandas import DataFrame
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
%matplotlib inline
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))



In [2]:
from devml.post_processing import (git_churn_df, file_len, git_populate_file_metatdata)

In [3]:
df = git_churn_df(path="/Users/noahgift/src/linux")


2017-10-22 10:02:53,055 - devml.post_processing - INFO - Running churn cmd: [git log --name-only --pretty=format:] at path [/Users/noahgift/src/linux]

In [4]:
metadata_df = git_populate_file_metatdata(df)

In [5]:
metadata_df.sort_values(by="churn_count", ascending=False).head(10)


Out[5]:
files churn_count line_count extension relative_churn
1 b'MAINTAINERS' 4595 14942.0 0.31
2 b'drivers/gpu/drm/i915/intel_display.c' 3296 15493.0 .c 0.21
3 b'drivers/gpu/drm/i915/i915_drv.h' 2315 4343.0 .h 0.53
4 b'drivers/gpu/drm/i915/i915_gem.c' 1678 5392.0 .c 0.31
5 b'sound/pci/hda/patch_realtek.c' 1542 7931.0 .c 0.19
6 b'kernel/sched.c' 1359 NaN .c NaN
7 b'include/linux/sched.h' 1331 1669.0 .h 0.80
8 b'arch/x86/kvm/x86.c' 1287 8817.0 .c 0.15
9 b'drivers/gpu/drm/i915/intel_pm.c' 1287 9262.0 .c 0.14
10 b'arch/arm/Kconfig' 1286 2217.0 0.58

In [6]:
metadata_df.median()


Out[6]:
churn_count         4.00
line_count        143.00
relative_churn      0.05
dtype: float64

In [7]:
from devml.post_processing import git_deleted_files
deletion_counts = git_deleted_files("/Users/noahgift/src/linux")


2017-10-22 10:04:12,282 - devml.post_processing - INFO - Running del cmd: [git log --diff-filter=D --summary | grep delete] at path [/Users/noahgift/src/linux]

In [8]:
all_files = metadata_df['files']
deleted_files = deletion_counts['files']
membership = all_files.isin(deleted_files)

In [9]:
metadata_df["deleted_files"] = membership

In [10]:
metadata_df.loc[metadata_df["deleted_files"] == True].median()


Out[10]:
churn_count        3.00
line_count        79.00
relative_churn     0.12
deleted_files      1.00
dtype: float64

In [11]:
metadata_df.loc[metadata_df["deleted_files"] == False].median()


Out[11]:
churn_count         5.00
line_count        144.00
relative_churn      0.05
deleted_files       0.00
dtype: float64

In [12]:
metadata_df.count()


Out[12]:
files             106491
churn_count       106491
line_count         61276
extension         106491
relative_churn     61276
deleted_files     106491
dtype: int64

In [13]:
sns.heatmap(metadata_df.corr(), annot=True)


Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x11cc5fcc0>

In [14]:
sns.lmplot(x="churn_count", y="line_count", hue="deleted_files", data=metadata_df)


Out[14]:
<seaborn.axisgrid.FacetGrid at 0x11cc8ca90>

In [15]:
import devml
df = devml.mkdata.create_org_df(path="/Users/noahgift/src/linux/")


2017-10-22 10:05:00,830 - devml.mkdata - INFO - Found repo: /Users/noahgift/src/linux/init
2017-10-22 10:05:00,831 - devml.mkdata - INFO - Processing Repo: /Users/noahgift/src/linux/init
2017-10-22 10:05:00,853 - devml.mkdata - INFO - Repo Name: b'linux'
2017-10-22 10:05:10,075 - devml.mkdata - ERROR - utf8 encoding is incorrect, trying ISO-8859-1
Traceback (most recent call last):
  File "../devml/mkdata.py", line 46, in log_to_dict
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 69960305: invalid continuation byte
2017-10-22 10:05:12,829 - devml.mkdata - INFO - Found 656006 Messages For Repo: b'linux'
2017-10-22 10:05:12,843 - devml.mkdata - INFO - Found repo: /Users/noahgift/src/linux/crypto
2017-10-22 10:05:12,844 - devml.mkdata - INFO - Processing Repo: /Users/noahgift/src/linux/crypto
2017-10-22 10:05:12,875 - devml.mkdata - INFO - Repo Name: b'linux'
2017-10-22 10:05:21,953 - devml.mkdata - ERROR - utf8 encoding is incorrect, trying ISO-8859-1
Traceback (most recent call last):
  File "../devml/mkdata.py", line 46, in log_to_dict
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 69960305: invalid continuation byte
2017-10-22 10:05:25,372 - devml.mkdata - INFO - Found 656006 Messages For Repo: b'linux'
2017-10-22 10:05:25,384 - devml.mkdata - INFO - Found repo: /Users/noahgift/src/linux/firmware
2017-10-22 10:05:25,385 - devml.mkdata - INFO - Processing Repo: /Users/noahgift/src/linux/firmware
2017-10-22 10:05:25,417 - devml.mkdata - INFO - Repo Name: b'linux'
2017-10-22 10:05:34,412 - devml.mkdata - ERROR - utf8 encoding is incorrect, trying ISO-8859-1
Traceback (most recent call last):
  File "../devml/mkdata.py", line 46, in log_to_dict
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 69960305: invalid continuation byte
2017-10-22 10:05:38,071 - devml.mkdata - INFO - Found 656006 Messages For Repo: b'linux'
2017-10-22 10:05:38,085 - devml.mkdata - INFO - Found repo: /Users/noahgift/src/linux/Documentation
2017-10-22 10:05:38,086 - devml.mkdata - INFO - Processing Repo: /Users/noahgift/src/linux/Documentation
2017-10-22 10:05:38,129 - devml.mkdata - INFO - Repo Name: b'linux'
2017-10-22 10:05:47,380 - devml.mkdata - ERROR - utf8 encoding is incorrect, trying ISO-8859-1
Traceback (most recent call last):
  File "../devml/mkdata.py", line 46, in log_to_dict
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 69960305: invalid continuation byte
2017-10-22 10:05:51,235 - devml.mkdata - INFO - Found 656006 Messages For Repo: b'linux'
2017-10-22 10:05:51,247 - devml.mkdata - INFO - Found repo: /Users/noahgift/src/linux/usr
2017-10-22 10:05:51,248 - devml.mkdata - INFO - Processing Repo: /Users/noahgift/src/linux/usr
2017-10-22 10:05:51,294 - devml.mkdata - INFO - Repo Name: b'linux'
2017-10-22 10:06:00,288 - devml.mkdata - ERROR - utf8 encoding is incorrect, trying ISO-8859-1
Traceback (most recent call last):
  File "../devml/mkdata.py", line 46, in log_to_dict
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 69960305: invalid continuation byte
2017-10-22 10:06:04,412 - devml.mkdata - INFO - Found 656006 Messages For Repo: b'linux'
2017-10-22 10:06:04,425 - devml.mkdata - INFO - Found repo: /Users/noahgift/src/linux/tools
2017-10-22 10:06:04,426 - devml.mkdata - INFO - Processing Repo: /Users/noahgift/src/linux/tools
2017-10-22 10:06:04,472 - devml.mkdata - INFO - Repo Name: b'linux'
2017-10-22 10:06:13,592 - devml.mkdata - ERROR - utf8 encoding is incorrect, trying ISO-8859-1
Traceback (most recent call last):
  File "../devml/mkdata.py", line 46, in log_to_dict
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 69960305: invalid continuation byte
2017-10-22 10:06:17,618 - devml.mkdata - INFO - Found 656006 Messages For Repo: b'linux'
2017-10-22 10:06:17,631 - devml.mkdata - INFO - Found repo: /Users/noahgift/src/linux/net
2017-10-22 10:06:17,632 - devml.mkdata - INFO - Processing Repo: /Users/noahgift/src/linux/net
2017-10-22 10:06:17,682 - devml.mkdata - INFO - Repo Name: b'linux'
2017-10-22 10:06:27,165 - devml.mkdata - ERROR - utf8 encoding is incorrect, trying ISO-8859-1
Traceback (most recent call last):
  File "../devml/mkdata.py", line 46, in log_to_dict
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 69960305: invalid continuation byte
2017-10-22 10:06:31,720 - devml.mkdata - INFO - Found 656006 Messages For Repo: b'linux'
2017-10-22 10:06:31,733 - devml.mkdata - INFO - Found repo: /Users/noahgift/src/linux/drivers
2017-10-22 10:06:31,734 - devml.mkdata - INFO - Processing Repo: /Users/noahgift/src/linux/drivers
2017-10-22 10:06:31,796 - devml.mkdata - INFO - Repo Name: b'linux'
2017-10-22 10:06:41,759 - devml.mkdata - ERROR - utf8 encoding is incorrect, trying ISO-8859-1
Traceback (most recent call last):
  File "../devml/mkdata.py", line 46, in log_to_dict
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 69960305: invalid continuation byte
2017-10-22 10:06:45,882 - devml.mkdata - INFO - Found 656006 Messages For Repo: b'linux'
2017-10-22 10:06:45,897 - devml.mkdata - INFO - Found repo: /Users/noahgift/src/linux/security
2017-10-22 10:06:45,898 - devml.mkdata - INFO - Processing Repo: /Users/noahgift/src/linux/security
2017-10-22 10:06:45,987 - devml.mkdata - INFO - Repo Name: b'linux'
2017-10-22 10:06:55,472 - devml.mkdata - ERROR - utf8 encoding is incorrect, trying ISO-8859-1
Traceback (most recent call last):
  File "../devml/mkdata.py", line 46, in log_to_dict
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 69960305: invalid continuation byte
2017-10-22 10:06:59,966 - devml.mkdata - INFO - Found 656006 Messages For Repo: b'linux'
2017-10-22 10:06:59,978 - devml.mkdata - INFO - Found repo: /Users/noahgift/src/linux/include
2017-10-22 10:06:59,979 - devml.mkdata - INFO - Processing Repo: /Users/noahgift/src/linux/include
2017-10-22 10:07:00,047 - devml.mkdata - INFO - Repo Name: b'linux'
2017-10-22 10:07:08,951 - devml.mkdata - ERROR - utf8 encoding is incorrect, trying ISO-8859-1
Traceback (most recent call last):
  File "../devml/mkdata.py", line 46, in log_to_dict
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 69960305: invalid continuation byte
2017-10-22 10:07:13,248 - devml.mkdata - INFO - Found 656006 Messages For Repo: b'linux'
2017-10-22 10:07:13,260 - devml.mkdata - INFO - Found repo: /Users/noahgift/src/linux/virt
2017-10-22 10:07:13,261 - devml.mkdata - INFO - Processing Repo: /Users/noahgift/src/linux/virt
2017-10-22 10:07:13,349 - devml.mkdata - INFO - Repo Name: b'linux'
2017-10-22 10:07:22,226 - devml.mkdata - ERROR - utf8 encoding is incorrect, trying ISO-8859-1
Traceback (most recent call last):
  File "../devml/mkdata.py", line 46, in log_to_dict
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 69960305: invalid continuation byte
2017-10-22 10:07:26,891 - devml.mkdata - INFO - Found 656006 Messages For Repo: b'linux'
2017-10-22 10:07:26,904 - devml.mkdata - INFO - Found repo: /Users/noahgift/src/linux/samples
2017-10-22 10:07:26,905 - devml.mkdata - INFO - Processing Repo: /Users/noahgift/src/linux/samples
2017-10-22 10:07:27,015 - devml.mkdata - INFO - Repo Name: b'linux'
2017-10-22 10:07:36,074 - devml.mkdata - ERROR - utf8 encoding is incorrect, trying ISO-8859-1
Traceback (most recent call last):
  File "../devml/mkdata.py", line 46, in log_to_dict
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 69960305: invalid continuation byte
2017-10-22 10:07:40,471 - devml.mkdata - INFO - Found 656006 Messages For Repo: b'linux'
2017-10-22 10:07:40,483 - devml.mkdata - INFO - Found repo: /Users/noahgift/src/linux/certs
2017-10-22 10:07:40,484 - devml.mkdata - INFO - Processing Repo: /Users/noahgift/src/linux/certs
2017-10-22 10:07:40,612 - devml.mkdata - INFO - Repo Name: b'linux'
2017-10-22 10:07:49,651 - devml.mkdata - ERROR - utf8 encoding is incorrect, trying ISO-8859-1
Traceback (most recent call last):
  File "../devml/mkdata.py", line 46, in log_to_dict
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 69960305: invalid continuation byte
2017-10-22 10:07:54,295 - devml.mkdata - INFO - Found 656006 Messages For Repo: b'linux'
2017-10-22 10:07:54,308 - devml.mkdata - INFO - Found repo: /Users/noahgift/src/linux/arch
2017-10-22 10:07:54,308 - devml.mkdata - INFO - Processing Repo: /Users/noahgift/src/linux/arch
2017-10-22 10:07:54,440 - devml.mkdata - INFO - Repo Name: b'linux'
2017-10-22 10:08:03,664 - devml.mkdata - ERROR - utf8 encoding is incorrect, trying ISO-8859-1
Traceback (most recent call last):
  File "../devml/mkdata.py", line 46, in log_to_dict
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 69960305: invalid continuation byte
2017-10-22 10:08:08,088 - devml.mkdata - INFO - Found 656006 Messages For Repo: b'linux'
2017-10-22 10:08:08,101 - devml.mkdata - INFO - Found repo: /Users/noahgift/src/linux/scripts
2017-10-22 10:08:08,101 - devml.mkdata - INFO - Processing Repo: /Users/noahgift/src/linux/scripts
2017-10-22 10:08:08,230 - devml.mkdata - INFO - Repo Name: b'linux'
2017-10-22 10:08:17,511 - devml.mkdata - ERROR - utf8 encoding is incorrect, trying ISO-8859-1
Traceback (most recent call last):
  File "../devml/mkdata.py", line 46, in log_to_dict
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 69960305: invalid continuation byte
2017-10-22 10:08:21,883 - devml.mkdata - INFO - Found 656006 Messages For Repo: b'linux'
2017-10-22 10:08:21,895 - devml.mkdata - INFO - Found repo: /Users/noahgift/src/linux/lib
2017-10-22 10:08:21,895 - devml.mkdata - INFO - Processing Repo: /Users/noahgift/src/linux/lib
2017-10-22 10:08:22,026 - devml.mkdata - INFO - Repo Name: b'linux'
2017-10-22 10:08:31,197 - devml.mkdata - ERROR - utf8 encoding is incorrect, trying ISO-8859-1
Traceback (most recent call last):
  File "../devml/mkdata.py", line 46, in log_to_dict
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 69960305: invalid continuation byte
2017-10-22 10:08:35,491 - devml.mkdata - INFO - Found 656006 Messages For Repo: b'linux'
2017-10-22 10:08:35,503 - devml.mkdata - INFO - Found repo: /Users/noahgift/src/linux/mm
2017-10-22 10:08:35,504 - devml.mkdata - INFO - Processing Repo: /Users/noahgift/src/linux/mm
2017-10-22 10:08:35,666 - devml.mkdata - INFO - Repo Name: b'linux'
2017-10-22 10:08:45,967 - devml.mkdata - ERROR - utf8 encoding is incorrect, trying ISO-8859-1
Traceback (most recent call last):
  File "../devml/mkdata.py", line 46, in log_to_dict
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 69960305: invalid continuation byte
2017-10-22 10:08:50,974 - devml.mkdata - INFO - Found 656006 Messages For Repo: b'linux'
2017-10-22 10:08:50,986 - devml.mkdata - INFO - Found repo: /Users/noahgift/src/linux/ipc
2017-10-22 10:08:50,987 - devml.mkdata - INFO - Processing Repo: /Users/noahgift/src/linux/ipc
2017-10-22 10:08:51,162 - devml.mkdata - INFO - Repo Name: b'linux'
2017-10-22 10:09:00,294 - devml.mkdata - ERROR - utf8 encoding is incorrect, trying ISO-8859-1
Traceback (most recent call last):
  File "../devml/mkdata.py", line 46, in log_to_dict
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 69960305: invalid continuation byte
2017-10-22 10:09:04,716 - devml.mkdata - INFO - Found 656006 Messages For Repo: b'linux'
2017-10-22 10:09:04,729 - devml.mkdata - INFO - Found repo: /Users/noahgift/src/linux/.git
2017-10-22 10:09:04,730 - devml.mkdata - INFO - Processing Repo: /Users/noahgift/src/linux/.git
2017-10-22 10:09:04,919 - devml.mkdata - INFO - Repo Name: b''
2017-10-22 10:09:14,384 - devml.mkdata - ERROR - utf8 encoding is incorrect, trying ISO-8859-1
Traceback (most recent call last):
  File "../devml/mkdata.py", line 46, in log_to_dict
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 69960305: invalid continuation byte
2017-10-22 10:09:19,986 - devml.mkdata - INFO - Found 656006 Messages For Repo: b''
2017-10-22 10:09:19,998 - devml.mkdata - INFO - Found repo: /Users/noahgift/src/linux/fs
2017-10-22 10:09:19,999 - devml.mkdata - INFO - Processing Repo: /Users/noahgift/src/linux/fs
2017-10-22 10:09:20,175 - devml.mkdata - INFO - Repo Name: b'linux'
2017-10-22 10:09:30,567 - devml.mkdata - ERROR - utf8 encoding is incorrect, trying ISO-8859-1
Traceback (most recent call last):
  File "../devml/mkdata.py", line 46, in log_to_dict
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 69960305: invalid continuation byte
2017-10-22 10:09:35,468 - devml.mkdata - INFO - Found 656006 Messages For Repo: b'linux'
2017-10-22 10:09:35,481 - devml.mkdata - INFO - Found repo: /Users/noahgift/src/linux/sound
2017-10-22 10:09:35,482 - devml.mkdata - INFO - Processing Repo: /Users/noahgift/src/linux/sound
2017-10-22 10:09:35,669 - devml.mkdata - INFO - Repo Name: b'linux'
2017-10-22 10:09:45,675 - devml.mkdata - ERROR - utf8 encoding is incorrect, trying ISO-8859-1
Traceback (most recent call last):
  File "../devml/mkdata.py", line 46, in log_to_dict
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 69960305: invalid continuation byte
2017-10-22 10:09:51,218 - devml.mkdata - INFO - Found 656006 Messages For Repo: b'linux'
2017-10-22 10:09:51,230 - devml.mkdata - INFO - Found repo: /Users/noahgift/src/linux/kernel
2017-10-22 10:09:51,231 - devml.mkdata - INFO - Processing Repo: /Users/noahgift/src/linux/kernel
2017-10-22 10:09:51,453 - devml.mkdata - INFO - Repo Name: b'linux'
2017-10-22 10:10:01,581 - devml.mkdata - ERROR - utf8 encoding is incorrect, trying ISO-8859-1
Traceback (most recent call last):
  File "../devml/mkdata.py", line 46, in log_to_dict
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 69960305: invalid continuation byte
2017-10-22 10:10:06,696 - devml.mkdata - INFO - Found 656006 Messages For Repo: b'linux'
2017-10-22 10:10:06,709 - devml.mkdata - INFO - Found repo: /Users/noahgift/src/linux/block
2017-10-22 10:10:06,709 - devml.mkdata - INFO - Processing Repo: /Users/noahgift/src/linux/block
2017-10-22 10:10:06,934 - devml.mkdata - INFO - Repo Name: b'linux'
2017-10-22 10:10:16,418 - devml.mkdata - ERROR - utf8 encoding is incorrect, trying ISO-8859-1
Traceback (most recent call last):
  File "../devml/mkdata.py", line 46, in log_to_dict
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 69960305: invalid continuation byte
2017-10-22 10:10:22,127 - devml.mkdata - INFO - Found 656006 Messages For Repo: b'linux'
2017-10-22 10:10:22,140 - devml.mkdata - INFO - Found a total log entries: 15088138
2017-10-22 10:12:37,844 - devml.mkdata - INFO - Changing back to original cwd: /Users/noahgift/src/linux from /Users/noahgift/src/linux/block

In [23]:
def author_active_days(df):
    """Active Days (Days in which there was a commit)

    This dataframe is return as index such that each column is a date
    in which a commit was created.
    
    Example of how to query this:
    ad = author_active_days(df)
    active_days = ad.loc["Armin Ronacher"].count()
    Out[98]: 960

    """

    active_days = {}
    for name, group in df.groupby("author_name"):
        uday = unique_days(group)
        active_days[name] = uday
    df = DataFrame.from_dict(active_days, orient='index')
    df.index.name = "author_name"
    return df

def author_unique_active_days(df, sort_by="active_days"):
    """DataFrame of Unique Active Days by Author With Descending Order
    
    author_name	unique_days
    46	Armin Ronacher	271
    260	Markus Unterwaditzer	145
    """

    author_list = []
    count_list = []
    duration_active_list = []
    ad = author_active_days(df)
    for author in ad.index:
        author_list.append(author) 
        vals = ad.loc[author]
        vals.dropna(inplace=True)
        vals.drop_duplicates(inplace=True)
        vals.sort_values(axis=0,inplace=True)
        vals.reset_index(drop=True, inplace=True)
        count_list.append(vals.count())
        duration_active_list.append(vals[len(vals)-1]-vals[0])
    df_author_ud = DataFrame()   
    df_author_ud["author_name"] = author_list
    df_author_ud["active_days"] = count_list
    df_author_ud["active_duration"] = duration_active_list
    df_author_ud["active_ratio"] = \
        round(df_author_ud["active_days"]/df_author_ud["active_duration"].dt.days, 2)
    df_author_ud = df_author_ud.iloc[1:] #first row is =
    df_author_ud = df_author_ud.sort_values(by=sort_by, ascending=False)
    return df_author_ud

In [27]:
from devml.ts import (unique_days)
activity_counts = author_unique_active_days(df, sort_by="active_days")

In [50]:
ac_25 = activity_counts.head(25)
plt.subplots(figsize=(20,15))
ax = plt.axes()
sns.barplot(x="active_days", y="author_name", data=ac_25,
            label="Active Days", color="b")
# Add a legend and informative axis label
ax.legend(ncol=2, loc="lower right", frameon=True)
ax.set(xlim=(500,2000),ylabel="",
       xlabel="Active Days on Linux Development")
sns.despine(left=True, bottom=True)



In [ ]: