In [1]:
%matplotlib inline
from bigbang.archive import Archive
import bigbang.entity_resolution
from git_data import Repos;
import matplotlib.pyplot as plt
from matplotlib import animation, colors
import pylab
import numpy as np
import pandas as pd
from IPython.display import display # Used to display widgets in the notebook
from IPython.display import clear_output

Introduction

In group efforts, there is sometimes the impression that there are those who work, and those who talk. A naive question to ask is whether or not the people that tend to talk a lot actually get any work done. This is an obviously and purposefully obtuse question with an interesting answer.

We can use BigBang's newest feature, git data collection, to compare all of the contributors to a project, in this case Scipy, based on their email and git commit activity. The hypothesis in this case was that people who commit a lot will also tend to email a lot, and vice versa, since their involvement in a project would usually require them to do both. This hypothesis was proven to be correct. However, the data reveals many more interesting phenomenon.


In [2]:
# Load the raw email and git data
url = "http://mail.scipy.org/pipermail/scipy-dev/"
arx = Archive(url,archive_dir="../archives")
mailInfo = arx.data
repo = Repos.get_repo("scipy")
gitInfo = repo.commit_data;

Entity Resolution

Git and Email data comes from two different datatables. To observe a single person's git and email data, we need a way to identify that person across the two different datatables.

To solve this problem, I wrote an entity resolution client that will parse a Pandas dataframe and add a new column to it called "Person-ID" which gives each row an ID that represents one unique contributor. A person may go by many names ("Robert Smith, Rob B. Smith, Bob S., etc.) and use many different emails. However, this client will read through these data tables in one pass and consolidate these identities based on a few strategies.


In [3]:
entityResolve = bigbang.entity_resolution.entityResolve
mailAct = mailInfo.apply(entityResolve, axis=1, args =("From",None))
gitAct = gitInfo.apply(entityResolve, axis=1, args =("Committer Email","Committer Name"))

After we've run entity resolution on our dataframes, we split the dataframe into slices based on time. So for the entire life-span of the project, we will have NUM_SLICES different segments to analyze. We will be able to look at the git and email data up until that certain date, which can let us analyze these changes over time.


In [4]:
NUM_SLICES = 1500 # Number of animation frames. More means more loading time

In [5]:
mailAct.sort("Date")
gitAct.sort("Time")

def getSlices(df, numSlices):
    sliceSize = len(df)/numSlices
    slices = []
    for i in range(1, numSlices + 1):
        start = 0
        next = (i)*sliceSize;
        next = min(next, len(df)-1) # make sure we don't go out of bounds
        
        slice = df.iloc[start:next]
        slices.append(slice)
    return slices

mailSlices = getSlices(mailAct, NUM_SLICES)
gitSlices = getSlices(gitAct, NUM_SLICES)

Merging Data Tables

Now we want to merge these two tables based on their Person-ID values. Basically, we first count how many emails / commits a certain contributor had in a certain slice. We then join all the rows with the same Person-ID to each other, so that we have the number of emails and the number of commits of each person in one row per person in one consolidated dataframe. We then delete all the rows where both of these values aren't defined. These represent people for whom we have git data but not mail data, or vice versa.


In [6]:
def processSlices(slices) :
    for i in range(len(slices)):
        slice = slices[i]
        slice = slice.groupby("Person-ID").size()
        slice.sort()
        slices[i] = slice

def concatSlices(slicesA, slicesB) :
    # assumes they have the same number of slices
    # First is emails, second is commits
    ansSlices = []
    for i in range(len(slicesA)):
        sliceA = slicesA[i]
        sliceB = slicesB[i]
        ans = pd.concat({"Emails" : sliceA, "Commits": sliceB}, axis = 1)
        ans = ans[pd.notnull(ans["Emails"])]
        ans = ans[pd.notnull(ans["Commits"])]
        ansSlices.append(ans);
    return ansSlices

processSlices(mailSlices)
processSlices(gitSlices)

finalSlices = concatSlices(mailSlices, gitSlices)

Coloring

We now assign a float value [0 --> 1] to each person. This isn't neccesary, but can let us graph these changes in a scatter plot and give each contributor a unique color to differentiate them. This will help us track an individual as their dot travels over time.


In [7]:
def idToFloat(id):
    return id*1.0/400.0;

for i in range(len(finalSlices)):
    slice = finalSlices[i]
    toSet = []
    
    for i in slice.index.values:
        i = idToFloat(i)
        toSet.append(i)
    slice["color"] = toSet

Here we graph our data. Each dot represents a unique contributor's number of emails and commits. As you'll notice, the graph is on a log-log scale.


In [8]:
data = finalSlices[len(finalSlices)-1] # Will break if there are 0 slices
fig = plt.figure(figsize=(8, 8))

d = data
x = d["Emails"]
y = d["Commits"]
c = d["color"]
ax = plt.axes(xscale='log', yscale = 'log')


plt.scatter(x, y, c=c, s=75)
plt.ylim(0, 10000)
plt.xlim(0, 10000)
ax.set_xlabel("Emails")
ax.set_ylabel("Commits")
plt.plot([0, 1000],[0, 1000], linewidth=5)

plt.show()


Animations

Below this point, you'll find the code for generating animations. This can take a long time (~30 mins) for a large number of slices. However, the pre-generated videos are below.

The first video just shows all the contributors over time without unique colors. The second video has a color for each contributor, but also contains a Matplotlib bug where the minimum x and y values for the axes is not followed.

There is a lot to observe. As to our hypothesis, it's clear that people who email more commit more. In our static graph, we could see many contributors on the x-axis -- people who only email -- but this dynamic graph allows us to see the truth. While it may seem that they're people who only email, the video shows that even these contributors eventually start committing. Most committers don't really get past 10 commits without starting to email the rest of the project, for pretty clear reasons. However, the emailers can "get away with" exclusively emailing for longer, but eventually they too start to commit. In general, not only is there a positive correlation, there's a general trend of everyone edging close to having a stable and relatively equal ratio of commits to emails.


In [9]:
from IPython.display import YouTubeVideo
display(YouTubeVideo('GCcYJBq1Bcc', width=500, height=500))
display(YouTubeVideo('uP-z4jJqxmI', width=500, height=500))



In [10]:
fig = plt.figure(figsize=(8, 8))
a = finalSlices[0]

print type(plt)
ax = plt.axes(xscale='log', yscale = 'log')
graph, = ax.plot(x ,y, 'o', c='red', alpha=1, markeredgecolor='none')
ax.set_xlabel("Emails")
ax.set_ylabel("Commits")
plt.ylim(0, 10000)
plt.xlim(0, 10000)

def init():
    graph.set_data([],[]);
    return graph,

def animate(i):
    a = finalSlices[i]
    x = a["Emails"]
    y = a["Commits"]
    graph.set_data(x, y)
    return graph,

anim = animation.FuncAnimation(fig, animate, init_func=init,
                               frames=NUM_SLICES, interval=1, blit=True)

anim.save('t1.mp4', fps=15)


<type 'module'>
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-10-ae198c0477b0> in <module>()
     25                                frames=NUM_SLICES, interval=1, blit=True)
     26 
---> 27 anim.save('t1.mp4', fps=15)

/home/aryan/anaconda/envs/bigbang/lib/python2.7/site-packages/matplotlib/animation.pyc in save(self, filename, writer, fps, dpi, codec, bitrate, extra_args, metadata, extra_anim, savefig_kwargs)
    716                     #TODO: Need to see if turning off blit is really necessary
    717                     anim._draw_next_frame(d, blit=False)
--> 718                 writer.grab_frame(**savefig_kwargs)
    719 
    720         # Reconnect signal for first draw if necessary

/home/aryan/anaconda/envs/bigbang/lib/python2.7/site-packages/matplotlib/animation.pyc in grab_frame(self, **savefig_kwargs)
    202             # frame format and dpi.
    203             self.fig.savefig(self._frame_sink(), format=self.frame_format,
--> 204                              dpi=self.dpi, **savefig_kwargs)
    205         except RuntimeError:
    206             out, err = self._proc.communicate()

/home/aryan/anaconda/envs/bigbang/lib/python2.7/site-packages/matplotlib/figure.pyc in savefig(self, *args, **kwargs)
   1419             self.set_frameon(frameon)
   1420 
-> 1421         self.canvas.print_figure(*args, **kwargs)
   1422 
   1423         if frameon:

/home/aryan/anaconda/envs/bigbang/lib/python2.7/site-packages/matplotlib/backend_bases.pyc in print_figure(self, filename, dpi, facecolor, edgecolor, orientation, format, **kwargs)
   2218                 orientation=orientation,
   2219                 bbox_inches_restore=_bbox_inches_restore,
-> 2220                 **kwargs)
   2221         finally:
   2222             if bbox_inches and restore_bbox:

/home/aryan/anaconda/envs/bigbang/lib/python2.7/site-packages/matplotlib/backends/backend_agg.pyc in print_raw(self, filename_or_obj, *args, **kwargs)
    485 
    486     def print_raw(self, filename_or_obj, *args, **kwargs):
--> 487         FigureCanvasAgg.draw(self)
    488         renderer = self.get_renderer()
    489         original_dpi = renderer.dpi

/home/aryan/anaconda/envs/bigbang/lib/python2.7/site-packages/matplotlib/backends/backend_agg.pyc in draw(self)
    449 
    450         try:
--> 451             self.figure.draw(self.renderer)
    452         finally:
    453             RendererAgg.lock.release()

/home/aryan/anaconda/envs/bigbang/lib/python2.7/site-packages/matplotlib/artist.pyc in draw_wrapper(artist, renderer, *args, **kwargs)
     53     def draw_wrapper(artist, renderer, *args, **kwargs):
     54         before(artist, renderer)
---> 55         draw(artist, renderer, *args, **kwargs)
     56         after(artist, renderer)
     57 

/home/aryan/anaconda/envs/bigbang/lib/python2.7/site-packages/matplotlib/figure.pyc in draw(self, renderer)
   1032         dsu.sort(key=itemgetter(0))
   1033         for zorder, a, func, args in dsu:
-> 1034             func(*args)
   1035 
   1036         renderer.close_group('figure')

/home/aryan/anaconda/envs/bigbang/lib/python2.7/site-packages/matplotlib/artist.pyc in draw_wrapper(artist, renderer, *args, **kwargs)
     53     def draw_wrapper(artist, renderer, *args, **kwargs):
     54         before(artist, renderer)
---> 55         draw(artist, renderer, *args, **kwargs)
     56         after(artist, renderer)
     57 

/home/aryan/anaconda/envs/bigbang/lib/python2.7/site-packages/matplotlib/axes.pyc in draw(self, renderer, inframe)
   2084 
   2085         for zorder, a in dsu:
-> 2086             a.draw(renderer)
   2087 
   2088         renderer.close_group('axes')

/home/aryan/anaconda/envs/bigbang/lib/python2.7/site-packages/matplotlib/artist.pyc in draw_wrapper(artist, renderer, *args, **kwargs)
     53     def draw_wrapper(artist, renderer, *args, **kwargs):
     54         before(artist, renderer)
---> 55         draw(artist, renderer, *args, **kwargs)
     56         after(artist, renderer)
     57 

/home/aryan/anaconda/envs/bigbang/lib/python2.7/site-packages/matplotlib/axis.pyc in draw(self, renderer, *args, **kwargs)
   1094 
   1095         for tick in ticks_to_draw:
-> 1096             tick.draw(renderer)
   1097 
   1098         # scale up the axis label box to also find the neighbors, not

/home/aryan/anaconda/envs/bigbang/lib/python2.7/site-packages/matplotlib/artist.pyc in draw_wrapper(artist, renderer, *args, **kwargs)
     53     def draw_wrapper(artist, renderer, *args, **kwargs):
     54         before(artist, renderer)
---> 55         draw(artist, renderer, *args, **kwargs)
     56         after(artist, renderer)
     57 

/home/aryan/anaconda/envs/bigbang/lib/python2.7/site-packages/matplotlib/axis.pyc in draw(self, renderer)
    234                 self.gridline.draw(renderer)
    235             if self.tick1On:
--> 236                 self.tick1line.draw(renderer)
    237             if self.tick2On:
    238                 self.tick2line.draw(renderer)

/home/aryan/anaconda/envs/bigbang/lib/python2.7/site-packages/matplotlib/artist.pyc in draw_wrapper(artist, renderer, *args, **kwargs)
     53     def draw_wrapper(artist, renderer, *args, **kwargs):
     54         before(artist, renderer)
---> 55         draw(artist, renderer, *args, **kwargs)
     56         after(artist, renderer)
     57 

/home/aryan/anaconda/envs/bigbang/lib/python2.7/site-packages/matplotlib/lines.pyc in draw(self, renderer)
    576 
    577             marker = self._marker
--> 578             tpath, affine = transf_path.get_transformed_points_and_affine()
    579             if len(tpath.vertices):
    580                 # subsample the markers if markevery is not None

/home/aryan/anaconda/envs/bigbang/lib/python2.7/site-packages/matplotlib/transforms.pyc in get_transformed_points_and_affine(self)
   2551         be performed.
   2552         """
-> 2553         self._revalidate()
   2554         return self._transformed_points, self.get_affine()
   2555 

/home/aryan/anaconda/envs/bigbang/lib/python2.7/site-packages/matplotlib/transforms.pyc in _revalidate(self)
   2537             or self._transformed_path is None):
   2538             self._transformed_path = \
-> 2539                 self._transform.transform_path_non_affine(self._path)
   2540             self._transformed_points = \
   2541                 Path(self._transform.transform_non_affine(self._path.vertices),

/home/aryan/anaconda/envs/bigbang/lib/python2.7/site-packages/matplotlib/transforms.pyc in transform_path_non_affine(self, path)
   1365         ``transform_path_affine(transform_path_non_affine(values))``.
   1366         """
-> 1367         return Path(self.transform_non_affine(path.vertices), path.codes,
   1368                     path._interpolation_steps)
   1369 

/home/aryan/anaconda/envs/bigbang/lib/python2.7/site-packages/matplotlib/transforms.pyc in transform_non_affine(self, points)
   2006 
   2007         if y.input_dims == 2:
-> 2008             y_points = y.transform_non_affine(points)[:, 1:]
   2009         else:
   2010             y_points = y.transform_non_affine(points[:, 1])

/home/aryan/anaconda/envs/bigbang/lib/python2.7/site-packages/matplotlib/transforms.pyc in transform_non_affine(self, points)
   2215             return points
   2216         elif not self._a.is_affine and self._b.is_affine:
-> 2217             return self._a.transform_non_affine(points)
   2218         else:
   2219             return self._b.transform_non_affine(

/home/aryan/anaconda/envs/bigbang/lib/python2.7/site-packages/matplotlib/transforms.pyc in transform_non_affine(self, points)
   2002             x_points = x.transform_non_affine(points)[:, 0:1]
   2003         else:
-> 2004             x_points = x.transform_non_affine(points[:, 0])
   2005             x_points = x_points.reshape((len(x_points), 1))
   2006 

/home/aryan/anaconda/envs/bigbang/lib/python2.7/site-packages/matplotlib/scale.pyc in transform_non_affine(self, a)
    117 
    118     def transform_non_affine(self, a):
--> 119         a = self._handle_nonpos(a * 10.0)
    120         if isinstance(a, ma.MaskedArray):
    121             return ma.log10(a)

/home/aryan/anaconda/envs/bigbang/lib/python2.7/site-packages/matplotlib/scale.pyc in _mask_non_positives(a)
     89     """
     90     mask = a <= 0.0
---> 91     if mask.any():
     92         return ma.MaskedArray(a, mask=mask)
     93     return a

/home/aryan/anaconda/envs/bigbang/lib/python2.7/site-packages/numpy/core/_methods.pyc in _any(a, axis, dtype, out, keepdims)
     24 def _any(a, axis=None, dtype=None, out=None, keepdims=False):
     25     return um.logical_or.reduce(a, axis=axis, dtype=dtype, out=out,
---> 26                                 keepdims=keepdims)
     27 
     28 def _all(a, axis=None, dtype=None, out=None, keepdims=False):

KeyboardInterrupt: 

In [ ]:
def main():

    data = finalSlices
    first = finalSlices[0]
    fig = plt.figure(figsize=(8, 8))

    d = data
    x = d[0]["Emails"]
    y = d[0]["Commits"]
    c = d[0]["color"]
    
    ax = plt.axes(xscale='log', yscale='log')
    scat = plt.scatter(x, y, c=c, s=100)
    plt.ylim(0, 10000)
    plt.xlim(0, 10000)
    plt.xscale('log')
    plt.yscale('log')
    ani = animation.FuncAnimation(fig, update_plot, frames=NUM_SLICES,
                                          fargs=(data, scat), blit=True)
    ani.save('test.mp4', fps=10)
    #plt.show()


def update_plot(i, d, scat):
    x = d[i]["Emails"]
    y = d[i]["Commits"]
    c = d[i]["color"]
    plt.cla()
    ax = plt.axes()
    ax.set_xscale('log')
    ax.set_yscale('log')

    scat = plt.scatter(x, y, c=c, s=100)
    
    plt.ylim(0, 10000)
    plt.xlim(0, 10000)
    plt.xlabel("Emails")
    plt.ylabel("Commits")
    return scat,

main()