Productive Bursts

Load Modules and Main Variables


In [73]:
from urllib2 import urlopen
import json
import re
import pandas as pd
from datetime import datetime
import time
import requests
import matplotlib
from scipy import stats as S
import sys
import numpy as np

sys.path.append("./")
#from tm_python_lib import *


#from settings import github_token
github_token = "819cb7cace7f4a01e41ac5d4e3a9f033c4c6bc15"

ISO8601 = "%Y-%m-%dT%H:%M:%SZ"

settings = {
            'project': 'prabhamatta',
            'repo': 'nlp_information_extraction',
            'github_token': github_token
            }

GitHub API Functions


In [62]:
# from https://github.com/nipy/dipy/blob/master/tools/github_stats.py

element_pat = re.compile(r'<(.+?)>')
rel_pat = re.compile(r'rel=[\'"](\w+)[\'"]')

def parseLinkHeaders(headers):
    link_s = headers.get('link', '')
    urls = element_pat.findall(link_s)
    rels = rel_pat.findall(link_s)
    d = {}
    for rel,url in zip(rels, urls):
        d[rel] = url
    return d

def getPagedRequest(url):
    """get a full list, handling APIv3's paging"""
    results = []
    while url:
        print("fetching %s" % url)
        f = urlopen(url)
        results_json = json.load(f)
        if type(results_json) == list:
            results.extend(results_json)
        else:
            results.extend([results_json])
        
        links = parseLinkHeaders(f.headers)
        url = links.get('next')
        time.sleep(0.25)
    return results

Functions to Fetch from GitHub API

Functions to Make Data Frames


In [63]:
def createCommitsDataframe(commits_json=None, refresh=False):
    
    if not refresh:
        try:
            print 'trying to open commits.json'
            with open('commits.json') as infile:
                print 'reading commits.json'
                df = pd.read_json(infile)
                
                # put timestamp column datatime object
                df['timestamp'] = pd.to_datetime(df.timestamp)
                df.index = df.timestamp
                return df
        except:
            print 'no commits.json found, getting commits and creating DataFrame'
            
    if not commits_json:
        commits_json = getCommits(settings, project_url)
    
    #return  
    df = pd.DataFrame(commits_json)
    
    def addCommitAuthor(row):
        # Commit can have different committer and author
        if row['author']:
            if row['author']['login']:
                return row['author']['login']
            
    def addCommitStats(sha):
        ## API limit is 5000 per hour - TODO: slow  down our calls if we're going to hit
        # the limit
        commit = getCommitInfo(settings, project_url, sha)[0]
        return commit['stats']['additions'], commit['stats']['deletions']
    
    def addCommitDate(commit):
        return commit['author']['date']

    df['author_name'] = df.apply(addCommitAuthor, axis=1)
    ## Out until we can control staying within the API Limit
    #df['additions'], df['deletions'] = zip(*df.sha.apply(addCommitStats))
    df['timestamp'] = df['commit'].apply(addCommitDate)
    df['timestamp'] = pd.to_datetime(df.timestamp)
    df.index = df.timestamp
    print df.head()
    print "returning from createCommitsDataframe..."
    return df

In [64]:
def createBurstsDataframe(commits_df,t_resol='5D',periods='250D'):
    
    count_commits = commits_df.commit.resample(t_resol,how='count')
    count_authors = commits_df.author_name.resample(t_resol,how=uniquerAuthors)
    Time  = count_commits.index.values
    commits250 = commits_df.commit.resample('250D',how='count')
    authors250 = commits_df.author_name.resample('250D',how=uniquerAuthors)
    Time250  = commits250.index.values
    
    
    contributors = []
    commits = []

    beta = []
    p = []
    r = []
    intercept = []
    
    
    for i,ix in enumerate(Time250[:-1]):
        c = (count_commits.index > ix)*(count_commits.index < Time250[i+1])
        cc = count_commits[c].values
        ca = count_authors[c].values
        c = (cc > 0)*(ca > 0)

        lcc = np.log10(cc[c])
        lca = np.log10(ca[c])

        fit =  S.linregress(lca,lcc)
        #print i,ix,"beta=%.2f (p=%.2f,r=%.2f)"%(fit[0],fit[3],fit[1])
    
        commits.append(cc)
        contributors.append(ca)
        beta.append(fit[0])
        p.append(fit[3])
        r.append(fit[2])
        intercept.append(fit[1])
        
    dico = { 't1' : Time250[:-1],'t2': Time250[1:], "beta" : beta, "p" : p, "r" : r, "commits" : commits, "contributors" : contributors,"intercept":intercept}
    #dico2 = { 'timestamp' : Time250[:-1], "beta" : beta, "p" : p, "r" : r}
    bursts_df = pd.DataFrame(dico,index=dico['t1'])
    
    return bursts_df

Some Functions for Analysis


In [65]:
def uniquerAuthors(auth):
    return len(set([a for a in auth if a != None]))

Load and Fetch Data + Create DataFrame


In [66]:
#print datetime.fromtimestamp(0).strftime(ISO8601) # for github since param
project_url = 'https://api.github.com/repos/{project}/{repo}'.format(**settings)

def getIssues(settings, project_url, state='all'):
    url = project_url + '/issues?access_token={0}&state={1}'.format(settings['github_token'],state)
    return getPagedRequest(url)

def getIssueComments(settings, project_url, issue_id):
    # GET /repos/:owner/:repo/issues/:number/comments
    url = project_url + '/issues/{0}/comments?access_token={1}'.format(
                                                                        issue_id,
                                                                        settings['github_token'])
    return getPagedRequest(url)

def getCommits(settings, project_url):
    # GET /repos/:owner/:repo/commits
    url = project_url + '/commits?access_token={0}'.format(settings['github_token'])
    return getPagedRequest(url)

def getCommitInfo(settings, project_url, sha):
    # GET /repos/:owner/:repo/commits/:sha
    url = project_url + '/commits/{0}?access_token={1}'.format(sha, settings['github_token'])
    #print json.load(urlopen(url))
    return getPagedRequest(url)
    
def getReleases(settings, project_url):
    # GET /repos/:owner/:repo/releases
    pass

In [67]:
project_url


Out[67]:
'https://api.github.com/repos/prabhamatta/nlp_information_extraction'

In [68]:
#xxx = getCommitInfo(settings, project_url, 'd31f0a266863e23ddfb5f68d48059a5d67f6d683')[0]

In [69]:
commits_df = createCommitsDataframe()
#commits_df.head()


trying to open commits.json
no commits.json found, getting commits and creating DataFrame
fetching https://api.github.com/repos/prabhamatta/nlp_information_extraction/commits?access_token=819cb7cace7f4a01e41ac5d4e3a9f033c4c6bc15
                                                                author  \
timestamp                                                                
2013-11-13 17:19:58  {u'following_url': u'https://api.github.com/us...   
2013-11-13 08:39:49  {u'following_url': u'https://api.github.com/us...   
2013-11-08 11:02:11  {u'following_url': u'https://api.github.com/us...   
2013-11-08 10:59:45  {u'following_url': u'https://api.github.com/us...   
2013-11-08 09:11:43  {u'following_url': u'https://api.github.com/us...   

                                                          comments_url  \
timestamp                                                                
2013-11-13 17:19:58  https://api.github.com/repos/prabhamatta/nlp_i...   
2013-11-13 08:39:49  https://api.github.com/repos/prabhamatta/nlp_i...   
2013-11-08 11:02:11  https://api.github.com/repos/prabhamatta/nlp_i...   
2013-11-08 10:59:45  https://api.github.com/repos/prabhamatta/nlp_i...   
2013-11-08 09:11:43  https://api.github.com/repos/prabhamatta/nlp_i...   

                                                                commit  \
timestamp                                                                
2013-11-13 17:19:58  {u'committer': {u'date': u'2013-11-13T17:19:58...   
2013-11-13 08:39:49  {u'committer': {u'date': u'2013-11-13T08:39:49...   
2013-11-08 11:02:11  {u'committer': {u'date': u'2013-11-08T11:02:11...   
2013-11-08 10:59:45  {u'committer': {u'date': u'2013-11-08T10:59:45...   
2013-11-08 09:11:43  {u'committer': {u'date': u'2013-11-08T09:11:43...   

                                                             committer  \
timestamp                                                                
2013-11-13 17:19:58  {u'following_url': u'https://api.github.com/us...   
2013-11-13 08:39:49  {u'following_url': u'https://api.github.com/us...   
2013-11-08 11:02:11  {u'following_url': u'https://api.github.com/us...   
2013-11-08 10:59:45  {u'following_url': u'https://api.github.com/us...   
2013-11-08 09:11:43  {u'following_url': u'https://api.github.com/us...   

                                                              html_url  \
timestamp                                                                
2013-11-13 17:19:58  https://github.com/prabhamatta/nlp_information...   
2013-11-13 08:39:49  https://github.com/prabhamatta/nlp_information...   
2013-11-08 11:02:11  https://github.com/prabhamatta/nlp_information...   
2013-11-08 10:59:45  https://github.com/prabhamatta/nlp_information...   
2013-11-08 09:11:43  https://github.com/prabhamatta/nlp_information...   

                                                               parents  \
timestamp                                                                
2013-11-13 17:19:58  [{u'url': u'https://api.github.com/repos/prabh...   
2013-11-13 08:39:49  [{u'url': u'https://api.github.com/repos/prabh...   
2013-11-08 11:02:11  [{u'url': u'https://api.github.com/repos/prabh...   
2013-11-08 10:59:45  [{u'url': u'https://api.github.com/repos/prabh...   
2013-11-08 09:11:43  [{u'url': u'https://api.github.com/repos/prabh...   

                                                          sha  \
timestamp                                                       
2013-11-13 17:19:58  95c607f5802954ce3468285a5121cbd7bf3eaa6a   
2013-11-13 08:39:49  d7738690c76ac6c0b9d444d360b030dc7057e939   
2013-11-08 11:02:11  a80a619fb23e5c71f3687da3b983ffced5b0feea   
2013-11-08 10:59:45  40e8f43a3b7cb927fd9149b2835998c63fd15afd   
2013-11-08 09:11:43  c6f346f5791110cd302906504aa79b0c8cafbe09   

                                                                   url  \
timestamp                                                                
2013-11-13 17:19:58  https://api.github.com/repos/prabhamatta/nlp_i...   
2013-11-13 08:39:49  https://api.github.com/repos/prabhamatta/nlp_i...   
2013-11-08 11:02:11  https://api.github.com/repos/prabhamatta/nlp_i...   
2013-11-08 10:59:45  https://api.github.com/repos/prabhamatta/nlp_i...   
2013-11-08 09:11:43  https://api.github.com/repos/prabhamatta/nlp_i...   

                     author_name           timestamp  
timestamp                                             
2013-11-13 17:19:58  prabhamatta 2013-11-13 17:19:58  
2013-11-13 08:39:49  prabhamatta 2013-11-13 08:39:49  
2013-11-08 11:02:11  prabhamatta 2013-11-08 11:02:11  
2013-11-08 10:59:45  prabhamatta 2013-11-08 10:59:45  
2013-11-08 09:11:43  prabhamatta 2013-11-08 09:11:43  

[5 rows x 10 columns]
returning from createCommitsDataframe...

In [70]:
type(commits_df)


Out[70]:
pandas.core.frame.DataFrame

In [71]:
period = 200

bursts_df = createBurstsDataframe(commits_df,t_resol='5D',periods='%sD'%period)
bursts_df.head()


Out[71]:
<class 'pandas.tseries.index.DatetimeIndex'> Length: 0, Freq: None, Timezone: None Empty DataFrame

0 rows × 8 columns

Add greybar colummn commits_df


In [74]:
timestamps =  np.array(commits_df.index.values.astype('int'))

greybar = np.zeros_like(timestamps).astype('string')

bursts_timestamps = bursts_df.index.values.astype('int')[:-1]

for i,ix in enumerate(bursts_timestamps[:-1]):
    
    c = ( ix < timestamps )*( timestamps < bursts_timestamps[i+1])
    index = np.argwhere(timestamps[c])
    
    #print c[:1000]
    beta = bursts_df.beta[i]
    p = bursts_df.p[i]

    #print i,ix,bursts_timestamps[i+1],p,beta
    
    if beta > 1 and p < 0.1:
        greybar[index] = "#909090"
        #print "good"
    
    elif p >= 0.1:
        greybar[index] = "#FFFFFF"
        #print "blah"
    
    else:
        greybar[index] = "#FFFFFF"

In [75]:
commits_df['greybar'] = greybar
commits_df.head()


Out[75]:
author comments_url commit committer html_url parents sha url author_name timestamp greybar
timestamp
2013-11-13 17:19:58 {u'following_url': u'https://api.github.com/us... https://api.github.com/repos/prabhamatta/nlp_i... {u'committer': {u'date': u'2013-11-13T17:19:58... {u'following_url': u'https://api.github.com/us... https://github.com/prabhamatta/nlp_information... [{u'url': u'https://api.github.com/repos/prabh... 95c607f5802954ce3468285a5121cbd7bf3eaa6a https://api.github.com/repos/prabhamatta/nlp_i... prabhamatta 2013-11-13 17:19:58 0
2013-11-13 08:39:49 {u'following_url': u'https://api.github.com/us... https://api.github.com/repos/prabhamatta/nlp_i... {u'committer': {u'date': u'2013-11-13T08:39:49... {u'following_url': u'https://api.github.com/us... https://github.com/prabhamatta/nlp_information... [{u'url': u'https://api.github.com/repos/prabh... d7738690c76ac6c0b9d444d360b030dc7057e939 https://api.github.com/repos/prabhamatta/nlp_i... prabhamatta 2013-11-13 08:39:49 0
2013-11-08 11:02:11 {u'following_url': u'https://api.github.com/us... https://api.github.com/repos/prabhamatta/nlp_i... {u'committer': {u'date': u'2013-11-08T11:02:11... {u'following_url': u'https://api.github.com/us... https://github.com/prabhamatta/nlp_information... [{u'url': u'https://api.github.com/repos/prabh... a80a619fb23e5c71f3687da3b983ffced5b0feea https://api.github.com/repos/prabhamatta/nlp_i... prabhamatta 2013-11-08 11:02:11 0
2013-11-08 10:59:45 {u'following_url': u'https://api.github.com/us... https://api.github.com/repos/prabhamatta/nlp_i... {u'committer': {u'date': u'2013-11-08T10:59:45... {u'following_url': u'https://api.github.com/us... https://github.com/prabhamatta/nlp_information... [{u'url': u'https://api.github.com/repos/prabh... 40e8f43a3b7cb927fd9149b2835998c63fd15afd https://api.github.com/repos/prabhamatta/nlp_i... prabhamatta 2013-11-08 10:59:45 0
2013-11-08 09:11:43 {u'following_url': u'https://api.github.com/us... https://api.github.com/repos/prabhamatta/nlp_i... {u'committer': {u'date': u'2013-11-08T09:11:43... {u'following_url': u'https://api.github.com/us... https://github.com/prabhamatta/nlp_information... [{u'url': u'https://api.github.com/repos/prabh... c6f346f5791110cd302906504aa79b0c8cafbe09 https://api.github.com/repos/prabhamatta/nlp_i... prabhamatta 2013-11-08 09:11:43 0

5 rows × 11 columns

Plot Time Series


In [77]:
resampled_authors = commits_df.author_name.resample('5D',how=uniquerAuthors)
resampled_commits = commits_df.commit.resample('5D',how='count')
Time  = resampled_authors.index.values.astype('int')/1.e9/3600/24
Time = Time - Time.min()

Time = np.array(np.concatenate([[Time[0]],Time,[Time[-1]]]))
resampled_authors = np.concatenate([[0],resampled_authors,[0]])
resampled_commits = np.concatenate([[0],resampled_commits,[0]])

greybar = np.zeros_like(Time).astype('string')



#figure(1,(15,7))

# Plot a grey bar when a productive burst is detected 
timestamps= bursts_df.t1.astype('int')/1.e9/3600/24
timestamps = timestamps - timestamps.min()
for i,ix in enumerate(timestamps):
    beta = bursts_df.beta[i]
    p = bursts_df.p[i]
    
    ix2 = ix + period*1.e9*3600*24
    print ix,ix2
    c = ( ix <= Time )*( Time <= ix2 )
    #index = np.argwhere(Time[c])
    #print Time[c]
    
    if beta > 1 and p < 0.1:
        bar(ix,140,period,color='#909090',linewidth=0.5,alpha=0.6)
        greybar[c] = '#909090'
        
    else:
        greybar[c] = "#FFFFFF"
        
        
# Plot Time Series
fill(Time,resampled_commits,color="red")
fill(Time,resampled_authors,color="green")
xlim(xmax=Time[-1])
xlabel("Time [days]")
ylabel("Active Contributors (green), Commits (red)")


#print greybar


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-77-e45b69f310a8> in <module>()
     15 
     16 # Plot a grey bar when a productive burst is detected
---> 17 timestamps= bursts_df.t1.astype('int')/1.e9/3600/24
     18 timestamps = timestamps - timestamps.min()
     19 for i,ix in enumerate(timestamps):

/Library/Frameworks/Python.framework/Versions/7.3/lib/python2.7/site-packages/pandas/core/generic.pyc in astype(self, dtype, copy, raise_on_error)
   1781 
   1782         mgr = self._data.astype(
-> 1783             dtype, copy=copy, raise_on_error=raise_on_error)
   1784         return self._constructor(mgr).__finalize__(self)
   1785 

/Library/Frameworks/Python.framework/Versions/7.3/lib/python2.7/site-packages/pandas/core/internals.pyc in astype(self, *args, **kwargs)
   2306 
   2307     def astype(self, *args, **kwargs):
-> 2308         return self.apply('astype', *args, **kwargs)
   2309 
   2310     def convert(self, *args, **kwargs):

/Library/Frameworks/Python.framework/Versions/7.3/lib/python2.7/site-packages/pandas/core/internals.pyc in apply(self, f, *args, **kwargs)
   2265 
   2266             else:
-> 2267                 applied = getattr(blk, f)(*args, **kwargs)
   2268 
   2269             if isinstance(applied, list):

/Library/Frameworks/Python.framework/Versions/7.3/lib/python2.7/site-packages/pandas/core/internals.pyc in astype(self, dtype, copy, raise_on_error)
   1594             klass = ObjectBlock
   1595         return self._astype(dtype, copy=copy, raise_on_error=raise_on_error,
-> 1596                             klass=klass)
   1597 
   1598     def set(self, item, value):

/Library/Frameworks/Python.framework/Versions/7.3/lib/python2.7/site-packages/pandas/core/internals.pyc in _astype(self, dtype, copy, raise_on_error, values, klass)
    419             # force the copy here
    420             if values is None:
--> 421                 values = com._astype_nansafe(self.values, dtype, copy=True)
    422             newb = make_block(values, self.items, self.ref_items,
    423                               ndim=self.ndim, placement=self._ref_locs,

/Library/Frameworks/Python.framework/Versions/7.3/lib/python2.7/site-packages/pandas/core/common.pyc in _astype_nansafe(arr, dtype, copy)
   2098         elif dtype != _NS_DTYPE:
   2099             raise TypeError("cannot astype a datetimelike from [%s] to [%s]" %
-> 2100                             (arr.dtype, dtype))
   2101         return arr.astype(_NS_DTYPE)
   2102     elif is_timedelta64_dtype(arr):

TypeError: cannot astype a datetimelike from [datetime64[ns]] to [int32]

In [ ]:
S.spearmanr(resampled_authors,resampled_commits)

Export CSCV


In [ ]:
# Time Series

output = open("timeseries.csv",'wb')
output.write("time,commits,authors,greybar\n") 
for i,ix in enumerate(Time):
   output.write("%s,%s,%s,%s\n" %(ix,resampled_authors[i],resampled_commits[i],greybar[i]))

output.close()

In [ ]: