In [16]:
####Load Modules & Build DataFrames
project_dir = "/home/ubuntu/github/AstroWeekStudy/python_code/"

import sys
sys.path.append("/home/ubuntu/github/AstroWeekStudy/python_code/")

try:
    reload(loadData)
    reload(astroWeekLib)
except:
    import loadData
    import astroWeekLib
    
from loadData import *
from astroWeekLib import *


df,df2014,df2015,resampled = build_main_df()
df_users_created = prepareUserDf(df)
df_repos_created = build_df_repos_created(df2014)

In [25]:
def burstPlot(x,y):
    c = (y >0)*(x>0)
    lx = np.log10(x[c])
    ly = np.log10(y[c])

    fit = S.linregress(lx,ly)
    print fit

    plot(lx,ly,'o')
    plot(lx,lx*fit[0]+fit[1],'k-')
    
    return fit

In [18]:
x = resampled['activity']['repos'].values
y = resampled['activity']['events'].values
slope = burstPlot(x,y)[0]


(1.0715792063815721, 0.3692101491851032, 0.82470300793085649, 1.2607185455902662e-122, 0.033300100524602319)

In [19]:
sAE = 1.14
sAR = 1.05
sRE = 1.07

print 1.05*1.07


1.1235

In [20]:
activity_type = ['events','actors','repos']
tseries = {}
for at in activity_type:
    print "%s activity"%at
    test = resampled['activity'][at]
    before = test[test.index >= astroweek[0]].values
    during = test[(test.index >= astroweek[0]) & (test.index < astroweek[1])].values
    after = test[(test.index > astroweek[1]) & (test.index <= time_boundaries[1])].values
    after_start = test[(test.index >= astroweek[0]) & (test.index <= "2015-04-20")].values
    
    tseries[at] = {'before' : before, 'during' : during, 'after' : after, 'after_start' : after_start}


events activity
actors activity
repos activity

In [26]:
period = 'before'
x_type = 'actors'
y_type = 'events'
pl.figure(1,(10,9))
burstPlot(tseries[x_type][period],tseries[y_type][period])
xlabel("log10(%s)"%x_type)
ylabel("log10(%s)"%y_type)
pl.savefig(project_dir + "figures/productive_burst_before.eps")


(1.1824403683076576, 0.46654594582292952, 0.76972281645412033, 1.0021892527750094e-46, 0.064663476424208069)

In [15]:
4/3.


Out[15]:
1.3333333333333333

In [9]:
period = 'after'
x_type = 'actors'
y_type = 'events'
burstPlot(tseries[x_type][period],tseries[y_type][period])
xlabel("log10(%s)"%x_type)
ylabel("log10(%s)"%y_type)


(1.1796446161705079, 0.46756456566523052, 0.7650070371396096, 4.8420204348094903e-41, 0.069360062914704299)
Out[9]:
<matplotlib.text.Text at 0x43d8b90>

In [10]:
period = 'after_start'
x_type = 'actors'
y_type = 'events'
burstPlot(tseries[x_type][period],tseries[y_type][period])
xlabel("log10(%s)"%x_type)
ylabel("log10(%s)"%y_type)


(1.1672450236125842, 0.47701031156016183, 0.7702948411943914, 4.3809144935886741e-44, 0.065748574679360169)
Out[10]:
<matplotlib.text.Text at 0x4364910>

In [107]:
def PrepareRepoDf(df):
    u_repo_url = np.unique(df.repo_url.values)
    repo_dic = {}
    for repo_url in u_repo_url[:]:
        #print repo_url
        created_at = df[df['repo_url']==repo_url]['repo_created_at'][0]
        event_count = len(df[df['repo_url']==repo_url])
        user_count = len(np.unique(df[df['repo_url']==repo_url]['actor'].values))
        type_freq = dict(zip(*np.unique(df[df['repo_url']==repo_url]['type'].values, return_counts=True)))
        repo_dic[repo_url] = {"event_count": event_count, "user_count" : user_count,'created_at':created_at,'type_freq':type_freq}

    return repo_dic

In [109]:
repo_dic = PrepareRepoDf(df)

In [126]:
uc = []
ec = []
len_type_freq = []
for key in repo_dic.keys():
    uc.append(repo_dic[key]['user_count'])
    ec.append(repo_dic[key]['event_count'])
    len_type_freq.append(len(repo_dic[key]['type_freq']))

In [121]:
len(repo_dic[key]['type_freq'])


Out[121]:
1

In [133]:
loglog(len_type_freq,ec,'o')
pl.xlabel("number of different event types")
pl.ylabel("total event count")


Out[133]:
<matplotlib.text.Text at 0x8a9b050>