In [2]:
import pylab as pl

import pandas as pd
from datetime import datetime,timedelta
figureDir = "/home/ubuntu/github/patchwork/figures/"

sys.path.append("/home/ubuntu/github/patchwork/")
try:
    reload(parseUserNames)
    reload(analyze_patchwork)
    reload(parseEvents)
except:
    import parseUserNames
    import analyze_patchwork
    import parseEvents
    
from parseUserNames import *
from analyze_patchwork import *
from parseEvents import *

matplotlib.rcParams.update({'font.size': 18,'legend.fontsize': 10})
#df,df2014,df2015,resampled = build_main_df()

In [3]:
dic = makeEventList()
df = pd.DataFrame(dic)
df.index = df['date']
userDic,pwDates = parseUsers()

x. Activity per User Pre- and Post-Event


In [4]:
def userTimeline(username,resol = 1,plot=False):
    date = time.mktime(userDic[username]['pwDate'][0].timetuple())/3600/24
    x = df[df['user']==username].date.resample("%sD"%resol,how="count").index.astype(np.int64)// 10**9 /3600/24
    x = x - date
    y = df[df['user']==username].date.resample("%sD"%resol,how="count").values

    if plot:
        pl.bar(x,y,width=resol)
        pl.xlabel("Delta t [days]")
        pl.ylabel("Events per day")
    
    return x,y

In [5]:
#pl.close("all")
#pl.figure(1,(13,9))
resol = 1

X = np.arange(-1000,1000)
Y = np.zeros_like(X)
uY = np.zeros_like(X)

for i,username in enumerate(userDic.keys()):
    try:
        x,y = userTimeline(username,resol = resol,plot=False)
        Y[map(int,x+ 1000)] += y
        uY[map(int,x+ 1000)] += 1
    except:
        print i,username, "error"
        continue

        
#pl.xlabel("Delta t [days]")
#pl.ylabel("Events per day")


7 sspT3 error
8 blacksintechnology error
11 Kisi78 error
27 walterreade error
35 victorczar error
46 vandna118 error
73 nikocal17 error
77 stiphan error
103 katez error
113 junuseu error
116 CodeShambles error
135 Casprawr error
166 egreenberg error
180 laj error
183 rcassani error
197 harrypilot error
210 rmcd-home error
217 SanyamYadav error
260 daryl-walsh error
298 kdpascal error
301 Bentonmaru error
305 nisarg1184 error
316 rainbowkenz error
331 alfalfasalads error
366 jchappell82 error
397 amlehman error
400 rkarlis error
401 Gold-of-Danae error
405 mariangemarcano error
418 hutchinc error
424 californiakat error
449 bmweeden error
460 atrudeau error
461 sfmina error
466 agoddardnash error
487 dmdreon error
519 rpingili3 error
522 ericparkpro error
528 robbieglab error
540 sweenepe error
549 kennyminsight error
556 matsumonkie error
557 tvvinkle error
559 troyconquer1 error
560 bpranav96 error
572 jimbeaudoin error
579 alexandraqin error
586 extraface error
587 wrennie error
601 cgilet error
612 Clover-M error
622 srihan error
653 AbhijithMahadi error
657 mourawaldson error
661 nabpackagist error
674 kmb232 error
677 JoyceLz error
686 mbetz08 error
709 Leontodon error
724 AndrewHayne error
735 Palleas error
736 p-stewart error
744 campbellmclaughlin error
764 y82 error
775 qweixi341 error
785 narahari2 error

In [10]:
r = Y/map(float,uY)

In [55]:
lim =70
c = (X > -lim)*(X < lim)

pl.close("all")
pl.figure(1,(12,15))

pl.subplot(311)
pl.bar(X[c],Y[c])
pl.xlabel("time - tc [days]")
pl.ylabel("Activity")

pl.subplot(312)
pl.bar(X[c],r[c])
pl.xlabel("time - tc [days]")
pl.ylabel("Activity/nDevs")

pl.subplot(313)
pl.bar(X[c],uY[c])
pl.xlabel("time - tc [days]")
pl.ylabel("uActivity")
pl.ylim(ymin=400)


Out[55]:
(400, 600.0)

In [91]:
pl.figure(1,(15,9))
pl.subplot(121)
fit = S.linregress(uY,Y)
print "linear fit: ", fit
pl.plot(uY,Y,'o')
pl.plot(uY,uY*fit[0] + fit[1],'r-')

pl.xlabel("daily users")
pl.ylabel("daily events")

pl.subplot(122)

cond = (uY > 0)*(Y > 0)
lx = np.log10(uY[cond])
ly = np.log10(Y[cond])              

fit = S.linregress(lx,ly)
print "scaling fit: ", fit
pl.plot(lx,ly,'bo')
pl.plot(lx,lx*fit[0] + fit[1],'r-')
pl.xlabel("log10(daily users)")
pl.ylabel("lgo10(daily events)")


linear fit:  (0.90947570420477797, 5.7109442621157598, 0.84968526043895931, 0.0, 0.012626557986232271)
scaling fit:  (0.95178315972986827, 0.064037442626691199, 0.89710220108963445, 0.0, 0.011008942375568804)
Out[91]:
<matplotlib.text.Text at 0x1ab9cfd0>

In [67]:
lim =30
c = (X > -lim)*(X < lim)

x = X[c]+1
y = uY[c]
y = y - y[-1] + 1

pl.figure(1,(13,7))
pl.subplot(121)
c1 = x < 0
pl.plot(-x[c1],y[c1],'o-')
pl.subplot(122)
c2 = x > 0
lx = np.log10(x[c2])
ly = np.log10(y[c2])
fit = S.linregress(lx,ly)
print fit
pl.plot(lx,ly,'o-')
pl.plot(lx,lx*fit[0]+fit[1],'r-')

pl.figure(2)
#pl.plot(10**lx,10**ly,'o-')
pl.plot(x,y)
pl.plot(10**lx,y[-1] + 10**(lx*fit[0]+fit[1]),'r-')


(-1.097551493995282, 2.1897066225155006, -0.93566270235808291, 3.4310158769877382e-14, 0.078229908143218188)
Out[67]:
[<matplotlib.lines.Line2D at 0x1776b290>]

In [102]:
upper_limit = 500
print upper_limit
c1 = (X>=0)*(Y>0)*(X < upper_limit)
lx = np.log10(X[c1]+1)
ly = np.log10(Y[c1])

c2= (X<=0)*(Y>0)*(X > -upper_limit)
lx2 = np.log10(-X[c2]+1)
ly2 = np.log10(Y[c2])


fit2 = S.linregress(lx2,ly2)
print "before tc: ", fit2
fit1 = S.linregress(lx,ly)
print "after tc: ", fit1

pl.figure(1,(13,9))
pl.plot(lx,ly,'b-.')
pl.plot(lx,lx*fit1[0]+fit1[1],'b-')
pl.plot(lx2,ly2,'r-.')
pl.plot(lx2,lx2*fit2[0]+fit2[1],'r-')
pl.xlabel("log10(time - tc [days])")
pl.ylabel("log10(Activity)")


500
before tc:  (-0.25342360448287859, 3.0752083672330195, -0.75734804006361633, 3.1895038024347384e-94, 0.0097916905278095951)
after tc:  (-0.48283333784706778, 3.5252145161761996, -0.72676010810155911, 2.8919823835233697e-83, 0.020449352091246983)
Out[102]:
<matplotlib.text.Text at 0x2cada2d0>

In [81]:
c1 = (X>0)*(Y>0)*(X>1)*(X < 600)
c2 = (X<0)
print S.linregress(X[c1],Y[c1])
plot(X[c1],Y[c1])
plot(-X[c2],Y[c2],'r-')


(-0.91604481586603914, 548.96226822596373, -0.90108207507009619, 2.3672896361471027e-209, 0.018448975273422114)
Out[81]:
[<matplotlib.lines.Line2D at 0x270706d0>]

In [22]:
jDates = []
for item in userDic.items():
    jDates.append(item[1]['jDate'])
    #print item[0],item[1]['jDate']
    
jDates = np.sort(jDates)

#bD0 = datetime(2008, 2, 1, 0, 0, 0)
bD0 = pwDates[0] - timedelta(days=60)
bD1 = datetime(2015, 8, 30, 0, 0, 0)
dayRes = 7
hourRes = 24*3600*dayRes
dateBins = [bD0]
while dateBins[-1] < bD1:
    dateBins.append(dateBins[-1] + timedelta(seconds = hourRes))
dateBins = np.array([time.mktime(datetime.timetuple(t)) for t in dateBins])/(24*3600)

jDatesTS = np.array([time.mktime(datetime.timetuple(t)) for t in jDates])/(24*3600)
pwDatesTS = np.array([time.mktime(datetime.timetuple(t)) for t in pwDates])/(24*3600)

H = np.histogram(jDatesTS,bins=dateBins)

In [42]:
pl.figure(1,(14,12))
pl.subplot(221)
y = np.arange(1,len(jDates)+1)
pl.plot(jDates,y,lw=2)
y = range(0,850)
for date in pwDates:
    pl.plot([date]*len(y),y,'r-',lw=1)

#Count Events
resol = "1W"
x = df.user.resample(resol,how="count").index
y = df.user.resample(resol,how="count").values
pl.plot(x,y/20.,'-',lw=2)
    
pl.xlabel("Time [years]")
pl.ylabel("Cumulative Joining")
pl.ylabel("CumJoin (blue) / wEvents/20 (green)")
pl.xticks(rotation=90)


pl.xlim(xmin=x[0])
pl.ylim(ymax=850)

pl.subplot(222)
#Joining Rate
y = np.arange(1,len(jDates)+1)
pl.plot(jDates,y,lw=2)

#Count Events
resol = "1W"
x = df.user.resample(resol,how="count").index
y = df.user.resample(resol,how="count").values
pl.plot(x,y/10.,'-',lw=2)

#Patchwork Events
y = range(0,max(y)/10+20)
for date in pwDates:
    pl.plot([date]*len(y),y,'r-',lw=1)
    pl.xlim(xmin=pwDates[0]-timedelta(days=30),xmax=pwDates[-1]+timedelta(days=30))
    pl.ylim(0,max(y))

pl.xlabel("Time [years]")
pl.ylabel("CumJoin (blue) / wEvents/10 (green)")
pl.xticks(rotation=90)


#pl.subplot(223)
#bar(H[1][:-1]-H[1][0],H[0],width=dayRes,lw=0)
#pl.xlabel("Time [days since 2 months before 1st patchwork]")
#pl.ylabel("Joining per %s days"%dayRes)




y= range(0,15)
for date in pwDates:
    pl.plot([pwDatesTS - H[1][0]]*len(y),y,'r-',lw=1)
    

pl.xticks(rotation=90)
pl.savefig(figureDir + "cumulativeJoining.png")



In [34]:
#Count Events
resol = "1W"
x = df.user.resample(resol,how="count").index.astype(np.int64) // 10 ** 9 / 3600 /24.
minx = min(x)
x = x - minx

pl.figure(1,(9,9))

pw0 = time.mktime(pwDates[0].timetuple()) / 3600 /24. - minx
ypw0 = np.arange(0,5000)
plot(np.zeros_like(ypw0)+pw0,ypw0,'r-')
y = df.user.resample(resol,how="count").values
pl.plot(x,y,'-',lw=2)

c0 = (x>400)*(x < pw0)
fit0 = S.linregress(x[c0],y[c0])
print fit0
pl.plot(x[c0],x[c0]*fit0[0] + fit0[1],'r-')
pl.text(450,450,"slope = %.2f \n (std. err. = %.2f)"%(fit0[0],fit0[-1]),verticalalignment='center',horizontalalignment='center')

print pwDates[0],pw0

c1 = (x > pw0)*(x < 1350)
fit1 = S.linregress(x[c1],y[c1])
print fit1
pl.plot(x[c1],x[c1]*fit1[0] + fit1[1],'r-')
pl.text(1200,1500,"slope = %.2f \n (std. err. = %.2f)"%(fit1[0],fit1[-1]),verticalalignment='center',horizontalalignment='center')

jump = (x[c1][0]*fit1[0] + fit1[1]) - (x[c0][-1]*fit0[0] + fit0[1])
pl.text(850,3500,"jump: \n %.0f events \n per day"%jump,verticalalignment='center',horizontalalignment='center')

pl.xlabel("time [days since 2012]")
pl.ylabel("daily events")
pl.savefig(figureDir + "regime_shift.png")


(2.7576194656967266, -310.57374890254619, 0.87190704022706456, 7.8547284153609566e-22, 0.19209331111393577)
2014-01-29 00:00:00 871.0
(3.5990953162575865, -764.53448868190981, 0.78292486362777658, 3.0787916587017185e-15, 0.3520244732267564)

1. Joining Curve

  • Represented by first contribution made by an actor since 2014. Counts per day. It may not be accurate, but still this is suprising that we have a rather constant flow of first contributions.
  • Actors are taken from the list: http://reporobot.jlord.us/data
  • No clear signal. I checked for each patchwork meetup, there is a sometimes a spike on the day of a meetup, but sometimes, there a slight spike before or after.
  • Some spikes are unrelated to the events I have found on https://github.com/blog/category/meetup?page=1
  • The initial spike early January suggests that some people are on the http://reporobot.jlord.us/data, but they had joined previously, and perhaps long before. I surmise that some early organizers populated the list.
  • There is a strange cyclicity, which does not even look like seasonal

2. Time series of Events

  • There is a clear increase of events generated by all users from 2000 to 10000 in 1.5 years
  • still sometimes, an increase is clearly associated with a patchwork meetup, but not always.
  • At least for the first and third events, I have the impression that we see a clear decay after both meetups
  • The frequency of events is high between Sept and Dec 2014, and indeed, the rate of events tend to be larger, but still it's hard to attribute beside the peak right at the moment of highest patchwork meetup frequency.

In [388]:
figure(1,(12,8))
resol = "1W"
x = df.user.resample(resol,how="count").index
y = df.user.resample(resol,how="count").values

pl.plot(x,y,'-')

y = range(0,max(y))
for date in dateEvents:
    pl.plot([date]*len(y),y,'r-',lw=1)

pl.xticks(rotation=90)


Out[388]:
(array([ 734563.,  734747.,  734928.,  735112.,  735293.,  735477.,  735658.]),
 <a list of 7 Text xticklabel objects>)

In [265]:
L = []
for user in userDic.keys():
    l = len(df.user[df['user'] == user])
    if l > 0:
        print user,l
        L.append(l)

In [6]:
deltaJ = []
pwDate = []
pwPlace = []
pwUsers = []
for item in userDic.items():
    #print item[1]['jDate'],np.min(item[1]['pwDate'])
    jDate = time.mktime(datetime.timetuple(item[1]['jDate']))
    pwd = time.mktime(datetime.timetuple(np.min(item[1]['pwDate'])))

    dt = pwd - jDate
    deltaJ = np.append(deltaJ,-float(dt)/(3600*24.*365.))
    pwDate = np.append(pwDate,pwd)
    pwPlace = np.append(pwPlace,item[1]['pwPlace'])
    pwUsers = np.append(pwUsers,item[0])

In [242]:
pl.figure(1,(10,5))
H = np.histogram(deltaJ,bins=100)
pl.bar(H[1][:-1],H[0],width=H[1][1] - H[1][0],lw=0)
pl.xlabel("Delta time [years] between joining \n and (first) patchwork attendance")
pl.ylabel("(unnormed) Density")
pl.xlim(-7.5,2)
pl.savefig(figureDir + "density_lag_joining_attendance.png")



In [417]:
print len(uPwDate)

In [ ]:
uPwDate = np.sort(np.unique(pwDate))

for d,dx in enumerate(uPwDate):
    pl.figure(d,(6,3))
    c = (pwDate == dx)
    pl.title("%s (%s)"%(pwPlace[c][0],datetime.fromtimestamp(dx).strftime("%Y/%m/%d")))
    H = np.histogram(deltaJ[c],bins=50)
    pl.bar(H[1][:-1],H[0],width=H[1][1] - H[1][0],lw=0)
    #pl.xlabel("Delta time [years] between joining \n and (first) patchwork attendance")
    #pl.ylabel("(unnormed) Density")
    pl.xlim(-7.5,2)
    
    #print "('" + "','".join(list(pwUsers[c])) +"')"

3. Distribution of Events per User

  • The distribution is overall conform to what we would expect from the theory of proportional growth.
  • I wonder though whether those with super high number of events ( $ events \geqslant 5.10^4$), are actually newbies who started contributing the day they attended a patchwork meetup.

In [414]:
euCDF = eventsPerUserCDF(dic)
x,y = rankorder(euCDF['events'])
loglog(x,y,'.')
xlabel("Counts events per user")
ylabel("Rank Ordering (CCDF)")


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-414-53bd1924231c> in <module>()
----> 1 euCDF = eventsPerUserCDF(dic)
      2 x,y = rankorder(euCDF['events'])
      3 loglog(x,y,'.')
      4 xlabel("Counts events per user")
      5 ylabel("Rank Ordering (CCDF)")

/home/ubuntu/github/patchwork/analyze_patchwork.pyc in eventsPerUserCDF(dic)
     98 
     99 def eventsPerUserCDF(dic):
--> 100     uusers = np.unique(dic['user'])
    101     users = np.array(dic['user'])
    102     events = []

TypeError: list indices must be integers, not str

4. Misc

Patchwork Meetups

  • January 29th, 2014: GitHub HQ
  • March 27th, 2014 : GitHub HQ
  • June 19, 2014 : London (The Electricity Showrooms)
  • July 10, 2014 : GitHub HQ
  • August 14, 2014 : GitHub Boulder
  • September 18, 2014: New York
  • September 30, 2014: GitHub HQ
  • October 9, 2014 : Phoenix
  • October 14, 2014 : Taipei
  • October 20, 2014: Tokyo
  • October 23, 2014: Edimburgh
  • October 29, 2014: Nashville
  • November 5, 2014 : Chicago
  • November 24, 2014: Columbus, Ohio
  • December 10, 2014: Atlanta

  • February 2, 2015, Melbourne

  • February 4, 2015, Cape Town
  • March 12, 2015, Ann Arbor
  • April 13, 2015, Montreal, Ca
  • April 14, 2015, Wellington, NZ
  • May 20, 2015 : Detroit
  • May 21, 2015 : Irvine
  • May 28, 2015 : Washington DC
  • June 23, 2015 : Nashville
  • August 18, 2015 : GitHub Boulder

Productive Bursts


In [68]:
def burstPlot(x,y):
    c = (y >0)*(x>0)
    lx = np.log10(x[c])
    ly = np.log10(y[c])

    fit = S.linregress(lx,ly)
    print fit

    pl.plot(lx,ly,'.')
    pl.plot(lx,lx*fit[0]+fit[1],'k-')
    
    return fit