notebook.community

Edit and run



In [1]:

    
import os,sys
from dateutil.parser import parse
import numpy as np
from datetime import datetime
import time
from collections import Counter

sys.path.append("/Users/maithoma/work/python/")
from tm_python_lib import * 
from fitting_tools import *
from adaptive_kernel_tom import adaptive_kernel_quantile_pdf_tom

%matplotlib inline



In [2]:

    
fig_width_pt = 420.0  # Get this from LaTeX using \showthe\columnwidth
inches_per_pt = 1.0 / 72.27  # Convert pt to inch
golden_mean = (np.sqrt(5) - 1.0) / 2.0  # Aesthetic ratio
fig_width = fig_width_pt * inches_per_pt  # width in inches
fig_height = fig_width  # *golden_mean      # height in inches
fig_size = [fig_width, fig_height]


params = {'backend': 'ps',
          'axes.labelsize': 25,
          'text.fontsize': 32,
          'legend.fontsize': 16,
          'xtick.labelsize': 20,
          'ytick.labelsize': 20,
          'text.usetex': False,
          'figure.figsize': fig_size}
pl.rcParams.update(params)



In [3]:

    
def plotFit(loss,xmin=1,label=""):
    
    pl.close("all")
    pl.figure(1,(25,9))
    
    index,mu,error,llh,points = vary_threshold(loss,xmin=0.05,index=100,type='cont')

    pl.subplot(131)
    pl.semilogx(index,mu)
    pl.semilogx(index,mu + error)
    pl.semilogx(index,mu - error)
    pl.xlabel("threshold")
    pl.ylabel("exponent mu")

    pl.subplot(132)
    pl.semilogx(index,llh)
    pl.xlabel("threshold")
    pl.ylabel("Loglihelihood")

    pl.subplot(133)
    #xmin = index[np.argmax(llh)]
    mu,confidence,n = pwlaw_fit_cont(loss,xmin)
    
    print "nEvents: %.2f, %.2f , %.2f"%(len(loss),len(loss[loss<xmin]),len(loss[loss>=xmin]))
    print "sumLoss: %.2f, %.2f , %.2f"%(sum(loss),sum(loss[loss<xmin]),sum(loss[loss>=xmin]))
    print "min max: %.2f, %.2f"%(min(loss),max(loss))
    whatever = bootstrapping(loss,xmin,confint=.01,plot=True)
    print "fit: %s (p-value = %s, std.err = %.2f)"%(mu,whatever[-1],confidence)
    
    pl.xlabel("Loss")
    pl.ylabel("Rank Ordering")
    pl.title(label)
    
    
def bootstrapping(data,xmin,confint=.05,numiter = -1,plot=False,plotconfint=False):
    '''Bootstrapping power law distribution'''
    data = np.array(data) # make sure the input is an array
    sample = data[data >= xmin]
    mu,confidence,nPoints = pwlaw_fit_cont(sample,xmin) #fit original power law
        
    f = 1/(sample/float(xmin))**mu 
    ksInit = kstest(sample,f)
    #print ksInit
    
    if nPoints==0:
        print "no value larger than %s"%xmin
        return
    
    if numiter == -1:
        numiter = round(1./4*(confint)**-2)
    
    m = np.zeros([numiter,nPoints])
    i = 0
    k = 0
    while i < numiter:
        q2 = pwlaw(len(sample),xmin,mu)[0]
        m[i]=np.sort(q2)
        ks = kstest(q2,f)
        
        if ks > ksInit:
            k += 1
        
        i+=1
                    
    pvalue = k/float(numiter)
    U=np.percentile(m,100-confint*100,0)
    L=np.percentile(m,confint,0)
    
    if plot:
        x,y = rankorder(data)
        yNorm = y/float(y[-1])
        yMin = yNorm[x <= xmin][0]
        
        pl.loglog(x,yNorm,'k.')
        
        xFit = np.logspace(np.log10(xmin),np.log10(max(sample)))
        yFit = yMin*1/(xFit/float(xmin))**mu 
        
        pl.loglog(xFit,yFit,'r-')
        
        if plotconfint:
            x,y = rankorder(L)
            yLowerNorm = y/float(y[-1])
            pl.loglog(x,yMin*yLowerNorm,'m')
            x,y = rankorder(U)
            yUpperNorm = y/float(y[-1])
            pl.loglog(x,yMin*yUpperNorm,'b')
    
    
    
    return m,L,U,pvalue

def kstest(sample1,sample2):
    return np.max(np.abs(sample1 - sample2))



In [86]:

    
def crossLagCorr(x,y,lagspan=35):

    rho = []
    L = range(-lagspan,lagspan)

    for l in L:
        if l==0:
            rho.append(S.spearmanr(x,y)[0])
        elif l < 0:
             rho.append(S.spearmanr(x[-l:],y[:l])[0])
        else:
            rho.append(S.spearmanr(x[:-l],y[l:])[0])

    return L,rho



In [4]:

    
rootdir = "/Users/maithoma/work/untrackedProjects/mingyi/"
datadir = rootdir + "data/"



In [184]:

    
bprogram = ['Twitter','Coinbase','Slack','Square']
index = 0
print bprogram[index]
data = open(datadir + bprogram[index] + ".csv",'rb').read().split("\r")

date = []
reward = []
user = []

for i,line in enumerate(data):
    if i==0:
        continue
    line = line.split(",")
    date = np.append(date,time.mktime(parse(line[1]).timetuple()))
    reward = np.append(reward,float(line[2]))
    user = np.append(user,line[3])
    
date = (date - min(date) )/3600/24. + 1

pl.figure(1,(17,5))

pl.subplot(121)
x,y = rankorder(reward)
pl.loglog(x,y)
thresholds = [100,100,100,200]
pwlaw_fit_cont(x,thresholds[index])
pl.xlabel("Bounty Rewards")
pl.ylabel("Rank Ordering")

pl.subplot(122)
userCount = Counter(user)
x,y = rankorder(userCount.values())
pl.loglog(x,y)
x = x + np.random.rand(len(y)) - 0.5
print pwlaw_fit_cont(x,1)

pl.xlabel("Count Vulns / Researcher")
pl.ylabel("Rank Ordering")









    



Twitter
(1.2139493423773073, 0.21199957600127201, 89)






    Out[184]:





<matplotlib.text.Text at 0x125a38c90>

CDF(X > x) = 1/x^mu = x^-mu PDF(X=x) = 1/x^(mu+1) = x^-(mu +1) alpha = mu +1

mu = 1.5 # coinbase

CDF (X > x = 5) = 1/5^1.5



In [185]:

    
1000*(1/10**1.5)









    Out[185]:





31.622776601683793

Arrival of new vulnerabilities



In [186]:

    
bins = np.arange(1,max(date)+1,7)
H = pl.histogram(date,bins = bins)
x = H[1][:-1]
y = H[0]

c = (y>0)*(x > 30.0)

lx = np.log10(x[c] - min(x[c])+1)
ly = np.log10(y[c])

B = binning(lx,ly,20)
c = (B[0] >= -1)*(B[0] < 3.0)*(B[1] > 0.1)

fit = S.linregress(B[0][c],B[1][c])
print fit

pl.plot(lx,ly,'b.')
pl.plot(B[0],B[1],'ro')
pl.plot(B[0][c],B[0][c]*fit[0]+fit[1],'k-')

pl.xlabel("Time [days]")
pl.ylabel("Count new Vulnerabilities")









    



LinregressResult(slope=0.037286297577823146, intercept=0.73583664735993071, rvalue=0.1002501122731051, pvalue=0.75656691180238533, stderr=0.11702294004458212)






    Out[186]:





<matplotlib.text.Text at 0x124613690>

Arrival of New Researchers



In [187]:

    
arrivals = []
uuser = []

for u in np.unique(user):
    index = np.argwhere(user==u)[0][0]
    arrivals = np.append(arrivals,index)
    uuser = np.append(uuser,u)
    #print u,date[arrivals[-1]]

o = np.argsort(arrivals)
uuser = uuser[o]
arrivals = arrivals[o]

binsize = 7
bins = np.arange(1,max(date)+1,binsize)
H = pl.histogram(date[list(arrivals)],bins = bins)
x = H[1][:-1]
y = H[0]

c = (y>0)*(x > 0)

lx = np.log10(x[c] - min(x[c])+1)
ly = np.log10(y[c])

B = binning(lx,ly,20)
c = (B[0] >= -1)*(B[0] < 3.0)#*(B[1] > .4)

fit = S.linregress(B[0][c],B[1][c])
print fit

pl.plot(lx,ly,'b.')
pl.plot(B[0],B[1],'ro')
pl.plot(B[0][c],B[0][c]*fit[0]+fit[1],'k-')

pl.xlabel("log10(Time [days])")
pl.ylabel("log10(Count new researchers)")









    



LinregressResult(slope=-0.404158586149515, intercept=1.2064453078524888, rvalue=-0.87262393296825946, pvalue=0.00021241767487512744, stderr=0.071530933372645586)






    



/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipykernel/__main__.py:16: DeprecationWarning: non integer (and non boolean) array-likes will not be accepted as indices in the future






    Out[187]:





<matplotlib.text.Text at 0x125c89f90>

Researcher Arrival following rewards



In [188]:

    
x = H[1][:-1]
y = H[0]

c = y > 0
x = x[c]
y = y[c]
ly = np.log10(y)

Hr = pl.histogram(date,weights=reward,bins=bins)
xr = Hr[1][:-1]
yr = Hr[0]

c = yr > 0
xr = xr[c]
yr = yr[c]
lyr = np.log10(yr)

Hra = pl.histogram(date[list(arrivals)],weights=reward[list(arrivals)],bins=bins)
xra = Hra[1][:-1]
yra = Hra[0]

c = yra > 0
xra = xra[c]
yra = yra[c]
lyra = np.log10(yra)

print len(lyr),len(lyra)

pl.bar(x,ly,binsize,color="blue",alpha=0.3)
pl.bar(xr,lyr,binsize,color = "red",alpha=0.3)
#pl.bar(xra,lyra,binsize,color = "purple",alpha=0.3)
pl.xlabel("Time")
pl.ylabel("log10(values)")









    



51 47






    



/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/ipykernel/__main__.py:18: DeprecationWarning: non integer (and non boolean) array-likes will not be accepted as indices in the future






    Out[188]:





<matplotlib.text.Text at 0x125ea4550>



In [189]:

    
h = H[0] + np.random.rand(len(H[0])) - 0.5
hr = Hr[0] + np.random.rand(len(H[0])) - 0.5

print S.spearmanr(h,hr),len(h),len(hr)









    



SpearmanrResult(correlation=0.70061458718992964, pvalue=3.6856055040839574e-12) 74 74



In [190]:

    
L,rho = crossLagCorr(hr[:],h[:],lagspan=15)
pl.plot(np.array(L)*binsize,rho,'o-')
pl.ylim(ymax=1)
pl.ylabel("cross-correlation hr -> h")
pl.xlabel("Time [days]")









    Out[190]:





<matplotlib.text.Text at 0x12619f850>

Inter-time between 2 awards



In [191]:

    
test = []
for u in uuser:
    index = np.argwhere(user==u)
    df = np.diff(date[index].flatten())
    test = np.append(test,df)



In [192]:

    
test[test < 1] = 1. 
x,y = rankorder(test)
pl.loglog(x,y,'o')
print np.median(test),np.mean(test),np.std(test),np.percentile(test,90)

pl.xlabel("delta t between two awards by one user")
pl.ylabel("rank ordering")









    



10.9281481481 30.6054131257 51.3618638427 84.6650810185






    Out[192]:





<matplotlib.text.Text at 0x1262f8250>



In [196]:

    
x = []
y = []

X = []
Y = []



for u in uuser:
    index = np.argwhere(user==u)
    x = np.append(x,len(reward[index].flatten()))
    y = np.append(y,np.sum(reward[index].flatten()))
    
    #print len(reward[index].flatten())
    Y = np.append(Y,reward[index].flatten())
    X = np.append(X,np.arange(1,len(reward[index].flatten())+1))

print len(X),len(Y)
    
pl.figure(1,(8,8))
c = (X > 0)*(Y > 0)
lX = np.log10(X[c])
lY = np.log10(Y[c])

B = binning(X[c],lY[c],30)
print S.linregress(X[c],lY[c])
fit = S.linregress(B[0],B[1])
print fit
pl.plot(X[c],lY[c],'.')
pl.plot(B[0],B[1],'ro')
pl.plot(B[0],B[0]*fit[0]+fit[1],'k-')
pl.xlabel("rank submission with award")
pl.ylabel("log10(bounty)")

pl.figure(2,(8,8))    
c = (x > 0)*(y > 0)
lx = np.log10(x[c])
ly = np.log10(y[c])

B = binning(lx,ly,30)
fit = S.linregress(B[0],B[1])
print fit
pl.plot(lx,ly,'.')
pl.plot(B[0],B[1],'ro')
pl.plot(B[0],B[0]*fit[0] + fit[1],'k-')
pl.xlabel("log10(count rewards)")
pl.ylabel("lgo10(sum rewards)")









    



359 359
LinregressResult(slope=0.021686184969807929, intercept=2.4749626709201658, rvalue=0.33958133234524129, pvalue=3.8659760296789711e-11, stderr=0.0031790653496956457)
LinregressResult(slope=0.033098926111610742, intercept=2.2856456483077592, rvalue=0.78636612156817498, pvalue=1.1589370155236749e-06, stderr=0.0052004317798130719)
LinregressResult(slope=1.3164029643673456, intercept=2.4708817657594846, rvalue=0.93741045566352499, pvalue=2.266860775746369e-06, stderr=0.14744309216013557)






    Out[196]:





<matplotlib.text.Text at 0x126e5bdd0>



In [ ]:

Remarks:

there is clearly a learning effect for those who stay in the race.
survivaly bias
from this compute optimal strategy

Jens' Ideas:

two-sided market
duplication : how competition goes given easyness of finding a vuln?



In [ ]: