In [10]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
# Some nice default configuration for plots
plt.rcParams['figure.figsize'] = 10, 7.5
plt.rcParams['axes.grid'] = True
plt.gray()
The problem with weighting the interactions in the curated edgelist we are using to build our protein-protein interaction network is that the classifier is not perfect and is supplying extremely low probabilities for interactions which we expect to exist. We can encode this prior into our model by considering inclusion in a database, the edgelist and the result of classification to both be events that are dependent on an underlying interaction event. If we assume the events are also conditionally independent then we have a Naive Bayes model.
To define the conditional distributions we can use KDE, as implemented in Weka's Naive Bayes model. For binary variables we can simply use Bernoulli distributions with explicitly defined parameters corresponding to true positive and false positive rate estimates. Including various interaction databases through iRefIndex is also possible, by explicitly defining conservative true positive and false positive rates.
First we have to load the data:
In [1]:
import scipy.stats
In [68]:
import sklearn.neighbors
In [2]:
cd ../../forGAVIN/mergecode/OUT/
In [3]:
interactions = loadtxt("edgelist_weighted.txt",dtype=str)
In [4]:
interactions = interactions[1:]
In [5]:
weights = interactions[:,2]
weights = weights.astype(np.float)
Now, finding the indexes of positive and negative examples:
In [6]:
import pickle
In [7]:
#unpickle classifier's samples
f = open("../../../features/random.forest.bayes.dist.samples.pickle")
samples = pickle.load(f)
f.close()
In [11]:
h = hist(samples[1][:,1],bins=50)
In [12]:
h = hist(samples[0][:,1],bins=100)
In [83]:
x = linspace(0,1,500)
In [77]:
def scotts_factor(X):
"""Only for 1d data"""
return float(len(X))**(-1./(5))
In [79]:
kernel_1 = sklearn.neighbors.KernelDensity(bandwidth=scotts_factor(samples[1][:,1]))
kernel_1.fit(samples[1][:,1][:, np.newaxis])
Out[79]:
In [85]:
# high bandwidth for smooth distribution
kernel_0 = sklearn.neighbors.KernelDensity(bandwidth=0.4)
kernel_0.fit(samples[0][:,1][:, np.newaxis])
Out[85]:
In [86]:
plot(x,exp(kernel_1.score_samples(x[:, np.newaxis])),label="interaction")
plot(x,exp(kernel_0.score_samples(x[:, np.newaxis])),label="non-interaction")
yscale('log')
legend(loc=0)
Out[86]:
In [21]:
import os
In [22]:
plotpath = os.path.abspath("../../../plots/bayes/")
In [24]:
savez(os.path.join(plotpath,"rf.kde.npz"),x,kernel_1.score_samples(x),kernel_0.score_samples(x))
In [25]:
import pickle
In [26]:
sys.path.append("../../../opencast-bio/")
In [27]:
import ocbio.extract
In [28]:
f = open("../../../string/human.Entrez.string.pickle")
stringdb = pickle.load(f)
f.close()
In [29]:
f = open("../../../iRefIndex/human.iRefIndex.Entrez.1ofk.pickle")
irefdb = pickle.load(f)
f.close()
In [50]:
stringlabelled = {}
for k in stringdb.featuredict:
try:
stringlabelled[max(irefdb[k])] += [float(stringdb[k][-1])/1000]
except KeyError:
stringlabelled[max(irefdb[k])] = [float(stringdb[k][-1])/1000]
In [52]:
# padding in zero confidence non-interactions
nmissing = 500000
stringlabelled['0'] = concatenate([stringlabelled['0'],zeros(int(nmissing))])
In [94]:
# convert to arrays
stringlabelled['1'] = array(stringlabelled['1'])
stringlabelled['0'] = array(stringlabelled['0'])
In [53]:
h=hist(stringlabelled['1'],bins=50,label='interaction')
h=hist(stringlabelled['0'],bins=50,label='non-interaction')
l=legend()
In [95]:
strkernel_1 = sklearn.neighbors.KernelDensity(bandwidth=scotts_factor(stringlabelled['1']))
strkernel_1.fit(stringlabelled['1'])
Out[95]:
In [98]:
stringlabelled['1']
Out[98]:
In [96]:
strkernel_0 = sklearn.neighbors.KernelDensity(bandwidth=scotts_factor(stringlabelled['0']))
strkernel_0.fit(stringlabelled['0'])
Out[96]:
In [97]:
plot(x,exp(strkernel_1.score_samples(x[:, np.newaxis])),label="interaction")
plot(x,exp(strkernel_0.score_samples(x[:, np.newaxis])),label="non-interaction")
l=legend(loc=0)
In [63]:
def posterior(prior,logprobdict):
"""Takes dictionary of class-conditional log probabilities and log prior
produces the corresponding posterior probability"""
#dictionary should have keys 0 and 1
post = []
for l1,l0 in zip(logprobdict[1],logprobdict[0]):
numerator = sum(l1) + prior
denominator = logaddexp(numerator,sum(l0))
post.append(exp(numerator-denominator))
return post
In the below cell we will be explicitly defining various probabilities for the different data sources. The values used will be estimates, aiming to be conservative. For each database two different probabilties must be defined:
This only equates to two probabilities as these probabilities are related within each class:
$$ TPR = 1 - FPR $$We can define these probabilities for each of these databases before we begin for iRefIndex and edgelist inclusion.
In [66]:
irefprob = {'tpr':0.9,'tnr':0.9,'fpr':0.1,'fnr':0.1}
edgelistprob = {'tpr':0.9,'tnr':0.9,'fpr':0.1,'fnr':0.1}
In [64]:
rfkernel = {0:kernel_0,1:kernel_1}
strkernel = {0:strkernel_0,1:strkernel_1}
In [67]:
#making logprobdict from classifier, STRING and iRefIndex
logprobdict = {}
for cls in [0,1]:
for l in interactions:
pair = frozenset(l[:2])
weight = float(l[2])
pclass = log(rfkernel[cls](weight))
wstring = (float(stringdb[pair][-1]))
pstring = log(strkernel[cls](wstring))
irefresponse = max(map(float,irefdb[pair]))
if cls == 1:
piref = log(irefresponse*irefprob['tpr'] + (1.0-irefresponse)*irefprob['fnr'])
pedgelist = log(edgelistprob['tpr'])
else:
piref = log(irefresponse*irefprob['fpr'] + (1.0-irefresponse)*irefprob['tnr'])
pedgelist = log(edgelistprob['fpr'])
try:
logprobdict[cls] += [sum([pclass,pstring,pedgelist,piref])]
except KeyError:
logprobdict[cls] = [sum([pclass,pstring,pedgelist,piref])]
In [101]:
postweightings = posterior(1.0/600,logprobdict)
In [119]:
h=hist(postweightings,bins=100)
Quite a discrete set of weights, probably due to all these binary features we're using.
In [103]:
import csv
In [104]:
f = open("edgelist_update_weighted.txt","w")
c = csv.writer(f, delimiter="\t")
for i,p in enumerate(interactions[:,:2]):
c.writerow(list(p)+[postweightings[i]])
f.close()