Analysing data Gefitinib Src experiment on February 20 2015. Full experiment description in google docs 'Fluorescence assay protocol notes and lab notebook'.


In [1]:
import numpy as np
import matplotlib.pyplot as plt
from lxml import etree
import pandas as pd
import os
import matplotlib.cm as cm 
import seaborn as sns
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
def get_wells_from_section(path):
    reads = path.xpath("*/Well")
    wellIDs = [read.attrib['Pos'] for read in reads]

    data = [(float(s.text), r.attrib['Pos'])
         for r in reads
         for s in r]

    datalist = {
      well : value
      for (value, well) in data
    }
    
    welllist = [
                [
                 datalist[chr(64 + row) + str(col)]          
                 if chr(64 + row) + str(col) in datalist else None
                 for row in range(1,9)
                ]
                for col in range(1,13)
                ]
                
    return welllist

In [3]:
file_GEF = "data/Gef_WIP_SMH_SrcBos_Extend_013015_mdfx_20150220_18.xml"
file_name = os.path.splitext(file_GEF)[0]

In [4]:
root = etree.parse(file_GEF)

In [5]:
Sections = root.xpath("/*/Section")
much = len(Sections)
print "****The xml file " + file_GEF + " has %s data sections:****" % much
for sect in Sections:
    print sect.attrib['Name']


****The xml file data/Gef_WIP_SMH_SrcBos_Extend_013015_mdfx_20150220_18.xml has 7 data sections:****
280_TopRead
280_BottomRead
Abs_280
350_TopRead
350_BottomRead
Abs_350
Abs_480

In [6]:
#Just going to work with topread for now
TopRead = root.xpath("/*/Section")[0]
welllist = get_wells_from_section(TopRead)
dataframe = pd.DataFrame(welllist, columns = ['A - Src','B - Buffer','C - Src','D - Buffer', 'E - Src','F - Buffer','G - Src','H - Buffer'])

In [7]:
sns.set_palette("Paired", 10)
sns.set_context("notebook", rc={"lines.linewidth": 2.5})

dataframe.plot(figsize=(6, 4), title=file_name)
plt.xlim(-0.5,11.5)


Out[7]:
(-0.5, 11.5)

In [8]:
dataframe


Out[8]:
A - Src B - Buffer C - Src D - Buffer E - Src F - Buffer G - Src H - Buffer
0 399 140 408 137 408 136 410 56
1 365 114 377 112 388 112 383 55
2 308 79 327 78 328 78 318 55
3 251 67 261 65 268 64 177 60
4 191 60 189 62 191 59 127 55
5 130 59 125 60 131 61 97 57
6 93 63 88 60 91 62 71 57
7 78 59 75 60 73 60 63 55
8 69 61 63 61 64 59 58 56
9 61 63 61 61 59 60 57 59
10 62 60 64 59 57 59 58 57
11 61 62 59 58 56 60 55 57

In [9]:
dataframe['A - Src'].plot()


Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x10b6f4cd0>

In [10]:
to_excl = ['D - Buffer', 'E - Src','F - Buffer','G - Src','H - Buffer']
dataframe.ix[:, dataframe.columns - to_excl].plot()


Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x10bc022d0>

Section II

Can I just pick the best single Buffer and Src Data sets from each, and see how they compare?


In [11]:
root = etree.parse(file_GEF)
#Just going to keep working with topread for now
TopRead = root.xpath("/*/Section")[0]
welllist = get_wells_from_section(TopRead)
dataframe = pd.DataFrame(welllist, columns = ['A - Src','B - Buffer','C - Src','D - Buffer', 'E - Src','F - Buffer','G - Src','H - Buffer'])

In [12]:
Complex_Fluorescence = dataframe['A - Src']
Ligand_Fluorescence = dataframe['B - Buffer']
F_i = np.array(Complex_Fluorescence, np.float64)
Fligand_i = np.array(Ligand_Fluorescence, np.float64)

In [13]:
#Gefitinib concentration (M):
Lstated = np.array([20.0e-6,9.15e-6,4.18e-6,1.91e-6,0.875e-6,0.4e-6,0.183e-6,0.0837e-6,0.0383e-6,0.0175e-6,0.008e-6,0], np.float64)
#Protein concentration (M):
Pstated = 0.5e-6 * np.ones([12],np.float64)

In [14]:
# Plot fluorescence intensity vs ligand concentration.
fig = figure(figsize=(10,5))
rcParams['figure.figsize'] = [10, 5];
clf();
semilogx(Lstated, F_i, 'ko');
hold(True)
semilogx(Lstated, Fligand_i, 'ro');
xlabel('$[L]_T$ (M)');
ylabel('rel. fluorescence');



In [15]:
# Plot fluorescence intensity vs ligand concentration on a log plot to see if we are running into inner filter effects.
fig2 = pyplot.figure(figsize=(10,5))
rcParams['figure.figsize'] = [10, 5];
clf();
loglog(Lstated, Fligand_i, 'ro');
xlabel('$[L]_T$ (M)');
ylabel('rel. fluorescence');



In [16]:
# Remove highest concentrations due to aggregation issues.
nskip = 0
F_i = F_i[nskip:]
Fligand_i = Fligand_i[nskip:]
Pstated = Pstated[nskip:]
Lstated = Lstated[nskip:]

# Uncertainties in protein and ligand concentrations.
dPstated = 0.10 * Pstated # protein concentration uncertainty
dLstated = 0.08 * Lstated # ligand concentraiton uncertainty (due to gravimetric preparation and HP D300 dispensing)

dLstated[-1] = dLstated[-2]

print dPstated
print dLstated


[  5.00000000e-08   5.00000000e-08   5.00000000e-08   5.00000000e-08
   5.00000000e-08   5.00000000e-08   5.00000000e-08   5.00000000e-08
   5.00000000e-08   5.00000000e-08   5.00000000e-08   5.00000000e-08]
[  1.60000000e-06   7.32000000e-07   3.34400000e-07   1.52800000e-07
   7.00000000e-08   3.20000000e-08   1.46400000e-08   6.69600000e-09
   3.06400000e-09   1.40000000e-09   6.40000000e-10   6.40000000e-10]

In [17]:
# Plot fluorescence intensity vs ligand concentration.
fig2 = pyplot.figure(figsize=(10,5))
rcParams['figure.figsize'] = [10, 5];
clf();
semilogx(Lstated, F_i, 'ko');
hold(True)
semilogx(Lstated, Fligand_i, 'ro');
xlabel('$[L]_T$ (M)');
ylabel('rel. fluorescence');



In [18]:
import pymc

# Two-component binding.
def two_component_binding(DeltaG, P, L):
    Kd = np.exp(DeltaG)
    PL = 0.5 * ((P + L + Kd) - np.sqrt((P + L + Kd)**2 - 4*P*L));  # complex concentration (M)                                                                                                                                                                                                         
    P = P - PL; # free protein concentration in sample cell after n injections (M)                                                                                                                                                                                                                          
    L = L - PL; # free ligand concentration in sample cell after n injections (M)                                                                                                                                                                                                                           
    return [P, L, PL]

# Create a pymc model
def make_model(Pstated, dPstated, Lstated, dLstated, Fobs_i, Fligand_i):
    N = len(Lstated)
    
    # Prior on binding free energies.
    DeltaG = pymc.Uniform('DeltaG', lower=-40, upper=+40, value=0.0) # binding free energy (kT), uniform over huge range
    #DeltaG = pymc.Normal('DeltaG', mu=0, tau=1./(12.5**2)) # binding free energy (kT), using a Gaussian prior inspured by ChEMBL
        
    # Priors on true concentrations of protein and ligand.
    #Ptrue = pymc.Lognormal('Ptrue', mu=np.log(Pstated**2 / np.sqrt(dPstated**2 + Pstated**2)), tau=np.sqrt(np.log(1.0 + dPstated**2/Pstated**2))**(-2)) # protein concentration (M)
    #Ltrue = pymc.Lognormal('Ltrue', mu=np.log(Lstated**2 / np.sqrt(dLstated**2 + Lstated**2)), tau=np.sqrt(np.log(1.0 + dLstated**2/Lstated**2))**(-2)) # ligand concentration (M)
    Ptrue = pymc.Normal('Ptrue', mu=Pstated, tau=dPstated**(-2)) # protein concentration (M)
    Ltrue = pymc.Normal('Ltrue', mu=Lstated, tau=dLstated**(-2)) # ligand concentration (M)
    Ltrue_control = pymc.Normal('Ltrue_control', mu=Lstated, tau=dLstated**(-2)) # ligand concentration (M)

    # Priors on fluorescence intensities of complexes (later divided by a factor of Pstated for scale).
    Fmax = max(Fobs_i.max(), Fligand_i.max())
    F_background = pymc.Uniform('F_background', lower=0.0, upper=Fmax) # background fluorescence
    F_PL = pymc.Uniform('F_PL', lower=0.0, upper=Fmax/min(Pstated.max(),Lstated.max())) # complex fluorescence
    F_L = pymc.Uniform('F_L', lower=0.0, upper=Fmax/Lstated.max()) # ligand fluorescence

    # Unknown experimental measurement error.
    log_sigma = pymc.Uniform('log_sigma', lower=-10, upper=+2, value=0.0) 
    @pymc.deterministic
    def precision(log_sigma=log_sigma): # measurement precision
        return 1.0 / np.exp(log_sigma)**2

    # Fluorescence model.
    @pymc.deterministic
    def Fmodel(F_background=F_background, F_PL=F_PL, F_L=F_L, Ptrue=Ptrue, Ltrue=Ltrue, DeltaG=DeltaG):
        Fmodel_i = np.zeros([N])
        for i in range(N):
            [P, L, PL] = two_component_binding(DeltaG, Ptrue[i], Ltrue[i])
            Fmodel_i[i] = (F_PL*PL + F_L*L) + F_background
        return Fmodel_i

    # Fluorescence model, ligand only.
    @pymc.deterministic
    def Fligand(F_background=F_background, F_L=F_L, Ltrue_control=Ltrue_control):
        Fmodel_i = np.zeros([N])
        for i in range(N):
            Fmodel_i[i] = (F_L*Ltrue_control[i]) + F_background
        return Fmodel_i
    
    # Experimental error on fluorescence observations.
    Fobs_model = pymc.Normal('Fobs_i', mu=Fmodel, tau=precision, size=[N], observed=True, value=Fobs_i) # observed data
    Fligand_model = pymc.Normal('Fligand_i', mu=Fligand, tau=precision, size=[N], observed=True, value=Fligand_i) # ligand only data
    
    # Construct dictionary of model variables.
    pymc_model = { 'Ptrue' : Ptrue, 
                   'Ltrue' : Ltrue, 
                   'Ltrue_control' : Ltrue_control, 
                   'log_sigma' : log_sigma, 
                   'precision' : precision, 
                   'F_PL' : F_PL, 
                   'F_L' : F_L, 
                   'F_background' : F_background,
                   'Fmodel_i' : Fmodel, 
                   'Fligand_i' : Fligand, 
                   'Fobs_model' : Fobs_model, 
                   'Fligand_model' : Fligand_model, 
                   'DeltaG' : DeltaG # binding free energy
                   }
    return pymc_model


# Build model.
pymc_model = pymc.Model(make_model(Pstated, dPstated, Lstated, dLstated, F_i, Fligand_i))

# Sample with MCMC
mcmc = pymc.MCMC(pymc_model, db='ram', name='Sampler', verbose=True)
mcmc.sample(iter=100000, burn=10000, thin=50, progress_bar=False)


Couldn't import dot_parser, loading of dot files will not be possible.

In [19]:
# Plot trace of DeltaG.
rcParams['figure.figsize'] = [15, 3]
plot(mcmc.DeltaG.trace(), 'o');
xlabel('MCMC sample');
ylabel('$\Delta G$ ($k_B T$)');



In [20]:
# Plot trace of true protein concentration.
rcParams['figure.figsize'] = [15, 3]
plot(mcmc.Ptrue.trace()*1e6, 'o');
xlabel('MCMC sample');
ylabel('$[P]$ ($\mu$M)');



In [21]:
# Plot trace of true protein concentration.
rcParams['figure.figsize'] = [15, 3]
plot(mcmc.Ltrue.trace()*1e6, 'o');
xlabel('MCMC sample');
ylabel('$[L]$ ($\mu$M)');
print mcmc.Ltrue.trace().min()


-2.2549506087e-09

In [22]:
# Plot histogram of DeltaG.
rcParams['figure.figsize'] = [15, 3]
hist(mcmc.DeltaG.trace(), 40);
xlabel('$\Delta G$ ($k_B T$)');
ylabel('$P(\Delta G)$');



In [23]:
# Plot trace of intrinsic fluorescence parameters.
rcParams['figure.figsize'] = [15, 3]
semilogy(mcmc.F_PL.trace(), 'o', mcmc.F_L.trace(), 'o', mcmc.F_background.trace(), 'o');
legend(['complex fluorescence', 'ligand fluorescence', 'background fluorescence']);
xlabel('MCMC sample');
ylabel('relative fluorescence intensity');



In [24]:
# Plot model fit.
rcParams['figure.figsize'] = [15, 3]
figure = pyplot.gcf() # get current figure
Fmodels = mcmc.Fmodel_i.trace()
Fligands = mcmc.Fligand_i.trace()
clf();
hold(True)
for Fmodel in Fmodels:
    semilogx(Lstated, Fmodel, 'k-')
semilogx(Lstated, F_i, 'ro')
for Fligand in Fligands:
    semilogx(Lstated, Fligand, 'k-')
semilogx(Lstated, Fligand_i, 'ro')
hold(False)
xlabel('$[L]_s$ (M)');
ylabel('fluorescence units');



In [ ]:


In [ ]: