Good chapter on determining G+C content from CsCl gradient analysis
http://www.analyticalultracentrifugation.com/dynamic_density_gradients.htm
Meselson et al. - 1957 - Equilibrium Sedimentation of Macromolecules in Den
Vinograd et al. - 1963 - Band-Centrifugation of Macromolecules and Viruses
http://onlinelibrary.wiley.com.proxy.library.cornell.edu/doi/10.1002/bip.360101011/pdf
Vinograd et al., 1963; (band-centrifugation):
\begin{align} \sigma^2 = \frac{r_0}{r_0^0} \left\{ \frac{r_0}{r_0^0} + 2D \left( t - t^0 \right) \right\} \end{align}isoconcentration point
\begin{equation} r_c = \sqrt{(r_t^2 + r_t * r_b + r_b^2)/3} \end{equation}rp in relation to the particle's buoyant density
\begin{equation} r_p = \sqrt{ ((p_p-p_m)\frac{2\beta^{\circ}}{w}) + r_c^2 } \\ p_p = \textrm{buoyant density} \end{equation}buoyant density of a DNA fragment in CsCl
\begin{equation} p_p = 0.098F + 1.66, \quad \textrm{where F = G+C molar fraction} \end{equation}info needed on a DNA fragment to determine it's sigma of the Guassian distribution
In [178]:
    
%load_ext rpy2.ipython
    
    
In [179]:
    
%%R
library(dplyr)
library(tidyr)
library(ggplot2)
library(gridExtra)
    
In [498]:
    
%%R
GC2MW = function(x){ 
    A = 313.2
    T = 304.2
    C = 289.2
    G = 329.2
    GC = G + C
    AT = A + T
    x = x / 100
    x*GC + (1-x)*AT  
}
GC2BD = function(GC){
    # GC = percentage
    BD = GC / 100 * 0.098 + 1.66
    return(BD)
}
calc_BD_macro = function(p_i, w, B, r)
rpm2w2 = function(rpm){
    x = 2 * pi * rpm / 60
    return(x**2)
}
calc_R_c = function(r_t, r_b){
    x = r_t**2 + r_t * r_b + r_b**2
    return(sqrt(x/3))
}
calc_R_p = function(p_p, p_m, B, w, r_c){
    # distance of the particle from the axis of rotation (at equilibrium)
    x = ((p_p - p_m) * (2 * B / w)) + r_c**2
    return(sqrt(x))
}
calc_S = function(l, GC){
    # l = dsDNA length (bp)
    MW = GC2MW(GC)
    S = 0.00834 * (l * MW)**0.479 + 2.8
    S = S * 1e-13
    return(S)
}
calc_dif_sigma_OLD = function(L, w, r_p, S, t, B, p_p, p_m){
    nom = w**2 * r_p**2 * S 
    denom = B * (p_p - p_m)
    x = nom / denom * t - 1.26
    sigma = L / exp(x)
    return(sigma)
}
    
calc_dif_sigma = function(L, w, r_c, S, t, B, p_p, p_m){
    nom = w**2 * r_c**2 * S 
    denom = B * (p_p - p_m)
    x = nom / denom * t - 1.26
    sigma = L / exp(x)
    return(sigma)
}
R_p2BD = function(r_p, p_m, B, w, r_c){
    # converting a distance from center of rotation of a particle to buoyant density
    ## inverse of `calc_R_p`
    nom = (r_p**2 - r_c**2) * w
    return(nom / (2 * B) + p_m)
}
sigma2BD = function(r_p, sigma, p_m, B, w, r_c){
    BD_low = R_p2BD(r_p - sigma, p_m, B, w, r_c)
    BD_high = R_p2BD(r_p + sigma, p_m, B, w, r_c)
    return(BD_high - BD_low)
}
    
time2eq = function(B, p_p, p_m, w, r_c, s, L, sigma){
    x = (B * (p_p - p_m)) / (w**2 * r_c**2 * s) 
    y = 1.26 + log(L / sigma)
    return(x * y)
}
    
In [460]:
    
%%R -w 450 -h 300
# time to eq 
calc_time2eq = function(x, B, L, rpm, r_t, r_b, sigma, p_m){
    l = x[1]
    GC = x[2]
    s = calc_S(l, GC)
    w = rpm2w2(rpm)
    p_p = GC2BD(GC)
    r_c = calc_R_c(r_t, r_b)
    #r_p = calc_R_p(p_p, p_m, B, w, r_c)
    t = time2eq(B, p_p, p_m, w, r_c, s, L, sigma)
    t = t / 360
    return(t)
} 
rpm = 55000 
B = 1.14e9
r_b = 4.85
r_t = 2.6
L = r_b - r_t
p_m = 1.7
l = seq(100,20000,100)          # bp
GC = 1:100               # percent
sigma = 0.01
df = expand.grid(l, GC)
df$t = apply(df, 1, calc_time2eq, B=B, L=L, rpm=rpm, r_t=r_t, r_b=r_b, sigma=sigma, p_m=p_m)
colnames(df) = c('length', 'GC', 'time')
df %>% head
cols = rev(rainbow(12))
p1 = ggplot(df, aes(GC, length, fill=time)) +
    geom_tile() +
    scale_x_continuous(expand=c(0,0)) +
    scale_y_continuous(expand=c(0,0)) +
    scale_fill_gradientn(colors=cols) +
    geom_hline(yintercept=4000, linetype='dashed', color='black') +
    #geom_vline(xintercept=60*60*66, linetype='dashed', color='black') +
    labs(x='GC (%)', y='dsDNA length (bp)') +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )
p1
    
    
In [404]:
    
%%R
rpm = 55000
B = 1.14e9
r_b = 4.85
r_t = 2.6
L = r_b - r_t
p_m = 1.7
l = 500          # bp
GC = 50           # pebrcent
t = 60 * 60 * 66     # sec
S = calc_S(l, GC)
w2 = rpm2w2(rpm)
p_p = GC2BD(GC)
r_c = calc_R_c(r_t, r_b)
r_p = calc_R_p(p_p, p_m, B, w2, r_c)
sigma = calc_dif_sigma(L, w2, r_p, S, t, B, p_p, p_m)
print(sigma)
#sigma_BD = sigma2BD(r_p, sigma, p_m, B, w2, r_c)
#print(sigma_BD)
    
    
In [405]:
    
%%R
#-- alternative calculation
p_p = 1.7
M = l * 882
R = 8.3144598   #J mol^-1 K^-1
T = 293.15
calc_stdev(p_p, M, R, T, w2, r_c, B, r_p)
    
    
In [447]:
    
%%R -h 300 -w 850
calc_sigma_BD = function(x, rpm, GC, r_t, r_b, p_m, B, L){
    l = x[1]
    t = x[2]
    S = calc_S(l, GC)
    w2 = rpm2w2(rpm)
    p_p = GC2BD(GC)
    r_c = calc_R_c(r_t, r_b)
    r_p = calc_R_p(p_p, p_m, B, w2, r_c)
    sigma = calc_dif_sigma(L, w2, r_p, S, t, B, p_p, p_m)
    if (sigma > L){
        return(NA)
    } else {
        return(sigma)
    }
}
# params
GC = 50
rpm = 55000
B = 1.14e9
r_b = 4.85
r_t = 2.6
L = r_b - r_t
p_m = 1.66
# pairwise calculations of all parameters
l = 50**seq(1,3, by=0.05)
t = 6**seq(3,8, by=0.05)
df = expand.grid(l, t)
df$sigma = apply(df, 1, calc_sigma_BD, rpm=rpm, GC=GC, r_t=r_t, r_b=r_b, p_m=p_m, B=B, L=L)
colnames(df) = c('length', 'time', 'sigma')
df= df %>%
    mutate(sigma = ifelse((sigma < 1e-20 | sigma > 1e20), NA, sigma))
# plotting
cols = rev(rainbow(12))
p1 = ggplot(df, aes(time, length, fill=sigma)) +
    geom_tile() +
    scale_x_log10(expand=c(0,0)) +
    scale_y_log10(expand=c(0,0)) +
    scale_fill_gradientn(colors=cols) +
    #geom_hline(yintercept=4000, linetype='dashed', color='black') +
    geom_vline(xintercept=60*60*66, linetype='dashed', color='black') +
    labs(x='Time', y='Length') +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )
p2 = p1 + scale_fill_gradientn(colors=cols, trans='log10')
grid.arrange(p1, p2, ncol=2)
    
    
In [453]:
    
%%R -h 300 -w 850
# params
GC = 20
rpm = 55000
B = 1.14e9
r_b = 4.85
r_t = 2.6
L = r_b - r_t
p_m = 1.66
# pairwise calculations of all parameters
l = 50**seq(1,3, by=0.05)
t = 6**seq(3,8, by=0.05)
df = expand.grid(l, t)
df$sigma = apply(df, 1, calc_sigma_BD, rpm=rpm, GC=GC, r_t=r_t, r_b=r_b, p_m=p_m, B=B, L=L)
colnames(df) = c('length', 'time', 'sigma')
df= df %>%
    mutate(sigma = ifelse((sigma < 1e-20 | sigma > 1e20), NA, sigma))
# plotting
cols = rev(rainbow(12))
p1 = ggplot(df, aes(time, length, fill=sigma)) +
    geom_tile() +
    scale_x_log10(expand=c(0,0)) +
    scale_y_log10(expand=c(0,0)) +
    scale_fill_gradientn(colors=cols) +
    #geom_hline(yintercept=4000, linetype='dashed', color='black') +
    geom_vline(xintercept=60*60*66, linetype='dashed', color='black') +
    labs(x='Time', y='Length') +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )
p2 = p1 + scale_fill_gradientn(colors=cols, trans='log10')
grid.arrange(p1, p2, ncol=2)
    
    
In [454]:
    
%%R -h 300 -w 850
# params
GC = 80
rpm = 55000
B = 1.14e9
r_b = 4.85
r_t = 2.6
L = r_b - r_t
p_m = 1.66
# pairwise calculations of all parameters
l = 50**seq(1,3, by=0.05)
t = 6**seq(3,8, by=0.05)
df = expand.grid(l, t)
df$sigma = apply(df, 1, calc_sigma_BD, rpm=rpm, GC=GC, r_t=r_t, r_b=r_b, p_m=p_m, B=B, L=L)
colnames(df) = c('length', 'time', 'sigma')
df= df %>%
    mutate(sigma = ifelse((sigma < 1e-20 | sigma > 1e20), NA, sigma))
# plotting
cols = rev(rainbow(12))
p1 = ggplot(df, aes(time, length, fill=sigma)) +
    geom_tile() +
    scale_x_log10(expand=c(0,0)) +
    scale_y_log10(expand=c(0,0)) +
    scale_fill_gradientn(colors=cols) +
    #geom_hline(yintercept=4000, linetype='dashed', color='black') +
    geom_vline(xintercept=60*60*66, linetype='dashed', color='black') +
    labs(x='Time', y='Length') +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )
p2 = p1 + scale_fill_gradientn(colors=cols, trans='log10')
grid.arrange(p1, p2, ncol=2)
    
    
In [502]:
    
%%R
calc_dif_sigma_Clay = function(rho, R, T, B, G, M, l){
    sigma = sqrt((rho*R*T)/(B**2*G*M*l))
    return(sigma)
}
    
In [511]:
    
%%R -w 850 -h 300
wrap_calc_sigma_Clay = function(x, R, T, B, G, m){
    l= x[1]
    GC = x[2]
    rho = GC2BD(GC)
    sigma = calc_dif_sigma_Clay(rho, R, T, B, G, m, l)
    return(sigma)
}
# params
R = 8.3145e7
T = 293.15
G = 7.87e-10
M = 882
B = 1.14e9
l = 50**seq(1,3, by=0.05)
GC = 1:100
# pairwise calculations of all parameters
df = expand.grid(l, GC)
df$sigma = apply(df, 1, wrap_calc_sigma_Clay, R=R, T=T, B=B, G=G, m=M)
colnames(df) = c('length', 'GC', 'sigma')
# plotting
cols = rev(rainbow(12))
p1 = ggplot(df, aes(GC, length, fill=sigma)) +
    geom_tile() +
    scale_y_log10(expand=c(0,0)) +
    scale_x_continuous(expand=c(0,0)) +
    scale_fill_gradientn(colors=cols) +
    labs(y='length (bp)', x='G+C') +
    theme_bw() +
    theme(
        text = element_text(size=16)
    )
p2 = p1 + scale_fill_gradientn(colors=cols, trans='log10')
grid.arrange(p1, p2, ncol=2)
    
    
In [ ]:
    
    
In [ ]:
    
    
In [ ]:
    
%pylab inline
    
In [ ]:
    
import scipy as sp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import mixture
#import sklearn.mixture as mixture
    
In [ ]:
    
n_frags = 10000
frag_GC = np.random.normal(0.5,0.1,n_frags)
frag_GC[frag_GC < 0] = 0
frag_GC[frag_GC > 1] = 1
frag_len = np.random.normal(10000,1000,n_frags)
    
In [ ]:
    
ret = plt.hist2d(frag_GC, frag_len, bins=100)
    
In [ ]:
    
RPM = 55000
omega = (2 * np.pi * RPM) / 60
beta_o = 1.14 * 10**9
radius_bottom = 4.85 
radius_top = 2.6 
col_len = radius_bottom - radius_top
density_medium = 1.7
    
In [ ]:
    
# BD from GC
frag_BD = 0.098 * frag_GC + 1.66
ret = plt.hist(frag_BD, bins=100)
    
In [ ]:
    
sedimentation = (frag_len*666)**0.479 * 0.00834 + 2.8   # l = length of fragment
ret = plt.hist(sedimentation, bins=100)
    
In [ ]:
    
# sedimentation as a function of fragment length 
len_range = np.arange(1,10000, 100)
ret = plt.scatter(len_range, 2.8 + 0.00834 * (len_range*666)**0.479 )
    
In [ ]:
    
# isoconcentration point
iso_point = sqrt((radius_top**2 + radius_top * radius_bottom + radius_bottom**2)/3)
iso_point
    
In [ ]:
    
# radius of particle
#radius_particle = np.sqrt( (frag_BD - density_medium)*2*(beta_o/omega) + iso_point**2 )
#ret = plt.hist(radius_particle)
    
In [ ]:
    
    
In [ ]:
    
n_dists = 10
n_samp = 10000
    
In [ ]:
    
def make_mm(n_dists):
    dist_loc = np.random.uniform(0,1,n_dists)
    dist_scale = np.random.uniform(0,0.1, n_dists)
    dists = [mixture.NormalDistribution(x,y) for x,y in zip(dist_loc, dist_scale)]
    eq_weights = np.array([1.0 / n_dists] * n_dists)
    eq_weights[0] += 1.0 - np.sum(eq_weights)
    return mixture.MixtureModel(n_dists, eq_weights, dists)
    
In [ ]:
    
mm = make_mm(n_dists)
    
In [ ]:
    
%%timeit
smp = mm.sampleDataSet(n_samp).getInternalFeature(0).flatten()
    
In [ ]:
    
%%timeit
smp = np.array([mm.sample() for i in arange(n_samp)])
    
In [ ]:
    
n_dists = 1000
mm = make_mm(n_dists)
    
In [ ]:
    
%%timeit
smp = mm.sampleDataSet(n_samp).getInternalFeature(0).flatten()
    
In [ ]:
    
%%timeit
smp = np.array([mm.sample() for i in arange(n_samp)])
    
In [ ]:
    
n_dists = 10000
mm = make_mm(n_dists)
    
In [ ]:
    
%%timeit
smp = mm.sampleDataSet(n_samp).getInternalFeature(0).flatten()
    
In [ ]:
    
%%timeit
smp = np.array([mm.sample() for i in arange(n_samp)])
    
In [ ]:
    
n_samp = 100000
    
In [ ]:
    
%%timeit
smp = mm.sampleDataSet(n_samp).getInternalFeature(0).flatten()
    
In [ ]:
    
%%timeit
smp = np.array([mm.sample() for i in arange(n_samp)])
    
Notes:
In [ ]:
    
x = np.random.normal(3, 1, 100)
y = np.random.normal(1, 1, 100)
H, xedges, yedges = np.histogram2d(y, x, bins=100)
    
In [ ]:
    
H
    
For each genome in mock community, simulate N fragments and calculate their Guassian distributions in the gradient. Create a mixture model of those Guassian distributions to sample Aa fragments, where Aa = the absolute abundance of the taxon in the mock community. One mixture model per genome.
In [ ]: