In [4]:
%load_ext rpy2.ipython
In [77]:
workDir = '/home/nick/notebook/SIPSim/dev/fullCyc/frag_norm_9_2.5_n5/default_run/'
In [75]:
%%R
sigmas = seq(1, 50, 1)
means = seq(30, 70, 1) # mean GC content of 30 to 70%
## max 13C shift
max_13C_shift_in_BD = 0.036
## min BD (that we care about)
min_GC = 13.5
min_BD = min_GC/100.0 * 0.098 + 1.66
## max BD (that we care about)
max_GC = 80
max_BD = max_GC / 100.0 * 0.098 + 1.66 # 80.0% G+C
max_BD = max_BD + max_13C_shift_in_BD
In [52]:
%%R
library(dplyr)
library(tidyr)
library(ggplot2)
library(gridExtra)
In [79]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import dill
In [54]:
%%R
GC2BD = function(GC) GC / 100.0 * 0.098 + 1.66
GC2BD(50) %>% print
BD2GC = function(BD) (BD - 1.66) / 0.098 * 100
BD2GC(1.709) %>% print
In [55]:
%%R
min_GC = BD2GC(min_BD)
max_GC = BD2GC(max_BD)
cat('Min-max GC:', min_GC, max_GC, '\n')
In [69]:
%%R
# where is density > 1e-7
detect_thresh = function(mean, sd){
dens = dnorm(0:117, mean=mean, sd=sd)
all(dens > 1e-7)
}
df = expand.grid(means, sigmas)
colnames(df) = c('mean', 'sigma')
df$detect = mapply(detect_thresh, mean=df$mean, sd=df$sigma)
df %>% head(n=4)
In [74]:
%%R -w 600
# plotting
ggplot(df, aes(mean, sigma, fill=detect)) +
geom_tile(color='black') +
theme_bw() +
theme(
text = element_text(size=16)
)
In [124]:
# loading fragments
F = os.path.join(workDir, '1', 'fragsParsed.pkl')
with open(F, 'rb') as inFH:
frags = dill.load(inFH)
In [166]:
stds = []
for x in frags:
otu = x[0]
for scaf,arr in x[1].items():
arr = np.array(arr)
sd = np.std(arr[:,2]) # fragment GC
stds.append([otu, scaf, sd])
stds = np.array(stds)
In [116]:
%%R -i stds -w 500 -h 300
stds = stds %>% as.data.frame
colnames(stds) = c('taxon', 'scaffold', 'sigma')
stds = stds %>%
mutate(sigma = sigma %>% as.character %>% as.numeric)
ggplot(stds, aes(sigma)) +
geom_histogram() +
theme_bw() +
theme(
text = element_text(size=16)
)
In [130]:
%%R
# using 10% quantile
## a relatively small, but not totally outlier of a sigma
## this will require a lot of diffusion
q10 = quantile(stds$sigma, probs=c(0.1)) %>% as.vector
q10
In [156]:
%%R
# function for sigma diffusion (Clay et al., 2003)
sigma_dif = function(L){
sqrt(44.5 / L)
}
# function for calculating total sigma (fragment buoyant density) based on mean fragment length
total_sigma = function(L, sigma_start){
# L = fragment length (kb)
# start_sigma = genome sigma prior to diffusion
sigma_D = sigma_dif(L)
sqrt(sigma_D**2 + sigma_start**2)
}
frag_lens = seq(0.1, 20, 0.1)
total_sd = sapply(frag_lens, total_sigma, sigma_start=q10)
df = data.frame('length__kb' = frag_lens, 'sigma' = total_sd)
df %>% head
In [157]:
%%R -w 600 -h 350
# plotting
ggplot(df, aes(length__kb, sigma)) +
geom_point() +
geom_line() +
geom_hline(yintercept=18, linetype='dashed', alpha=0.7) +
labs(x='Fragment length (kb)', y='Standard deviation of fragment BD\n(+diffusion)') +
theme_bw() +
theme(
text = element_text(size=16)
)
In [158]:
%%R
sigma_thresh = 18
frag_lens = seq(0.1, 20, 0.1)
df = expand.grid(stds$sigma, frag_lens)
colnames(df) = c('sigma', 'length__kb')
df$total_sd = mapply(total_sigma, df$length__kb, df$sigma)
df$detect = ifelse(df$total_sd >= sigma_thresh, 1, 0)
df = df %>%
group_by(length__kb) %>%
summarize(n = n(),
detected = sum(detect),
detect_perc = detected / n * 100)
df %>% head(n=3)
In [167]:
%%R -w 600 -h 350
# plotting
ggplot(df, aes(length__kb, detect_perc)) +
geom_point() +
geom_line() +
labs(x='Fragment length (kb)', y='% of taxa detected in all\ngradient fractions') +
theme_bw() +
theme(
text = element_text(size=16)
)
In [ ]: