In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats
import scipy.optimize
import sys
import PaSDqc
%matplotlib inline
In [4]:
p_1465 = "/home/mas138/orchestra/data/BSM/Cell/1465/MDAqc/1x/psd/"
freq, nd_1465, sl_14165 = extra_tools.mk_ndarray(p_1465)
In [7]:
p_4638 = "/home/mas138/orchestra/data/BSM/Cell/4638/MDAqc/1x/psd/"
freq, nd_4638, sl_4638 = extra_tools.mk_ndarray(p_4638)
In [8]:
p_4643 = "/home/mas138/orchestra/data/BSM/Cell/4643/MDAqc/1x/psd/"
freq, nd_4643, sl_4643 = extra_tools.mk_ndarray(p_4643)
In [10]:
p_cz = "/home/mas138/orchestra/data/BSM/Pellman/CZ_nat/PaSDqc/1x/psd/"
freq, nd_cz, sl_cz = extra_tools.mk_ndarray(p_cz)
In [19]:
def func_erf(x, inter, asym, mu, sigma):
return inter + asym * scipy.special.erf((x-mu) / (np.sqrt(2) * sigma))
In [ ]:
def func_erf2(x, inter, asym, mu, sigma):
return inter + asym * scipy.special.erf((x) / (np.sqrt(2) * sigma))
In [11]:
print(sl_14165)
print(sl_4638)
print(sl_4643)
print(sl_cz)
In [13]:
bulk_norm = pd.Series.from_csv("/home/mas138/sandbox/dev/MDAqc/db/bulk_1x.smooth3.spec", index_col=0, sep="\t").as_matrix()
In [37]:
def fit_curve(freq, nd):
l_params = []
freq_cut = freq[freq < 1e-3]
freq_cut_scale = -np.log10(freq_cut)
for psd in nd:
psd_mda = 10*np.log10(psd / bulk_norm)
psd_mda_cut = psd_mda[freq < 1e-3]
try:
l_params.append(scipy.optimize.curve_fit(func_erf, freq_cut_scale, psd_mda_cut)[0])
except:
pass
return l_params
In [41]:
lp_1465 = fit_curve(freq, nd_1465)
lp_4638 = fit_curve(freq, nd_4638)
lp_4643 = fit_curve(freq, nd_4643[1:, :])
In [47]:
lp_cz = fit_curve(freq, nd_cz)
In [48]:
lp_cz
Out[48]:
In [43]:
lp_4643
Out[43]:
In [55]:
def gaussian(l_params):
l_pdf = []
for p in l_params:
pdf = scipy.stats.norm(loc=p[2], scale=p[3])
low, high = pdf.ppf([0.001, 0.999])
freq_eval = np.linspace(low, high, 100)
l_pdf.append([freq_eval, pdf.pdf(freq_eval)])
return l_pdf
In [58]:
def gaussian2(l_params, freq_eval):
l_pdf = []
for p in l_params:
pdf = scipy.stats.norm(loc=p[2], scale=p[3])
# low, high = pdf.ppf([0.001, 0.999])
# freq_eval = np.linspace(low, high, 100)
l_pdf.append(pdf.pdf(freq_eval))
return l_pdf
In [98]:
freq_eval = np.arange(3, 5.5, 0.01)
l_pdf_1465 = gaussian2(lp_1465, freq_eval)
l_pdf_4638 = gaussian2(lp_4638, freq_eval)
l_pdf_4643 = gaussian2(lp_4643, freq_eval)
freq_eval2 = np.arange(2.5, 5, 0.01)
l_pdf_cz = gaussian2(lp_cz, freq_eval2)
In [60]:
for s in l_pdf_1465+l_pdf_4638+l_pdf_4643+l_pdf_cz:
plt.plot(freq_eval, s)
In [99]:
pdf_alk = np.array(l_pdf_1465+l_pdf_4638+l_pdf_4643)
pdf_alk_avg = np.mean(pdf_alk, axis=0)
pdf_alk_std = np.std(pdf_alk, axis=0)
pdf_1465 = np.array(l_pdf_1465)
pdf_1465_avg = np.mean(pdf_1465, axis=0)
pdf_1465_std = np.std(pdf_1465, axis=0)
pdf_4638 = np.array(l_pdf_4638)
pdf_4638_avg = np.mean(pdf_4638, axis=0)
pdf_4638_std = np.std(pdf_4638, axis=0)
pdf_4643 = np.array(l_pdf_4643)
pdf_4643_avg = np.mean(pdf_4643, axis=0)
pdf_4643_std = np.std(pdf_4643, axis=0)
pdf_cz = np.array(l_pdf_cz)
pdf_cz_avg = np.mean(pdf_cz, axis=0)
pdf_cz_std = np.std(pdf_cz, axis=0)
In [113]:
pd.DataFrame(np.array([pdf_alk_avg, pdf_alk_std]).T, columns=['avg', 'std']).to_csv("Ampl_dist_lodato.txt", sep="\t", index=False)
pd.DataFrame(np.array([pdf_cz_avg, pdf_cz_std]).T, columns=['avg', 'std']).to_csv("Ampl_dist_CZ.txt", sep="\t", index=False)
In [111]:
df_alk[0:5]
Out[111]:
In [102]:
sns.set_style('ticks', {'ytick.minor.size': 0.0, 'xtick.minor.size': 0.0})
sns.set_context('talk')
In [107]:
cp = sns.color_palette()
plt.figure(figsize=(10, 4))
plt.plot(10**freq_eval, pdf_alk_avg, label='Lodato et al, 2015')
plt.fill_between(10**freq_eval, pdf_alk_avg-pdf_alk_std, pdf_alk_avg+pdf_alk_std, alpha=0.3)
plt.plot(10**freq_eval2, pdf_cz_avg, label='Zhang et al, 2015')
plt.fill_between(10**freq_eval2, pdf_cz_avg-pdf_cz_std, pdf_cz_avg+pdf_cz_std, alpha=0.3, color=cp[1])
plt.legend()
plt.xscale('log')
plt.xlabel('Amplicon size (bp)')
plt.ylabel('Density')
Out[107]:
In [85]:
def calc_median(l_params):
l_med = []
for p in l_params:
obs = scipy.stats.norm.rvs(loc=p[2], scale=p[3], size=100000)
l_med.append(np.median(10**obs))
return l_med
In [90]:
med_1465 = calc_median(lp_1465)
med_4638 = calc_median(lp_4638)
med_4643 = calc_median(lp_4643)
med_cz = calc_median(lp_cz)
In [91]:
scipy.stats.ks_2samp(med_cz, med_1465+med_4638+med_4643)
Out[91]:
In [94]:
print(np.mean(med_1465+med_4638+med_4643))
print(np.std(med_1465+med_4638+med_4643))
In [95]:
print(np.mean(med_cz))
print(np.std(med_cz))
In [96]:
10**3.8
Out[96]:
In [114]:
len(sl_cz)
Out[114]:
In [ ]: