notebook.community

Edit and run



In [2]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats
import scipy.optimize
import sys

import PaSDqc

%matplotlib inline



In [4]:

    
p_1465 = "/home/mas138/orchestra/data/BSM/Cell/1465/MDAqc/1x/psd/"
freq, nd_1465, sl_14165 = extra_tools.mk_ndarray(p_1465)



In [7]:

    
p_4638 = "/home/mas138/orchestra/data/BSM/Cell/4638/MDAqc/1x/psd/"
freq, nd_4638, sl_4638 = extra_tools.mk_ndarray(p_4638)



In [8]:

    
p_4643 = "/home/mas138/orchestra/data/BSM/Cell/4643/MDAqc/1x/psd/"
freq, nd_4643, sl_4643 = extra_tools.mk_ndarray(p_4643)



In [10]:

    
p_cz = "/home/mas138/orchestra/data/BSM/Pellman/CZ_nat/PaSDqc/1x/psd/"
freq, nd_cz, sl_cz = extra_tools.mk_ndarray(p_cz)



In [19]:

    
def func_erf(x, inter, asym, mu, sigma):
    return inter + asym * scipy.special.erf((x-mu) / (np.sqrt(2) * sigma))



In [ ]:

    
def func_erf2(x, inter, asym, mu, sigma):
    return inter + asym * scipy.special.erf((x) / (np.sqrt(2) * sigma))



In [11]:

    
print(sl_14165)
print(sl_4638)
print(sl_4643)
print(sl_cz)









    



['1465-cortex_1-neuron_MDA_12.1x', '1465-cortex_1-neuron_MDA_18.1x', '1465-cortex_1-neuron_MDA_20.1x', '1465-cortex_1-neuron_MDA_24.1x', '1465-cortex_1-neuron_MDA_25.1x', '1465-cortex_1-neuron_MDA_2_WGSb.1x', '1465-cortex_1-neuron_MDA_30.1x', '1465-cortex_1-neuron_MDA_39.1x', '1465-cortex_1-neuron_MDA_3_WGSb.1x', '1465-cortex_1-neuron_MDA_43.1x', '1465-cortex_1-neuron_MDA_46.1x', '1465-cortex_1-neuron_MDA_47.1x', '1465-cortex_1-neuron_MDA_5.1x', '1465-cortex_1-neuron_MDA_51_WGSb.1x', '1465-cortex_1-neuron_MDA_6_WGSb.1x', '1465-cortex_1-neuron_MDA_8.1x']
['4638-MDA-03-final-all.1x', '4638-MDA-11-final-all.1x', '4638-MDA-12-final-all.1x', '4638-MDA-13-final-all.1x', '4638-MDA-15-final-all.1x', '4638-MDA-2-final-all.1x', '4638-MDA-20-final-all.1x', '4638-MDA-24-final-all.1x', '4638-MDA-4-final-all.1x', '4638-MDA-7-final-all.1x']
['4643_Bulk-Liver.1x', '4643_MDA_1.1x', '4643_MDA_2.1x', '4643_MDA_23.1x', '4643_MDA_24.1x', '4643_MDA_26.1x', '4643_MDA_3.1x', '4643_MDA_31.1x', '4643_MDA_32.1x', '4643_MDA_4.1x', '4643_MDA_5.1x']
['C1_D2.1x', 'C1_D3.1x', 'C2_H4.1x', 'C2_H5.1x', 'C3_I9.1x', 'N1_A3.1x', 'N1_A6.1x', 'N2_E3.1x', 'N2_E5.1x', 'N3_I6.1x', 'N4_A2.1x', 'N4_A3.1x', 'N5_F3.1x', 'N5_F5.1x']



In [13]:

    
bulk_norm = pd.Series.from_csv("/home/mas138/sandbox/dev/MDAqc/db/bulk_1x.smooth3.spec", index_col=0, sep="\t").as_matrix()



In [37]:

    
def fit_curve(freq, nd):
    l_params = []
    freq_cut = freq[freq < 1e-3]
    freq_cut_scale = -np.log10(freq_cut)
    
    for psd in nd:
        psd_mda = 10*np.log10(psd / bulk_norm)
        psd_mda_cut = psd_mda[freq < 1e-3]
        try:
            l_params.append(scipy.optimize.curve_fit(func_erf, freq_cut_scale, psd_mda_cut)[0])
        except:
            pass
        
    return l_params



In [41]:

    
lp_1465 = fit_curve(freq, nd_1465)
lp_4638 = fit_curve(freq, nd_4638)
lp_4643 = fit_curve(freq, nd_4643[1:, :])



In [47]:

    
lp_cz = fit_curve(freq, nd_cz)



In [48]:

    
lp_cz









    Out[48]:





[array([ 11.49626949,   7.29462124,   3.74068369,   0.38063083]),
 array([ 11.35292109,   6.78439323,   3.76885117,   0.41827411]),
 array([ 12.01932588,   6.61789303,   3.66436152,   0.35457833]),
 array([ 11.52763513,   7.22078589,   3.83736751,   0.44134933]),
 array([ 11.69414097,   6.50441685,   3.8084103 ,   0.40578742]),
 array([ 11.63278144,   6.48048083,   3.74367007,   0.4560825 ]),
 array([ 11.28062587,   5.83684715,   3.78460821,   0.41503917]),
 array([ 12.2961098 ,   6.31310621,   3.80485578,   0.39919702]),
 array([ 11.57973424,   6.01422893,   3.81608131,   0.38160109]),
 array([ 12.16692346,   5.76955548,   3.73306978,   0.32858983]),
 array([ 12.80589804,   6.46543958,   3.75213598,   0.36784062]),
 array([ 12.52487128,   6.35988859,   3.75143312,   0.36708796]),
 array([ 11.86558225,   6.21645651,   3.78159151,   0.43630748]),
 array([ 11.02426704,   6.11579265,   3.811597  ,   0.46154502])]



In [43]:

    
lp_4643









    Out[43]:





[array([ 9.00049456,  5.85497326,  4.20055748,  0.42818105]),
 array([ 9.24785813,  5.97421431,  4.21145708,  0.44029458]),
 array([ 9.4246057 ,  5.75148808,  4.19653226,  0.39223627]),
 array([ 9.65539523,  6.27570879,  4.21711106,  0.44868351]),
 array([ 9.81066724,  5.78831809,  4.20410166,  0.36718286]),
 array([ 9.4774427 ,  6.1461352 ,  4.20645511,  0.45772457]),
 array([ 9.13485415,  5.93624798,  4.19764849,  0.43943928])]



In [55]:

    
def gaussian(l_params):
    l_pdf = []
    for p in l_params:
        pdf = scipy.stats.norm(loc=p[2], scale=p[3])
        low, high = pdf.ppf([0.001, 0.999])
        freq_eval = np.linspace(low, high, 100)
        l_pdf.append([freq_eval, pdf.pdf(freq_eval)])
        
    return l_pdf



In [58]:

    
def gaussian2(l_params, freq_eval):
    l_pdf = []
    for p in l_params:
        pdf = scipy.stats.norm(loc=p[2], scale=p[3])
        # low, high = pdf.ppf([0.001, 0.999])
        # freq_eval = np.linspace(low, high, 100)
        l_pdf.append(pdf.pdf(freq_eval))
        
    return l_pdf



In [98]:

    
freq_eval = np.arange(3, 5.5, 0.01)
l_pdf_1465 = gaussian2(lp_1465, freq_eval)
l_pdf_4638 = gaussian2(lp_4638, freq_eval)
l_pdf_4643 = gaussian2(lp_4643, freq_eval)

freq_eval2 = np.arange(2.5, 5, 0.01)
l_pdf_cz = gaussian2(lp_cz, freq_eval2)



In [60]:

    
for s in l_pdf_1465+l_pdf_4638+l_pdf_4643+l_pdf_cz:
    plt.plot(freq_eval, s)



In [99]:

    
pdf_alk = np.array(l_pdf_1465+l_pdf_4638+l_pdf_4643)
pdf_alk_avg = np.mean(pdf_alk, axis=0)
pdf_alk_std = np.std(pdf_alk, axis=0)

pdf_1465 = np.array(l_pdf_1465)
pdf_1465_avg = np.mean(pdf_1465, axis=0)
pdf_1465_std = np.std(pdf_1465, axis=0)

pdf_4638 = np.array(l_pdf_4638)
pdf_4638_avg = np.mean(pdf_4638, axis=0)
pdf_4638_std = np.std(pdf_4638, axis=0)

pdf_4643 = np.array(l_pdf_4643)
pdf_4643_avg = np.mean(pdf_4643, axis=0)
pdf_4643_std = np.std(pdf_4643, axis=0)

pdf_cz = np.array(l_pdf_cz)
pdf_cz_avg = np.mean(pdf_cz, axis=0)
pdf_cz_std = np.std(pdf_cz, axis=0)



In [113]:

    
pd.DataFrame(np.array([pdf_alk_avg, pdf_alk_std]).T, columns=['avg', 'std']).to_csv("Ampl_dist_lodato.txt", sep="\t", index=False)
pd.DataFrame(np.array([pdf_cz_avg, pdf_cz_std]).T, columns=['avg', 'std']).to_csv("Ampl_dist_CZ.txt", sep="\t", index=False)



In [111]:

    
df_alk[0:5]



In [102]:

    
sns.set_style('ticks', {'ytick.minor.size': 0.0, 'xtick.minor.size': 0.0})
sns.set_context('talk')



In [107]:

    
cp = sns.color_palette()
plt.figure(figsize=(10, 4))
plt.plot(10**freq_eval, pdf_alk_avg, label='Lodato et al, 2015')
plt.fill_between(10**freq_eval, pdf_alk_avg-pdf_alk_std, pdf_alk_avg+pdf_alk_std, alpha=0.3)

plt.plot(10**freq_eval2, pdf_cz_avg, label='Zhang et al, 2015')
plt.fill_between(10**freq_eval2, pdf_cz_avg-pdf_cz_std, pdf_cz_avg+pdf_cz_std, alpha=0.3, color=cp[1])
plt.legend()
plt.xscale('log')
plt.xlabel('Amplicon size (bp)')
plt.ylabel('Density')









    Out[107]:





<matplotlib.text.Text at 0x2b7c4952acc0>



In [85]:

    
def calc_median(l_params):
    l_med = []
    for p in l_params:
        obs = scipy.stats.norm.rvs(loc=p[2], scale=p[3], size=100000)
        l_med.append(np.median(10**obs))
        
    return l_med



In [90]:

    
med_1465 = calc_median(lp_1465)
med_4638 = calc_median(lp_4638)
med_4643 = calc_median(lp_4643)
med_cz = calc_median(lp_cz)



In [91]:

    
scipy.stats.ks_2samp(med_cz, med_1465+med_4638+med_4643)









    Out[91]:





Ks_2sampResult(statistic=1.0, pvalue=1.1601494284614622e-09)



In [94]:

    
print(np.mean(med_1465+med_4638+med_4643))
print(np.std(med_1465+med_4638+med_4643))









    



17478.6809314
1343.4548016



In [95]:

    
print(np.mean(med_cz))
print(np.std(med_cz))









    



5934.75494528
577.79817705



In [96]:

    
10**3.8









    Out[96]:





6309.57344480193



In [114]:

    
len(sl_cz)









    Out[114]:





14



In [ ]:

	avg	std
0	0.013750	0.006066
1	0.014690	0.006379
2	0.015686	0.006703
3	0.016742	0.007040
4	0.017861	0.007389