In [914]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import matplotlib.mlab as mlab
import pandas as pd
import pylab
%matplotlib inline
snps_folder="/Users/ernesto/projects/IGSR/11_04_17/LCOVERAGE/SNPS/"
We read-in the data:
In [915]:
good_set=None
bad_set=None
good_set=np.fromfile(snps_folder+"/quals.good.corr.txt",dtype=float,sep="\n")
bad_set=np.fromfile(snps_folder+"/quals.bad.corr.txt",dtype=float,sep="\n")
And we calulate the AVG values for the good set (in GIAB) and the bad set (not in GIAB)
In [916]:
print "Avg Good set: {0}".format(np.mean(good_set))
print "Avg Bad set: {0}".format(np.mean(bad_set))
In [917]:
print(np.var(good_set))
print(np.var(bad_set))
And we perform the homogeinity of variances test:
In [918]:
out=stats.levene(good_set,bad_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
Variances are not equal, we set the t-test accordingly:
In [919]:
stats.ttest_ind(a=good_set,b=bad_set,equal_var=False)
Out[919]:
In [920]:
#normality test
out=stats.normaltest(good_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
In [921]:
#Q-Q plot
stats.probplot(good_set, dist="norm", plot=pylab)
pylab.show()
In [922]:
data=[good_set,bad_set]
plt.boxplot(data,labels=['known','novel'])
plt.ylabel('QUAL')
Out[922]:
In [923]:
# plot normed histogram
plt.figure(1)
plt.hist(good_set, normed=True,facecolor='green')
plt.xlabel('qual')
plt.title('qual distribution: Known')
plt.grid(True)
plt.figure(2)
plt.hist(bad_set, normed=True,facecolor='red')
plt.xlabel('qual')
plt.title('qual distribution: Novel')
plt.grid(True)
plt.show
Out[923]:
We see a huge peak of SNPs having a quality of 999. Let's adjust the range for not displaying this big peak
In [924]:
plt.hist(good_set, bins=100, histtype='step', normed=True, color='b', label='known',range=[0,990])
plt.hist(bad_set, bins=100, histtype='step', normed=True, color='r', alpha=0.5, label='novel',range=[0,990])
plt.title("QUAL Histogram")
plt.xlabel("QUAL")
plt.ylabel("Probability")
plt.legend()
plt.show()
We read-in the data:
In [925]:
good_set=None
bad_set=None
good_set=np.fromfile(snps_folder+"/DP.good.corr.txt",dtype=float,sep="\n")
bad_set=np.fromfile(snps_folder+"/DP.bad.corr.txt",dtype=float,sep="\n")
And we calulate the AVG values for the good set (in GIAB) and the bad set (not in GIAB)
In [926]:
print "Avg Good set: {0}".format(np.mean(good_set))
print "Avg Bad set: {0}".format(np.mean(bad_set))
In [927]:
print(np.var(good_set))
print(np.var(bad_set))
And we perform the homogeinity of variances test:
In [928]:
out=stats.levene(good_set,bad_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
Variances are not equal, we set the t-test accordingly:
In [929]:
stats.ttest_ind(a=good_set,b=bad_set,equal_var=False)
Out[929]:
In [930]:
#normality test
out=stats.normaltest(good_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
In [931]:
#Q-Q plot
stats.probplot(good_set, dist="norm", plot=pylab)
pylab.show()
In [932]:
data=[good_set,bad_set]
bplot=plt.boxplot(data,labels=['known','novel'])
In [933]:
plt.hist(good_set, bins=20, histtype='step', normed=True, color='b', label='known')
plt.hist(bad_set, bins=20, histtype='step', normed=True, color='r', alpha=0.5, label='novel')
plt.title("DP Histogram")
plt.xlabel("DP")
plt.ylabel("Probability")
plt.legend()
plt.show()
In [934]:
cutoff=np.percentile(good_set,99)
print(cutoff)
We read-in the data:
In [935]:
good_set=None
bad_set=None
good_set=np.fromfile(snps_folder+"/MQ.good.corr.txt",dtype=float,sep="\n")
bad_set=np.fromfile(snps_folder+"/MQ.bad.corr.txt",dtype=float,sep="\n")
And we calulate the AVG values for the good set (in GIAB) and the bad set (not in GIAB)
In [936]:
print "Avg Good set: {0}".format(np.mean(good_set))
print "Avg Bad set: {0}".format(np.mean(bad_set))
In [937]:
print(np.var(good_set))
print(np.var(bad_set))
And we perform the homogeinity of variances test:
In [938]:
out=stats.levene(good_set,bad_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
Variances are not equal, we set the t-test accordingly:
In [939]:
stats.ttest_ind(a=good_set,b=bad_set,equal_var=False)
Out[939]:
In [940]:
#normality test
out=stats.normaltest(good_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
In [941]:
#Q-Q plot
stats.probplot(good_set, dist="norm", plot=pylab)
pylab.show()
In [942]:
data=[good_set,bad_set]
bp=plt.boxplot(data,labels=['known','novel'])
In [943]:
plt.hist(good_set, bins=20, histtype='step', normed=True, color='b', label='known')
plt.hist(bad_set, bins=20, histtype='step', normed=True, color='r', alpha=0.5, label='novel')
plt.title("MQ Histogram")
plt.xlabel("MQ")
plt.ylabel("Probability")
plt.legend()
plt.show()
In [944]:
cutoff=np.percentile(good_set,1)
print(cutoff)
In [945]:
skewed = pd.DataFrame(good_set) # Convert to DF
## square root transformation
sqrt_transformed = skewed.apply(np.sqrt) # Get the square root of data points*
sqrt_transformed.hist(figsize=(8,8), bins=50) # Plot histogram
Out[945]:
Square root transformation does not work
In [946]:
log_transformed = (skewed+1).apply(np.log) # Get the log of the data
log_transformed.hist(figsize = (8,8),bins=50) # Plot histogram
Out[946]:
Log transformation does not work
In [947]:
#data reflection:
constant=(max(skewed[0]))+1
skewed['reflected']=constant-skewed[0]
log_transformed_r = (skewed['reflected']).apply(np.log)
log_transformed_r.hist(figsize = (8,8))
Out[947]:
In [948]:
out=stats.normaltest(log_transformed_r)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
Data reflection and then log transformation does not work
In [949]:
good_set=None
bad_set=None
good_set=np.fromfile(snps_folder+"/MQ0F.good.corr.txt",dtype=float,sep="\n")
bad_set=np.fromfile(snps_folder+"/MQ0F.bad.corr.txt",dtype=float,sep="\n")
And we calulate the AVG values for the good set (in GIAB) and the bad set (not in GIAB)
In [950]:
print "Avg Good set: {0}".format(np.mean(good_set))
print "Avg Bad set: {0}".format(np.mean(bad_set))
In [951]:
print(np.var(good_set))
print(np.var(bad_set))
And we perform the homogeinity of variances test:
In [952]:
out=stats.levene(good_set,bad_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
Variances are not equal, we set the t-test accordingly:
In [953]:
stats.ttest_ind(a=good_set,b=bad_set,equal_var=False)
Out[953]:
In [954]:
#normality test
out=stats.normaltest(good_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
In [955]:
#Q-Q plot
stats.probplot(good_set, dist="norm", plot=pylab)
pylab.show()
In [956]:
data=[good_set,bad_set]
bp=plt.boxplot(data,labels=['known','novel'])
In [957]:
plt.hist(good_set, bins=20, histtype='step', normed=True, color='b', label='known')
plt.hist(bad_set, bins=20, histtype='step', normed=True, color='r', alpha=0.5, label='novel')
plt.title("MQ0F Histogram")
plt.xlabel("MQ0F")
plt.ylabel("Probability")
plt.legend()
plt.show()
In [958]:
cutoff=np.percentile(good_set,99)
print(cutoff)
In [959]:
skewed = pd.DataFrame(good_set) # Convert to DF
## square root transformation
sqrt_transformed = skewed.apply(np.sqrt) # Get the square root of data points*
sqrt_transformed.hist(figsize=(8,8), bins=50) # Plot histogram
Out[959]:
Square root transformation does not work
In [960]:
log_transformed = (skewed+1).apply(np.log) # Get the log of the data
log_transformed.hist(figsize = (8,8),bins=50) # Plot histogram
Out[960]:
Log transformation does not work
In [961]:
#data reflection:
constant=(max(skewed[0]))+1
skewed['reflected']=constant-skewed[0]
log_transformed_r = (skewed['reflected']).apply(np.log)
log_transformed_r.hist(figsize = (8,8))
Out[961]:
In [962]:
out=stats.normaltest(log_transformed_r)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
Data reflection and then log transformation does not work
In [963]:
good_set=None
bad_set=None
good_set=np.fromfile(snps_folder+"/BQB.good.corr.txt",dtype=float,sep="\n")
bad_set=np.fromfile(snps_folder+"/BQB.bad.corr.txt",dtype=float,sep="\n")
And we calulate the AVG values for the good set (in GIAB) and the bad set (not in GIAB)
In [964]:
print "Avg Good set: {0}".format(np.mean(good_set))
print "Avg Bad set: {0}".format(np.mean(bad_set))
In [965]:
print(np.var(good_set))
print(np.var(bad_set))
And we perform the homogeinity of variances test:
In [966]:
out=stats.levene(good_set,bad_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
Variances are not equal, we set the t-test accordingly:
In [967]:
stats.ttest_ind(a=good_set,b=bad_set,equal_var=False)
Out[967]:
In [968]:
#normality test
out=stats.normaltest(good_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
In [969]:
#Q-Q plot
stats.probplot(good_set, dist="norm", plot=pylab)
pylab.show()
In [970]:
data=[good_set,bad_set]
bp=plt.boxplot(data,labels=['known','novel'])
In [971]:
plt.hist(good_set, bins=20, histtype='step', normed=True, color='b', label='known')
plt.hist(bad_set, bins=20, histtype='step', normed=True, color='r', alpha=0.5, label='novel')
plt.title("BQB Histogram")
plt.xlabel("BQB")
plt.ylabel("Probability")
plt.legend()
plt.show()
In [972]:
good_set=None
bad_set=None
good_set=np.fromfile(snps_folder+"/HOB.good.corr.txt",dtype=float,sep="\n")
bad_set=np.fromfile(snps_folder+"/HOB.bad.corr.txt",dtype=float,sep="\n")
And we calulate the AVG values for the good set (in GIAB) and the bad set (not in GIAB)
In [973]:
print "Avg Good set: {0}".format(np.mean(good_set))
print "Avg Bad set: {0}".format(np.mean(bad_set))
In [974]:
print(np.var(good_set))
print(np.var(bad_set))
And we perform the homogeinity of variances test:
In [975]:
out=stats.levene(good_set,bad_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
Variances are not equal, we set the t-test accordingly:
In [976]:
stats.ttest_ind(a=good_set,b=bad_set,equal_var=False)
Out[976]:
In [977]:
#normality test
out=stats.normaltest(good_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
In [978]:
#Q-Q plot
stats.probplot(good_set, dist="norm", plot=pylab)
pylab.show()
In [979]:
data=[good_set,bad_set]
bp=plt.boxplot(data,labels=['known','novel'])
In [980]:
plt.hist(good_set, bins=20, histtype='step', normed=True, color='b', label='known')
plt.hist(bad_set, bins=20, histtype='step', normed=True, color='r', alpha=0.5, label='novel')
plt.title("HOB Histogram")
plt.xlabel("HOB")
plt.ylabel("Probability")
plt.legend()
plt.show()
In [981]:
cutoff=np.percentile(good_set,99)
print(cutoff)
In [982]:
good_set=None
bad_set=None
good_set=np.fromfile(snps_folder+"/ICB.good.corr.txt",dtype=float,sep="\n")
bad_set=np.fromfile(snps_folder+"/ICB.bad.corr.txt",dtype=float,sep="\n")
And we calulate the AVG values for the good set (in GIAB) and the bad set (not in GIAB)
In [983]:
print "Avg Good set: {0}".format(np.mean(good_set))
print "Avg Bad set: {0}".format(np.mean(bad_set))
In [984]:
print(np.var(good_set))
print(np.var(bad_set))
And we perform the homogeinity of variances test:
In [985]:
out=stats.levene(good_set,bad_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
Variances are not equal, we set the t-test accordingly:
In [986]:
stats.ttest_ind(a=good_set,b=bad_set,equal_var=False)
Out[986]:
In [987]:
#normality test
out=stats.normaltest(good_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
In [988]:
#Q-Q plot
stats.probplot(good_set, dist="norm", plot=pylab)
pylab.show()
In [989]:
data=[good_set,bad_set]
bp=plt.boxplot(data,labels=['known','novel'])
In [990]:
plt.hist(good_set, bins=20, histtype='step', normed=True, color='b', label='known')
plt.hist(bad_set, bins=20, histtype='step', normed=True, color='r', alpha=0.5, label='novel')
plt.title("ICB Histogram")
plt.xlabel("ICB")
plt.ylabel("Probability")
plt.legend()
plt.show()
In [991]:
good_set=None
bad_set=None
good_set=np.fromfile(snps_folder+"/MQSB.good.corr.txt",dtype=float,sep="\n")
bad_set=np.fromfile(snps_folder+"/MQSB.bad.corr.txt",dtype=float,sep="\n")
And we calulate the AVG MQSB for the good set (in GIAB) and the bad set (not in GIAB)
In [992]:
print "Avg Good set: {0}".format(np.mean(good_set))
print "Avg Bad set: {0}".format(np.mean(bad_set))
In [993]:
print(np.var(good_set))
print(np.var(bad_set))
And we perform the homogeinity of variances test:
In [994]:
out=stats.levene(good_set,bad_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
Variances are not equal, we set the t-test accordingly:
In [995]:
stats.ttest_ind(a=good_set,b=bad_set,equal_var=False)
Out[995]:
In [996]:
#normality test
out=stats.normaltest(good_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
In [997]:
#Q-Q plot
stats.probplot(good_set, dist="norm", plot=pylab)
pylab.show()
In [998]:
data=[good_set,bad_set]
bp=plt.boxplot(data,labels=['known','novel'])
In [999]:
plt.hist(good_set, bins=20, histtype='step', normed=True, color='b', label='known')
plt.hist(bad_set, bins=20, histtype='step', normed=True, color='r', alpha=0.5, label='novel')
plt.title("MQSB Histogram")
plt.xlabel("MQSB")
plt.ylabel("Probability")
plt.legend()
plt.show()
In [1000]:
good_set=None
bad_set=None
good_set=np.fromfile(snps_folder+"/MQB.good.corr.txt",dtype=float,sep="\n")
bad_set=np.fromfile(snps_folder+"/MQB.bad.corr.txt",dtype=float,sep="\n")
And we calulate the AVG values for the good set (in GIAB) and the bad set (not in GIAB)
In [1001]:
print "Avg Good set: {0}".format(np.mean(good_set))
print "Avg Bad set: {0}".format(np.mean(bad_set))
In [1002]:
print(np.var(good_set))
print(np.var(bad_set))
And we perform the homogeinity of variances test:
In [1003]:
out=stats.levene(good_set,bad_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
Variances are not equal, we set the t-test accordingly:
In [1004]:
stats.ttest_ind(a=good_set,b=bad_set,equal_var=False)
Out[1004]:
In [1005]:
#normality test
out=stats.normaltest(good_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
In [1006]:
#Q-Q plot
stats.probplot(good_set, dist="norm", plot=pylab)
pylab.show()
In [1007]:
data=[good_set,bad_set]
bp=plt.boxplot(data,labels=['known','novel'])
In [1008]:
plt.hist(good_set, bins=20, histtype='step', normed=True, color='b', label='known')
plt.hist(bad_set, bins=20, histtype='step', normed=True, color='r', alpha=0.5, label='novel')
plt.title("MQB Histogram")
plt.xlabel("MQB")
plt.ylabel("Probability")
plt.legend()
plt.show()
In [1009]:
good_set=None
bad_set=None
good_set=np.fromfile(snps_folder+"/RPB.good.corr.txt",dtype=float,sep="\n")
bad_set=np.fromfile(snps_folder+"/RPB.bad.corr.txt",dtype=float,sep="\n")
And we calulate the AVG values for the good set (in GIAB) and the bad set (not in GIAB)
In [1010]:
print "Avg Good set: {0}".format(np.mean(good_set))
print "Avg Bad set: {0}".format(np.mean(bad_set))
In [1011]:
print(np.var(good_set))
print(np.var(bad_set))
And we perform the homogeinity of variances test:
In [1012]:
out=stats.levene(good_set,bad_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
Variances are not equal, we set the t-test accordingly:
In [1013]:
stats.ttest_ind(a=good_set,b=bad_set,equal_var=False)
Out[1013]:
In [1014]:
#normality test
out=stats.normaltest(good_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
In [1015]:
#Q-Q plot
stats.probplot(good_set, dist="norm", plot=pylab)
pylab.show()
In [1016]:
data=[good_set,bad_set]
bp=plt.boxplot(data,labels=['known','novel'])
In [1017]:
plt.hist(good_set, bins=20, histtype='step', normed=True, color='b', label='known')
plt.hist(bad_set, bins=20, histtype='step', normed=True, color='r', alpha=0.5, label='novel')
plt.title("RPB Histogram")
plt.xlabel("RPB")
plt.ylabel("Probability")
plt.legend()
plt.show()
In [1018]:
good_set=None
bad_set=None
good_set=np.fromfile(snps_folder+"/SGB.good.corr.txt",dtype=float,sep="\n")
bad_set=np.fromfile(snps_folder+"/SGB.bad.corr.txt",dtype=float,sep="\n")
And we calulate the AVG values for the good set (in GIAB) and the bad set (not in GIAB)
In [1019]:
print "Avg Good set: {0}".format(np.mean(good_set))
print "Avg Bad set: {0}".format(np.mean(bad_set))
In [1020]:
print(np.var(good_set))
print(np.var(bad_set))
And we perform the homogeinity of variances test:
In [1021]:
out=stats.levene(good_set,bad_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
Variances are not equal, we set the t-test accordingly:
In [1022]:
stats.ttest_ind(a=good_set,b=bad_set,equal_var=False)
Out[1022]:
In [1023]:
#normality test
out=stats.normaltest(good_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
In [1024]:
#Q-Q plot
stats.probplot(good_set, dist="norm", plot=pylab)
pylab.show()
In [1025]:
data=[good_set,bad_set]
bp=plt.boxplot(data,labels=['known','novel'])
In [1026]:
plt.hist(good_set, bins=20, histtype='step', normed=True, color='b', label='known')
plt.hist(bad_set, bins=20, histtype='step', normed=True, color='r', alpha=0.5, label='novel')
plt.title("SGB Histogram")
plt.xlabel("SGB")
plt.ylabel("Probability")
plt.legend()
plt.show()
A double filter is going to be used, for variants in the Novel set having SGB> than the max of Known SGB and variants in the Novel set having SGB< than the min of Known SGB
In [1027]:
cutoff_upper=np.percentile(good_set,99)
cutoff_lower=np.percentile(bad_set,1)
print(cutoff_upper)
print(cutoff_lower)
In [1028]:
good_set=None
bad_set=None
good_set=np.fromfile(snps_folder+"/GT_DP.good.corr.txt",dtype=float,sep="\n")
bad_set=np.fromfile(snps_folder+"/GT_DP.bad.corr.txt",dtype=float,sep="\n")
And we calulate the AVG values for the good set (in GIAB) and the bad set (not in GIAB)
In [1029]:
print "Avg Good set: {0}".format(np.mean(good_set))
print "Avg Bad set: {0}".format(np.mean(bad_set))
In [1030]:
print(np.var(good_set))
print(np.var(bad_set))
And we perform the homogeinity of variances test:
In [1031]:
out=stats.levene(good_set,bad_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
Variances are not equal, we set the t-test accordingly:
In [1032]:
stats.ttest_ind(a=good_set,b=bad_set,equal_var=False)
Out[1032]:
In [1033]:
#normality test
out=stats.normaltest(good_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
In [1034]:
#Q-Q plot
stats.probplot(good_set, dist="norm", plot=pylab)
pylab.show()
In [1035]:
data=[good_set,bad_set]
bp=plt.boxplot(data,labels=['known','novel'])
In [1036]:
plt.hist(good_set, bins=20, histtype='step', normed=True, color='b', label='known')
plt.hist(bad_set, bins=20, histtype='step', normed=True, color='r', alpha=0.5, label='novel')
plt.title("GT_DP Histogram")
plt.xlabel("GT_DP")
plt.ylabel("Probability")
plt.legend()
plt.show()
In [1037]:
cutoff=np.percentile(good_set,99)
print(cutoff)
In [1038]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import matplotlib.mlab as mlab
%matplotlib inline
indels_folder="/Users/ernesto/projects/IGSR/11_04_17/LCOVERAGE/INDELS/"
Then, we read-in the data:
In [1039]:
good_set=None
bad_set=None
good_set=np.fromfile(indels_folder+"/quals.good.corr.txt",dtype=float,sep="\n")
bad_set=np.fromfile(indels_folder+"/quals.bad.corr.txt",dtype=float,sep="\n")
And we calulate the AVG values for the good set (in GIAB) and the bad set (not in GIAB)
In [1040]:
print "Avg Good set: {0}".format(np.mean(good_set))
print "Avg Bad set: {0}".format(np.mean(bad_set))
In [1041]:
print(np.var(good_set))
print(np.var(bad_set))
And we perform the homogeinity of variances test:
In [1042]:
out=stats.levene(good_set,bad_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
Variances are not equal, we set the t-test accordingly:
In [1043]:
stats.ttest_ind(a=good_set,b=bad_set,equal_var=False)
Out[1043]:
In [1044]:
#normality test
out=stats.normaltest(good_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
In [1045]:
#Q-Q plot
stats.probplot(good_set, dist="norm", plot=pylab)
pylab.show()
In [1046]:
data=[good_set,bad_set]
bp=plt.boxplot(data,labels=['known','novel'])
In [1047]:
plt.hist(good_set, bins=20, histtype='step', normed=True, color='b', label='known')
plt.hist(bad_set, bins=20, histtype='step', normed=True, color='r', alpha=0.5, label='novel')
plt.title("QUAL Histogram")
plt.xlabel("QUAL")
plt.ylabel("Probability")
plt.legend()
plt.show()
We see a huge peak of INDELs having a quality of 999. Let's adjust the range for not displaying this big peak
In [1048]:
plt.hist(good_set, bins=100, histtype='step', normed=True, color='b', label='known',range=[0,990])
plt.hist(bad_set, bins=100, histtype='step', normed=True, color='r', alpha=0.5, label='novel',range=[0,990])
plt.title("QUAL Histogram")
plt.xlabel("QUAL")
plt.ylabel("Probability")
plt.legend()
plt.show()
In [1049]:
good_set=None
bad_set=None
good_set=np.fromfile(indels_folder+"/DP.good.corr.txt",dtype=float,sep="\n")
bad_set=np.fromfile(indels_folder+"/DP.bad.corr.txt",dtype=float,sep="\n")
And we calulate the AVG values for the good set (in GIAB) and the bad set (not in GIAB)
In [1050]:
print "Avg Good set: {0}".format(np.mean(good_set))
print "Avg Bad set: {0}".format(np.mean(bad_set))
In [1051]:
print(np.var(good_set))
print(np.var(bad_set))
And we perform the homogeinity of variances test:
In [1052]:
out=stats.levene(good_set,bad_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
Variances are not equal, we set the t-test accordingly:
In [1053]:
stats.ttest_ind(a=good_set,b=bad_set,equal_var=False)
Out[1053]:
In [1054]:
#normality test
out=stats.normaltest(good_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
In [1055]:
#Q-Q plot
stats.probplot(good_set, dist="norm", plot=pylab)
pylab.show()
In [1056]:
data=[good_set,bad_set]
bp=plt.boxplot(data,labels=['known','novel'])
In [1057]:
plt.hist(good_set, bins=20, histtype='step', normed=True, color='b', label='known')
plt.hist(bad_set, bins=20, histtype='step', normed=True, color='r', alpha=0.5, label='novel')
plt.title("DP Histogram")
plt.xlabel("DP")
plt.ylabel("Probability")
plt.legend()
plt.show()
In [1058]:
cutoff=np.percentile(good_set,99)
print(cutoff)
In [1059]:
good_set=None
bad_set=None
good_set=np.fromfile(indels_folder+"/MQ.good.corr.txt",dtype=float,sep="\n")
bad_set=np.fromfile(indels_folder+"/MQ.bad.corr.txt",dtype=float,sep="\n")
And we calulate the AVG values for the good set (in GIAB) and the bad set (not in GIAB)
In [1060]:
print "Avg Good set: {0}".format(np.mean(good_set))
print "Avg Bad set: {0}".format(np.mean(bad_set))
In [1061]:
print(np.var(good_set))
print(np.var(bad_set))
And we perform the homogeinity of variances test:
In [1062]:
out=stats.levene(good_set,bad_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
Variances are not equal, we set the t-test accordingly:
In [1063]:
stats.ttest_ind(a=good_set,b=bad_set,equal_var=False)
Out[1063]:
In [1064]:
#normality test
out=stats.normaltest(good_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
In [1065]:
#Q-Q plot
stats.probplot(good_set, dist="norm", plot=pylab)
pylab.show()
In [1066]:
data=[good_set,bad_set]
bp=plt.boxplot(data,labels=['known','novel'])
In [1067]:
plt.hist(good_set, bins=20, histtype='step', normed=True, color='b', label='known')
plt.hist(bad_set, bins=20, histtype='step', normed=True, color='r', alpha=0.5, label='novel')
plt.title("MQ Histogram")
plt.xlabel("MQ")
plt.ylabel("Probability")
plt.legend()
plt.show()
In [1068]:
cutoff=np.percentile(good_set,1)
print(cutoff)
In [1069]:
good_set=None
bad_set=None
good_set=np.fromfile(indels_folder+"/MQ0F.good.corr.txt",dtype=float,sep="\n")
bad_set=np.fromfile(indels_folder+"/MQ0F.bad.corr.txt",dtype=float,sep="\n")
And we calulate the AVG values for the good set (in GIAB) and the bad set (not in GIAB)
In [1070]:
print "Avg Good set: {0}".format(np.mean(good_set))
print "Avg Bad set: {0}".format(np.mean(bad_set))
In [1071]:
print(np.var(good_set))
print(np.var(bad_set))
And we perform the homogeinity of variances test:
In [1072]:
out=stats.levene(good_set,bad_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
Variances are not equal, we set the t-test accordingly:
In [1073]:
stats.ttest_ind(a=good_set,b=bad_set,equal_var=False)
Out[1073]:
In [1074]:
#normality test
out=stats.normaltest(good_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
In [1075]:
#Q-Q plot
stats.probplot(good_set, dist="norm", plot=pylab)
pylab.show()
In [1076]:
data=[good_set,bad_set]
bp=plt.boxplot(data,labels=['known','novel'])
In [1077]:
plt.hist(good_set, bins=20, histtype='step', normed=True, color='b', label='known')
plt.hist(bad_set, bins=20, histtype='step', normed=True, color='r', alpha=0.5, label='novel')
plt.title("MQ0F Histogram")
plt.xlabel("MQ0F")
plt.ylabel("Probability")
plt.legend()
plt.show()
In [1078]:
cutoff=np.percentile(good_set,99)
print(cutoff)
In [1079]:
good_set=None
bad_set=None
good_set=np.fromfile(indels_folder+"/HOB.good.corr.txt",dtype=float,sep="\n")
bad_set=np.fromfile(indels_folder+"/HOB.bad.corr.txt",dtype=float,sep="\n")
And we calulate the AVG values for the good set (in GIAB) and the bad set (not in GIAB)
In [1080]:
print "Avg Good set: {0}".format(np.mean(good_set))
print "Avg Bad set: {0}".format(np.mean(bad_set))
In [1081]:
print(np.var(good_set))
print(np.var(bad_set))
And we perform the homogeinity of variances test:
In [1082]:
out=stats.levene(good_set,bad_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
Variances are not equal, we set the t-test accordingly:
In [1083]:
stats.ttest_ind(a=good_set,b=bad_set,equal_var=False)
Out[1083]:
In [1084]:
#normality test
out=stats.normaltest(good_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
In [1085]:
#Q-Q plot
stats.probplot(good_set, dist="norm", plot=pylab)
pylab.show()
In [1086]:
data=[good_set,bad_set]
bp=plt.boxplot(data,labels=['known','novel'])
In [1087]:
plt.hist(good_set, bins=20, histtype='step', normed=True, color='b', label='known')
plt.hist(bad_set, bins=20, histtype='step', normed=True, color='r', alpha=0.5, label='novel')
plt.title("HOB Histogram")
plt.xlabel("HOB")
plt.ylabel("Probability")
plt.legend()
plt.show()
In [1088]:
cutoff=np.percentile(good_set,99)
print(cutoff)
In [1089]:
good_set=None
bad_set=None
good_set=np.fromfile(indels_folder+"/ICB.good.corr.txt",dtype=float,sep="\n")
bad_set=np.fromfile(indels_folder+"/ICB.bad.corr.txt",dtype=float,sep="\n")
And we calulate the AVG values for the good set (in GIAB) and the bad set (not in GIAB)
In [1090]:
print "Avg Good set: {0}".format(np.mean(good_set))
print "Avg Bad set: {0}".format(np.mean(bad_set))
In [1091]:
print(np.var(good_set))
print(np.var(bad_set))
And we perform the homogeinity of variances test:
In [1092]:
out=stats.levene(good_set,bad_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
Variances are not equal, we set the t-test accordingly:
In [1093]:
stats.ttest_ind(a=good_set,b=bad_set,equal_var=False)
Out[1093]:
In [1094]:
#normality test
out=stats.normaltest(good_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
In [1095]:
#Q-Q plot
stats.probplot(good_set, dist="norm", plot=pylab)
pylab.show()
In [1096]:
data=[good_set,bad_set]
bp=plt.boxplot(data,labels=['known','novel'])
In [1097]:
plt.hist(good_set, bins=20, histtype='step', normed=True, color='b', label='known')
plt.hist(bad_set, bins=20, histtype='step', normed=True, color='r', alpha=0.5, label='novel')
plt.title("ICB Histogram")
plt.xlabel("ICB")
plt.ylabel("Probability")
plt.legend()
plt.show()
In [1098]:
good_set=None
bad_set=None
good_set=np.fromfile(indels_folder+"/MQSB.good.corr.txt",dtype=float,sep="\n")
bad_set=np.fromfile(indels_folder+"/MQSB.bad.corr.txt",dtype=float,sep="\n")
And we calulate the AVG values for the good set (in GIAB) and the bad set (not in GIAB)
In [1099]:
print "Avg Good set: {0}".format(np.mean(good_set))
print "Avg Bad set: {0}".format(np.mean(bad_set))
In [1100]:
print(np.var(good_set))
print(np.var(bad_set))
And we perform the homogeinity of variances test:
In [1101]:
out=stats.levene(good_set,bad_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
Variances are not equal, we set the t-test accordingly:
In [1102]:
stats.ttest_ind(a=good_set,b=bad_set,equal_var=False)
Out[1102]:
In [1103]:
#normality test
out=stats.normaltest(good_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
In [1104]:
#Q-Q plot
stats.probplot(good_set, dist="norm", plot=pylab)
pylab.show()
In [1105]:
data=[good_set,bad_set]
bp=plt.boxplot(data,labels=['known','novel'])
In [1106]:
plt.hist(good_set, bins=20, histtype='step', normed=True, color='b', label='known')
plt.hist(bad_set, bins=20, histtype='step', normed=True, color='r', alpha=0.5, label='novel')
plt.title("MQSB Histogram")
plt.xlabel("MQSB")
plt.ylabel("Probability")
plt.legend()
plt.show()
In [1107]:
good_set=None
bad_set=None
good_set=np.fromfile(indels_folder+"/SGB.good.corr.txt",dtype=float,sep="\n")
bad_set=np.fromfile(indels_folder+"/SGB.bad.corr.txt",dtype=float,sep="\n")
And we calulate the AVG values for the good set (in GIAB) and the bad set (not in GIAB)
In [1108]:
print "Avg Good set: {0}".format(np.mean(good_set))
print "Avg Bad set: {0}".format(np.mean(bad_set))
In [1109]:
print(np.var(good_set))
print(np.var(bad_set))
And we perform the homogeinity of variances test:
In [1110]:
out=stats.levene(good_set,bad_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
Variances are equal, we set the t-test accordingly:
In [1111]:
stats.ttest_ind(a=good_set,b=bad_set,equal_var=True)
Out[1111]:
In [1112]:
#normality test
out=stats.normaltest(good_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
In [1113]:
#Q-Q plot
stats.probplot(good_set, dist="norm", plot=pylab)
pylab.show()
In [1114]:
data=[good_set,bad_set]
bp=plt.boxplot(data,labels=['known','novel'])
In [1115]:
plt.hist(good_set, bins=20, histtype='step', normed=True, color='b', label='known')
plt.hist(bad_set, bins=20, histtype='step', normed=True, color='r', alpha=0.5, label='novel')
plt.title("SGB Histogram")
plt.xlabel("SGB")
plt.ylabel("Probability")
plt.legend()
plt.show()
In [1116]:
cutoff_upper=np.percentile(good_set,99)
cutoff_lower=np.percentile(bad_set,1)
print(cutoff_upper)
print(cutoff_lower)
In [1117]:
good_set=None
bad_set=None
good_set=np.fromfile(indels_folder+"/GT_DP.good.corr.txt",dtype=float,sep="\n")
bad_set=np.fromfile(indels_folder+"/GT_DP.bad.corr.txt",dtype=float,sep="\n")
And we calulate the AVG values for the good set (in GIAB) and the bad set (not in GIAB)
In [1118]:
print "Avg Good set: {0}".format(np.mean(good_set))
print "Avg Bad set: {0}".format(np.mean(bad_set))
In [1119]:
print(np.var(good_set))
print(np.var(bad_set))
And we perform the homogeinity of variances test:
In [1120]:
out=stats.levene(good_set,bad_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
Variances are not equal, we set the t-test accordingly:
In [1121]:
stats.ttest_ind(a=good_set,b=bad_set,equal_var=False)
Out[1121]:
In [1122]:
#normality test
out=stats.normaltest(good_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
In [1123]:
#Q-Q plot
stats.probplot(good_set, dist="norm", plot=pylab)
pylab.show()
In [1124]:
data=[good_set,bad_set]
bp=plt.boxplot(data,labels=['known','novel'])
In [1125]:
plt.hist(good_set, bins=20, histtype='step', normed=True, color='b', label='known')
plt.hist(bad_set, bins=20, histtype='step', normed=True, color='r', alpha=0.5, label='novel')
plt.title("GT_DP Histogram")
plt.xlabel("GT_DP")
plt.ylabel("Probability")
plt.legend()
plt.show()
In [1126]:
cutoff=np.percentile(good_set,99)
print(cutoff)
In [1127]:
good_set=None
bad_set=None
good_set=np.fromfile(indels_folder+"/IDV.good.corr.txt",dtype=float,sep="\n")
bad_set=np.fromfile(indels_folder+"/IDV.bad.corr.txt",dtype=float,sep="\n")
And we calulate the AVG values for the good set (in GIAB) and the bad set (not in GIAB)
In [1128]:
print "Avg Good set: {0}".format(np.mean(good_set))
print "Avg Bad set: {0}".format(np.mean(bad_set))
In [1129]:
print(np.var(good_set))
print(np.var(bad_set))
And we perform the homogeinity of variances test:
In [1130]:
out=stats.levene(good_set,bad_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
Variances are not equal, we set the t-test accordingly:
In [1131]:
stats.ttest_ind(a=good_set,b=bad_set,equal_var=False)
Out[1131]:
In [1132]:
#normality test
out=stats.normaltest(good_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
In [1133]:
#Q-Q plot
stats.probplot(good_set, dist="norm", plot=pylab)
pylab.show()
In [1134]:
data=[good_set,bad_set]
bp=plt.boxplot(data,labels=['known','novel'])
In [1135]:
plt.hist(good_set, bins=20, histtype='step', normed=True, color='b', label='known')
plt.hist(bad_set, bins=20, histtype='step', normed=True, color='r', alpha=0.5, label='novel')
plt.title("IDV Histogram")
plt.xlabel("IDV")
plt.ylabel("Probability")
plt.legend()
plt.show()
In [1136]:
cutoff=np.percentile(good_set,99)
print(cutoff)
In [1137]:
good_set=np.fromfile(indels_folder+"/IMF.good.corr.txt",dtype=float,sep="\n")
bad_set=np.fromfile(indels_folder+"/IMF.bad.corr.txt",dtype=float,sep="\n")
And we calulate the AVG values for the good set (in GIAB) and the bad set (not in GIAB)
In [1138]:
print "Avg Good set: {0}".format(np.mean(good_set))
print "Avg Bad set: {0}".format(np.mean(bad_set))
In [1139]:
print(np.var(good_set))
print(np.var(bad_set))
And we perform the homogeinity of variances test:
In [1140]:
out=stats.levene(good_set,bad_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
Variances are not equal, we set the t-test accordingly:
In [1141]:
stats.ttest_ind(a=good_set,b=bad_set,equal_var=False)
Out[1141]:
In [1142]:
#normality test
out=stats.normaltest(good_set)
print("Z-score: {0}".format(out[0]))
print("P-value: {0}".format(out[1]))
In [1143]:
#Q-Q plot
stats.probplot(good_set, dist="norm", plot=pylab)
pylab.show()
In [1144]:
data=[good_set,bad_set]
bp=plt.boxplot(data,labels=['known','novel'])
In [1145]:
plt.hist(good_set, bins=20, histtype='step', normed=True, color='b', label='known')
plt.hist(bad_set, bins=20, histtype='step', normed=True, color='r', alpha=0.5, label='novel')
plt.title("IMF Histogram")
plt.xlabel("IMF")
plt.ylabel("Probability")
plt.legend()
plt.show()
In [1146]:
cutoff=np.percentile(good_set,1)
print(cutoff)