In [1]:
using MethylClust
using PyPlot


INFO: Loading help data...

In [2]:
# Simple fit to show that the code is working
A,P,c = generate_reads(N=100,W=30);
X,Y,ch = fit_soft(A,2)
Xd,Yd,Xr = MethylClust.eval_synthetic_results(A,X,Y,c,P;make_plots=true);



In [14]:
X_real_hard = zeros(100,2)
for i = 1:100
    X_real_hard[i,indmax(Xr[i,:])] = 1.0
end

figure()
imshow(X_real_hard',cmap=ColorMap("Greys"),interpolation="none")
xlabel("true X (hard clustering)")
yticks([]),xticks([])


Out[14]:
(({},{}),({},{}))

Questions

How does performance scale with:

  • number of reads
  • read length
  • number of cell types
  • proportions of cell types
  • distance between clusters

In [4]:
Nlevs = [10]#[10:20:90]
Wlevs = [10]#[10:5:30]
nr = 3 # number of replicates
L = 100 # genome length

# Storage for X and Y errors/distance
Xd = zeros(length(Nlevs),length(Wlevs),nr)
Yd = zeros(length(Nlevs),length(Wlevs),nr)

for n = 1:length(Nlevs)
    N = Nlevs[n]
    for w = 1:length(Wlevs)
        W = Wlevs[w]
        println("Fitting with N = ",N," reads of length W = ",W,".")
        for r = 1:nr
            print("*")
            A,P,c = generate_reads(N=N, W=W, L=L, Cprob=[0.5,0.5]);
            X,Y,ch = fit_soft(A,2)
            Xd[n,w,r],Yd[n,w,r] = MethylClust.eval_synthetic_results(X,Y,c,P;make_plots=false)
        end
        println()
    end
end


Fitting with N = 10 reads of length W = 10.
***
Fitting with N = 10 reads of length W = 15.
***
Fitting with N = 10 reads of length W = 20.
***
Fitting with N = 10 reads of length W = 25.
***
Fitting with N = 10 reads of length W = 30.
***
Fitting with N = 30 reads of length W = 10.
***
Fitting with N = 30 reads of length W = 15.
***
Fitting with N = 30 reads of length W = 20.
***
Fitting with N = 30 reads of length W = 25.
***
Fitting with N = 30 reads of length W = 30.
***
Fitting with N = 50 reads of length W = 10.
***
Fitting with N = 50 reads of length W = 15.
***
Fitting with N = 50 reads of length W = 20.
***
Fitting with N = 50 reads of length W = 25.
***
Fitting with N = 50 reads of length W = 30.
***
Fitting with N = 70 reads of length W = 10.
***
Fitting with N = 70 reads of length W = 15.
***
Fitting with N = 70 reads of length W = 20.
***
Fitting with N = 70 reads of length W = 25.
***
Fitting with N = 70 reads of length W = 30.
***
Fitting with N = 90 reads of length W = 10.
***
Fitting with N = 90 reads of length W = 15.
***
Fitting with N = 90 reads of length W = 20.
***
Fitting with N = 90 reads of length W = 25.
***
Fitting with N = 90 reads of length W = 30.
***

In [5]:
Xdm = squeeze(mean(Xd,3),3)

imshow(Xdm,cmap=ColorMap("Greys"),interpolation="None")
xticks(0:length(Wlevs)-1,Wlevs),yticks(0:length(Nlevs)-1,Nlevs)
xlabel("Read Length"),ylabel("Number of Reads"),title("Xdist, Num Sites = "*string(L))
colorbar(label="proportion misclassified")
show()



In [6]:
N = repmat(Nlevs,1,length(Wlevs))
W = repmat(Wlevs',length(Nlevs),1)

figure()
scatter(vec(N).*vec(W)./L,vec(Xdm),c=vec(W),s=40,cmap=ColorMap("Reds_r"))
colorbar(ticks=[10,30],label="sites per read")
ylabel("proportion misclassified"), xlabel("coverage")
show()



In [7]:
Ydm = squeeze(mean(Yd,3),3)

imshow(Ydm,cmap=ColorMap("Greys"),interpolation="None")
xticks(0:length(Wlevs)-1,Wlevs),yticks(0:length(Nlevs)-1,Nlevs)
xlabel("Read Length"),ylabel("Number of Reads"),title("Ydist, Num Sites = "*string(L))
show()



In [8]:
figure()
scatter(vec(N).*vec(W)./L,vec(Ydm),c=vec(W),s=40,cmap=ColorMap("Reds_r"))
colorbar(ticks=[10,30],label="sites per read")
ylabel("reconstruction error"), xlabel("coverage")
show()


If methylation probabilities are low, then clustering is harder


In [14]:
d = (Float64)[]
Xd = (Float64)[]
Yd = (Float64)[]
nr = 20

for dist = 0.5:0.5:5.0
    println("Fitting with methylation profile approx dist = ",dist)
    for _ = 1:nr
        A,P,c = generate_reads(N=100, W=30, L=100, Cprob=[0.5,0.5], targ_dist=dist);
        X,Y,ch = fit_soft(A,2)
        xd,yd = MethylClust.eval_synthetic_results(X,Y,c,P;make_plots=false)
        push!(d,MethylClust.d_js_multivar_bern(vec(P[1,:]),vec(P[2,:])))
        push!(Xd,xd)
        push!(Yd,yd)
        print("*")
    end
    println()
end


Fitting with methylation profile approx dist = 0.5
********************
Fitting with methylation profile approx dist = 1.0
********************
Fitting with methylation profile approx dist = 1.5
********************
Fitting with methylation profile approx dist = 2.0
********************
Fitting with methylation profile approx dist = 2.5
********************
Fitting with methylation profile approx dist = 3.0
********************

In [17]:
figure()
plot(d,Xd,".k")
xlabel("Jensen-Shannon Distance Between Cell Types")
ylabel("Proportion Misclassified")
savefig("./figs/02_05.svg")



In [18]:
figure()
plot(d,Yd,".k")
xlabel("Jensen-Shannon Distance Between Cell Types")
ylabel("Reconstruction Error")
savefig("./figs/02_06.svg")



In [ ]: