In [3]:
source( "/anas/gh/splitIntoDataFrame.R" )
suppressMessages(library(bioUtilsr))
suppressMessages(library(Biostrings))

In [4]:
# Local Fucntions

# Parse the fatsa files
myfun = function(x)
{
        name=gsub("./", "", gsub(".fasta","" ,x ))
        length( readDNAStringSet(x, format="fasta"))
}

# Counts of sequences in each file
to_table =  function(f){
        tok = splitIntoDataFrame( f, header=F, sep="[._]" )
        samples = paste( tok[,1], tok[,2], sep="." )
        table( samples )
}

In [5]:
# File Paths

fp = "/anas/roshan-current/i2mc/burcelin/frm/runs/qiime_test4/misc_files/CH1"

# Results from Chimera check with ref and denovo
ch1_c = readLines( "/anas/roshan-current/i2mc/burcelin/frm/runs/qiime_test4/misc_files/CH1/chimeras.txt" )
ch1_nc = readLines( "/anas/roshan-current/i2mc/burcelin/frm/runs/qiime_test4/misc_files/CH1/non_chimeras.txt" )

# Results from Chimera check with ref alone
ch2_c = readLines( "/anas/roshan-current/i2mc/burcelin/frm/runs/qiime_test4/misc_files/CH2/chimeras.txt" )
ch2_nc = readLines( "/anas/roshan-current/i2mc/burcelin/frm/runs/qiime_test4/misc_files/CH2/non_chimeras.txt" )

# Results from Chimera check with denovo alone
ch3_c = readLines( "/anas/roshan-current/i2mc/burcelin/frm/runs/qiime_test4/misc_files/CH4/chimeras.txt" )
ch3_nc = readLines( "/anas/roshan-current/i2mc/burcelin/frm/runs/qiime_test4/misc_files/CH4/non_chimeras.txt" )


# A quick test on the number of chimeras and non chimeras in each analysis

# roshan@r128 /anas/roshan-current/i2mc/burcelin/frm/runs/qiime_test4/misc_files $ wc -l  CH{1,2,4}/chimeras.txt
#  208468 CH1/chimeras.txt
#  256366 CH2/chimeras.txt
#  187858 CH4/chimeras.txt

# roshan@r128 /anas/roshan-current/i2mc/burcelin/frm/runs/qiime_test4/misc_files $ wc -l  CH{1,2,4}/non_chimeras.txt
#   952506 CH1/non_chimeras.txt
#   904608 CH2/non_chimeras.txt
#   973116 CH4/non_chimeras.txt

In [6]:
# Data
files  = file_path_list(fp, "*.[0-9].fasta")
L = sapply( files, myfun )
df = cbind(CH1ch=to_table(ch1_c), CH1nch=to_table(ch1_nc),
           CH2ch=to_table(ch2_c), CH2nch=to_table(ch2_nc), 
           CH3ch=to_table(ch3_c), CH3nch=to_table(ch3_nc))

In [7]:
head(L)
head(df)


Out[7]:
/anas/roshan-current/i2mc/burcelin/frm/runs/qiime_test4/misc_files/CH1/HFD.1.fasta
36723
/anas/roshan-current/i2mc/burcelin/frm/runs/qiime_test4/misc_files/CH1/HFD.2.fasta
85006
/anas/roshan-current/i2mc/burcelin/frm/runs/qiime_test4/misc_files/CH1/HFD.3.fasta
64719
/anas/roshan-current/i2mc/burcelin/frm/runs/qiime_test4/misc_files/CH1/HFD.4.fasta
82869
/anas/roshan-current/i2mc/burcelin/frm/runs/qiime_test4/misc_files/CH1/HFD.5.fasta
76259
/anas/roshan-current/i2mc/burcelin/frm/runs/qiime_test4/misc_files/CH1/HFS.1.fasta
77142
Out[7]:
CH1chCH1nchCH2chCH2nchCH3chCH3nch
HFD.1113402538313203235201062326100
HFD.2132437176317750672561268572321
HFD.3 553759182 632958390 506759652
HFD.4133336953615453674161160071269
HFD.5 9979662801282763432 850867751
HFS.1 872068422 984467298 672970413

In [8]:
names(L) =  rownames(df)
newdf = cbind(df,totalseqs= c(L))
newdf = as.data.frame(newdf)

newdf[, "CH1ch_pct"] = as.numeric(newdf[, "CH1ch"]) / as.numeric(newdf[,"totalseqs"]) *100
newdf[, "CH1nch_pct"] = as.numeric(newdf[, "CH1nch"]) / as.numeric(newdf[,"totalseqs"]) * 100

newdf[, "CH2ch_pct"] = as.numeric(newdf[, "CH2ch"]) / as.numeric(newdf[,"totalseqs"]) *100
newdf[, "CH2nch_pct"] = as.numeric(newdf[, "CH2nch"]) / as.numeric(newdf[,"totalseqs"]) * 100

newdf[, "CH3ch_pct"] = as.numeric(newdf[, "CH3ch"]) / as.numeric(newdf[,"totalseqs"]) *100
newdf[, "CH3nch_pct"] = as.numeric(newdf[, "CH3nch"]) / as.numeric(newdf[,"totalseqs"]) * 100

In [44]:
head(newdf)


Out[44]:
CH1chCH1nchCH2chCH2nchCH3chCH3nchtotalseqsCH1ch_pctCH1nch_pctCH2ch_pctCH2nch_pctCH3ch_pctCH3nch_pct
HFD.11134025383132032352010623261003672330.8798369.1201735.9529564.0470528.9273871.07262
HFD.21324371763177506725612685723218500615.578984.421120.8808879.1191214.9224885.07752
HFD.3553759182632958390506759652647198.55544791.444559.77919990.22087.82923192.17077
HFD.41333369536154536741611600712698286916.0892583.9107518.647581.352513.99886.002
HFD.599796628012827634328508677517625913.0856786.9143316.8203183.1796911.1567288.84328
HFS.18720684229844672986729704137714211.3038388.6961712.7608887.239128.72287591.27713

In [10]:
pal=c(rep("red", 5), rep("blue", 5) ,rep("green", 5))
names(pal) = rownames(newdf)

In [11]:
# Plot prep for Chimera

x = newdf;
xCols = colnames( x )[ grep( "[0-9]ch_pct$", colnames( x ) ) ];
N = dim(x)[1];
xpts = 1:N;
xlim=c(1,N);
ymin =min( x[, xCols] )
ymax =max( x[, xCols] )
ylim =c( ymin, ymax) ;

n.color = length( xCols );
cmap = rainbow( n.color );
names( cmap ) =  xCols;

In [12]:
plot( NA, xlim=xlim, ylim=ylim, main="Chimera from three diff methods", xlab="Sample", ylab="Read Count" )
for( s in xCols ) {
    points( xpts, x[, s], col=cmap[s], type="b", pch=16 );
    
}
legend( "topright", legend=xCols, col=cmap, pch=16 );
axis(1, 1:N, rownames(x))



In [13]:
# Plot prep for non-Chimera

x = newdf;
xCols = colnames( x )[ grep( "[0-9]nch_pct$", colnames( x ) ) ];
N = dim(x)[1];
xpts = 1:N;
xlim=c(1,N);
ymin =min( x[, xCols] )
ymax =max( x[, xCols] )
ylim =c( ymin, ymax) ;

n.color = length( xCols );
cmap = rainbow( n.color );
names( cmap ) =  xCols;

In [24]:
plot( NA, xlim=xlim, ylim=ylim, main="Non Chimera from three diff methods", xlab="Sample", ylab="Read Count" )
for( s in xCols ){
    points( xpts, x[, s], col=cmap[s], type="b", pch=16 );
}
axis(1, 1:N, rownames(x),las=3)
legend( "topright", legend=xCols, col=cmap, pch=16 );


VennDiagram : Test 1


In [48]:
# Subset the data for vennCounts and vennDiagram
#CH1ch	CH1nch	CH2ch	CH2nch	CH3ch	CH3nch
# V = newdf[,c('CH1ch','CH2ch','CH3ch')]
# head(V)

In [49]:
#as.matrix(V/newdf$totalseqs *100)

In [50]:
# Used this piece of code to clean up the CH4 
# 
# with open("ofile.txt", "w") as od:
#      with open("chimeras.txt" ) as dat:
#          d = dat.readlines()
#          od.writelines([i[::-1].replace('.', '_', 1)[::-1] for i in d])