In [3]:
source( "/anas/gh/splitIntoDataFrame.R" )
suppressMessages(library(bioUtilsr))
suppressMessages(library(Biostrings))
In [4]:
# Local Fucntions
# Parse the fatsa files
myfun = function(x)
{
name=gsub("./", "", gsub(".fasta","" ,x ))
length( readDNAStringSet(x, format="fasta"))
}
# Counts of sequences in each file
to_table = function(f){
tok = splitIntoDataFrame( f, header=F, sep="[._]" )
samples = paste( tok[,1], tok[,2], sep="." )
table( samples )
}
In [5]:
# File Paths
fp = "/anas/roshan-current/i2mc/burcelin/frm/runs/qiime_test4/misc_files/CH1"
# Results from Chimera check with ref and denovo
ch1_c = readLines( "/anas/roshan-current/i2mc/burcelin/frm/runs/qiime_test4/misc_files/CH1/chimeras.txt" )
ch1_nc = readLines( "/anas/roshan-current/i2mc/burcelin/frm/runs/qiime_test4/misc_files/CH1/non_chimeras.txt" )
# Results from Chimera check with ref alone
ch2_c = readLines( "/anas/roshan-current/i2mc/burcelin/frm/runs/qiime_test4/misc_files/CH2/chimeras.txt" )
ch2_nc = readLines( "/anas/roshan-current/i2mc/burcelin/frm/runs/qiime_test4/misc_files/CH2/non_chimeras.txt" )
# Results from Chimera check with denovo alone
ch3_c = readLines( "/anas/roshan-current/i2mc/burcelin/frm/runs/qiime_test4/misc_files/CH4/chimeras.txt" )
ch3_nc = readLines( "/anas/roshan-current/i2mc/burcelin/frm/runs/qiime_test4/misc_files/CH4/non_chimeras.txt" )
# A quick test on the number of chimeras and non chimeras in each analysis
# roshan@r128 /anas/roshan-current/i2mc/burcelin/frm/runs/qiime_test4/misc_files $ wc -l CH{1,2,4}/chimeras.txt
# 208468 CH1/chimeras.txt
# 256366 CH2/chimeras.txt
# 187858 CH4/chimeras.txt
# roshan@r128 /anas/roshan-current/i2mc/burcelin/frm/runs/qiime_test4/misc_files $ wc -l CH{1,2,4}/non_chimeras.txt
# 952506 CH1/non_chimeras.txt
# 904608 CH2/non_chimeras.txt
# 973116 CH4/non_chimeras.txt
In [6]:
# Data
files = file_path_list(fp, "*.[0-9].fasta")
L = sapply( files, myfun )
df = cbind(CH1ch=to_table(ch1_c), CH1nch=to_table(ch1_nc),
CH2ch=to_table(ch2_c), CH2nch=to_table(ch2_nc),
CH3ch=to_table(ch3_c), CH3nch=to_table(ch3_nc))
In [7]:
head(L)
head(df)
Out[7]:
Out[7]:
In [8]:
names(L) = rownames(df)
newdf = cbind(df,totalseqs= c(L))
newdf = as.data.frame(newdf)
newdf[, "CH1ch_pct"] = as.numeric(newdf[, "CH1ch"]) / as.numeric(newdf[,"totalseqs"]) *100
newdf[, "CH1nch_pct"] = as.numeric(newdf[, "CH1nch"]) / as.numeric(newdf[,"totalseqs"]) * 100
newdf[, "CH2ch_pct"] = as.numeric(newdf[, "CH2ch"]) / as.numeric(newdf[,"totalseqs"]) *100
newdf[, "CH2nch_pct"] = as.numeric(newdf[, "CH2nch"]) / as.numeric(newdf[,"totalseqs"]) * 100
newdf[, "CH3ch_pct"] = as.numeric(newdf[, "CH3ch"]) / as.numeric(newdf[,"totalseqs"]) *100
newdf[, "CH3nch_pct"] = as.numeric(newdf[, "CH3nch"]) / as.numeric(newdf[,"totalseqs"]) * 100
In [44]:
head(newdf)
Out[44]:
In [10]:
pal=c(rep("red", 5), rep("blue", 5) ,rep("green", 5))
names(pal) = rownames(newdf)
In [11]:
# Plot prep for Chimera
x = newdf;
xCols = colnames( x )[ grep( "[0-9]ch_pct$", colnames( x ) ) ];
N = dim(x)[1];
xpts = 1:N;
xlim=c(1,N);
ymin =min( x[, xCols] )
ymax =max( x[, xCols] )
ylim =c( ymin, ymax) ;
n.color = length( xCols );
cmap = rainbow( n.color );
names( cmap ) = xCols;
In [12]:
plot( NA, xlim=xlim, ylim=ylim, main="Chimera from three diff methods", xlab="Sample", ylab="Read Count" )
for( s in xCols ) {
points( xpts, x[, s], col=cmap[s], type="b", pch=16 );
}
legend( "topright", legend=xCols, col=cmap, pch=16 );
axis(1, 1:N, rownames(x))
In [13]:
# Plot prep for non-Chimera
x = newdf;
xCols = colnames( x )[ grep( "[0-9]nch_pct$", colnames( x ) ) ];
N = dim(x)[1];
xpts = 1:N;
xlim=c(1,N);
ymin =min( x[, xCols] )
ymax =max( x[, xCols] )
ylim =c( ymin, ymax) ;
n.color = length( xCols );
cmap = rainbow( n.color );
names( cmap ) = xCols;
In [24]:
plot( NA, xlim=xlim, ylim=ylim, main="Non Chimera from three diff methods", xlab="Sample", ylab="Read Count" )
for( s in xCols ){
points( xpts, x[, s], col=cmap[s], type="b", pch=16 );
}
axis(1, 1:N, rownames(x),las=3)
legend( "topright", legend=xCols, col=cmap, pch=16 );
In [48]:
# Subset the data for vennCounts and vennDiagram
#CH1ch CH1nch CH2ch CH2nch CH3ch CH3nch
# V = newdf[,c('CH1ch','CH2ch','CH3ch')]
# head(V)
In [49]:
#as.matrix(V/newdf$totalseqs *100)
In [50]:
# Used this piece of code to clean up the CH4
#
# with open("ofile.txt", "w") as od:
# with open("chimeras.txt" ) as dat:
# d = dat.readlines()
# od.writelines([i[::-1].replace('.', '_', 1)[::-1] for i in d])