The complete Shakya et al. dataset and subsamples of this dataset (10%, 25%, and 50%) were trimmed at quality scores of 2 and 30 using trimmomatic and classified with sourmash gather. All sourmash signatures were calculated with --scaled values of 10000, k values of 21, 31 and 51, and abundance tracking. Jaccard indices were calculated with sourmash compare.
To reproduce this notebook you need the following
You can retrieve data frames used in the note book with the following command:
for i in $(cat comparison_file_names.txt); do osf -p dm938 fetch osfstorage/taxonomic_classification/sourmash/${i} ${PWD}/${i}; echo ${i}; done
or by visiting the following link https://osf.io/dm938/ and downloading the contents of the '~/comparisons/signatures' directory.
(comparison_file_names.txt can be found here)
First, Set the backend of matplotlib to the 'inline' backend. With this backend, the output of plotting commands is displayed inline within frontends like the Jupyter notebook, directly below the code cell that produced it. The resulting plots will then also be stored in the notebook document.
In [1]:
%matplotlib inline
Then, import seaborn, pandas, and matplotlib data manipulation and plotting.
In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy as sp
#import plotly.figure_factory as ff
#import plotly.plotly as py
In [3]:
import plotly
In [4]:
#plotly.tools.set_credentials_file(username='brooksph', api_key='ztCs2sa6rYNmFdTaYqjY')
Next, compare sourmash signatures representing quality trimmed reads from the complete and subsampled data at three k-mer sizes (21, 31, and 51). These data were quality trimmed (trim) at 2 and 30.
In [5]:
import compare
from compare import load_sourmash_csv
In [6]:
load_sourmash_csv('SRR606249.pe.trim2and30_comparison.k51.csv')
Out[6]:
In [7]:
import os.path
In [8]:
from compare import create_cluster_map
create_cluster_map("SRR606249.pe.trim2and30_comparison.k51.csv", "Yep.png", 'Shakya Complete and Subsampled with Variable Quality Trimming and K = 51')
In [9]:
from compare import sort_by_similarity
sort_by_similarity("SRR606249.pe.trim2and30_reads_and_contigs_comparison.k51.csv", "yo_yo_yo.csv")
Out[9]:
In [10]:
#from compare import create_tsne
#create_tsne("SRR606249.pe.trim2and30_reads_and_contigs_comparison.k51.csv")
In [11]:
#import plotly.plotly as py
#import plotly.graph_objs as go
#from plotly import tools
#from time import time
#import numpy as np
#import matplotlib.pyplot as plt
#from sklearn import manifold
#from sklearn.utils import check_random_state
import numpy
from matplotlib import pyplot
#df_csv = pd.read_csv("SRR606249.pe.trim2and30_reads_and_contigs_comparison.k51.csv")
def create_mds_plot(filename):
m = numpy.loadtxt(open(filename),delimiter=",", skiprows=1)
from sklearn.manifold import mds
from sklearn.preprocessing import StandardScaler
data_std = StandardScaler().fit_transform(m)
from sklearn.decomposition import PCA
pca = PCA(n_components=24, svd_solver='full')
data_pca = pca.fit_transform(data_std)
mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9,
dissimilarity="euclidean", n_jobs=1).fit_transform(m)
df = pd.DataFrame(mds)
df.columns=['t1','t2']
#df['labels'] = df_csv.columns
return pyplot.scatter(df.t1, df.t2)
create_mds_plot("SRR606249.pe.trim2and30_reads_and_contigs_comparison.k51.csv")
In [ ]:
from compare import create_mds_plot
create_mds_plot("SRR606249.pe.trim2and30_reads_and_contigs_comparison.k51.csv")
In [12]:
# k-mer size = 51
df = pd.read_csv("SRR606249.pe.trim2and30_comparison.k51.csv")
dfnew = df.rename(index=str, columns={'/data/SRR606249.pe.trim2.fq.gz': 'SRR606249.pe.trim2.fq.gz',
'/data/SRR606249.pe.trim30.fq.gz': 'SRR606249.pe.trim30.fq.gz',
'/data/SRR606249_subset10.pe.trim2.fq.gz': 'SRR606249_subset10.pe.trim2.fq.gz',
'/data/SRR606249_subset10.pe.trim30.fq.gz': 'SRR606249_subset10.pe.trim30.fq.gz',
'/data/SRR606249_subset25.pe.trim2.fq.gz': 'SRR606249_subset25.pe.trim2.fq.gz',
'/data/SRR606249_subset25.pe.trim30.fq.gz': 'SRR606249_subset25.pe.trim30.fq.gz',
'/data/SRR606249_subset50.pe.trim2.fq.gz': 'SRR606249_subset50.pe.trim2.fq.gz',
'/data/SRR606249_subset50.pe.trim30.fq.gz': 'SRR606249_subset50.pe.trim30.fq.gz'})
dfnew [''] = ('SRR606249.pe.trim2.fq.gz', 'SRR606249.pe.trim30.fq.gz', "SRR606249_subset10.pe.trim2.fq.gz", "SRR606249_subset10.pe.trim30.fq.gz", "SRR606249_subset25.pe.trim2.fq.gz", "SRR606249_subset25.pe.trim30.fq.gz", "SRR606249_subset50.pe.trim2.fq.gz", "SRR606249_subset50.pe.trim30.fq.gz")
output = dfnew.set_index('')
cmap = sns.cubehelix_palette(8, start=2, rot=0, dark=0, light=.95, reverse=True, as_cmap=True)
o = sns.clustermap(output, col_cluster=True, row_cluster=True, linewidths=.5, figsize=(10, 10), cmap=cmap)
o.ax_heatmap.set_yticklabels(o.ax_heatmap.get_yticklabels(), rotation=0)
sns.set_context('paper')
o.savefig('clustermap_compare_reads_k51.pdf')
o.fig.suptitle('Shakya Complete and Subsampled with Variable Quality Trimming and K = 51')
Out[12]:
In [ ]:
import os.path
In [ ]:
In [ ]:
# k-mer size = 51
import os.path
df = pd.read_csv("SRR606249.pe.trim2and30_comparison.k51.csv")
#x = {}
#for i in df.columns:
# p = os.path.basename(i)
# x[i] = p
#List comprehension
x = dict([(i,os.path.basename(i)) for i in df.columns])
dfnew = df.rename(index=str, columns=x)
#Reset index to columns
dfnew ['']= dfnew.columns
output = dfnew.set_index('')
#Create cluster map
cmap = sns.cubehelix_palette(8, start=2, rot=0, dark=0, light=.95, reverse=True, as_cmap=True)
o = sns.clustermap(output, col_cluster=True, row_cluster=True, linewidths=.5, figsize=(10, 10), cmap=cmap)
o.ax_heatmap.set_yticklabels(o.ax_heatmap.get_yticklabels(), rotation=0)
sns.set_context('paper')
o.savefig('clustermap_compare_reads_k51.pdf')
o.fig.suptitle('Shakya Complete and Subsampled with Variable Quality Trimming and K = 51')
In [ ]:
df = pd.read_csv("SRR606249.pe.trim2and30_megahitandspades_comparison.k51.csv")
dfnew = df.rename(index=str, columns={'/data/osfstorage/assembly/SRR606249_1.trim2.fq.gz_megahit_output/final.contigs.fa':'SRR606249_1.trim2.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_1.trim2.fq.gz_spades_output/contigs.fasta': 'SRR606249_1.trim2.fq.gz_spades',
'/data/osfstorage/assembly/SRR606249_1.trim30.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_1.trim30.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_1.trim30.fq.gz_spades_output/contigs.fasta': 'SRR606249_1.trim30.fq.gz_spades',
'/data/osfstorage/assembly/SRR606249_subset10_1.trim2.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_subset10_1.trim2.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_subset10_1.trim2.fq.gz_spades_output/contigs.fasta': 'SRR606249_subset10_1.trim2.fq.gz_spades',
'/data/osfstorage/assembly/SRR606249_subset10_1.trim30.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_subset10_1.trim30.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_subset25_1.trim2.fq.gz_spades_output/contigs.fasta': 'SRR606249_subset25_1.trim2.fq.gz_spades',
'/data/osfstorage/assembly/SRR606249_subset25_1.trim30.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_subset25_1.trim30.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_subset25_1.trim30.fq.gz_spades_output/contigs.fasta': 'SRR606249_subset25_1.trim30.fq.gz_spades',
'/data/osfstorage/assembly/SRR606249_subset50_1.trim2.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_subset50_1.trim2.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_subset50_1.trim2.fq.gz_spades_output/contigs.fasta': 'SRR606249_subset50_1.trim2.fq.gz_spades',
'/data/osfstorage/assembly/SRR606249_subset50_1.trim30.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_subset50_1.trim30.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_subset50_1.trim30.fq.gz_spades_output/contigs.fasta': 'SRR606249_subset50_1.trim30.fq.gz_spades'})
dfnew [''] = ('SRR606249_1.trim2.fq.gz_megahit',
'SRR606249_1.trim2.fq.gz_spades',
'SRR606249_1.trim30.fq.gz_megahit',
'SRR606249_1.trim30.fq.gz_spades',
'SRR606249_subset10_1.trim2.fq.gz_megahit',
'SRR606249_subset10_1.trim2.fq.gz_spades',
'SRR606249_subset10_1.trim30.fq.gz_megahit',
'SRR606249_subset25_1.trim2.fq.gz_spades',
'SRR606249_subset25_1.trim30.fq.gz_megahit',
'SRR606249_subset25_1.trim30.fq.gz_spades',
'SRR606249_subset50_1.trim2.fq.gz_megahit',
'SRR606249_subset50_1.trim2.fq.gz_spades',
'SRR606249_subset50_1.trim30.fq.gz_megahit',
'SRR606249_subset50_1.trim30.fq.gz_spades')
output = dfnew.set_index('')
cmap = sns.cubehelix_palette(8, start=2, rot=0, dark=0, light=.95, reverse=True, as_cmap=True)
o = sns.clustermap(output, col_cluster=True, row_cluster=True, linewidths=.5, figsize=(10, 10), cmap=cmap)
o.ax_heatmap.set_yticklabels(o.ax_heatmap.get_yticklabels(), rotation=0)
sns.set_context('paper')
o.savefig("clustermap_compare_assemblies_k51.pdf")
o.fig.suptitle('Shakya Complete and Subsampled with Variable Quality Trimming and K = 51')
In [ ]:
# Reads and contigs k = 51
df = pd.read_csv("SRR606249.pe.trim2and30_reads_and_contigs_comparison.k51.csv")
dfnew = df.rename(index=str, columns={'/data/osfstorage/assembly/SRR606249_subset50_1.trim2.fq.gz_megahit_output/final.contigs.fa':'SRR606249_subset50_1.trim2.fq.gz_megahit_output.final.contigs.fa',
'/data/SRR606249_subset10.pe.trim30.fq.gz':'SRR606249_subset10.pe.trim30',
'/data/osfstorage/assembly/SRR606249_1.trim30.fq.gz_spades_output/contigs.fasta':'SRR606249_1.trim30.fq.gz_spades_output.contigs.fasta',
'/data/SRR606249_subset25.pe.trim2.fq.gz':'SRR606249_subset25.pe.trim2',
'/data/SRR606249.pe.trim30.fq.gz':'SRR606249.pe.trim30',
'/data/SRR606249.pe.trim2.fq.gz':'SRR606249.pe.trim2',
'/data/SRR606249_subset50.pe.trim2.fq.gz':'SRR606249_subset50.pe.trim2',
'/data/osfstorage/assembly/SRR606249_subset25_1.trim2.fq.gz_megahit_output/final.contigs.fa':'SRR606249_subset25_1.trim2.fq.gz_megahit_output.final.contigs.fa',
'/data/osfstorage/assembly/SRR606249_subset10_1.trim30.fq.gz_spades_output/contigs.fasta':'SRR606249_subset10_1.trim30.fq.gz_spades_output.contigs.fasta',
'/data/osfstorage/assembly/SRR606249_subset10_1.trim30.fq.gz_megahit_output/final.contigs.fa':'SRR606249_subset10_1.trim30.fq.gz_megahit_output.final.contigs.fa',
'/data/SRR606249_subset25.pe.trim30.fq.gz':'SRR606249_subset25.pe.trim30',
'/data/osfstorage/assembly/SRR606249_subset10_1.trim2.fq.gz_spades_output/contigs.fasta':'SRR606249_subset10_1.trim2.fq.gz_spades_output.contigs.fasta',
'/data/osfstorage/assembly/SRR606249_1.trim2.fq.gz_spades_output/contigs.fasta':'SRR606249_1.trim2.fq.gz_spades_output.contigs.fasta',
'/data/SRR606249_subset10.pe.trim2.fq.gz':'SRR606249_subset10.pe.trim2',
'/data/osfstorage/assembly/SRR606249_1.trim30.fq.gz_megahit_output/final.contigs.fa':'SRR606249_1.trim30.fq.gz_megahit_output.final.contigs.fa',
'/data/osfstorage/assembly/SRR606249_subset25_1.trim30.fq.gz_spades_output/contigs.fasta':'SRR606249_subset25_1.trim30.fq.gz_spades_output.contigs.fasta',
'/data/osfstorage/assembly/SRR606249_subset50_1.trim30.fq.gz_megahit_output/final.contigs.fa':'SRR606249_subset50_1.trim30.fq.gz_megahit_output.final.contigs.fa',
'/data/SRR606249_subset50.pe.trim30.fq.gz':'SRR606249_subset50.pe.trim30.fq.gz',
'/data/osfstorage/assembly/SRR606249_subset25_1.trim2.fq.gz_spades_output/contigs.fasta':'SRR606249_subset25_1.trim2.fq.gz_spades_output.contigs.fasta',
'/data/osfstorage/assembly/SRR606249_subset50_1.trim2.fq.gz_spades_output/contigs.fasta':'SRR606249_subset50_1.trim2.fq.gz_spades_output.contigs.fasta',
'/data/osfstorage/assembly/SRR606249_subset10_1.trim2.fq.gz_megahit_output/final.contigs.fa':'SRR606249_subset10_1.trim2.fq.gz_megahit_output.final.contigs.fa',
'/data/osfstorage/assembly/SRR606249_subset50_1.trim30.fq.gz_spades_output/contigs.fasta':'SRR606249_subset50_1.trim30.fq.gz_spades_output.contigs.fasta',
'/data/osfstorage/assembly/SRR606249_1.trim2.fq.gz_megahit_output/final.contigs.fa':'SRR606249_1.trim2.fq.gz_megahit_output.final.contigs.fa',
'/data/osfstorage/assembly/SRR606249_subset25_1.trim30.fq.gz_megahit_output/final.contigs.fa':'SRR606249_subset25_1.trim30.fq.gz_megahit_output.final.contigs.fa'})
dfnew [''] = ('SRR606249_subset50_1.trim2.fq.gz_megahit_output.final.contigs.fa',
'SRR606249_subset10.pe.trim30',
'SRR606249_1.trim30.fq.gz_spades_output.contigs.fasta',
'SRR606249_subset25.pe.trim2',
'SRR606249.pe.trim30',
'SRR606249.pe.trim2',
'SRR606249_subset50.pe.trim2',
'SRR606249_subset25_1.trim2.fq.gz_megahit_output.final.contigs.fa',
'SRR606249_subset10_1.trim30.fq.gz_spades_output.contigs.fasta',
'SRR606249_subset10_1.trim30.fq.gz_megahit_output.final.contigs.fa',
'SRR606249_subset25.pe.trim30',
'SRR606249_subset10_1.trim2.fq.gz_spades_output.contigs.fasta',
'SRR606249_1.trim2.fq.gz_spades_output.contigs.fasta',
'SRR606249_subset10.pe.trim2',
'SRR606249_1.trim30.fq.gz_megahit_output.final.contigs.fa',
'SRR606249_subset25_1.trim30.fq.gz_spades_output.contigs.fasta',
'SRR606249_subset50_1.trim30.fq.gz_megahit_output.final.contigs.fa',
'SRR606249_subset50.pe.trim30.fq.gz',
'SRR606249_subset25_1.trim2.fq.gz_spades_output.contigs.fasta',
'SRR606249_subset50_1.trim2.fq.gz_spades_output.contigs.fasta',
'SRR606249_subset10_1.trim2.fq.gz_megahit_output.final.contigs.fa',
'SRR606249_subset50_1.trim30.fq.gz_spades_output.contigs.fasta',
'SRR606249_1.trim2.fq.gz_megahit_output.final.contigs.fa',
'SRR606249_subset25_1.trim30.fq.gz_megahit_output.final.contigs.fa')
output = dfnew.set_index('')
cmap = sns.cubehelix_palette(8, start=2, rot=0, dark=0, light=.95, reverse=True, as_cmap=True)
o = sns.clustermap(output, col_cluster=True, row_cluster=True, linewidths=.5, figsize=(10, 10), cmap=cmap)
o.ax_heatmap.set_yticklabels(o.ax_heatmap.get_yticklabels(), rotation=0)
sns.set_context('paper')
o.savefig("clustermap_compare_assemblies_k51.pdf")
o.fig.suptitle('Shakya Complete and Subsampled with Variable Quality Trimming and K = 51')
In [ ]:
#def sort_by_similarity(filename, saved_file):
df = pd.read_csv('SRR606249.pe.trim2and30_reads_and_contigs_comparison.k51.csv')
x = dict([(i,os.path.basename(i)) for i in df.columns])
print(x)
dfnew = df.rename(index=str, columns= x)
dfnew [''] = df.columns
dfnew
#stacked_output = dfnew.stack()
#stacked_output
#stacked_sorted_output = pd.DataFrame(stacked_output, columns=['Jaccard_similarity']).sort_values('Jaccard_similarity', ascending=False)
#stacked_sorted_output.to_csv('x')
#stacked_sorted_output
#return stacked_sorted_output
# Determine which samples are most similar to each other
#from compare import sort_by_similarity
# Input names, output name for csv
#sort_by_similarity('SRR606249.pe.trim2and30_reads_and_contigs_comparison.k51.csv', 'stacked.csv')
In [ ]:
In [ ]:
from compare import create_tsne
create_tsne("SRR606249.pe.trim2and30_reads_and_contigs_comparison.k51.csv")
In [24]:
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools
from time import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn import manifold
from sklearn.utils import check_random_state
import numpy
from matplotlib import pyplot
df_csv = pd.read_csv("SRR606249.pe.trim2and30_reads_and_contigs_comparison.k51.csv")
m = numpy.loadtxt(open("SRR606249.pe.trim2and30_reads_and_contigs_comparison.k51.csv"),delimiter=",", skiprows=1)
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
data_std = StandardScaler().fit_transform(m)
from sklearn.decomposition import PCA
pca = PCA(n_components=24, svd_solver='full')
data_pca = pca.fit_transform(data_std)
t = TSNE(n_components=2, perplexity=5).fit_transform(data_pca)
df = pd.DataFrame(t)
df.columns=['t1','t2']
df['labels'] = df_csv.columns
fig=(df, df.t1, df.t2)
#pyplot.scatter(fig)
pyplot.scatter(df.t1, df.t2)
Out[24]:
In [ ]:
# Determine which samples are most similar to each other
stacked_output = output.stack()
df = pd.DataFrame(stacked_output, columns=['Jaccard_similarity'])
stacked_sorted_output = df.sort_values('Jaccard_similarity', ascending=False)
#df.to_csv('stacked_sorted_output.csv')
#stacked_sorted_output
In [1]:
# install.packages("plotly")
library(plotly, warn.conflicts = FALSE)
sourmash_comp_matrix <- read.csv("SRR606249.pe.trim2and30_reads_and_contigs_comparison.k51.csv")
# Label the rows
rownames(sourmash_comp_matrix) <- colnames(sourmash_comp_matrix)
# Transform for plotting
sourmash_comp_matrix <- as.matrix(sourmash_comp_matrix)
fit <- dist(sourmash_comp_matrix)
fit <- cmdscale(fit)
fit <- as.data.frame(fit)
# Clean up names
row.names(fit) <- gsub(".fq.gz", "", row.names(fit))
row.names(fit) <- gsub("X.data.osfstorage.assembly.", "", row.names(fit))
row.names(fit) <- gsub("X.data.", "", row.names(fit))
row.names(fit) <- gsub("_output.final.contigs.fa", "", row.names(fit))
row.names(fit)
# Fit data and plot
x <- fit[, 1]
y <- fit[, 2]
p <- plot_ly(fit, x = ~fit[ , 1], y = ~fit[ , 2], text=~row.names(fit)) %>%
add_markers()
p
text(fit[,1:2], labels = row.names(fit), pos = 1)
In [ ]:
# Create tsne
#install.packages("Rtsne")
library(Rtsne)
tsne_model <- Rtsne(sourmash_comp_matrix, check_duplicates=FALSE, pca=TRUE, perplexity=5, theta=0.5, dims=2)
d_tsne <- as.data.frame(tsne_model$Y)
row.names(sourmash_comp_matrix) <- gsub(".fq.gz", "", row.names(fit))
row.names(sourmash_comp_matrix) <- gsub("X.data.osfstorage.assembly.", "", row.names(fit))
row.names(sourmash_comp_matrix) <- gsub("X.data.", "", row.names(fit))
row.names(sourmash_comp_matrix) <- gsub("_output.final.contigs.fa", "", row.names(fit))
row.names(sourmash_comp_matrix)
p <- plot_ly(d_tsne, x = ~V1, y = ~V2, text=~row.names(sourmash_comp_matrix)) %>%
add_markers()
p
In [ ]:
# k-mer size = 21
df = pd.read_csv("SRR606249.pe.trim2and30_comparison.k21.csv")
# Clean up files names by removing path
dfnew = df.rename(index=str, columns={'/data/SRR606249.pe.trim2.fq.gz': 'SRR606249.pe.trim2.fq.gz',
'/data/SRR606249.pe.trim30.fq.gz': 'SRR606249.pe.trim30.fq.gz',
'/data/SRR606249_subset10.pe.trim2.fq.gz': 'SRR606249_subset10.pe.trim2.fq.gz',
'/data/SRR606249_subset10.pe.trim30.fq.gz': 'SRR606249_subset10.pe.trim30.fq.gz',
'/data/SRR606249_subset25.pe.trim2.fq.gz': 'SRR606249_subset25.pe.trim2.fq.gz',
'/data/SRR606249_subset25.pe.trim30.fq.gz':'SRR606249_subset25.pe.trim30.fq.gz',
'/data/SRR606249_subset50.pe.trim2.fq.gz': 'SRR606249_subset50.pe.trim2.fq.gz',
'/data/SRR606249_subset50.pe.trim30.fq.gz': 'SRR606249_subset50.pe.trim30.fq.gz'})
# Rename index to change numbers to files names
dfnew [''] = ("SRR606249.pe.trim2.fq.gz", "SRR606249.pe.trim30.fq.gz", "SRR606249_subset10.pe.trim2.fq.gz", "SRR606249_subset10.pe.trim30.fq.gz", "SRR606249_subset25.pe.trim2.fq.gz", "SRR606249_subset25.pe.trim30.fq.gz", "SRR606249_subset50.pe.trim2.fq.gz", "SRR606249_subset50.pe.trim30.fq.gz")
output = dfnew.set_index('')
cmap = sns.cubehelix_palette(8, start=2, rot=0, dark=0, light=.95, reverse=True, as_cmap=True)
o = sns.clustermap(output, col_cluster=True, row_cluster=True, linewidths=.5, figsize=(10, 10), cmap=cmap)
o.ax_heatmap.set_yticklabels(o.ax_heatmap.get_yticklabels(), rotation=0)
# Set context, save, and create title
sns.set_context('paper')
o.savefig("clustermap_compare_reads_k21.pdf")
o.fig.suptitle('Shakya Complete and Subsampled with Variable Quality Trimming and K = 21')
In [ ]:
# k-mer size = 31
df = pd.read_csv("SRR606249.pe.trim2and30_comparison.k31.csv")
# Clean up files names by removing path
dfnew = df.rename(index=str, columns={'/data/SRR606249.pe.trim2.fq.gz': 'SRR606249.pe.trim2.fq.gz',
'/data/SRR606249.pe.trim30.fq.gz': 'SRR606249.pe.trim30.fq.gz',
'/data/SRR606249_subset10.pe.trim2.fq.gz': 'SRR606249_subset10.pe.trim2.fq.gz',
'/data/SRR606249_subset10.pe.trim30.fq.gz': 'SRR606249_subset10.pe.trim30.fq.gz',
'/data/SRR606249_subset25.pe.trim2.fq.gz': 'SRR606249_subset25.pe.trim2.fq.gz',
'/data/SRR606249_subset25.pe.trim30.fq.gz': 'SRR606249_subset25.pe.trim30.fq.gz',
'/data/SRR606249_subset50.pe.trim2.fq.gz': 'SRR606249_subset50.pe.trim2.fq.gz',
'/data/SRR606249_subset50.pe.trim30.fq.gz': 'SRR606249_subset50.pe.trim30.fq.gz'})
# Rename index to change numbers to files names
dfnew [''] = ("SRR606249.pe.trim2.fq.gz", "SRR606249.pe.trim30.fq.gz", "SRR606249_subset10.pe.trim2.fq.gz", "SRR606249_subset10.pe.trim30.fq.gz", "SRR606249_subset25.pe.trim2.fq.gz", "SRR606249_subset25.pe.trim30.fq.gz", "SRR606249_subset50.pe.trim2.fq.gz", "SRR606249_subset50.pe.trim30.fq.gz")
output = dfnew.set_index('')
cmap = sns.cubehelix_palette(8, start=2, rot=0, dark=0, light=.95, reverse=True, as_cmap=True)
o = sns.clustermap(output, col_cluster=True, row_cluster=True, linewidths=.5, figsize=(10, 10), cmap=cmap)
o.ax_heatmap.set_yticklabels(o.ax_heatmap.get_yticklabels(), rotation=0)
# Set context, save, and create title
sns.set_context('paper')
o.savefig("clustermap_compare_reads_k31.pdf")
o.fig.suptitle('Shakya Complete and Subsampled with Variable Quality Trimming and K = 31')
In [ ]:
# k-mer size = 51
df = pd.read_csv("SRR606249.pe.trim2and30_comparison.k51.csv")
dfnew = df.rename(index=str, columns={'/data/SRR606249.pe.trim2.fq.gz': 'SRR606249.pe.trim2.fq.gz',
'/data/SRR606249.pe.trim30.fq.gz': 'SRR606249.pe.trim30.fq.gz',
'/data/SRR606249_subset10.pe.trim2.fq.gz': 'SRR606249_subset10.pe.trim2.fq.gz',
'/data/SRR606249_subset10.pe.trim30.fq.gz': 'SRR606249_subset10.pe.trim30.fq.gz',
'/data/SRR606249_subset25.pe.trim2.fq.gz': 'SRR606249_subset25.pe.trim2.fq.gz',
'/data/SRR606249_subset25.pe.trim30.fq.gz': 'SRR606249_subset25.pe.trim30.fq.gz',
'/data/SRR606249_subset50.pe.trim2.fq.gz': 'SRR606249_subset50.pe.trim2.fq.gz',
'/data/SRR606249_subset50.pe.trim30.fq.gz': 'SRR606249_subset50.pe.trim30.fq.gz'})
dfnew [''] = ("SRR606249.pe.trim2.fq.gz", "SRR606249.pe.trim30.fq.gz", "SRR606249_subset10.pe.trim2.fq.gz", "SRR606249_subset10.pe.trim30.fq.gz", "SRR606249_subset25.pe.trim2.fq.gz", "SRR606249_subset25.pe.trim30.fq.gz", "SRR606249_subset50.pe.trim2.fq.gz", "SRR606249_subset50.pe.trim30.fq.gz")
output = dfnew.set_index('')
cmap = sns.cubehelix_palette(8, start=2, rot=0, dark=0, light=.95, reverse=True, as_cmap=True)
o = sns.clustermap(output, col_cluster=True, row_cluster=True, linewidths=.5, figsize=(10, 10), cmap=cmap)
o.ax_heatmap.set_yticklabels(o.ax_heatmap.get_yticklabels(), rotation=0)
sns.set_context('paper')
o.savefig("clustermap_compare_reads_k51.pdf")
o.fig.suptitle('Shakya Complete and Subsampled with Variable Quality Trimming and K = 51')
In [ ]:
df = pd.read_csv("SRR606249.pe.trim2and30_megahitandspades_comparison.k21.csv")
dfnew = df.rename(index=str, columns={'/data/osfstorage/assembly/SRR606249_1.trim2.fq.gz_megahit_output/final.contigs.fa':'SRR606249_1.trim2.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_1.trim2.fq.gz_spades_output/contigs.fasta': 'SRR606249_1.trim2.fq.gz_spades',
'/data/osfstorage/assembly/SRR606249_1.trim30.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_1.trim30.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_1.trim30.fq.gz_spades_output/contigs.fasta': 'SRR606249_1.trim30.fq.gz_spades',
'/data/osfstorage/assembly/SRR606249_subset10_1.trim2.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_subset10_1.trim2.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_subset10_1.trim2.fq.gz_spades_output/contigs.fasta': 'SRR606249_subset10_1.trim2.fq.gz_spades',
'/data/osfstorage/assembly/SRR606249_subset10_1.trim30.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_subset10_1.trim30.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_subset25_1.trim2.fq.gz_spades_output/contigs.fasta': 'SRR606249_subset25_1.trim2.fq.gz_spades',
'/data/osfstorage/assembly/SRR606249_subset25_1.trim30.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_subset25_1.trim30.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_subset25_1.trim30.fq.gz_spades_output/contigs.fasta': 'SRR606249_subset25_1.trim30.fq.gz_spades',
'/data/osfstorage/assembly/SRR606249_subset50_1.trim2.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_subset50_1.trim2.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_subset50_1.trim2.fq.gz_spades_output/contigs.fasta': 'SRR606249_subset50_1.trim2.fq.gz_spades',
'/data/osfstorage/assembly/SRR606249_subset50_1.trim30.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_subset50_1.trim30.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_subset50_1.trim30.fq.gz_spades_output/contigs.fasta': 'SRR606249_subset50_1.trim30.fq.gz_spades'})
dfnew [''] = ('SRR606249_1.trim2.fq.gz_megahit',
'SRR606249_1.trim2.fq.gz_spades',
'SRR606249_1.trim30.fq.gz_megahit',
'SRR606249_1.trim30.fq.gz_spades',
'SRR606249_subset10_1.trim2.fq.gz_megahit',
'SRR606249_subset10_1.trim2.fq.gz_spades',
'SRR606249_subset10_1.trim30.fq.gz_megahit',
'SRR606249_subset25_1.trim2.fq.gz_spades',
'SRR606249_subset25_1.trim30.fq.gz_megahit',
'SRR606249_subset25_1.trim30.fq.gz_spades',
'SRR606249_subset50_1.trim2.fq.gz_megahit',
'SRR606249_subset50_1.trim2.fq.gz_spades',
'SRR606249_subset50_1.trim30.fq.gz_megahit',
'SRR606249_subset50_1.trim30.fq.gz_spades')
output = dfnew.set_index('')
cmap = sns.cubehelix_palette(8, start=2, rot=0, dark=0, light=.95, reverse=True, as_cmap=True)
o = sns.clustermap(output, col_cluster=True, row_cluster=True, linewidths=.5, figsize=(10, 10), cmap=cmap)
o.ax_heatmap.set_yticklabels(o.ax_heatmap.get_yticklabels(), rotation=0)
sns.set_context('paper')
o.savefig("clustermap_compare_assemblies_k21.pdf")
o.fig.suptitle('Shakya Complete and Subsampled data Assembled with MEGAHIT or SPAdes with Variable Quality Trimming and K = 21')
In [ ]:
df = pd.read_csv("SRR606249.pe.trim2and30_megahitandspades_comparison.k31.csv")
dfnew = df.rename(index=str, columns={'/data/osfstorage/assembly/SRR606249_1.trim2.fq.gz_megahit_output/final.contigs.fa':'SRR606249_1.trim2.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_1.trim2.fq.gz_spades_output/contigs.fasta': 'SRR606249_1.trim2.fq.gz_spades',
'/data/osfstorage/assembly/SRR606249_1.trim30.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_1.trim30.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_1.trim30.fq.gz_spades_output/contigs.fasta': 'SRR606249_1.trim30.fq.gz_spades',
'/data/osfstorage/assembly/SRR606249_subset10_1.trim2.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_subset10_1.trim2.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_subset10_1.trim2.fq.gz_spades_output/contigs.fasta': 'SRR606249_subset10_1.trim2.fq.gz_spades',
'/data/osfstorage/assembly/SRR606249_subset10_1.trim30.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_subset10_1.trim30.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_subset25_1.trim2.fq.gz_spades_output/contigs.fasta': 'SRR606249_subset25_1.trim2.fq.gz_spades',
'/data/osfstorage/assembly/SRR606249_subset25_1.trim30.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_subset25_1.trim30.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_subset25_1.trim30.fq.gz_spades_output/contigs.fasta': 'SRR606249_subset25_1.trim30.fq.gz_spades',
'/data/osfstorage/assembly/SRR606249_subset50_1.trim2.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_subset50_1.trim2.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_subset50_1.trim2.fq.gz_spades_output/contigs.fasta': 'SRR606249_subset50_1.trim2.fq.gz_spades',
'/data/osfstorage/assembly/SRR606249_subset50_1.trim30.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_subset50_1.trim30.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_subset50_1.trim30.fq.gz_spades_output/contigs.fasta': 'SRR606249_subset50_1.trim30.fq.gz_spades'})
dfnew [''] = ('SRR606249_1.trim2.fq.gz_megahit',
'SRR606249_1.trim2.fq.gz_spades',
'SRR606249_1.trim30.fq.gz_megahit',
'SRR606249_1.trim30.fq.gz_spades',
'SRR606249_subset10_1.trim2.fq.gz_megahit',
'SRR606249_subset10_1.trim2.fq.gz_spades',
'SRR606249_subset10_1.trim30.fq.gz_megahit',
'SRR606249_subset25_1.trim2.fq.gz_spades',
'SRR606249_subset25_1.trim30.fq.gz_megahit',
'SRR606249_subset25_1.trim30.fq.gz_spades',
'SRR606249_subset50_1.trim2.fq.gz_megahit',
'SRR606249_subset50_1.trim2.fq.gz_spades',
'SRR606249_subset50_1.trim30.fq.gz_megahit',
'SRR606249_subset50_1.trim30.fq.gz_spades')
output = dfnew.set_index('')
cmap = sns.cubehelix_palette(8, start=2, rot=0, dark=0, light=.95, reverse=True, as_cmap=True)
o = sns.clustermap(output, col_cluster=True, row_cluster=True, linewidths=.5, figsize=(10, 10), cmap=cmap)
o.ax_heatmap.set_yticklabels(o.ax_heatmap.get_yticklabels(), rotation=0)
sns.set_context('paper')
o.savefig("clustermap_compare_assemblies_k31.pdf")
o.fig.suptitle('Shakya Complete and Subsampled with Variable Quality Trimming and K = 31')
In [ ]:
In [ ]:
df = pd.read_csv("SRR606249.pe.trim2and30_megahitandspades_comparison.k51.csv")
dfnew = df.rename(index=str, columns={'/data/osfstorage/assembly/SRR606249_1.trim2.fq.gz_megahit_output/final.contigs.fa':'SRR606249_1.trim2.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_1.trim2.fq.gz_spades_output/contigs.fasta': 'SRR606249_1.trim2.fq.gz_spades',
'/data/osfstorage/assembly/SRR606249_1.trim30.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_1.trim30.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_1.trim30.fq.gz_spades_output/contigs.fasta': 'SRR606249_1.trim30.fq.gz_spades',
'/data/osfstorage/assembly/SRR606249_subset10_1.trim2.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_subset10_1.trim2.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_subset10_1.trim2.fq.gz_spades_output/contigs.fasta': 'SRR606249_subset10_1.trim2.fq.gz_spades',
'/data/osfstorage/assembly/SRR606249_subset10_1.trim30.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_subset10_1.trim30.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_subset25_1.trim2.fq.gz_spades_output/contigs.fasta': 'SRR606249_subset25_1.trim2.fq.gz_spades',
'/data/osfstorage/assembly/SRR606249_subset25_1.trim30.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_subset25_1.trim30.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_subset25_1.trim30.fq.gz_spades_output/contigs.fasta': 'SRR606249_subset25_1.trim30.fq.gz_spades',
'/data/osfstorage/assembly/SRR606249_subset50_1.trim2.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_subset50_1.trim2.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_subset50_1.trim2.fq.gz_spades_output/contigs.fasta': 'SRR606249_subset50_1.trim2.fq.gz_spades',
'/data/osfstorage/assembly/SRR606249_subset50_1.trim30.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_subset50_1.trim30.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_subset50_1.trim30.fq.gz_spades_output/contigs.fasta': 'SRR606249_subset50_1.trim30.fq.gz_spades'})
dfnew [''] = ('SRR606249_1.trim2.fq.gz_megahit',
'SRR606249_1.trim2.fq.gz_spades',
'SRR606249_1.trim30.fq.gz_megahit',
'SRR606249_1.trim30.fq.gz_spades',
'SRR606249_subset10_1.trim2.fq.gz_megahit',
'SRR606249_subset10_1.trim2.fq.gz_spades',
'SRR606249_subset10_1.trim30.fq.gz_megahit',
'SRR606249_subset25_1.trim2.fq.gz_spades',
'SRR606249_subset25_1.trim30.fq.gz_megahit',
'SRR606249_subset25_1.trim30.fq.gz_spades',
'SRR606249_subset50_1.trim2.fq.gz_megahit',
'SRR606249_subset50_1.trim2.fq.gz_spades',
'SRR606249_subset50_1.trim30.fq.gz_megahit',
'SRR606249_subset50_1.trim30.fq.gz_spades')
output = dfnew.set_index('')
cmap = sns.cubehelix_palette(8, start=2, rot=0, dark=0, light=.95, reverse=True, as_cmap=True)
o = sns.clustermap(output, col_cluster=True, row_cluster=True, linewidths=.5, figsize=(10, 10), cmap=cmap)
o.ax_heatmap.set_yticklabels(o.ax_heatmap.get_yticklabels(), rotation=0)
sns.set_context('paper')
o.savefig("clustermap_compare_assemblies_k51.pdf")
o.fig.suptitle('Shakya Complete and Subsampled with Variable Quality Trimming and K = 51')
In [ ]:
df = pd.read_csv("SRR606249.pe.trim2and30_readstoassemblies_comparison.k21.csv")
dfnew = df.rename(index=str, columns={'/data/osfstorage/assembly/SRR606249_1.trim2.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_1.trim2.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_1.trim2.fq.gz_spades_output/contigs.fasta': 'SRR606249_1.trim2.fq.gz_spades',
'/data/SRR606249.pe.trim2.fq.gz': 'SRR606249.pe.trim2.fq.gz',
'/data/SRR606249.pe.trim30.fq.gz': 'SRR606249.pe.trim30.fq.gz',
'/data/osfstorage/assembly/SRR606249_1.trim30.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_1.trim30.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_1.trim30.fq.gz_spades_output/contigs.fasta': 'SRR606249_1.trim30.fq.gz_spades'})
dfnew [''] = ('SRR606249_1.trim2.fq.gz_megahit', 'SRR606249_1.trim2.fq.gz_spades', 'SRR606249.pe.trim2.fq.gz', 'SRR606249.pe.trim30.fq.gz', 'SRR606249_1.trim30.fq.gz_megahit', 'SRR606249_1.trim30.fq.gz_spades')
output = dfnew.set_index('')
cmap = sns.cubehelix_palette(8, start=2, rot=0, dark=0, light=.95, reverse=True, as_cmap=True)
o = sns.clustermap(output, col_cluster=True, row_cluster=True, linewidths=.5, figsize=(10, 10), cmap=cmap)
o.ax_heatmap.set_yticklabels(o.ax_heatmap.get_yticklabels(), rotation=0)
sns.set_context('paper')
o.savefig("clustermap_compare_readstoassemblies_k21.pdf")
o.fig.suptitle('Shakya Complete and Subsampled, Assemblies and Reads, with Variable Quality Trimming and K = 21')
In [ ]:
df = pd.read_csv("SRR606249.pe.trim2and30_readstoassemblies_comparison.k31.csv")
dfnew = df.rename(index=str, columns={'/data/osfstorage/assembly/SRR606249_1.trim2.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_1.trim2.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_1.trim2.fq.gz_spades_output/contigs.fasta': 'SRR606249_1.trim2.fq.gz_spades',
'/data/SRR606249.pe.trim2.fq.gz': 'SRR606249.pe.trim2.fq.gz',
'/data/SRR606249.pe.trim30.fq.gz': 'SRR606249.pe.trim30.fq.gz',
'/data/osfstorage/assembly/SRR606249_1.trim30.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_1.trim30.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_1.trim30.fq.gz_spades_output/contigs.fasta': 'SRR606249_1.trim30.fq.gz_spades'})
dfnew [''] = ('SRR606249_1.trim2.fq.gz_megahit', 'SRR606249_1.trim2.fq.gz_spades', 'SRR606249.pe.trim2.fq.gz', 'SRR606249.pe.trim30.fq.gz', 'SRR606249_1.trim30.fq.gz_megahit', 'SRR606249_1.trim30.fq.gz_spades')
output = dfnew.set_index('')
cmap = sns.cubehelix_palette(8, start=2, rot=0, dark=0, light=.95, reverse=True, as_cmap=True)
o = sns.clustermap(output, col_cluster=True, row_cluster=True, linewidths=.5, figsize=(10, 10), cmap=cmap)
o.ax_heatmap.set_yticklabels(o.ax_heatmap.get_yticklabels(), rotation=0)
sns.set_context('paper')
o.savefig("clustermap_compare_readstoassemblies_k31.pdf")
o.fig.suptitle('Shakya Complete and Subsampled, Assemblies and Reads, with Variable Quality Trimming and K = 31')
In [ ]:
df
In [ ]:
df = pd.read_csv("SRR606249.pe.trim2and30_readstoassemblies_comparison.k51.csv")
dfnew = df.rename(index=str, columns={'/data/osfstorage/assembly/SRR606249_1.trim2.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_1.trim2.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_1.trim2.fq.gz_spades_output/contigs.fasta': 'SRR606249_1.trim2.fq.gz_spades',
'/data/SRR606249.pe.trim2.fq.gz': 'SRR606249.pe.trim2.fq.gz',
'/data/SRR606249.pe.trim30.fq.gz': 'SRR606249.pe.trim30.fq.gz',
'/data/osfstorage/assembly/SRR606249_1.trim30.fq.gz_megahit_output/final.contigs.fa': 'SRR606249_1.trim30.fq.gz_megahit',
'/data/osfstorage/assembly/SRR606249_1.trim30.fq.gz_spades_output/contigs.fasta': 'SRR606249_1.trim30.fq.gz_spades'})
dfnew [''] = ('SRR606249_1.trim2.fq.gz_megahit', 'SRR606249_1.trim2.fq.gz_spades', 'SRR606249.pe.trim2.fq.gz', 'SRR606249.pe.trim30.fq.gz', 'SRR606249_1.trim30.fq.gz_megahit', 'SRR606249_1.trim30.fq.gz_spades')
output = dfnew.set_index('')
cmap = sns.cubehelix_palette(8, start=2, rot=0, dark=0, light=.95, reverse=True, as_cmap=True)
o = sns.clustermap(output, col_cluster=True, row_cluster=True, linewidths=.5, figsize=(10, 10), cmap=cmap)
o.ax_heatmap.set_yticklabels(o.ax_heatmap.get_yticklabels(), rotation=0)
sns.set_context('paper')
o.savefig("clustermap_compare_readstoassemblies_k51.pdf")
o.fig.suptitle('Shakya Complete and Subsampled, Assemblies and Reads, with Variable Quality Trimming and K = 31')
[ ] Extract unassigned reads
[ ] Compare these results to clustering with commet
[ ] Use plotly to create interactive plot for MDS and tsne