In [33]:
from os.path import join as pjoin
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import pandas as pd
import skbio
import skbio.diversity
import seaborn as sns
import random
In [34]:
plt.style.use('seaborn')
%matplotlib inline
CONFIGURATION (edit the following cell)
In [35]:
raw_reads_path = '/Users/laserson/Downloads/BaseSpace/Metagenomiclibraries-39688650/2-48080086/mms500_S2_L001_R1_001.fastq.gz'
analysis_dir_path = '/Users/laserson/tmp/phip_analysis/phip-7/phip-7-meta'
sample_prefix = 'mms500_S2_L001_R1_001'
In [36]:
vector_depleted_reads_path = pjoin(analysis_dir_path, 'vector_depleted', sample_prefix + '.fastq')
centrifuge_aln_path = pjoin(analysis_dir_path, 'centrifuge', sample_prefix + '.centrifuge_aln.tsv')
kraken_report_path = pjoin(analysis_dir_path, 'centrifuge', sample_prefix + '.centrifuge_kreport.tsv')
clustered_read_counts_path = pjoin(analysis_dir_path, 'clustered_reads', sample_prefix + '.clustered.counts.tsv')
aligned_clustered_read_counts_path = pjoin(analysis_dir_path, 'clustered_reads', sample_prefix + '.aligned_clustered.counts.tsv')
Number of raw reads
In [37]:
_ = !cat {raw_reads_path} | gunzip | wc -l
raw_read_count = int(_[0]) // 4
raw_read_count
Out[37]:
Number of non-vector reads
In [38]:
_ = !cat {vector_depleted_reads_path} | wc -l
non_vector_read_count = int(_[0]) // 4
non_vector_read_count
Out[38]:
Fraction non-vector reads
In [39]:
non_vector_read_count / raw_read_count
Out[39]:
Number non-vector reads successfully aligned by centrifuge
In [40]:
_ = !cat {centrifuge_aln_path} | wc -l
aligned_read_count = int(_[0]) // 2
aligned_read_count
Out[40]:
Fraction non-vector reads aligned by centrifuge
In [41]:
aligned_read_count / non_vector_read_count
Out[41]:
In [42]:
clustered_read_counts = pd.read_csv(clustered_read_counts_path, sep='\t', header=None, usecols=[0]).values.ravel()
aligned_clustered_read_counts = pd.read_csv(aligned_clustered_read_counts_path, sep='\t', header=None, usecols=[0]).values.ravel()
Observed
In [43]:
len(clustered_read_counts)
Out[43]:
In [44]:
len(aligned_clustered_read_counts)
Out[44]:
Chao1
In [45]:
skbio.diversity.alpha_diversity('chao1', clustered_read_counts)[0]
Out[45]:
In [46]:
skbio.diversity.alpha_diversity('chao1', aligned_clustered_read_counts)[0]
Out[46]:
Shannon entropy effective population size
In [47]:
2 ** skbio.diversity.alpha_diversity('shannon', clustered_read_counts)[0]
Out[47]:
In [48]:
2 ** skbio.diversity.alpha_diversity('shannon', aligned_clustered_read_counts)[0]
Out[48]:
Distribution of "clone" counts
In [49]:
fig, ax = plt.subplots()
_ = ax.hist(clustered_read_counts, bins=50)
In [50]:
fig, ax = plt.subplots()
_ = ax.hist(clustered_read_counts, bins=50, log=True)
In [51]:
hist, bin_edges = np.histogram(clustered_read_counts, bins=np.logspace(0, np.log10(clustered_read_counts.max()), 100))
cdf = np.cumsum(hist) / hist.sum()
fig, ax = plt.subplots()
ax.hlines(cdf, bin_edges[:-1], bin_edges[1:])
ax.set(title='clustered read count distrib', xlabel='count', xlim=[bin_edges[0], bin_edges[-1]], ylim=[0, 1], xscale='log')
Out[51]:
In [52]:
fig, ax = plt.subplots()
_ = ax.hist(aligned_clustered_read_counts, bins=50)
In [53]:
fig, ax = plt.subplots()
_ = ax.hist(aligned_clustered_read_counts, bins=50, log=True)
In [54]:
hist, bin_edges = np.histogram(aligned_clustered_read_counts, bins=np.logspace(0, np.log10(clustered_read_counts.max()), 100))
cdf = np.cumsum(hist) / hist.sum()
fig, ax = plt.subplots()
ax.hlines(cdf, bin_edges[:-1], bin_edges[1:])
ax.set(title='clustered ALIGNED read count distrib', xlabel='count', xlim=[bin_edges[0], bin_edges[-1]], ylim=[0, 1], xscale='log')
Out[54]:
In [55]:
names = ['pct_reads_clade', 'num_reads_clade', 'num_reads_taxon', 'rank', 'tax_id', 'tax_name']
converters = {'tax_name': lambda x: x.strip()}
td = pd.read_csv(kraken_report_path, sep='\t', header=None, names=names, converters=converters)
In [56]:
all_colors = sns.husl_palette(1000)
state = random.getstate()
random.seed(0)
random.shuffle(all_colors)
random.setstate(state)
def top_hits(df):
return df.sort_values('pct_reads_clade', ascending=False, inplace=False)[:20]
def plot_distribution(df):
fig, ax = plt.subplots()
left = 0.2
heights = df['pct_reads_clade'].values
bottoms = [0] + list(np.cumsum(heights))[:-1]
colors = all_colors[:len(heights)]
labels = df['tax_name'].values
for h, b, c, l in zip(heights, bottoms, colors, labels):
if h > 0:
ax.bar(left, h, width=0.4, bottom=b, color=c, label=l)
ax.set(xlim=(0, 1))
ax.legend()
Species
In [57]:
species = td[td['rank'] == 'S']
top_hits(species)
Out[57]:
In [58]:
plot_distribution(species)
Genus
In [59]:
genuses = td[td['rank'] == 'G']
top_hits(genuses)
Out[59]:
In [60]:
plot_distribution(genuses)
Family
In [61]:
families = td[td['rank'] == 'F']
top_hits(families)
Out[61]:
In [62]:
plot_distribution(families)
Order
In [63]:
orders = td[td['rank'] == 'O']
top_hits(orders)
Out[63]:
In [64]:
plot_distribution(orders)
Class
In [65]:
classes = td[td['rank'] == 'C']
top_hits(classes)
Out[65]:
In [66]:
plot_distribution(classes)
Phylum
In [67]:
phyla = td[td['rank'] == 'P']
top_hits(phyla)
Out[67]:
In [68]:
plot_distribution(phyla)