Single species single end library


In [110]:
from os.path import join as pjoin
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
import pandas as pd
import skbio
import skbio.diversity

In [2]:
plt.style.use('seaborn')
%matplotlib inline

Dataset

CONFIGURATION (edit the following cell)


In [3]:
raw_reads_path = '/Users/laserson/Downloads/BaseSpace/Metagenomiclibraries-39688650/3-48085070/bovatus500wopselect_S3_L001_R1_001.fastq.gz'
analysis_dir_path = '/Users/laserson/tmp/phip_analysis/phip-7/phip-7-single'
sample_prefix = 'bovatus500wopselect_S3_L001_R1_001'

In [97]:
vector_depleted_reads_path = pjoin(analysis_dir_path, 'vector_depleted', sample_prefix + '.fastq')
species_aligned_bam_path = pjoin(analysis_dir_path, 'species_aligned', sample_prefix + '.bam')
genome_cov_path = pjoin(analysis_dir_path, 'coverage', sample_prefix + '.cov_genome.tsv')
gene_cov_path = pjoin(analysis_dir_path, 'coverage', sample_prefix + '.cov_stranded.tsv')
inframe_cov_path = pjoin(analysis_dir_path, 'coverage', sample_prefix + '.cov_inframe.tsv')
clustered_read_counts_path = pjoin(analysis_dir_path, 'clustered_reads', sample_prefix + '.clustered.counts.tsv')
inframe_insert_counts_path = pjoin(analysis_dir_path, 'annot', sample_prefix + '.inframe_inserts.counts.tsv')
recon_allreads_path = pjoin(analysis_dir_path, 'diversity', sample_prefix + '.recon_allreads.txt')
recon_inframe_path = pjoin(analysis_dir_path, 'diversity', sample_prefix + '.recon_inframe.txt')

Statistics

Number of raw reads


In [5]:
_ = !cat {raw_reads_path} | gunzip | wc -l
raw_read_count = int(_[0]) // 4
raw_read_count


Out[5]:
7967079

Number of non-vector reads


In [6]:
_ = !cat {vector_depleted_reads_path} | wc -l
non_vector_read_count = int(_[0]) // 4
non_vector_read_count


Out[6]:
5374985

Fraction non-vector reads


In [7]:
non_vector_read_count / raw_read_count


Out[7]:
0.6746493915775155

Number of species-aligned reads


In [8]:
_ = !samtools view -c -F 4 {species_aligned_bam_path}
aligned_read_count = int(_[0])
aligned_read_count


Out[8]:
2423071

Fraction of non-vector reads that map to species


In [9]:
aligned_read_count / non_vector_read_count


Out[9]:
0.4508051650376699

Fraction of all reads that map to species


In [10]:
aligned_read_count / raw_read_count


Out[10]:
0.30413543031266543

Coverage

Genome Coverage


In [12]:
genome_cov = pd.read_csv(genome_cov_path, sep='\t', header=None, names=['chr', 'pos', 'cov'])

Number of bases of genome covered


In [13]:
(genome_cov['cov'] > 0).sum()


Out[13]:
2515185

Fraction of genome covered


In [14]:
(genome_cov['cov'] > 0).sum() / len(genome_cov)


Out[14]:
0.38859625717401758

In [15]:
fig, ax = plt.subplots()
ax.plot(genome_cov['pos'], genome_cov['cov'])
ax.set(title='genome coverage', xlabel='pos', ylabel='cov')


Out[15]:
[<matplotlib.text.Text at 0x11bd52c18>,
 <matplotlib.text.Text at 0x11bd521d0>,
 <matplotlib.text.Text at 0x1105b35f8>]

In [16]:
fig, ax = plt.subplots()
ax.plot(genome_cov['pos'], genome_cov['cov'])
ax.set(title='genome coverage', xlabel='pos', ylabel='cov', yscale='log')


Out[16]:
[None,
 <matplotlib.text.Text at 0x110be3780>,
 <matplotlib.text.Text at 0x1106459b0>,
 <matplotlib.text.Text at 0x110c0b5c0>]

In [17]:
fig, ax = plt.subplots()
ax.hist(genome_cov['cov'], bins=50)
ax.set(title='genome coverage distribution', xlabel='cov')


Out[17]:
[<matplotlib.text.Text at 0x116de6278>, <matplotlib.text.Text at 0x11e6b03c8>]

In [18]:
fig, ax = plt.subplots()
ax.hist(genome_cov['cov'], bins=50, log=True)
ax.set(title='genome coverage distribution', xlabel='cov')


Out[18]:
[<matplotlib.text.Text at 0x11e9f17b8>, <matplotlib.text.Text at 0x11ea9cb70>]

In [81]:
nonzero = genome_cov['cov'] > 0
hist, bin_edges = np.histogram(genome_cov['cov'][nonzero], bins='auto')
cdf = np.cumsum(hist) / hist.sum()
fig, ax = plt.subplots()
ax.hlines(cdf, bin_edges[:-1], bin_edges[1:])
ax.set(title='genome coverage CDF (nonzero positions only)', xlabel='cov', xlim=[bin_edges[0], bin_edges[-1]], ylim=[0, 1], xscale='log')


Out[81]:
[(0, 1),
 None,
 (1.0, 22371.0),
 <matplotlib.text.Text at 0x16a7617b8>,
 <matplotlib.text.Text at 0x16a9fb828>]

In [42]:
pct90 = sp.stats.scoreatpercentile(genome_cov['cov'][nonzero], 90)
pct80 = sp.stats.scoreatpercentile(genome_cov['cov'][nonzero], 80)
pct20 = sp.stats.scoreatpercentile(genome_cov['cov'][nonzero], 20)
pct10 = sp.stats.scoreatpercentile(genome_cov['cov'][nonzero], 10)

Percentiles


In [43]:
pct10, pct20, pct80, pct90


Out[43]:
(1.0, 2.0, 103.0, 686.0)

Cover middle 80% of unique clones with how many logs of frequency?


In [45]:
np.log10(pct90) - np.log10(pct10)


Out[45]:
2.8363241157067516

Cover middle 60% of unique clones with how many logs of frequency?


In [46]:
np.log10(pct80) - np.log10(pct20)


Out[46]:
1.7118072290411908

ORF Coverage


In [20]:
names=['chr', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute', 'num_reads', 'num_cov_bases', 'len', 'frac_bases_cov']
gene_cov = pd.read_csv(gene_cov_path, sep='\t', header=None, names=names)

Number of genes


In [21]:
len(gene_cov)


Out[21]:
4996

Number of covered genes


In [22]:
np.sum(gene_cov['num_cov_bases'] > 0)


Out[22]:
2805

Fraction genes covered


In [23]:
np.sum(gene_cov['num_cov_bases'] > 0) / len(gene_cov)


Out[23]:
0.56144915932746198

Fraction coding bases covered


In [24]:
gene_cov['num_cov_bases'].sum() / gene_cov['len'].sum()


Out[24]:
0.24869734503296492

Distribution of gene coverage fractions

For each gene, compute histogram of its coverage fraction. Only compute over genes that have non-zero coverage.


In [25]:
fig, ax = plt.subplots()
ax.hist(gene_cov.loc[gene_cov['frac_bases_cov'] > 0]['frac_bases_cov'], bins=50)
ax.set(title='fraction gene covered distribution', xlabel='fraction covered')


Out[25]:
[<matplotlib.text.Text at 0x11eceff98>, <matplotlib.text.Text at 0x11c684080>]

Distribution of number of reads that overlaps each gene


In [64]:
fig, ax = plt.subplots()
ax.hist(gene_cov['num_reads'], bins=50)
ax.set(title='num reads overlapping gene distribution', xlabel='fraction covered')


Out[64]:
[<matplotlib.text.Text at 0x122ddc0b8>, <matplotlib.text.Text at 0x122fba780>]

In [65]:
fig, ax = plt.subplots()
ax.hist(gene_cov['num_reads'], bins=50, log=True)
ax.set(title='num reads overlapping gene distribution', xlabel='fraction covered')


Out[65]:
[<matplotlib.text.Text at 0x12308ca20>, <matplotlib.text.Text at 0x122dc4a90>]

In [85]:
nonzero = gene_cov['num_reads'] > 0
hist, bin_edges = np.histogram(gene_cov['num_reads'][nonzero], bins='auto')
cdf = np.cumsum(hist) / hist.sum()
fig, ax = plt.subplots()
ax.hlines(cdf, bin_edges[:-1], bin_edges[1:])
ax.set(title='gene (ORF) coverage CDF (nonzero positions only)', xlabel='cov', xlim=[bin_edges[0], bin_edges[-1]], ylim=[0, 1], xscale='log')


Out[85]:
[(0, 1),
 None,
 (1.0, 22382.0),
 <matplotlib.text.Text at 0x11f38d048>,
 <matplotlib.text.Text at 0x11fff8438>]

Inframe Coverage

Fraction of ORFs that have inframe clones


In [98]:
names=['chr', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute', 'num_reads', 'num_cov_bases', 'len', 'frac_bases_cov']
inframe_cov = pd.read_csv(inframe_cov_path, sep='\t', header=None, names=names)

Number of covered genes


In [100]:
np.sum(inframe_cov['num_cov_bases'] > 0)


Out[100]:
1628

Fraction genes covered


In [101]:
np.sum(inframe_cov['num_cov_bases'] > 0) / len(inframe_cov)


Out[101]:
0.32586068855084066

Fraction coding bases covered


In [102]:
inframe_cov['num_cov_bases'].sum() / inframe_cov['len'].sum()


Out[102]:
0.1095057228036649

Distribution of gene coverage fractions

For each gene, compute histogram of its coverage fraction. Only compute over genes that have non-zero coverage.


In [104]:
fig, ax = plt.subplots()
ax.hist(inframe_cov.loc[inframe_cov['frac_bases_cov'] > 0]['frac_bases_cov'], bins=50)
ax.set(title='fraction gene covered by inframe distribution', xlabel='fraction covered')


Out[104]:
[<matplotlib.text.Text at 0x121f3e8d0>, <matplotlib.text.Text at 0x120383fd0>]

Distribution of number of reads that overlaps each gene


In [105]:
fig, ax = plt.subplots()
ax.hist(inframe_cov['num_reads'], bins=50)
ax.set(title='num reads overlapping gene distribution', xlabel='fraction covered')


Out[105]:
[<matplotlib.text.Text at 0x1203da8d0>, <matplotlib.text.Text at 0x1220dc518>]

In [109]:
fig, ax = plt.subplots()
ax.hist(inframe_cov['num_reads'], bins=50, log=True)
ax.set(title='num reads overlapping gene distribution', xlabel='num reads')


Out[109]:
[<matplotlib.text.Text at 0x1272f8160>, <matplotlib.text.Text at 0x1274ab198>]

In [108]:
nonzero = inframe_cov['num_reads'] > 0
hist, bin_edges = np.histogram(inframe_cov['num_reads'][nonzero], bins='auto')
cdf = np.cumsum(hist) / hist.sum()
fig, ax = plt.subplots()
ax.hlines(cdf, bin_edges[:-1], bin_edges[1:])
ax.set(title='gene (ORF) coverage by inframe reads CDF (nonzero positions only)', xlabel='cov', xlim=[bin_edges[0], bin_edges[-1]], ylim=[0, 1], xscale='log')


Out[108]:
[(0, 1),
 None,
 (1.0, 19013.0),
 <matplotlib.text.Text at 0x126f3e7f0>,
 <matplotlib.text.Text at 0x1270d0390>]

Diversity


In [26]:
clustered_read_counts = pd.read_csv(clustered_read_counts_path, sep='\t', header=None, usecols=[0]).values.ravel()
inframe_insert_counts = pd.read_csv(inframe_insert_counts_path, sep='\t', header=None, usecols=[1]).values.ravel()

Observed


In [27]:
len(clustered_read_counts)


Out[27]:
42364

In [28]:
len(inframe_insert_counts)


Out[28]:
1838

Chao1


In [29]:
skbio.diversity.alpha_diversity('chao1', clustered_read_counts)[0]


Out[29]:
179895.49657534246

In [30]:
skbio.diversity.alpha_diversity('chao1', inframe_insert_counts)[0]


Out[30]:
2387.3886255924172

Shannon entropy effective population size


In [59]:
2 ** skbio.diversity.alpha_diversity('shannon', clustered_read_counts)[0]


Out[59]:
1255.7034826896977

In [58]:
2 ** skbio.diversity.alpha_diversity('shannon', inframe_insert_counts)[0]


Out[58]:
102.81563707872104

Recon


In [31]:
def parse_recon_fitfile(path):
    with open(path, 'r') as ip:
        lines = [line.strip() for line in ip.readlines()]
    lines.reverse()
    line = lines.pop()
    while not line.startswith('observed_clone_size_distribution'):
        line = lines.pop()
    observed_clone_size_distribution = eval(line.split('=')[-1])
    start = lines.index('===========================================')
    n0 = float(lines[start + 4].split('=')[-1].strip())
    parameters = eval(lines[start + 5].split('=')[-1])
    weights = parameters[:len(parameters) // 2]
    means = parameters[len(parameters) // 2:]
    return {'observed_clone_size_distribution': observed_clone_size_distribution,
            'n0': n0,
            'weights': weights,
            'means': means}

In [1]:
# recon_allreads_results = parse_recon_fitfile(recon_allreads_path)

In [2]:
# sum(recon_allreads_results['observed_clone_size_distribution'].values())

Distribution of inframe clones


In [83]:
fig, ax = plt.subplots()
ax.hist(inframe_insert_counts, bins=50, log=True)
ax.set(title='inframe clone coverage distribution', xlabel='cov')


Out[83]:
[<matplotlib.text.Text at 0x11f94edd8>, <matplotlib.text.Text at 0x11f4be208>]

In [84]:
hist, bin_edges = np.histogram(inframe_insert_counts, bins='auto')
cdf = np.cumsum(hist) / hist.sum()
fig, ax = plt.subplots()
ax.hlines(cdf, bin_edges[:-1], bin_edges[1:])
ax.set(title='inframe clone coverage CDF (nonzero positions only)', xlabel='cov', xlim=[bin_edges[0], bin_edges[-1]], ylim=[0, 1], xscale='log')


Out[84]:
[(0, 1),
 None,
 (1.0, 17768.0),
 <matplotlib.text.Text at 0x11edc9978>,
 <matplotlib.text.Text at 0x11f38d6d8>]

In [54]:
pct90 = sp.stats.scoreatpercentile(inframe_insert_counts, 90)
pct80 = sp.stats.scoreatpercentile(inframe_insert_counts, 80)
pct20 = sp.stats.scoreatpercentile(inframe_insert_counts, 20)
pct10 = sp.stats.scoreatpercentile(inframe_insert_counts, 10)

In [55]:
pct10, pct20, pct80, pct90


Out[55]:
(1.0, 1.0, 24.0, 100.0)

In [ ]:


In [ ]:


In [ ]:

"Useful" particles/clones


In [87]:
num_inframe_clones = len(inframe_insert_counts)
num_inframe_clones


Out[87]:
1838

In [89]:
num_inframe_particles = inframe_insert_counts.sum()
num_inframe_particles


Out[89]:
416152

In [90]:
num_nonvector_clones = len(clustered_read_counts)
num_nonvector_clones


Out[90]:
42364

In [91]:
num_nonvector_particles = clustered_read_counts.sum()
num_nonvector_particles


Out[91]:
5374985

In [92]:
num_total_particles = raw_read_count
num_total_particles


Out[92]:
7967079

Fraction clones "useful" (of non-vector clones)


In [93]:
num_inframe_clones / num_nonvector_clones


Out[93]:
0.043385893683316024

Fraction particles "useful" (of non-vector particles)


In [94]:
num_inframe_particles / num_nonvector_particles


Out[94]:
0.077423843973518064

Fraction particles "useful" (of all particles, including vector-coding)


In [95]:
num_inframe_particles / num_total_particles


Out[95]:
0.052233949230326447