Single species single end library


In [93]:
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy as sp
import pandas as pd
import skbio
import skbio.diversity
import pybedtools

In [3]:
plt.style.use('seaborn')
%matplotlib inline

Dataset

CONFIGURATION (edit the following cell)


In [92]:
analysis_dir = Path('/Users/laserson/tmp/phip_analysis/phip-28/sc5314')
genome_gff = Path('/Users/laserson/refdata/genomes/cal/GCF_000182965.3_ASM18296v3_genomic.gff')

In [355]:
bam_file = analysis_dir / 'genome_aligned_reads.unsorted.bam'

fragment_file = analysis_dir / 'fragments.tsv'

cov_path = analysis_dir / 'coverage'

cov_genome_weighted_file = cov_path / 'genome.weighted.tsv'
cov_genome_dedup_file = cov_path / 'genome.dedup.tsv'

cov_unstranded_weighted_file = cov_path / 'unstranded.weighted.tsv'
cov_unstranded_dedup_file = cov_path / 'unstranded.dedup.tsv'

cov_stranded_weighted_file = cov_path / 'stranded.weighted.tsv'
cov_stranded_dedup_file = cov_path / 'stranded.dedup.tsv'

cov_inframe_weighted_file = cov_path / 'inframe.weighted.tsv'
cov_inframe_dedup_file = cov_path / 'inframe.dedup.tsv'

Alignment

Number of raw reads


In [8]:
_ = !samtools view -c {bam_file}
raw_read_count = int(_[0])
raw_read_count


Out[8]:
8752941

Number of genome-aligned reads


In [9]:
_ = !samtools view -c -F 4 {bam_file}
aligned_read_count = int(_[0])
aligned_read_count


Out[9]:
7759014

Fraction of reads that map to species


In [10]:
aligned_read_count / raw_read_count


Out[10]:
0.8864465098073893

Genome Coverage


In [12]:
cov_genome_dedup = pd.read_csv(cov_genome_dedup_file, sep='\t', header=None, names=['chr', 'pos', 'cov'])
cov_genome_weighted = pd.read_csv(cov_genome_weighted_file, sep='\t', header=None, names=['chr', 'pos', 'cov'])

Number of bases of genome covered


In [13]:
(cov_genome_dedup['cov'] > 0).sum()


Out[13]:
11730181

Number of bases NOT covered


In [156]:
(cov_genome_dedup['cov'] == 0).sum()


Out[156]:
2552485

Fraction of genome covered


In [15]:
(cov_genome_dedup['cov'] > 0).sum() / len(cov_genome_dedup)


Out[15]:
0.82128791641560472

Deduplicated genome coverage

These coverage figures are for "deduplicated" coverage values. They count each unique clone a single time.


In [251]:
cov_genome = cov_genome_dedup

Coverage along genome


In [30]:
g = sns.FacetGrid(cov_genome, col='chr', col_wrap=3, sharex=False)
_ = g.map(plt.plot, 'pos', 'cov', marker='o', ms=2, ls='None')


Out[30]:
<seaborn.axisgrid.FacetGrid at 0x11d3f7d68>

In [29]:
g = sns.FacetGrid(cov_genome, col='chr', col_wrap=3, sharex=False)
_ = g.map(plt.plot, 'pos', 'cov', marker='o', ms=2, ls='None')
_ = g.set(yscale='log')


Out[29]:
<seaborn.axisgrid.FacetGrid at 0x119d309e8>

Distribution of coverage values


In [190]:
hist, bin_edges = np.histogram(cov_genome['cov'], bins=150)
fig, axs = plt.subplots(1, 2, figsize=(12, 4))
_ = axs[0].hlines(hist, bin_edges[:-1], bin_edges[1:])
_ = axs[0].set(title='genome coverage hist', xlabel='cov', xlim=[bin_edges[0], 20], ylim=[0, hist.max()*1.1])
_ = axs[1].hlines(hist, bin_edges[:-1], bin_edges[1:])
_ = axs[1].set(title='genome coverage hist (log)', xlabel='cov', xlim=[bin_edges[0], bin_edges[-1]], ylim=[1, hist.max()*2], yscale='log')



In [197]:
hist, bin_edges = np.histogram(cov_genome['cov'], bins=range(200))
cdf = np.cumsum(hist) / hist.sum()
fig, ax = plt.subplots()
_ = ax.hlines(cdf, bin_edges[:-1], bin_edges[1:])
_ = ax.set(title='genome coverage CDF', xlabel='cov', xlim=[bin_edges[0], 20], ylim=[0, 1])


Percentiles


In [464]:
nonzero = cov_genome['cov'] > 0
pct90 = sp.stats.scoreatpercentile(cov_genome['cov'][nonzero], 90)
pct80 = sp.stats.scoreatpercentile(cov_genome['cov'][nonzero], 80)
pct20 = sp.stats.scoreatpercentile(cov_genome['cov'][nonzero], 20)
pct10 = sp.stats.scoreatpercentile(cov_genome['cov'][nonzero], 10)

In [465]:
pct10, pct20, pct80, pct90


Out[465]:
(1.0, 1.0, 3.0, 4.0)

Cover middle 80% of unique clones with how many logs of frequency?


In [466]:
np.log10(pct90) - np.log10(pct10)


Out[466]:
0.6020599913279624

Cover middle 60% of unique clones with how many logs of frequency?


In [467]:
np.log10(pct80) - np.log10(pct20)


Out[467]:
0.47712125471966244

Read-weighted genome coverage

These coverage figures are for read-weighted coverage values.


In [209]:
cov_genome = cov_genome_weighted

Coverage along genome


In [210]:
g = sns.FacetGrid(cov_genome, col='chr', col_wrap=3, sharex=False)
_ = g.map(plt.plot, 'pos', 'cov', marker='o', ms=2, ls='None')



In [211]:
g = sns.FacetGrid(cov_genome, col='chr', col_wrap=3, sharex=False)
_ = g.map(plt.plot, 'pos', 'cov', marker='o', ms=2, ls='None')
_ = g.set(yscale='log')


Distribution of coverage values


In [242]:
hist, bin_edges = np.histogram(cov_genome['cov'], bins=np.insert(np.logspace(0, 4.5), 0, 0))
fig, axs = plt.subplots(1, 2, figsize=(12, 4))
_ = axs[0].hlines(hist, bin_edges[:-1], bin_edges[1:])
_ = axs[0].set(title='genome coverage hist', xlabel='cov', xlim=[bin_edges[0], 3000], ylim=[0, hist.max()*1.1])
_ = axs[1].hlines(hist, bin_edges[:-1], bin_edges[1:])
_ = axs[1].set(title='genome coverage hist (log)', xlabel='cov', xlim=[bin_edges[0], bin_edges[-1]], ylim=[1, hist.max()*2], yscale='log')



In [246]:
hist, bin_edges = np.histogram(cov_genome['cov'], bins='auto')
cdf = np.cumsum(hist) / hist.sum()
fig, ax = plt.subplots()
_ = ax.hlines(cdf, bin_edges[:-1], bin_edges[1:])
_ = ax.set(title='genome coverage CDF', xlabel='cov', xlim=[bin_edges[0], 3000], ylim=[0, 1])


Percentiles


In [247]:
pct90 = sp.stats.scoreatpercentile(cov_genome['cov'][nonzero], 90)
pct80 = sp.stats.scoreatpercentile(cov_genome['cov'][nonzero], 80)
pct20 = sp.stats.scoreatpercentile(cov_genome['cov'][nonzero], 20)
pct10 = sp.stats.scoreatpercentile(cov_genome['cov'][nonzero], 10)

In [248]:
pct10, pct20, pct80, pct90


Out[248]:
(9.0, 51.0, 681.0, 951.0)

Cover middle 80% of unique clones with how many logs of frequency?


In [249]:
np.log10(pct90) - np.log10(pct10)


Out[249]:
2.0239380074980891

Cover middle 60% of unique clones with how many logs of frequency?


In [250]:
np.log10(pct80) - np.log10(pct20)


Out[250]:
1.1255769358148489

CDS coverage

Load the data


In [267]:
names=['chr', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute', 'num_reads', 'num_cov_bases', 'len', 'frac_bases_cov']

cov_unstranded_weighted = pd.read_csv(cov_unstranded_weighted_file, sep='\t', header=None, names=names)
cov_unstranded_dedup = pd.read_csv(cov_unstranded_dedup_file, sep='\t', header=None, names=names)

cov_stranded_weighted = pd.read_csv(cov_stranded_weighted_file, sep='\t', header=None, names=names)
cov_stranded_dedup = pd.read_csv(cov_stranded_dedup_file, sep='\t', header=None, names=names)

cov_inframe_weighted = pd.read_csv(cov_inframe_weighted_file, sep='\t', header=None, names=names)
cov_inframe_dedup = pd.read_csv(cov_inframe_dedup_file, sep='\t', header=None, names=names)

Total number of coding bases


In [258]:
cov_inframe_dedup['len'].sum()


Out[258]:
8955545

Fraction of genome that is coding


In [260]:
cov_inframe_dedup['len'].sum() / len(cov_genome_dedup)


Out[260]:
0.62702194394239841

Number of CDSs


In [262]:
len(cov_inframe_dedup)


Out[262]:
6432

Number of CDSs with inframe coverage


In [263]:
np.sum(cov_inframe_dedup['num_cov_bases'] > 0)


Out[263]:
3813

Fraction of CDSs with inframe coverage


In [265]:
np.sum(cov_inframe_dedup['num_cov_bases'] > 0) / len(cov_inframe_dedup)


Out[265]:
0.59281716417910446

Fraction coding bases covered


In [266]:
cov_inframe_dedup['num_cov_bases'].sum() / cov_inframe_dedup['len'].sum()


Out[266]:
0.25495455608787626

Unstranded vs stranded vs inframe clone coverage (dedup'd)

We expect that half the unstranded clones will be stranded, and one third the stranded clones will be inframe. The black lines represent the expected relationships.


In [323]:
joined1 = pd.merge(cov_unstranded_dedup, cov_stranded_dedup, on=['chr', 'start', 'end'], suffixes=['_unstranded', '_stranded'])
x = np.linspace(0, joined1['num_reads_unstranded'].max())
y = x / 2
joined2 = pd.merge(cov_stranded_dedup, cov_inframe_dedup, on=['chr', 'start', 'end'], suffixes=['_stranded', '_inframe'])
z = np.linspace(0, joined2['num_reads_inframe'].max())
w = z * 3

fig, ax = plt.subplots(1, 2, figsize=(12, 4), sharey=True)
_ = sns.regplot('num_reads_unstranded', 'num_reads_stranded', data=joined1, ax=ax[0], scatter_kws={'s': 3}, line_kws={'lw': 1})
_ = ax[0].plot(x, y, color='k', lw=1)
_ = ax[0].set(xlabel='num clones match CDS unstranded', ylabel='num clones match CDS stranded')
_ = sns.regplot('num_reads_inframe', 'num_reads_stranded', data=joined2, ax=ax[1], scatter_kws={'s': 3}, line_kws={'lw': 1})
_ = ax[1].plot(z, w, color='k', lw=1)
_ = ax[1].set(xlabel='num clones match CDS inframe', ylabel='num clones match CDS stranded')


Read-weighted vs dedup'd CDS coverage


In [324]:
joined = pd.merge(cov_inframe_dedup, cov_inframe_weighted, on=['chr', 'start', 'end'], suffixes=['_dedup', '_weighted'])
fig, ax = plt.subplots()
_ = sns.regplot('num_reads_dedup', 'num_reads_weighted', data=joined, ax=ax)
_ = ax.set(xlabel='num clones', ylabel='num reads')


Distribution of gene coverage fractions

For each gene, compute histogram of its coverage fraction. Only compute over genes that have non-zero coverage.


In [309]:
hist, bin_edges = np.histogram(cov_inframe_dedup['frac_bases_cov'], bins=40)
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
_ = ax[0].hlines(hist, bin_edges[:-1], bin_edges[1:])
_ = ax[0].set(title='fraction gene covered distribution', xlabel='fraction covered', xlim=[0, 1])
_ = ax[1].hlines(hist[1:], bin_edges[1:-1], bin_edges[2:])
_ = ax[1].set(title='fraction gene covered distribution (nonzero)', xlabel='fraction covered', xlim=[0, 1])


Distribution of number of clones that overlaps each gene inframe


In [333]:
hist, bin_edges = np.histogram(cov_inframe_dedup['num_reads'], bins=25)
fig, ax = plt.subplots()
_ = ax.hlines(hist, bin_edges[:-1], bin_edges[1:])
_ = ax.set(title='num clones per CDS hist', xlabel='num clones')



In [348]:
hist, bin_edges = np.histogram(cov_inframe_weighted['num_reads'], bins=100)
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
_ = ax[0].hlines(hist, bin_edges[:-1], bin_edges[1:])
_ = ax[0].set(title='num reads per CDS hist', xlabel='num reads')
_ = ax[1].hlines(hist, bin_edges[:-1], bin_edges[1:])
_ = ax[1].set(title='num reads per CDS hist (log)', xlabel='num reads', yscale='log', ylim=[8e-1, hist.max()*2])



In [354]:
hist, bin_edges = np.histogram(cov_inframe_weighted['num_reads'], bins='auto')
cdf = np.cumsum(hist) / hist.sum()
fig, ax = plt.subplots()
ax.hlines(cdf, bin_edges[:-1], bin_edges[1:])
ax.set(title='CDS read-weighted coverage CDF', xlabel='num reads (coverage)', xlim=[bin_edges[0], bin_edges[-1]], ylim=[0, 1])


Out[354]:
[(0, 1),
 (0.0, 8658.0),
 Text(0.5,0,'num reads (coverage)'),
 Text(0.5,1,'CDS read-weighted coverage CDF')]

In [ ]:

Clone analysis


In [356]:
fragments = pd.read_csv(fragment_file, sep='\t', header=0)

Number of observed clones


In [358]:
len(fragments)


Out[358]:
47260

Number of observed clones with overlap


In [387]:
(fragments.has_overlap == 1).sum()


Out[387]:
19322

Number of observed clones with inframe overlap


In [388]:
(fragments.has_inframe_overlap == 1).sum()


Out[388]:
4226

Rough fraction of clones expected to be coding (lower bound)


In [360]:
cov_inframe_dedup['len'].sum() / len(cov_genome_dedup) / 6


Out[360]:
0.10450365732373307

Observed fraction of clones that code for an annotated CDS


In [362]:
(fragments.has_inframe_overlap == 1).sum() / len(fragments)


Out[362]:
0.089420228523063902

Observed fraction of "particles" (reads) that code for an annotated CDS


In [460]:
fragments[fragments.has_inframe_overlap == 1]['count'].sum() / fragments['count'].sum()


Out[460]:
0.083350461296344991

Distribution of fragment lengths


In [385]:
hist, bin_edges = np.histogram(fragments.fragment_aa.str.len().fillna(0), bins=211)
fig, ax = plt.subplots()
_ = ax.hlines(hist, bin_edges[:-1], bin_edges[1:])
_ = ax.set(title='insert len hist', xlabel='len (aa)', xlim=[bin_edges[0], bin_edges[-1] * 1.1])


Distribution of clone sizes


In [400]:
hist, bin_edges = np.histogram(fragments['count'], bins='auto')
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
_ = ax[0].hlines(hist, bin_edges[:-1], bin_edges[1:])
_ = ax[0].set(title='clone size dist', xlabel='num reads', xlim=[0, 1000], ylim=[0, hist.max() * 1.1])
_ = ax[1].hlines(hist, bin_edges[:-1], bin_edges[1:])
_ = ax[1].set(title='clone size dist', xlabel='num reads', yscale='log', ylim=[1, hist.max() * 2])



In [404]:
hist, bin_edges = np.histogram(fragments['count'], bins='auto')
cdf = np.cumsum(hist) / hist.sum()

fig, ax = plt.subplots()
ax.hlines(cdf, bin_edges[:-1], bin_edges[1:])
ax.set(title='clone size CDF', xlabel='num reads', xlim=[bin_edges[0], bin_edges[-1]], ylim=[0, 1])


Out[404]:
[(0, 1), (1.0, 6811.0), Text(0.5,0,'num reads'), Text(0.5,1,'clone size CDF')]

Distribution of INFRAME clone sizes


In [407]:
hist, bin_edges = np.histogram(fragments[fragments.has_inframe_overlap == 1]['count'], bins='auto')
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
_ = ax[0].hlines(hist, bin_edges[:-1], bin_edges[1:])
_ = ax[0].set(title='inframe clone size dist', xlabel='num reads', xlim=[0, 1000], ylim=[0, hist.max() * 1.1])
_ = ax[1].hlines(hist, bin_edges[:-1], bin_edges[1:])
_ = ax[1].set(title='inframe clone size dist', xlabel='num reads', yscale='log', ylim=[1, hist.max() * 2])



In [408]:
hist, bin_edges = np.histogram(fragments[fragments.has_inframe_overlap == 1]['count'], bins='auto')
cdf = np.cumsum(hist) / hist.sum()

fig, ax = plt.subplots()
ax.hlines(cdf, bin_edges[:-1], bin_edges[1:])
ax.set(title='inframe clone size CDF', xlabel='num reads', xlim=[bin_edges[0], bin_edges[-1]], ylim=[0, 1])


Out[408]:
[(0, 1),
 (1.0, 6811.0),
 Text(0.5,0,'num reads'),
 Text(0.5,1,'inframe clone size CDF')]

Is there a relationship between the observed count and the amount of protein that's coded for?


In [373]:
fig, ax = plt.subplots()
sns.regplot(fragments.fragment_aa.str.len(), fragments['count'], scatter_kws={'s': 3})
_ = ax.set(xlabel='len(fragment_aa)', ylabel='num reads')


Does GC content affect the count?


In [414]:
def gc_content(seq):
    s = seq.upper()
    gs = s.count('G')
    cs = s.count('C')
    as_ = s.count('A')
    ts = s.count('T')
    return (gs + cs) / (gs + cs + as_ + ts)

In [417]:
fig, ax = plt.subplots()
_ = sns.regplot(fragments.fragment_nt.apply(gc_content), fragments['count'])
_ = ax.set(xlabel='GC content', ylabel='num reads')


Most common clones


In [439]:
fragments.sort_values('count', ascending=False).head()


Out[439]:
chr position strand count has_overlap has_inframe_overlap bases_till_overlap fragment_nt fragment_aa annot
4543 NC_032089.1 2293142 - 6811 1 1 0.0 ACCTATTTGATGTTAATGTTGAATGCAAAAGTACCAAAAGATTTAC... TYLMLMLNAKVPKDLLRPSRPFTELVVSFSLLCGGLIFMELTDPVI... ID=cds961;Parent=rna997;Dbxref=CGD:CAL00001890...
3241 NC_032089.1 1918943 + 5413 0 0 NaN GTCGTATGATCTTTCGGTTTGcctattattattattattataccTG... VV .
19897 NC_032091.1 27661 + 5089 1 0 616.0 AATATTTCTGGGGTACTACGTCATTGGACAATAACCACGGAATTTA... NISGVLRHWTITTEFRVYDIIANPVCIE ID=cds2367;Parent=rna2458;Dbxref=CGD:CAL000019...
25695 NC_032092.1 425724 + 4541 0 0 NaN GCTTTGAGGATCCTTTAGATTCAGTGTCAAATACAGCTGCTGACAG... ALRIL .
42621 NC_032096.1 1893151 - 3963 0 0 NaN GTAATAATCTGGTGTGACAAgttgataaaaaattggttgTAAGTTT... VIIWCDKLIKNWL .

Most common INFRAME clones


In [441]:
fragments[fragments.has_inframe_overlap == 1].sort_values('count', ascending=False).head()


Out[441]:
chr position strand count has_overlap has_inframe_overlap bases_till_overlap fragment_nt fragment_aa annot
4543 NC_032089.1 2293142 - 6811 1 1 0.0 ACCTATTTGATGTTAATGTTGAATGCAAAAGTACCAAAAGATTTAC... TYLMLMLNAKVPKDLLRPSRPFTELVVSFSLLCGGLIFMELTDPVI... ID=cds961;Parent=rna997;Dbxref=CGD:CAL00001890...
38112 NC_032095.1 942192 - 3111 1 1 0.0 CTAAAATTGGACGACAAAGTACATGCCACCACAGACAACAACTTGG... LKLDDKVHATTDNNLDMNNILENDELILDGLNMTLLDNGDHANEEF... ID=cds5066;Parent=rna5241;Dbxref=CGD:CAL000018...
40399 NC_032096.1 1613176 + 2138 1 1 0.0 CCAGTACAAGCTGAACAACcgcaaccacaaccacaaccacatcaac... PVQAEQPQPQPQPHQQQHQQQQHQPELQNTYQQIKHIQQQQQMLQH... ID=cds5732;Parent=rna5946;Dbxref=CGD:CAL000019...
10013 NC_032090.1 1030542 - 1922 1 1 0.0 aaagttggtagaagaagaattttatTGGTAGGAATGCCATGCATGT... KVGRRRILLVGMPCMCISLIVCAVAFHYLNVDFSTGTVVSRGINGW... ID=cds1808;Parent=rna1875;Dbxref=CGD:CAL000020...
37087 NC_032095.1 655887 + 1723 1 1 0.0 CATTTCTCCAATTATATGTCACAGCAATTGACTAACCCATCAACTA... HFSNYMSQQLTNPSTTPNHEHEDELETSSEDDEDEEEYIDPNDDGR... ID=cds4942;Parent=rna5116;Dbxref=CGD:CAL000018...

Genes from most common clones


In [452]:
for s in fragments[fragments.has_inframe_overlap == 1].sort_values('count', ascending=False).head()['annot']:
    for f in s.split(';'):
        print(f)
    print()


ID=cds961
Parent=rna997
Dbxref=CGD:CAL0000189087,GeneID:3634836,Genbank:XP_723589.2
Name=XP_723589.2
Note=Putative protein of unknown function%3B Hap43p-repressed gene%3B increased transcription is observed upon fluphenazine treatment%3B possibly transcriptionally regulated by Tac1p%3B induced by nitric oxide%3B fungal-specific (no human/murine homolog
gbkey=CDS
orig_transcript_id=gnl|WGS:AACQ|mrna_CAALFM_C110360CA
product=hypothetical protein
protein_id=XP_723589.2
transl_table=12

ID=cds5066
Parent=rna5241
Dbxref=CGD:CAL0000186660,GeneID:30515361,Genbank:XP_019331044.1
Name=XP_019331044.1
Note=Putative protein of unknown function%2C transcript is upregulated in an RHE model of oral candidiasis
gbkey=CDS
orig_transcript_id=gnl|WGS:AACQ|mrna_CAALFM_C704340CA
product=hypothetical protein
protein_id=XP_019331044.1
transl_table=12

ID=cds5732
Parent=rna5946
Dbxref=CGD:CAL0000191878,GeneID:3646034,Genbank:XP_712331.2
Name=XP_712331.2
Note=Transcription factor%3B similar to S. cerevisiae Ace2 and Swi5%3B regulates morphogenesis%2C cell separation%2C adherence%2C virulence in a mice%3B mutant is hyperfilamentous%3B rat catheter and Spider biofilm induced
gbkey=CDS
gene=ACE2
orig_transcript_id=gnl|WGS:AACQ|mrna_CAALFM_CR07440WA
product=DNA-binding transcription factor
protein_id=XP_712331.2
transl_table=12

ID=cds1808
Parent=rna1875
Dbxref=CGD:CAL0000201160,GeneID:3643440,Genbank:XP_714885.1
Name=XP_714885.1
Note=MFS inositol transporter%3B uptake of exogenous inositol%3B 12 transmembrane motifs%3B expressed in rich medium%3B fluconazole%2C caspofungin repressed%3B possibly essential (UAU1 method)%3B flow model%2C rat catheter and Spider biofilm induced
gbkey=CDS
gene=ITR1
orig_transcript_id=gnl|WGS:AACQ|mrna_CAALFM_C204940CA
product=myo-inositol transporter
protein_id=XP_714885.1
transl_table=12

ID=cds4942
Parent=rna5116
Dbxref=CGD:CAL0000186095,GeneID:3641382,Genbank:XP_716982.2
Name=XP_716982.2
Note=Ortholog(s) have role in G1/S transition of mitotic cell cycle%2C TOR signaling%2C dephosphorylation%2C regulation of mitotic sister chromatid segregation%2C tRNA wobble uridine modification
gbkey=CDS
orig_transcript_id=gnl|WGS:AACQ|mrna_CAALFM_C703010WA
product=hypothetical protein
protein_id=XP_716982.2
transl_table=12

Diversity

Observed clones


In [418]:
len(fragments)


Out[418]:
47260

Observed inframe clones


In [419]:
(fragments.has_inframe_overlap == 1).sum()


Out[419]:
4226

Chao1


In [426]:
skbio.diversity.alpha_diversity('chao1', fragments['count'])[0]


Out[426]:
54189.258676538637

Chao1 inframe


In [437]:
skbio.diversity.alpha_diversity('chao1', fragments[fragments.has_inframe_overlap == 1]['count'].values)[0]


Out[437]:
4815.1676300578038

Shannon entropy effective population size


In [455]:
2 ** skbio.diversity.alpha_diversity('shannon', fragments['count'])[0]


Out[455]:
23396.596407037236

Shannon entropy INFRAME effective population size


In [457]:
2 ** skbio.diversity.alpha_diversity('shannon', fragments[fragments.has_inframe_overlap == 1]['count'].values)[0]


Out[457]:
2097.3718374149526

In [ ]:


In [ ]:


In [ ]:


In [47]:
def parse_recon_fitfile(path):
    with open(path, 'r') as ip:
        lines = [line.strip() for line in ip.readlines()]
    lines.reverse()
    line = lines.pop()
    while not line.startswith('observed_clone_size_distribution'):
        line = lines.pop()
    observed_clone_size_distribution = eval(line.split('=')[-1])
    start = lines.index('===========================================')
    n0 = float(lines[start + 4].split('=')[-1].strip())
    parameters = eval(lines[start + 5].split('=')[-1])
    weights = parameters[:len(parameters) // 2]
    means = parameters[len(parameters) // 2:]
    return {'observed_clone_size_distribution': observed_clone_size_distribution,
            'n0': n0,
            'weights': weights,
            'means': means}

In [48]:
# recon_allreads_results = parse_recon_fitfile(recon_allreads_path)

In [49]:
# sum(recon_allreads_results['observed_clone_size_distribution'].values())

In [ ]: