Statistical Modeling of PhIP-seq Data

The alignments were manually generated with this command:

bowtie2 -L 30 -N 3 -p 3 --norc --no-hd \
    -x ~/repos/phage_libraries_private/human90/indexes/bowtie2/human90 \
    -U Sjogrens.serum.Sjogrens.FS10-01971.20A20G.1_R1.fq \
    > Sjogrens.bowtie2.aln

Counts were generated like so

phip compute-counts -i . -o counts -r ~/repos/phage_libraries_private/human90/inputs/human90-larman1-input.tsv
mv counts/Sjogrens.bowtie2.tsv .
rmdir counts

In [1]:
%matplotlib inline

In [24]:
import numpy as np
import scipy as sp
import scipy.stats
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [28]:
results = pd.read_csv('../examples/Sjogrens.bowtie2.tsv', sep='\t', header=0)
results['pep_id'] = results['id'].str.split('|').str[0]
results.rename(columns={'Sjogrens.bowtie2': 'output'}, inplace=True)
beadsonly = pd.read_csv('../examples/PhIP6_90mer_BeadsOnly.txt', sep='\t', header=0)
beadsonly.rename(columns={'INPUT_RC': 'beadsonly'}, inplace=True)
merged = pd.merge(results, beadsonly, how='left', left_on='pep_id', right_on='row_name')
counts = merged[['id', 'input', 'beadsonly', 'output']]

In [29]:
counts.head()


Out[29]:
id input beadsonly output
0 hblOligo131399|NM_000014|2B 24 3 0
1 hblOligo131400|NM_000014|4B 137 21 5
2 hblOligo131401|NM_000014|6B 21 1 3
3 hblOligo131402|NM_000014|8B 253 33 9
4 hblOligo131403|NM_000014|10B 0 0 6

In [20]:
sns.distplot(counts['output'], kde=False, hist_kws={'log': True})


Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x11777ebe0>

In [70]:
sns.distplot(counts['output'], bins=np.linspace(0, 600, 100), kde=False, hist_kws={'log': True})


Out[70]:
<matplotlib.axes._subplots.AxesSubplot at 0x120c1c908>

In [22]:
sns.distplot(counts['output'], bins=range(50), kde=False, hist_kws={'log': False})


Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x1159ed198>

In [55]:
sp.stats.scoreatpercentile(counts['output'], 90)


Out[55]:
19.0

In [60]:
sns.distplot(counts['output'][counts['output'] > 100])


Out[60]:
<matplotlib.axes._subplots.AxesSubplot at 0x111d95c88>

Determine bins for input and beads_only


In [71]:
input_upper_bound = sp.stats.scoreatpercentile(counts['input'], 90)
beadsonly_upper_bound = sp.stats.scoreatpercentile(counts['beadsonly'], 90)

In [72]:
counts['input_bin'] = pd.cut(counts['input'], np.linspace(0, input_upper_bound, 20), include_lowest=True)
counts['beadsonly_bin'] = pd.cut(counts['beadsonly'], np.linspace(0, beadsonly_upper_bound, 20), include_lowest=True)

In [73]:
grouped = counts.groupby(by=('input_bin', 'beadsonly_bin'), as_index=False)['output'].agg({'output_median': np.median, 'bin_count': len})

In [77]:
grouped[grouped['bin_count']>100].head()


Out[77]:
input_bin beadsonly_bin bin_count output_median
0 [0, 34.789] [0, 4] 44664 1.0
1 [0, 34.789] (4, 8] 3727 2.0
2 [0, 34.789] (8, 12] 590 2.0
9 (34.789, 69.579] [0, 4] 9882 2.0
10 (34.789, 69.579] (4, 8] 9874 2.0

In [91]:
output_medians = pd.crosstab(counts['input_bin'], counts['beadsonly_bin'], values=counts['output'], aggfunc=np.median)
bin_counts = pd.crosstab(counts['input_bin'], counts['beadsonly_bin'])

In [92]:
sns.heatmap(output_medians[bin_counts > 100])


Out[92]:
<matplotlib.axes._subplots.AxesSubplot at 0x11994eda0>

In [90]:
sns.lmplot('input', 'beadsonly', counts)


Out[90]:
<seaborn.axisgrid.FacetGrid at 0x111c7c668>

In [93]:
counts[counts['beadsonly']>600]


Out[93]:
id input beadsonly output input_bin beadsonly_bin
33732 hblOligo82803|NM_001098623|17A 918 1269 32 NaN NaN

In [ ]: