Libraries


In [1]:
import pandas as pd
import os, sys, time, random
import numpy as np
from scipy import stats
sys.path.append('../')
from RASLseqTools import *
sys.path.append('../RASLseqTools')
import RASLseqAnalysis_STAR

In [2]:
import seaborn
%pylab inline
%matplotlib inline
%config InlineBackend.figure_format = 'retina'


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['random']
`%matplotlib` prevents importing * from pylab and numpy

NOTE: THIS SAMPLE NOTEBOOK REQUIRES THE FOLLOWING DEPENDENCIES:

STAR BINARY: https://github.com/alexdobin/STAR/tree/master/bin

python-Levenshtein: https://pypi.python.org/pypi/python-Levenshtein/

pandas: https://github.com/pydata/pandas

seaborn: https://github.com/mwaskom/seaborn

NumPy: http://www.numpy.org

SciPy: http://www.scipy.org

Data


In [3]:
fastq_path = '../data/sample.fastq.gz'

probes = '../data/sample.probes'

well_annot = '../data/sample.bc'

aligner_path = '../STAR_bin/'

Analysis

RASLseqAnalysis_STAR


In [4]:
#Initiate RASLseqAnalysis
seqRun1 = RASLseqAnalysis_STAR.RASLseqAnalysis_STAR(fastq_path, probes, aligner_path, well_annot, '../data/temp/', write_file='../data/temp/test_alignment.txt' ,print_on=False, \
                 n_jobs=1, offset_5p=25, offset_3p=21, wellbc_start=0, wellbc_end=8, write_alignments=False )

In [5]:
#Well Annotation DataFrame
seqRun1.RASLseqBCannot_obj.well_annot_df


Out[5]:
Concentration
PlateBarcode WellBarcode
CATATTG GACTGACT 0.1
GAGATTC ATCGATCG 0.1
GAGCATG CTAGCTAG 0.1
TGATAGT AGCTCAGA 0.1

In [6]:
#Probe DataFrame
seqRun1.RASLseqProbes_obj.on_off_target_probes_df.head()


Out[6]:
acceptor_probe_id donor_probe_id probe_seq AcceptorAdaptorSequence DonorAdaptorSequence
NM_000569.6_FCGR3A_2014///NM_000569.6_FCGR3A_2014 NM_000569.6_FCGR3A_2014 NM_000569.6_FCGR3A_2014 CAATGAACAAAGCTACACAGGAATTAGATATTGAAGCAGA GGAAGCCTTGGCTTTTG AGATCGGAAGAGCACAC
NM_000569.6_FCGR3A_2014///NM_000569.6_FCGR3A_2119 NM_000569.6_FCGR3A_2014 NM_000569.6_FCGR3A_2119 CAATGAACAAAGCTACACAGGAAGGATTTGAAAGTTTCAT GGAAGCCTTGGCTTTTG AGATCGGAAGAGCACAC
NM_000569.6_FCGR3A_2014///NM_000572.2_IL10_1007 NM_000569.6_FCGR3A_2014 NM_000572.2_IL10_1007 CAATGAACAAAGCTACACAGGAGGGAGGTCAGGGAAAACA GGAAGCCTTGGCTTTTG AGATCGGAAGAGCACAC
NM_000569.6_FCGR3A_2014///NM_000572.2_IL10_787 NM_000569.6_FCGR3A_2014 NM_000572.2_IL10_787 CAATGAACAAAGCTACACAGTAAATCGTTCACAGAGAAGC GGAAGCCTTGGCTTTTG AGATCGGAAGAGCACAC
NM_000569.6_FCGR3A_2014///NM_000594.3_TNF_1327 NM_000569.6_FCGR3A_2014 NM_000594.3_TNF_1327 CAATGAACAAAGCTACACAGACATAAATAGAGGGAGCTGG GGAAGCCTTGGCTTTTG AGATCGGAAGAGCACAC

Demultiplexing and Aligning FASTQ Reads


In [7]:
#FASTQ Analysis
%time seqRun1.get_target_counts_df()


Starting Asynchronous Alignment Process
Demultiplexing Reads
Demultiplexing Complete
Alignment Complete
250000 Reads Processed
CPU times: user 2.14 s, sys: 411 ms, total: 2.55 s
Wall time: 7.91 s

In [8]:
#RESULTS
#Test data with approximately balanced read counts for each on-target probe
seqRun1.RASLseqAnalysis_df


Out[8]:
Concentration NM_000569.6_FCGR3A_2014///NM_000569.6_FCGR3A_2014 NM_000569.6_FCGR3A_2119///NM_000569.6_FCGR3A_2119 NM_000572.2_IL10_1007///NM_000572.2_IL10_1007 NM_000572.2_IL10_787///NM_000572.2_IL10_787 NM_000594.3_TNF_1327///NM_000594.3_TNF_1327 NM_000594.3_TNF_1389///NM_000594.3_TNF_1389 NM_001002_RPLP0_647///NM_001002_RPLP0_647 NM_001002_RPLP0_909///NM_001002_RPLP0_909 NM_001081455.1_P2RY14_2328///NM_001081455.1_P2RY14_2328 NM_001081455.1_P2RY14_2526///NM_001081455.1_P2RY14_2526
PlateBarcode WellBarcode
CATATTG AGCTCAGA NaN 633 619 596 608 621 607 610 606 629 626
ATCGATCG NaN 600 589 606 631 656 623 638 624 611 608
CTAGCTAG NaN 635 606 599 577 613 631 616 648 621 627
GACTGACT 0.1 595 569 600 611 601 624 613 627 648 596
GAGATTC AGCTCAGA NaN 622 640 590 601 568 622 626 632 598 667
ATCGATCG 0.1 638 630 658 655 596 647 602 613 641 630
CTAGCTAG NaN 615 629 612 621 607 634 597 620 643 626
GACTGACT NaN 616 633 652 599 661 645 650 616 593 639
GAGCATG AGCTCAGA NaN 657 602 611 632 625 607 611 642 647 647
ATCGATCG NaN 617 635 615 606 644 640 662 619 620 594
CTAGCTAG 0.1 618 587 631 643 622 604 645 635 687 680
GACTGACT NaN 664 632 655 627 646 621 591 627 620 604
TGATAGT AGCTCAGA 0.1 616 650 630 612 614 622 643 622 648 661
ATCGATCG NaN 605 624 598 611 637 645 643 614 619 640
CTAGCTAG NaN 631 601 615 624 634 634 641 575 630 590
GACTGACT NaN 608 612 582 659 625 647 596 601 634 661

SUMMARY REPORT


In [9]:
#Pandas DataFrame of Probe Counts
seqRun1_counts = seqRun1.RASLseqAnalysis_df[seqRun1.probe_columns].fillna(value=0)
seqRun1_counts['well_count'] = seqRun1_counts.apply(np.sum, axis=1)  #total read counts by well

In [10]:
#READS BY WELL
seqRun1_counts.well_count.hist(bins=10)


Out[10]:
<matplotlib.axes.AxesSubplot at 0x10473c190>

In [11]:
#READS BY PLATE
seqRun1_counts.groupby(level=0)['well_count'].aggregate(np.sum).plot(kind='bar')


Out[11]:
<matplotlib.axes.AxesSubplot at 0x11735c3d0>

In [12]:
#ADDING ROW AND COL ANNOTATIONS
seqRun1_counts.insert(0,'Row', [1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4])
seqRun1_counts.insert(1,'Col', [1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4])

In [13]:
#MEDIAN READ COUNT BY WELL

median_well_count = seqRun1_counts.pivot_table(index='Row', columns='Col', values='well_count', aggfunc=np.median)
seaborn.heatmap(median_well_count.values, \
                cmap='spectral', square=False, annot=True)   
plt.show()
plt.close()
median_well_count.stack().hist(bins=10)


Out[13]:
<matplotlib.axes.AxesSubplot at 0x1160c0290>

In [14]:
#MEAN WELL READ COUNT

mean_well_count = seqRun1_counts.pivot_table(index='Row', columns='Col', values='well_count', aggfunc=np.mean)
seaborn.heatmap(mean_well_count.values, \
                cmap='spectral', square=False, annot=True)   
plt.show()
plt.close()
mean_well_count.stack().hist(bins=10)


Out[14]:
<matplotlib.axes.AxesSubplot at 0x116107410>

In [16]:
#PCA FOR EACH WELL BY PLATE

from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.lda import LDA
from sklearn.preprocessing import StandardScaler



X = seqRun1_counts[seqRun1.probe_columns]
target_names = seqRun1_counts.index.get_level_values(1).tolist()


X = StandardScaler().fit_transform(X)
y = target_names
pca = PCA(n_components=2)
X_r = pca.fit(X).transform(X)


# Percentage of variance explained for each components
print('explained variance ratio (first two components): %s'
      % str(pca.explained_variance_ratio_))



fig, ax = plt.subplots(figsize=(15,8))

#colored according to plate
colors = ['r', 'r', 'r', 'r', 'g', 'g', 'g', 'g', 'b', 'b', 'b', 'b', 'gray', 'gray', 'gray', 'gray', ]

ax.scatter(X_r[0:,0], X_r[0:,1], s=800, alpha=0.7, c=colors, cmap=cm.spectral)
for i, txt, c in zip(X_r, target_names, colors):
    
    ax.annotate(txt, xy=(i[0],i[1]), textcoords='offset points', va='bottom', ha='center', xytext=(0, -5), fontsize=20, color=c)

plt.title('PCA Wells From Each Plate', fontsize=20)

plt.grid(color='gray')

plt.show()


explained variance ratio (first two components): [ 0.23542144  0.17425137]

In [ ]: