In [1]:
import pandas as pd
import os, sys, time, random
import numpy as np
from scipy import stats
sys.path.append('../')
from RASLseqTools import *
sys.path.append('../RASLseqTools')
import RASLseqAnalysis_STAR
In [2]:
import seaborn
%pylab inline
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
STAR BINARY: https://github.com/alexdobin/STAR/tree/master/bin
python-Levenshtein: https://pypi.python.org/pypi/python-Levenshtein/
pandas: https://github.com/pydata/pandas
seaborn: https://github.com/mwaskom/seaborn
NumPy: http://www.numpy.org
SciPy: http://www.scipy.org
In [3]:
fastq_path = '../data/sample.fastq.gz'
probes = '../data/sample.probes'
well_annot = '../data/sample.bc'
aligner_path = '../STAR_bin/'
In [4]:
#Initiate RASLseqAnalysis
seqRun1 = RASLseqAnalysis_STAR.RASLseqAnalysis_STAR(fastq_path, probes, aligner_path, well_annot, '../data/temp/', write_file='../data/temp/test_alignment.txt' ,print_on=False, \
n_jobs=1, offset_5p=25, offset_3p=21, wellbc_start=0, wellbc_end=8, write_alignments=False )
In [5]:
#Well Annotation DataFrame
seqRun1.RASLseqBCannot_obj.well_annot_df
Out[5]:
In [6]:
#Probe DataFrame
seqRun1.RASLseqProbes_obj.on_off_target_probes_df.head()
Out[6]:
In [7]:
#FASTQ Analysis
%time seqRun1.get_target_counts_df()
In [8]:
#RESULTS
#Test data with approximately balanced read counts for each on-target probe
seqRun1.RASLseqAnalysis_df
Out[8]:
In [9]:
#Pandas DataFrame of Probe Counts
seqRun1_counts = seqRun1.RASLseqAnalysis_df[seqRun1.probe_columns].fillna(value=0)
seqRun1_counts['well_count'] = seqRun1_counts.apply(np.sum, axis=1) #total read counts by well
In [10]:
#READS BY WELL
seqRun1_counts.well_count.hist(bins=10)
Out[10]:
In [11]:
#READS BY PLATE
seqRun1_counts.groupby(level=0)['well_count'].aggregate(np.sum).plot(kind='bar')
Out[11]:
In [12]:
#ADDING ROW AND COL ANNOTATIONS
seqRun1_counts.insert(0,'Row', [1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4])
seqRun1_counts.insert(1,'Col', [1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4])
In [13]:
#MEDIAN READ COUNT BY WELL
median_well_count = seqRun1_counts.pivot_table(index='Row', columns='Col', values='well_count', aggfunc=np.median)
seaborn.heatmap(median_well_count.values, \
cmap='spectral', square=False, annot=True)
plt.show()
plt.close()
median_well_count.stack().hist(bins=10)
Out[13]:
In [14]:
#MEAN WELL READ COUNT
mean_well_count = seqRun1_counts.pivot_table(index='Row', columns='Col', values='well_count', aggfunc=np.mean)
seaborn.heatmap(mean_well_count.values, \
cmap='spectral', square=False, annot=True)
plt.show()
plt.close()
mean_well_count.stack().hist(bins=10)
Out[14]:
In [16]:
#PCA FOR EACH WELL BY PLATE
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.lda import LDA
from sklearn.preprocessing import StandardScaler
X = seqRun1_counts[seqRun1.probe_columns]
target_names = seqRun1_counts.index.get_level_values(1).tolist()
X = StandardScaler().fit_transform(X)
y = target_names
pca = PCA(n_components=2)
X_r = pca.fit(X).transform(X)
# Percentage of variance explained for each components
print('explained variance ratio (first two components): %s'
% str(pca.explained_variance_ratio_))
fig, ax = plt.subplots(figsize=(15,8))
#colored according to plate
colors = ['r', 'r', 'r', 'r', 'g', 'g', 'g', 'g', 'b', 'b', 'b', 'b', 'gray', 'gray', 'gray', 'gray', ]
ax.scatter(X_r[0:,0], X_r[0:,1], s=800, alpha=0.7, c=colors, cmap=cm.spectral)
for i, txt, c in zip(X_r, target_names, colors):
ax.annotate(txt, xy=(i[0],i[1]), textcoords='offset points', va='bottom', ha='center', xytext=(0, -5), fontsize=20, color=c)
plt.title('PCA Wells From Each Plate', fontsize=20)
plt.grid(color='gray')
plt.show()
In [ ]: