In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy.matlib
import numpy as np
import time
import matplotlib.pyplot as plt
import glob
import seaborn as sns
import collections
sns.set_context('talk')
sns.set_style('white')
import os
import sys
import csv
import re


/Users/atray/anaconda/lib/python2.7/site-packages/pandas/computation/__init__.py:19: UserWarning: The installed version of numexpr 2.4.4 is not supported in pandas and will be not be used

  UserWarning)

In [3]:
path2data='CHANGE_THIS'

In [4]:
R1=pd.read_csv(path2data+'r1.fastq.gz',sep='\t',header=None)
R2=pd.read_csv(path2data+'r2.fastq.gz',sep='\t',header=None)

In [5]:
R1_seq=R1[1::4]

In [ ]:
R2_seq=R2.ix[1::4]

In [14]:
cbcs=[re.sub('\+.*','',re.sub('.*2:N:0:','',x)) for x in R2.ix[0::4][0]]

In [15]:
R_combined=pd.concat([R1_seq,R2_seq],axis=1)
R_combined['CBC']=cbcs

In [16]:
R_combined.columns=['R1-RNA','R2-UMI','CBC']

In [17]:
#check for primer sequence
checksubstr=['GGCACAAGCTTAATTAAGAATT' in x for x in R_combined['R1-RNA']]

In [18]:
R_matching=R_combined[checksubstr]

In [ ]:
R_matching['RNA-CBC-UMI']=[x+'-'+y+'-'+z for x,y,z in zip(R_matching['R1-RNA'],R_matching['CBC'],R_matching['R2-UMI'])]

In [25]:
R_matching.head()


Out[25]:
R1-RNA R2-UMI CBC RNA-CBC RNA-CBC-UMI
189 GGCAAACTGGGGCACAAGCTTAATTAAGAATTTTCNCTACNNGCTA... AGAAACTTAT AGATCGTGTGCTCC GGCAAACTGGGGCACAAGCTTAATTAAGAATTTTCNCTACNNGCTA... GGCAAACTGGGGCACAAGCTTAATTAAGAATTTTCNCTACNNGCTA...
193 AGCAAACTGGGGCACAAGCTTAATTAAGAATTGCCAGGATNNACAA... ATGCTGTCAC GGACAACTACGCAT AGCAAACTGGGGCACAAGCTTAATTAAGAATTGCCAGGATNNACAA... AGCAAACTGGGGCACAAGCTTAATTAAGAATTGCCAGGATNNACAA...
245 AGCAAACTGGGGCACAAGCTTAATTAAGAATTCTAGACTCNNACAC... ATGCTACCCA AACTATCCTCTAGC AGCAAACTGGGGCACAAGCTTAATTAAGAATTCTAGACTCNNACAC... AGCAAACTGGGGCACAAGCTTAATTAAGAATTCTAGACTCNNACAC...
273 AGCAAACTGGGGCACAAGCTTAATTAAGAATTGTTGACCTNNTCAG... TAGCATGAGA CATGGCCTTGAGAA AGCAAACTGGGGCACAAGCTTAATTAAGAATTGTTGACCTNNTCAG... AGCAAACTGGGGCACAAGCTTAATTAAGAATTGTTGACCTNNTCAG...
285 AGCAAACTGGGGCACAAGCTTAATTAAGAATTAGGGCTTGNAGTGC... TAACTACCCT TTTCAGTGAGTTCG AGCAAACTGGGGCACAAGCTTAATTAAGAATTAGGGCTTGNAGTGC... AGCAAACTGGGGCACAAGCTTAATTAAGAATTAGGGCTTGNAGTGC...

In [ ]:
R_matching['counts']=[1]*len(R_matching)

In [27]:
R_grouped=R_matching.groupby('RNA-CBC-UMI').sum()

In [28]:
R_grouped.sort_values('counts',ascending=False).head(20)


Out[28]:
counts
RNA-CBC-UMI
AGCAAACTGGGGCACAAGCTTAATTAAGAATTTGGCGCGCAGATTTGCGGCCTAG-GGGGGGGGGGGGGG-GGGGGGGGGG 2
AGCAAACTGGGGCACAAGCTTAATTAAGAATTGTATAACTGATCAGCAGACCTAG-TGCAACGATGAGAA-TGGTGCCGGA 2
AGCAAACTGGGGCACAAGCTTAATTAAGAATTTGGCGCGCAGATTTGCGGCCTAG-CGCTAAGATGGATC-CCGTTCAACC 2
AGCAAACTGGGGCACAAGCTTAATTAAGAATTGCCAGGATGTACAAACTGCCTAG-ACCCAGCTAAGATG-CGCGTACCAT 2
AGCAAACTGGGGCACAAGCTTAATTAAGAATTGGTGTCGTTCCGAGAGCGCCTAG-GGGGGGGGGGGGGG-GGGGGGGGGG 2
AGCAAACTGGGGCACAAGCTTAATTAAGAATTGGCTGAAGACCTTAAACGCCTAG-AGCCGTCTATTTCC-ACCCCAGGCA 2
AGCAAACTGGGGCACAAGCTTAATTAAGAATTGTATAACTGATCAGCAGACCTAG-GAGCTCCTCCCTAC-ACAACGGGCA 2
AGCAAACTGGGGCACAAGCTTAATTAAGAATTGGCTGAAGACCTTAAACGCCTAG-AGGACTTGCCCTCA-ATGGCTCTAT 2
AGCAAACTGGGGCACAAGCTTAATTAAGAATTTTGAACATTAACGGAACGCCTAG-GCCATGCTTCTGGA-GATGTTCTAA 2
AGCAAACTGGGGCACAAGCTTAATTAAGAATTCTAACTCAGCGACTGGAGCCTAG-AACCTACTCGTACA-GACCGGCTAG 2
AGCAAACTGGGGCACAAGCTTAATTAAGAATTGAAACTCTCCCGCCTGACCCTAG-ATTCGGGATGGTCA-GGTTATCCCT 2
AGCAAACTGGGGCACAAGCTTAATTAAGAATTTTCGCTACCTGCTACCCGCCTAG-TAACCGGAACACTG-ACCCCATCCA 2
AGCAAACTGGGGCACAAGCTTAATTAAGAATTGGCTGAAGACCTTAAACGCCTAG-TTGGTACTACTCTT-CATGCCAACA 2
AGCAAACTGGGGCACAAGCTTAATTAAGAATTCGAGCGGTCGGCACTCGGCCTAG-AAGATGGAAAGTAG-ATCCCGTCAT 2
AGCAAACTGGGGCACAAGCTTAATTAAGAATTGGCTGAAGACCTTAAACGCCTAG-CAGTCAGATCTATC-TCCCGTCACT 2
AGCAAACTGGGGCACAAGCTTAATTAAGAATTTAAACACATGCATTTCACCCTAG-CAAGGACTCTGGTA-CACTTCGCTC 2
AGCAAACTGGGGCACAAGCTTAATTAAGAATTGGTGTCGTTCCGAGAGCGCCTAG-TACTACACAGCACT-TCACAATCCC 2
AGCAAACTGGGGCACAAGCTTAATTAAGAATTCTACCAAAATCCTTTCCGCCTAG-ATCTGTTGGCTGTA-TGCACCCAGG 2
AGCAAACTGGGGCACAAGCTTAATTAAGAATTGCCAGGATGTACAAACTGCCTAG-AGATCTCTAGTCTG-GACAAGCCCG 2
AGCAAACTGGGGCACAAGCTTAATTAAGAATTACACGCAAGCTTTGCTGCCTAGA-AGCGGGCTGGTTCA-GCCATTAACG 2

Note we typically do observe a polyG in the cell barcode in some cases when we sequence on the NextSeq (often the most abundant as seen here). However they are typically a small fraction of the entire run.


In [29]:
R_grouped.to_csv(path2data+'munched_counts.txt',sep='\t')