In [1]:
%matplotlib inline
In [2]:
import pandas as pd
import numpy.matlib
import numpy as np
import time
import matplotlib.pyplot as plt
import glob
import seaborn as sns
import collections
sns.set_context('talk')
sns.set_style('white')
import os
import sys
import csv
import re
In [3]:
path2data='CHANGE_THIS'
In [4]:
R1=pd.read_csv(path2data+'r1.fastq.gz',sep='\t',header=None)
R2=pd.read_csv(path2data+'r2.fastq.gz',sep='\t',header=None)
In [5]:
R1_seq=R1[1::4]
In [ ]:
R2_seq=R2.ix[1::4]
In [14]:
cbcs=[re.sub('\+.*','',re.sub('.*2:N:0:','',x)) for x in R2.ix[0::4][0]]
In [15]:
R_combined=pd.concat([R1_seq,R2_seq],axis=1)
R_combined['CBC']=cbcs
In [16]:
R_combined.columns=['R1-RNA','R2-UMI','CBC']
In [17]:
#check for primer sequence
checksubstr=['GGCACAAGCTTAATTAAGAATT' in x for x in R_combined['R1-RNA']]
In [18]:
R_matching=R_combined[checksubstr]
In [ ]:
R_matching['RNA-CBC-UMI']=[x+'-'+y+'-'+z for x,y,z in zip(R_matching['R1-RNA'],R_matching['CBC'],R_matching['R2-UMI'])]
In [25]:
R_matching.head()
Out[25]:
In [ ]:
R_matching['counts']=[1]*len(R_matching)
In [27]:
R_grouped=R_matching.groupby('RNA-CBC-UMI').sum()
In [28]:
R_grouped.sort_values('counts',ascending=False).head(20)
Out[28]:
Note we typically do observe a polyG in the cell barcode in some cases when we sequence on the NextSeq (often the most abundant as seen here). However they are typically a small fraction of the entire run.
In [29]:
R_grouped.to_csv(path2data+'munched_counts.txt',sep='\t')