Set the matplotlib magic to notebook enable inline plots
In [1]:
%pylab inline
SAM format example:
SRR585264.8766235 0 1 4 15 35M * 0 0 CTTAAACAATTATTCCCCCTGCAAACATTTTCAAT GGGGGGGGGGGGGGGGGGGGGGFGGGGGGGGGGGG XT:A:U NM:i:1 X0:i:1 X1:i:6 XM:i:1 XO:i:0 XG:i:0 MD:Z:8T26
Import the required modules
In [2]:
import subprocess
import matplotlib.pyplot as plt
import random
import numpy as np
Make figures prettier and biger
In [3]:
plt.style.use('ggplot')
figsize(10,5)
First store the file name in the variable
In [4]:
file = "/ngschool/chip_seq/bwa/input.sorted.bam"
Next we read the file using samtools. From each read we need to store the flag, chromosome name and start coordinate.
In [5]:
p = subprocess.Popen(["samtools", "view", "-q10", "-F260", file],
stdout=subprocess.PIPE)
coords = []
for line in p.stdout:
flag, ref, start = line.decode('utf-8').split()[1:4]
coords.append([flag, ref, start])
In [6]:
coords[:3]
Out[6]:
What is the total number of our unique reads?
In [7]:
len(coords)
Out[7]:
Randomly sample the coordinates to get 1M for NRF calculations
In [8]:
random.seed(1234)
sample = random.sample(coords, 1000000)
In [9]:
len(sample)
Out[9]:
How many of those coordinates are unique? (We will use the set python object which only the unique items.)
In [10]:
uniqueStarts = {'watson': set(), 'crick': set()}
for coord in sample:
flag, ref, start = coord
if int(flag) & 16:
uniqueStarts['crick'].add((ref, start))
else:
uniqueStarts['watson'].add((ref, start))
How many on the Watson strand?
In [11]:
len(uniqueStarts['watson'])
Out[11]:
And on the Crick?
In [12]:
len(uniqueStarts['crick'])
Out[12]:
Calculate the NRF
In [13]:
NRF_input = (len(uniqueStarts['watson']) + len(uniqueStarts['crick']))*1.0/len(sample)
print(NRF_input)
Lets create a function from what we did above and apply it to all of our files!
To use our function on the real sequencing datasets (not only on a small subset) we need to optimize our method a bit- we will use python module called numpy.
In [ ]:
def calculateNRF(filePath, pickSample=True, sampleSize=10000000, seed=1234):
p = subprocess.Popen(['samtools', 'view', '-q10', '-F260', filePath],
stdout=subprocess.PIPE)
coordType = np.dtype({'names': ['flag', 'ref', 'start'],
'formats': ['uint16', 'U10', 'uint32']})
coordArray = np.empty(10000000, dtype=coordType)
i = 0
for line in p.stdout:
if i >= len(coordArray):
coordArray = np.append(coordArray, np.empty(1000000, dtype=coordType), axis=0)
fg, rf, st = line.decode('utf-8').split()[1:4]
coordArray[i] = np.array((fg, rf, st), dtype=coordType)
i += 1
coordArray = coordArray[:i]
sample = coordArray
if pickSample and len(coordArray) > sampleSize:
np.random.seed(seed)
sample = np.random.choice(coordArray, sampleSize, replace=False)
uniqueStarts = {'watson': set(), 'crick': set()}
for read in sample:
flag, ref, start = read
if flag & 16:
uniqueStarts['crick'].add((ref, start))
else:
uniqueStarts['watson'].add((ref, start))
NRF = (len(uniqueStarts['watson']) + len(uniqueStarts['crick']))*1.0/len(sample)
return NRF
Calculate the NRF for the chip-seq sample
In [15]:
NRF_chip = calculateNRF("/ngschool/chip_seq/bwa/sox2_chip.sorted.bam", sampleSize=1000000)
print(NRF_chip)
Plot the NRF!
In [16]:
plt.bar([0,2],[NRF_input, NRF_chip], width=1)
plt.xlim([-0.5,3.5]), plt.xticks([0.5, 2.5], ['Input', 'ChIP'])
plt.xlabel('Sample')
plt.ylabel('NRF')
plt.ylim([0, 1.25]), plt.yticks(np.arange(0, 1.2, 0.2))
plt.plot((-0.5,3.5), (0.8,0.8), 'red', linestyle='dashed')
plt.show()
Load the results from the coverage calculations
In [17]:
countList = []
with open('/ngschool/chip_seq/bedtools/input_coverage.bed', 'r') as covFile:
for line in covFile:
countList.append(int(line.strip('\n').split('\t')[3]))
countList[0:6]
Out[17]:
In [18]:
countList[-15:]
Out[18]:
Lets see where do our reads align to the genome. Plot the distribution of tags along the genome.
In [19]:
plt.plot(range(len(countList)), countList)
plt.xlabel('Bin number')
plt.ylabel('Bin coverage')
plt.xlim([0, len(countList)])
plt.show()
Now sort the list- order the windows based on the tag count
In [20]:
countList.sort()
In [21]:
countList[0:6]
Out[21]:
Sum all the aligned tags
In [22]:
countSum = sum(countList)
countSum
Out[22]:
Calculate the summaric fraction of tags along the ordered windows.
In [23]:
countFraction = []
for i, count in enumerate(countList):
if i == 0:
countFraction.append(count*1.0 / countSum)
else:
countFraction.append((count*1.0 / countSum) + countFraction[i-1])
Look at the last five items of the list:
In [24]:
countFraction[-5:]
Out[24]:
Calculate the number of windows.
In [25]:
winNumber = len(countFraction)
winNumber
Out[25]:
Calculate what fraction of a whole is the position of each window.
In [26]:
winFraction = []
for i in range(winNumber):
winFraction.append(i*1.0 / winNumber)
Look at the last five items of our new list:
In [27]:
winFraction[-5:]
Out[27]:
Now prepare the function!
In [28]:
def calculateSES(filePath):
countList = []
with open(filePath, 'r') as covFile:
for line in covFile:
countList.append(int(line.strip('\n').split('\t')[3]))
plt.plot(range(len(countList)), countList)
plt.xlabel('Bin number')
plt.ylabel('Bin coverage')
plt.xlim([0, len(countList)])
plt.show()
countList.sort()
countSum = sum(countList)
countFraction = []
for i, count in enumerate(countList):
if i == 0:
countFraction.append(count*1.0 / countSum)
else:
countFraction.append((count*1.0 / countSum) + countFraction[i-1])
winNumber = len(countFraction)
winFraction = []
for i in range(winNumber):
winFraction.append(i*1.0 / winNumber)
return [winFraction, countFraction]
Use our function to calculate the signal extraction scaling for the Sox2 ChIP sample:
In [29]:
chipSes = calculateSES("/ngschool/chip_seq/bedtools/sox2_chip_coverage.bed")
Now we can plot the calculated fractions for both the input and ChIP sample:
In [30]:
plt.plot(winFraction, countFraction, label='input')
plt.plot(chipSes[0], chipSes[1], label='Sox2 ChIP')
plt.ylim([0,1])
plt.xlabel('Ordered window franction')
plt.ylabel('Genome coverage fraction')
plt.legend(loc='best')
plt.show()