# Comparison of Acevedo's passage 8 data when 3 repeats or only 2 repeats are used

In both cases, we consider a quality threshold of 60, i.e. a quality of 20 with 3 repeats, or 30 with two repeats. This means that we expect to have fewer remaining bases when only 2 repeats are used, because many will not pass the quality threshold. The analyses have been performed using Acevedo's scripts, modified in the case with 2 repeats. When 2 repeats are analysed, only the first and the third repeats are used. The third repeat should have lower quality than the second, so this is a conservative way to look at the data.

## Reading in the data

``````

In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pylab import rcParams
import matplotlib

colNames = ["position","referenceBase","A","C", "G", "T"]
counts = pd.read_csv ("outputPassage8/Q20threshold.txt", sep="\t", header=None, names=colNames)

counts2Repeats = pd.read_csv ("output2RepeatsPassage8/Q30threshold.txt", sep="\t", header=None, names=colNames)

#Utilitary variables
axisFontSize = 20
titleFontSize = 25
axisTickFontSize = 12

matplotlib.rc('xtick', labelsize=axisTickFontSize)
matplotlib.rc('ytick', labelsize=axisTickFontSize)
rcParams["axes.labelsize"]=axisFontSize

``````

## Computing statistics of interest

``````

In [2]:

def coverage(a,c,g,t):
cov = a+c+g+t
return cov

def majorBaseRatio(a,c,g,t,coverage):
l=[a,c,g,t]
l.sort()
return l[3]/coverage

def secondMajorBaseRatio(a,c,g,t,coverage):
l=[a,c,g,t]
l.sort()
return l[2]/coverage

def thirdMajorBaseRatio(a,c,g,t,coverage):
l=[a,c,g,t]
l.sort()
return l[1]/coverage

def fourthMajorBaseRatio(a,c,g,t,coverage):
l=[a,c,g,t]
l.sort()
return l[0]/coverage

def ARatio(a,coverage):
return a/coverage

def CRatio(c,coverage):
return c/coverage

def GRatio(g,coverage):
return g/coverage

def TRatio(t,coverage):
return t/coverage

def computeCoverageAndRatios (counts):
counts["coverage"]= list ( map(coverage, counts["A"], counts["C"], counts["G"], counts["T"]) )
counts["majorBaseRatio"]  = list ( map(majorBaseRatio, counts["A"], counts["C"], counts["G"], counts["T"], counts["coverage"]) )
counts["secondMajorBaseRatio"]  = list ( map(secondMajorBaseRatio, counts["A"], counts["C"], counts["G"], counts["T"], counts["coverage"]) )
counts["thirdMajorBaseRatio"] = list ( map(thirdMajorBaseRatio, counts["A"], counts["C"], counts["G"], counts["T"], counts["coverage"]) )
counts["fourthMajorBaseRatio"] = list ( map(fourthMajorBaseRatio, counts["A"], counts["C"], counts["G"], counts["T"], counts["coverage"]) )
counts["ARatio"] = list ( map(ARatio, counts["A"], counts["coverage"]) )
counts["CRatio"] = list ( map(CRatio, counts["C"], counts["coverage"]) )
counts["GRatio"] = list ( map(GRatio, counts["G"], counts["coverage"]) )
counts["TRatio"] = list ( map(TRatio, counts["T"], counts["coverage"]) )
return

computeCoverageAndRatios (counts)
counts.describe()

computeCoverageAndRatios (counts2Repeats)
counts2Repeats.describe()

``````
``````

Out[2]:

position
A
C
G
T
coverage
majorBaseRatio
secondMajorBaseRatio
thirdMajorBaseRatio
fourthMajorBaseRatio
ARatio
CRatio
GRatio
TRatio

count
7440.000000
7440.000000
7440.000000
7440.000000
7440.000000
7440.000000
7440.000000
7440.000000
7440.000000
7440.000000
7440.000000
7440.000000
7440.000000
7440.000000

mean
3720.500000
6378.266263
4450.078495
4476.822984
4855.572581
20160.740323
0.999802
0.000182
0.000015
0.000001
0.296490
0.233578
0.229992
0.239940

std
2147.887334
16949.345539
13589.430771
13364.229586
14537.080609
23661.456526
0.000905
0.000901
0.000043
0.000009
0.456635
0.423011
0.420754
0.426948

min
1.000000
0.000000
0.000000
0.000000
0.000000
59.000000
0.931071
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000

25%
1860.750000
0.000000
0.000000
0.000000
0.000000
6950.000000
0.999751
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000

50%
3720.500000
1.000000
0.000000
0.000000
1.000000
12909.000000
0.999883
0.000104
0.000000
0.000000
0.000044
0.000000
0.000000
0.000022

75%
5580.250000
5357.000000
10.000000
8.000000
21.250000
24012.500000
1.000000
0.000225
0.000000
0.000000
0.999728
0.000491
0.000368
0.000700

max
7440.000000
203304.000000
191022.000000
182796.000000
179859.000000
203317.000000
1.000000
0.068892
0.000647
0.000235
1.000000
1.000000
1.000000
1.000000

``````

## Impact of the number of repeats on coverage.

``````

In [3]:

%matplotlib inline

def plotCoverage(subplot, counts, title):
rcParams['figure.figsize'] = 18, 8
subplot.set_title(title, fontsize=15, fontweight='bold')
subplot.set_ylabel ('Number of reads')
subplot.plot (counts ['position'], counts ['coverage'], 'b-') #(position, coverage, 'r-')
fig_size = rcParams["figure.figsize"]
subplot.legend (['Sequencing coverage'], loc = 'upper left')
yaxis_lower_limit = counts ['coverage'].min()
yaxis_upper_limit = counts ['coverage'].max()
xaxis_lower_limit = counts ['position'].min()
xaxis_upper_limit = counts ['position'].max()
subplot.axis ([xaxis_lower_limit, xaxis_upper_limit, yaxis_lower_limit, yaxis_upper_limit])

f, axarr = plt.subplots(2, sharex=True,figsize=(15,15))
plotCoverage(axarr[0], counts,'Genome coverage by position, 3 repeats')
plotCoverage(axarr[1], counts2Repeats,'Genome coverage by position, 2 repeats')
plt.xlabel ('Genome position')
plt.tight_layout()

``````
``````

``````
``````

In [13]:

print("Average coverage for 3 repeats, quality threshold at 20: "+str(counts["coverage"].mean()))
print("Average coverage for 2 repeats, quality threshold at 30: "+str(counts2Repeats["coverage"].mean()))

``````
``````

Average coverage for 3 repeats, quality threshold at 20: 169749.96707
Average coverage for 2 repeats, quality threshold at 30: 20160.7403226

``````

## Impact of the number of repeats on allele frequency.

``````

In [4]:

%matplotlib inline

def plotFrequencyOfMostFrequentNucleotide (subplot, counts, title):
rcParams['figure.figsize'] = 18, 8
subplot.set_title(title, fontsize=15, fontweight='bold')
subplot.set_ylabel ('Frequency')
subplot.plot (counts ['position'], counts ['majorBaseRatio'], 'r.') #(position, coverage, 'r-')
fig_size = rcParams["figure.figsize"]
subplot.legend (['Frequency of the most frequent nucleotide'], loc = 'upper right')
yaxis_lower_limit = counts ['majorBaseRatio'].min()
yaxis_upper_limit = counts ['majorBaseRatio'].max()
xaxis_lower_limit = counts ['position'].min()
xaxis_upper_limit = counts ['position'].max()
subplot.axis ([xaxis_lower_limit, xaxis_upper_limit, yaxis_lower_limit, yaxis_upper_limit])

f, axarr = plt.subplots(2, sharex=True,figsize=(15,15))
plotFrequencyOfMostFrequentNucleotide(axarr[0], counts, 'Frequency of the most frequent nucleotide by genome position, 3 repeats')
plotFrequencyOfMostFrequentNucleotide(axarr[1], counts2Repeats, 'Frequency of the most frequent nucleotide by genome position, 2 repeats')
plt.xlabel ('Genome position')
plt.tight_layout()

``````
``````

``````

### Conclusion: most outlier sites in the 3-repeat analysis also appear to be outliers in the 2-repeat analysis, except in low-coverage regions.

``````

In [16]:

%matplotlib inline

def runningMeanFast(x, N):
return np.convolve(x, np.ones((N,))/N)[(N-1):]

def plotRMFrequencyOfMostFrequentNucleotide (subplot, counts, title):
majorBaseRatioMovingAverage10 = (runningMeanFast (counts ['majorBaseRatio'], 10))
secondMajorBaseRatioMovingAverage10 = (runningMeanFast (counts ['secondMajorBaseRatio'], 10))
rcParams['figure.figsize'] = 18, 8
subplot.set_title(title, fontsize=15, fontweight='bold')
subplot.plot (counts ['position'], majorBaseRatioMovingAverage10, 'r-')
subplot.plot (counts ['position'], 1-secondMajorBaseRatioMovingAverage10, 'b-', alpha=0.7)
subplot.set_ylabel ('Frequency')
fig_size = rcParams["figure.figsize"]
subplot.legend (['Frequency of the most frequent nucleotide', 'Frequency of the second most frequent nucleotide'], loc = 'lower left')
yaxis_lower_limit = 0.991
yaxis_upper_limit = majorBaseRatioMovingAverage10.max()
xaxis_lower_limit = counts ['position'].min()
xaxis_upper_limit = counts ['position'].max()
subplot.axis ([xaxis_lower_limit, xaxis_upper_limit, yaxis_lower_limit, yaxis_upper_limit])

f, axarr = plt.subplots(2, sharex=True,figsize=(15,15))
plotRMFrequencyOfMostFrequentNucleotide(axarr[0], counts, 'Frequency of the most frequent nucleotide by genome position, 3 repeats')
plotRMFrequencyOfMostFrequentNucleotide(axarr[1], counts2Repeats, 'Frequency of the most frequent nucleotide by genome position, 2 repeats')
plt.xlabel ('Genome position')
plt.tight_layout()

``````
``````

``````

### Now we plot the 3 least frequent bases per position.

``````

In [6]:

%matplotlib inline

def plotFrequenciesOf3LeastFrequentNucleotides (subplot, counts, title):
rcParams['figure.figsize'] = 18, 8
subplot.set_title(title, fontsize=15, fontweight='bold')
subplot.plot (counts ['position'], counts ['secondMajorBaseRatio'], 'r.') #(position, coverage, 'r-')
subplot.plot (counts ['position'], counts ['thirdMajorBaseRatio'], 'b.') #(position, coverage, 'r-')
subplot.plot (counts ['position'], counts ['fourthMajorBaseRatio'], 'g.') #(position, coverage, 'r-')
subplot.set_ylabel ('Frequency')
fig_size = rcParams["figure.figsize"]
subplot.legend (['Frequency of the second most frequent nucleotide', 'Frequency of the third most frequent nucleotide', 'Frequency of the fourth most frequent nucleotide'], loc = 'upper left')
yaxis_lower_limit = counts ['fourthMajorBaseRatio'].min()
yaxis_upper_limit = 1
xaxis_lower_limit = counts ['position'].min()
xaxis_upper_limit = counts ['position'].max()
subplot.set_yscale('log')
subplot.axhline(y = 0.01, linewidth=1, linestyle='dashed', color='0.75')
subplot.axis ([xaxis_lower_limit, xaxis_upper_limit, yaxis_lower_limit, yaxis_upper_limit])

f, axarr = plt.subplots(2, sharex=True,figsize=(15,15))
plotFrequenciesOf3LeastFrequentNucleotides(axarr[0], counts, 'Frequency of the 3 least frequent nucleotide by genome position, 3 repeats')
plotFrequenciesOf3LeastFrequentNucleotides(axarr[1], counts2Repeats, 'Frequency of the 3 least frequent nucleotide by genome position, 2 repeats')
plt.xlabel ('Genome position')
plt.tight_layout()

``````
``````

``````

## Impact of sequence coverage on frequency estimation

Now we are going to investigate the impact of sequence coverage on our ability to recover nucleotide frequencies. To do that we compute the Jensen-Shannon divergence (a symmetrised version of the Kullback-Leibler divergence).

``````

In [7]:

import scipy
import math

def KullbackLeibler (p, q):
if len(p) != len(q):
print("Error in KL: lists with different lengths")
exit(-1)
kl = 0
for i in range (len(p)):
ratio = 1
if p[i] == 0 and q[i] == 0:
ratio = 1
elif p[i] == 0:
ratio = 0.0000000001/q[i]
elif q[i] == 0:
ratio = p[i]/0.0000000001
else:
ratio = p[i]/q[i]
kl = kl + p[i]*math.log(ratio)
return kl

def JensenShannon (p, q):
meanpq = [sum(x)/2 for x in zip(p, q)]
js = KullbackLeibler (p, meanpq) + KullbackLeibler (q, meanpq)
return js

def computeDivergences(count1, count2):
ratios1 = list(zip(count1['ARatio'], count1['CRatio'], count1['GRatio'], count1['TRatio']))
ratios2 = list(zip(count2['ARatio'], count2['CRatio'], count2['GRatio'], count2['TRatio']))
result=list()
for i in range(len(ratios1)):
result.append(JensenShannon (ratios1[i], ratios2[i]))
return result

counts2Repeats['Divergence'] = computeDivergences(counts2Repeats, counts)

``````
``````

In [8]:

counts2Repeats.describe()

``````
``````

Out[8]:

position
A
C
G
T
coverage
majorBaseRatio
secondMajorBaseRatio
thirdMajorBaseRatio
fourthMajorBaseRatio
ARatio
CRatio
GRatio
TRatio
Divergence

count
7440.000000
7440.000000
7440.000000
7440.000000
7440.000000
7440.000000
7440.000000
7440.000000
7440.000000
7440.000000
7440.000000
7440.000000
7440.000000
7440.000000
7440.000000

mean
3720.500000
6378.266263
4450.078495
4476.822984
4855.572581
20160.740323
0.999802
0.000182
0.000015
0.000001
0.296490
0.233578
0.229992
0.239940
0.000132

std
2147.887334
16949.345539
13589.430771
13364.229586
14537.080609
23661.456526
0.000905
0.000901
0.000043
0.000009
0.456635
0.423011
0.420754
0.426948
0.000211

min
1.000000
0.000000
0.000000
0.000000
0.000000
59.000000
0.931071
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000

25%
1860.750000
0.000000
0.000000
0.000000
0.000000
6950.000000
0.999751
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000000
0.000027

50%
3720.500000
1.000000
0.000000
0.000000
1.000000
12909.000000
0.999883
0.000104
0.000000
0.000000
0.000044
0.000000
0.000000
0.000022
0.000067

75%
5580.250000
5357.000000
10.000000
8.000000
21.250000
24012.500000
1.000000
0.000225
0.000000
0.000000
0.999728
0.000491
0.000368
0.000700
0.000157

max
7440.000000
203304.000000
191022.000000
182796.000000
179859.000000
203317.000000
1.000000
0.068892
0.000647
0.000235
1.000000
1.000000
1.000000
1.000000
0.004305

``````
``````

In [9]:

%matplotlib inline

def scatterPlotDivergenceVsCoverage (subplot, counts, title):
rcParams['figure.figsize'] = 18, 8
subplot.suptitle(title, fontsize=15, fontweight='bold')
orderDivergenceByCoverage = [x for (y,x) in sorted(zip(counts ['coverage'],counts ['Divergence']))]
smoothedCurve = (runningMeanFast (orderDivergenceByCoverage, 200))
subplot.plot (counts ['coverage'], counts ['Divergence'], 'r.', alpha=0.3)
subplot.plot (sorted(counts ['coverage']), smoothedCurve, '-', alpha=1, linewidth=2)
subplot.ylabel ('Divergence')
fig_size = rcParams["figure.figsize"]
subplot.legend (['Divergence'], loc = 'upper left')
yaxis_lower_limit = counts ['Divergence'].min()
yaxis_upper_limit = counts ['Divergence'].max()
xaxis_lower_limit = counts ['coverage'].min()
xaxis_upper_limit = counts ['coverage'].max()
#subplot.yscale('log')
#subplot.axhline(y = 0.01, linewidth=1, linestyle='dashed', color='0.75')
subplot.axis ([xaxis_lower_limit, xaxis_upper_limit, yaxis_lower_limit, yaxis_upper_limit])

#f, axarr = plt.subplots(1, sharex=True,figsize=(15,15))
scatterPlotDivergenceVsCoverage(plt, counts2Repeats, 'Divergence in nucleotide frequencies between experiences against coverage')
plt.xlabel ('Coverage')
plt.tight_layout()

``````
``````

``````

### Now let's look at the differences in the major and various minor allele frequencies.

``````

In [10]:

%matplotlib inline

def scatterPlotRatioDifferenceVsCoverage (subplot, ratio1, ratio2,  title):
rcParams['figure.figsize'] = 18, 8
subplot.set_title(title, fontsize=15, fontweight='bold')
diffRatio = pd.Series([(x-y)/(x+y) for (x,y) in zip(ratio1,ratio2)])
yaxis_lower_limit = diffRatio.min(skipna=True)
yaxis_upper_limit = diffRatio.max(skipna=True)
xaxis_lower_limit = counts ['coverage'].min()
xaxis_upper_limit = counts ['coverage'].max()
#diffRatio = abs((ratio1 - ratio2)/(ratio1)
orderdiffRatioByCoverage = [x for (y,x) in sorted(zip(counts ['coverage'],diffRatio))]
smoothedCurve = (runningMeanFast (orderdiffRatioByCoverage, 50))
subplot.plot (counts ['coverage'], diffRatio, 'r.', alpha=0.3)
subplot.plot (sorted(counts ['coverage']), smoothedCurve, '-', alpha=1, linewidth=2)
subplot.set_ylabel ('Frequency difference')
fig_size = rcParams["figure.figsize"]
subplot.legend (['Relative frequency difference'], loc = 'upper left')
#subplot.set_yscale('log')
subplot.locator_params(axis='y',nbins=6)
subplot.axhline(y = 0.00, linewidth=2, linestyle='dashed', color='0.1')
subplot.axis ([xaxis_lower_limit, xaxis_upper_limit, yaxis_lower_limit, yaxis_upper_limit])

f, axarr = plt.subplots(4, sharex=True,figsize=(15,15))
scatterPlotRatioDifferenceVsCoverage(axarr[0], counts['majorBaseRatio'], counts2Repeats['majorBaseRatio'], 'Relative difference in nucleotide frequencies between experiences against coverage, major allele')
scatterPlotRatioDifferenceVsCoverage(axarr[1], counts['secondMajorBaseRatio'], counts2Repeats['secondMajorBaseRatio'], 'Relative difference in nucleotide frequencies between experiences against coverage, second major allele')
scatterPlotRatioDifferenceVsCoverage(axarr[2], counts['thirdMajorBaseRatio'], counts2Repeats['thirdMajorBaseRatio'], 'Relative difference in nucleotide frequencies between experiences against coverage, third major allele')
scatterPlotRatioDifferenceVsCoverage(axarr[3], counts['fourthMajorBaseRatio'], counts2Repeats['fourthMajorBaseRatio'], 'Relative difference in nucleotide frequencies between experiences against coverage, fourth major allele')

plt.xlabel ('Coverage')
plt.tight_layout()

``````
``````

``````