In [80]:
def naive(p, t):
occurrences = []
for i in range(len(t) - len(p) + 1): # loop over alignments
match = True
for j in range(len(p)): # loop over characters
if t[i+j] != p[j]: # compare characters
match = False
break
if match:
occurrences.append(i) # all chars matched; record
return occurrences
In [81]:
def reverseComplement(s):
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
t = []
for base in s:
t.append(complement[base])
return ''.join(t)
In [82]:
def naive_with_rc(p, t):
c = reverseComplement(p)
locations = naive(p, t)
if p != c:
locations.extend(naive(c, t))
return locations
In [83]:
# Example 1
p = 'CCC'
ten_as = 'AAAAAAAAAA'
t = ten_as + 'CCC' + ten_as + 'GGG' + ten_as
occurrences = naive_with_rc(p, t)
print(occurrences)
In [84]:
# Example 2
p = 'CGCG'
t = ten_as + 'CGCG' + ten_as + 'CGCG' + ten_as
occurrences = naive_with_rc(p, t)
print(occurrences)
In [5]:
def readGenome(filename):
genome = ''
with open(filename, 'r') as f:
for line in f:
# ignore header line with genome information
if not line[0] == '>':
genome += line.rstrip()
return genome
In [6]:
# Lambda Virus Genome
!rm lambda_virus.fa
!wget https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/lambda_virus.fa
In [7]:
lambda_va = readGenome('lambda_virus.fa')
In [8]:
# Question 1
# How many times does AGGT or its reverse complement (ACCT)
# occur in the lambda virus genome? E.g. if AGGT occurs 10 times and
# ACCT occurs 12 times, you should report 22.
occurrences = naive_with_rc('AGGT', lambda_va)
print ("Answer 1: %d" % len(occurrences))
In [102]:
# Question 2
# How many times does TTAA or its reverse complement occur in the
# lambda virus genome? Hint: TTAA and its reverse complement are equal,
# so remember not to double count.
occurrences = naive_with_rc('TTAA', lambda_va)
print ("Answer 2: %d" % len(occurrences))
In [117]:
def leftmost(p, t):
occurrences = naive_with_rc(p, t)
#complement = reverseComplement(p)
#occurrences.extend(naive(complement, lambda_va))
leftmost = min(occurrences)
return leftmost
In [118]:
# Question 3
# What is the offset of the leftmost occurrence of ACTAAGT or
# its reverse complement in the Lambda virus genome? E.g. if the
# leftmost occurrence of ACTAAGT is at offset 40 (0-based) and the
# leftmost occurrence of its reverse complement ACTTAGT is at offset 29,
# then report 29.
print ("Answer 3: ", leftmost('ACTAAGT', lambda_va))
In [116]:
# Question 4
# What is the offset of the leftmost occurrence of AGTCGA
# or its reverse complement in the Lambda virus genome?
print ("Answer 4: ", leftmost('AGTCGA', lambda_va))
In [1]:
def naive_2mm(p, t, mm = 2):
occurrences = []
for i in range(len(t) - len(p) + 1): # loop over alignments
match = True
mismatchCounter = 0
for j in range(len(p)): # loop over characters
if t[i+j] != p[j]: # compare characters
mismatchCounter += 1
if mismatchCounter > mm:
break
if mismatchCounter <= mm:
occurrences.append(i) # all chars matched; record
return occurrences
In [2]:
maxMismatch = 2
occurrences = naive_2mm('ACTTTA', 'ACTTACTTGATAAAGT', maxMismatch)
print ("Naive Match with %d mismatches: %s" % (maxMismatch, occurrences))
In [95]:
# Example 1 - naive_2mm
p = 'CTGT'
ten_as = 'AAAAAAAAAA'
t = ten_as + 'CTGT' + ten_as + 'CTTT' + ten_as + 'CGGG' + ten_as
occurrences = naive_2mm(p, t)
print(occurrences)
In [9]:
# Question 5
# How many times does TTCAAGCC occur in the Lambda virus genome when allowing up to 2 mismatches?
occurrences = naive_2mm('TTCAAGCC', lambda_va)
print('# Answer 5: %d' % len(occurrences))
In [10]:
# Question 6
# What is the offset of the leftmost occurrence of AGGAGGTT
# in the Lambda virus genome when allowing up to 2 mismatches?
occurrences = naive_2mm('AGGAGGTT', lambda_va)
print('Answer 6: %d' % min(occurrences))
In [98]:
!rm ERR037900_1.first1000.fastq
!wget https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ERR037900_1.first1000.fastq
In [99]:
def readFastq(filename):
sequences = []
qualities = []
with open(filename) as fh:
while True:
fh.readline() # skip name line
seq = fh.readline().rstrip() # read base sequence
fh.readline() # skip placeholder line
qual = fh.readline().rstrip() # base quality line
if len(seq) == 0:
break
sequences.append(seq)
qualities.append(qual)
return sequences, qualities
In [100]:
human = readGenome('ERR037900_1.first1000.fastq')
In [ ]: