In [80]:
def naive(p, t):
    occurrences = []
    for i in range(len(t) - len(p) + 1):  # loop over alignments
        match = True
        for j in range(len(p)):  # loop over characters
            if t[i+j] != p[j]:  # compare characters
                match = False
                break
        if match:
            occurrences.append(i)  # all chars matched; record
    return occurrences

In [81]:
def reverseComplement(s):
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
    t = []
    for base in s:
        t.append(complement[base])
    return ''.join(t)

In [82]:
def naive_with_rc(p, t):
    c = reverseComplement(p)
    locations = naive(p, t)
    if p != c:
        locations.extend(naive(c, t))
    return locations

In [83]:
# Example 1
p = 'CCC'
ten_as = 'AAAAAAAAAA'
t = ten_as + 'CCC' + ten_as + 'GGG' + ten_as
occurrences = naive_with_rc(p, t)
print(occurrences)


[10, 23]

In [84]:
# Example 2
p = 'CGCG'
t = ten_as + 'CGCG' + ten_as + 'CGCG' + ten_as
occurrences = naive_with_rc(p, t)
print(occurrences)


[10, 24]

In [5]:
def readGenome(filename):
    genome = ''
    with open(filename, 'r') as f:
        for line in f:
            # ignore header line with genome information
            if not line[0] == '>':
                genome += line.rstrip()
    return genome

In [6]:
# Lambda Virus Genome
!rm lambda_virus.fa
!wget https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/lambda_virus.fa


--2015-12-20 22:22:11--  https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/lambda_virus.fa
Resolving d28rh4a8wq0iu5.cloudfront.net... 54.240.168.253, 54.240.168.66, 54.240.168.40, ...
Connecting to d28rh4a8wq0iu5.cloudfront.net|54.240.168.253|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 49270 (48K) [application/octet-stream]
Saving to: 'lambda_virus.fa'

lambda_virus.fa     100%[===================>]  48.12K   307KB/s    in 0.2s    

2015-12-20 22:22:13 (307 KB/s) - 'lambda_virus.fa' saved [49270/49270]


In [7]:
lambda_va = readGenome('lambda_virus.fa')

In [8]:
# Question 1
# How many times does AGGT or its reverse complement (ACCT) 
# occur in the lambda virus genome? E.g. if AGGT occurs 10 times and 
# ACCT occurs 12 times, you should report 22.
occurrences = naive_with_rc('AGGT', lambda_va)
print ("Answer 1: %d" % len(occurrences))


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-8-3127764bca26> in <module>()
      3 # occur in the lambda virus genome? E.g. if AGGT occurs 10 times and
      4 # ACCT occurs 12 times, you should report 22.
----> 5 occurrences = naive_with_rc('AGGT', lambda_va)
      6 print ("Answer 1: %d" % len(occurrences))

NameError: name 'naive_with_rc' is not defined

In [102]:
# Question 2
# How many times does TTAA or its reverse complement occur in the 
# lambda virus genome? Hint: TTAA and its reverse complement are equal, 
# so remember not to double count.
occurrences = naive_with_rc('TTAA', lambda_va)
print ("Answer 2: %d" % len(occurrences))


Answer 2: 384

In [117]:
def leftmost(p, t):
    occurrences = naive_with_rc(p, t)
    #complement = reverseComplement(p)
    #occurrences.extend(naive(complement, lambda_va))
    leftmost = min(occurrences)
    return leftmost

In [118]:
# Question 3
# What is the offset of the leftmost occurrence of ACTAAGT or 
# its reverse complement in the Lambda virus genome? E.g. if the 
# leftmost occurrence of ACTAAGT is at offset 40 (0-based) and the 
# leftmost occurrence of its reverse complement ACTTAGT is at offset 29, 
# then report 29.

print ("Answer 3: ", leftmost('ACTAAGT', lambda_va))


Answer 3:  7092

In [116]:
# Question 4
# What is the offset of the leftmost occurrence of AGTCGA 
# or its reverse complement in the Lambda virus genome?

print ("Answer 4: ", leftmost('AGTCGA', lambda_va))


Answer 4:  7756

In [1]:
def naive_2mm(p, t, mm = 2):
    occurrences = []
    for i in range(len(t) - len(p) + 1):  # loop over alignments
        match = True
        mismatchCounter = 0
        for j in range(len(p)):  # loop over characters
            if t[i+j] != p[j]:  # compare characters
                mismatchCounter += 1
                if mismatchCounter > mm:
                    break
        if mismatchCounter <= mm:
            occurrences.append(i)  # all chars matched; record
    return occurrences

In [2]:
maxMismatch = 2
occurrences = naive_2mm('ACTTTA', 'ACTTACTTGATAAAGT', maxMismatch)
print ("Naive Match with %d mismatches: %s" % (maxMismatch, occurrences))


Naive Match with 2 mismatches: [0, 4]

In [95]:
# Example 1 - naive_2mm
p = 'CTGT'
ten_as = 'AAAAAAAAAA'
t = ten_as + 'CTGT' + ten_as + 'CTTT' + ten_as + 'CGGG' + ten_as
occurrences = naive_2mm(p, t)
print(occurrences)


[10, 24, 38]

In [9]:
# Question 5
# How many times does TTCAAGCC occur in the Lambda virus genome when allowing up to 2 mismatches?

occurrences = naive_2mm('TTCAAGCC', lambda_va)
print('# Answer 5: %d' % len(occurrences))


# Answer 5: 191

In [10]:
# Question 6
# What is the offset of the leftmost occurrence of AGGAGGTT 
# in the Lambda virus genome when allowing up to 2 mismatches?

occurrences = naive_2mm('AGGAGGTT', lambda_va)
print('Answer 6: %d' % min(occurrences))


Answer 6: 49

In [98]:
!rm ERR037900_1.first1000.fastq
!wget https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ERR037900_1.first1000.fastq


--2015-12-18 23:00:45--  https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ERR037900_1.first1000.fastq
Resolving d28rh4a8wq0iu5.cloudfront.net... 54.240.168.238, 54.240.168.40, 54.240.168.81, ...
Connecting to d28rh4a8wq0iu5.cloudfront.net|54.240.168.238|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 241626 (236K) [application/octet-stream]
Saving to: 'ERR037900_1.first1000.fastq'

ERR037900_1.first10 100%[=====================>] 235.96K  --.-KB/s   in 0.04s  

2015-12-18 23:00:46 (5.33 MB/s) - 'ERR037900_1.first1000.fastq' saved [241626/241626]


In [99]:
def readFastq(filename):
    sequences = []
    qualities = []
    with open(filename) as fh:
        while True:
            fh.readline()  # skip name line
            seq = fh.readline().rstrip()  # read base sequence
            fh.readline()  # skip placeholder line
            qual = fh.readline().rstrip() # base quality line
            if len(seq) == 0:
                break
            sequences.append(seq)
            qualities.append(qual)
    return sequences, qualities

In [100]:
human = readGenome('ERR037900_1.first1000.fastq')

In [ ]: