notebook.community

Edit and run



In [80]:

    
def naive(p, t):
    occurrences = []
    for i in range(len(t) - len(p) + 1):  # loop over alignments
        match = True
        for j in range(len(p)):  # loop over characters
            if t[i+j] != p[j]:  # compare characters
                match = False
                break
        if match:
            occurrences.append(i)  # all chars matched; record
    return occurrences



In [81]:

    
def reverseComplement(s):
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
    t = []
    for base in s:
        t.append(complement[base])
    return ''.join(t)



In [82]:

    
def naive_with_rc(p, t):
    c = reverseComplement(p)
    locations = naive(p, t)
    if p != c:
        locations.extend(naive(c, t))
    return locations



In [83]:

    
# Example 1
p = 'CCC'
ten_as = 'AAAAAAAAAA'
t = ten_as + 'CCC' + ten_as + 'GGG' + ten_as
occurrences = naive_with_rc(p, t)
print(occurrences)



In [84]:

    
# Example 2
p = 'CGCG'
t = ten_as + 'CGCG' + ten_as + 'CGCG' + ten_as
occurrences = naive_with_rc(p, t)
print(occurrences)



In [5]:

    
def readGenome(filename):
    genome = ''
    with open(filename, 'r') as f:
        for line in f:
            # ignore header line with genome information
            if not line[0] == '>':
                genome += line.rstrip()
    return genome



In [6]:

    
# Lambda Virus Genome
!rm lambda_virus.fa
!wget https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/lambda_virus.fa









    



--2015-12-20 22:22:11--  https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/lambda_virus.fa
Resolving d28rh4a8wq0iu5.cloudfront.net... 54.240.168.253, 54.240.168.66, 54.240.168.40, ...
Connecting to d28rh4a8wq0iu5.cloudfront.net|54.240.168.253|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 49270 (48K) [application/octet-stream]
Saving to: 'lambda_virus.fa'

lambda_virus.fa     100%[===================>]  48.12K   307KB/s    in 0.2s    

2015-12-20 22:22:13 (307 KB/s) - 'lambda_virus.fa' saved [49270/49270]



In [7]:

    
lambda_va = readGenome('lambda_virus.fa')



In [8]:

    
# Question 1
# How many times does AGGT or its reverse complement (ACCT) 
# occur in the lambda virus genome? E.g. if AGGT occurs 10 times and 
# ACCT occurs 12 times, you should report 22.
occurrences = naive_with_rc('AGGT', lambda_va)
print ("Answer 1: %d" % len(occurrences))









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-8-3127764bca26> in <module>()
      3 # occur in the lambda virus genome? E.g. if AGGT occurs 10 times and
      4 # ACCT occurs 12 times, you should report 22.
----> 5 occurrences = naive_with_rc('AGGT', lambda_va)
      6 print ("Answer 1: %d" % len(occurrences))

NameError: name 'naive_with_rc' is not defined



In [102]:

    
# Question 2
# How many times does TTAA or its reverse complement occur in the 
# lambda virus genome? Hint: TTAA and its reverse complement are equal, 
# so remember not to double count.
occurrences = naive_with_rc('TTAA', lambda_va)
print ("Answer 2: %d" % len(occurrences))









    



Answer 2: 384



In [117]:

    
def leftmost(p, t):
    occurrences = naive_with_rc(p, t)
    #complement = reverseComplement(p)
    #occurrences.extend(naive(complement, lambda_va))
    leftmost = min(occurrences)
    return leftmost



In [118]:

    
# Question 3
# What is the offset of the leftmost occurrence of ACTAAGT or 
# its reverse complement in the Lambda virus genome? E.g. if the 
# leftmost occurrence of ACTAAGT is at offset 40 (0-based) and the 
# leftmost occurrence of its reverse complement ACTTAGT is at offset 29, 
# then report 29.

print ("Answer 3: ", leftmost('ACTAAGT', lambda_va))









    



Answer 3:  7092



In [116]:

    
# Question 4
# What is the offset of the leftmost occurrence of AGTCGA 
# or its reverse complement in the Lambda virus genome?

print ("Answer 4: ", leftmost('AGTCGA', lambda_va))









    



Answer 4:  7756



In [1]:

    
def naive_2mm(p, t, mm = 2):
    occurrences = []
    for i in range(len(t) - len(p) + 1):  # loop over alignments
        match = True
        mismatchCounter = 0
        for j in range(len(p)):  # loop over characters
            if t[i+j] != p[j]:  # compare characters
                mismatchCounter += 1
                if mismatchCounter > mm:
                    break
        if mismatchCounter <= mm:
            occurrences.append(i)  # all chars matched; record
    return occurrences



In [2]:

    
maxMismatch = 2
occurrences = naive_2mm('ACTTTA', 'ACTTACTTGATAAAGT', maxMismatch)
print ("Naive Match with %d mismatches: %s" % (maxMismatch, occurrences))









    



Naive Match with 2 mismatches: [0, 4]



In [95]:

    
# Example 1 - naive_2mm
p = 'CTGT'
ten_as = 'AAAAAAAAAA'
t = ten_as + 'CTGT' + ten_as + 'CTTT' + ten_as + 'CGGG' + ten_as
occurrences = naive_2mm(p, t)
print(occurrences)









    



[10, 24, 38]



In [9]:

    
# Question 5
# How many times does TTCAAGCC occur in the Lambda virus genome when allowing up to 2 mismatches?

occurrences = naive_2mm('TTCAAGCC', lambda_va)
print('# Answer 5: %d' % len(occurrences))









    



# Answer 5: 191



In [10]:

    
# Question 6
# What is the offset of the leftmost occurrence of AGGAGGTT 
# in the Lambda virus genome when allowing up to 2 mismatches?

occurrences = naive_2mm('AGGAGGTT', lambda_va)
print('Answer 6: %d' % min(occurrences))









    



Answer 6: 49



In [98]:

    
!rm ERR037900_1.first1000.fastq
!wget https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ERR037900_1.first1000.fastq









    



--2015-12-18 23:00:45--  https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ERR037900_1.first1000.fastq
Resolving d28rh4a8wq0iu5.cloudfront.net... 54.240.168.238, 54.240.168.40, 54.240.168.81, ...
Connecting to d28rh4a8wq0iu5.cloudfront.net|54.240.168.238|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 241626 (236K) [application/octet-stream]
Saving to: 'ERR037900_1.first1000.fastq'

ERR037900_1.first10 100%[=====================>] 235.96K  --.-KB/s   in 0.04s  

2015-12-18 23:00:46 (5.33 MB/s) - 'ERR037900_1.first1000.fastq' saved [241626/241626]



In [99]:

    
def readFastq(filename):
    sequences = []
    qualities = []
    with open(filename) as fh:
        while True:
            fh.readline()  # skip name line
            seq = fh.readline().rstrip()  # read base sequence
            fh.readline()  # skip placeholder line
            qual = fh.readline().rstrip() # base quality line
            if len(seq) == 0:
                break
            sequences.append(seq)
            qualities.append(qual)
    return sequences, qualities



In [100]:

    
human = readGenome('ERR037900_1.first1000.fastq')



In [ ]: