In [1]:
import random
random.choice('ACGT')


Out[1]:
'G'

In [2]:
seq = ''
for _ in range(10):
    seq += random.choice('ACGT')
print(seq)


GTCTTGGGAC

In [3]:
seq = ''.join([random.choice('ACGT') for _ in range(10)])
print(seq)


GACCGGCGTT

In [4]:
def longestCommonPrefix(s1, s2):
    i = 0
    while i < len(s1) and i < len(s2) and s1[i] == s2[i]:
        i += 1
    return s1[:i]
longestCommonPrefix('ACCAGTA', 'ACCATGT')


Out[4]:
'ACCA'

In [5]:
def match(s1, s2):
    if not len(s1) == len(s2):
        return False
    for i in range(len(s1)):
        if not s1[i] == s2[i]:
            return False
    return True
match('ATGCT', 'ATGCT')


Out[5]:
True

In [6]:
match('ATGCT', 'ATCGT')


Out[6]:
False

In [7]:
def reverseComplement(s):
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
    t = ''
    for base in s:
        t = complement[base] + t
    return t
reverseComplement('ACCGTCG')


Out[7]:
'CGACGGT'

In [8]:
!wget --no-check https://d396qusza40orc.cloudfront.net/ads1/data/lambda_virus.fa


--2015-12-13 15:58:13--  https://d396qusza40orc.cloudfront.net/ads1/data/lambda_virus.fa
Resolving d396qusza40orc.cloudfront.net... 54.240.168.193, 54.240.168.237, 54.240.168.163, ...
Connecting to d396qusza40orc.cloudfront.net|54.240.168.193|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 49270 (48K) [application/octet-stream]
Saving to: 'lambda_virus.fa.1'

lambda_virus.fa.1   100%[=====================>]  48.12K  --.-KB/s   in 0.04s  

2015-12-13 15:58:14 (1.32 MB/s) - 'lambda_virus.fa.1' saved [49270/49270]


In [9]:
def readGenome(filename):
    genome = ''
    with open(filename, 'r') as f:
        for line in f:
            if not line[0] == '>':
                genome += line.rstrip()
    return genome
genome = readGenome('lambda_virus.fa')
genome[:100]


Out[9]:
'GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACC'

In [10]:
len(genome)


Out[10]:
48502

In [11]:
counts = {'A': 0, 'C': 0, 'G': 0, 'T': 0}
for base in genome:
    counts[base] += 1
print(counts)


{'T': 11986, 'G': 12820, 'C': 11362, 'A': 12334}

In [12]:
import collections
collections.Counter(genome)


Out[12]:
Counter({'A': 12334, 'C': 11362, 'G': 12820, 'T': 11986})

In [13]:
def QtoPhred33(Q):
    ''' Turn Q into Phred+33 ASCII-encoded quality '''
    return chr(Q + 33)     # converts integer to character according to ASCII table

In [14]:
def phred33ToQ(qual):
    '''Turn Phred+33 ASCII-encoded quality into Q'''
    return ord(qual) - 33  # converts character to integer according to ASCII table

In [15]:
!wget https://d396qusza40orc.cloudfront.net/ads1/data/SRR835775_1.first1000.fastq


--2015-12-13 15:58:14--  https://d396qusza40orc.cloudfront.net/ads1/data/SRR835775_1.first1000.fastq
Resolving d396qusza40orc.cloudfront.net... 54.240.168.163, 54.240.168.237, 54.230.159.236, ...
Connecting to d396qusza40orc.cloudfront.net|54.240.168.163|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 224786 (220K) [application/octet-stream]
Saving to: 'SRR835775_1.first1000.fastq.1'

SRR835775_1.first10 100%[=====================>] 219.52K  1.26MB/s   in 0.2s   

2015-12-13 15:58:14 (1.26 MB/s) - 'SRR835775_1.first1000.fastq.1' saved [224786/224786]


In [16]:
!head -n 4 SRR835775_1.first1000.fastq


@SRR835775.1 1/1
TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTCACCCTAACCCTAACCCTAACCGTATCCGTCACCCTAACCCTAAC
+
???B1ADDD8??BB+C?B+:AA883CEE8?C3@DDD3)?D2;DC?8?=BAD=@C@(.6.6=A?=?@##################################

In [21]:
def readFastq(filename):
    sequences = []
    qualities = []
    with open(filename) as fh:
        while True:
            fh.readline()                 # identifier line
            seq = fh.readline().rstrip()  # sequence line
            fh.readline()                 # '+' line
            qual = fh.readline().rstrip() # quality line
            if len(seq) == 0:
                break
            sequences.append(seq)
            qualities.append(qual)
    return sequences, qualities
seqs, quals = readFastq('SRR835775_1.first1000.fastq')

In [22]:
print(seqs[:5])


['TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTCACCCTAACCCTAACCCTAACCGTATCCGTCACCCTAACCCTAAC', 'TAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACC', 'TAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGG', 'TAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAAGGGTTGGGGGTTAGGGGTAGGGGTAGGGTTA', 'CTAACCCTAACCCTAACCCTAACCCTACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCTAACCCTAACCCTCACC']

In [23]:
print(quals[:5])


['???B1ADDD8??BB+C?B+:AA883CEE8?C3@DDD3)?D2;DC?8?=BAD=@C@(.6.6=A?=?@##################################', 'CCCFFFFFGHHGHJJJJJIJGIIJJJJJJJIJIJJJJJFJJFGIIIIH=CBFCF=CCEG)=>EHB2@@DEC>;;?=;(=?BBD?59?BA###########', '@@<DD?DDHHH<CBHII:CFGIGAGHIIG?CCGGE0BDHIIHIGICH8=FHGI=@EHGGGEEHH>);?CC@.;(=;?59,5<A599?CB>ABBCB><88A', '@CCFFDDFHHHDFHIJJCGGIJJHIIHJC?DHHIJ0?DGHI9BBFHICGGIGI=CDEGI=?AAEF7@?################################', '@<@FDFDDBBFHH@EBGGGEH@FFHE;EHIEGGBHG?)9?8BDGGBGGBCDGI=93=C6==C;CCD(?@>@#############################']

In [26]:
def createHist(qualities):
    hist = [0] * 50
    for qual in qualities:
        for phred in qual:
            q = phred33ToQ(phred)
            hist[q] += 1
    return hist
h = createHist(quals)
print(h)


[0, 0, 6178, 0, 0, 54, 108, 574, 345, 83, 193, 124, 79, 165, 49, 236, 184, 327, 514, 238, 531, 254, 313, 798, 992, 888, 1396, 1488, 993, 1752, 3387, 4487, 3248, 5476, 8375, 11814, 4243, 7827, 6579, 8179, 9349, 8180, 0, 0, 0, 0, 0, 0, 0, 0]

In [28]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.bar(range(len(h)), h)
plt.show()



In [30]:
def findGCByPos(reads):
    gc = [0] * 100
    totals = [0] * 100
    for read in reads:
        for i in range(len(read)):
            if read[i] == 'C' or read[i] == 'G':
                gc[i] += 1
            totals[i] += 1
    for i in range(len(gc)):
        if totals[i] > 0:
            gc[i] /= float(totals[i])
    return gc

gc = findGCByPos(seqs)
plt.plot(range(len(gc)), gc)
plt.show()



In [32]:
import collections
count = collections.Counter()
for seq in seqs:
    count.update(seq)
print(count)


Counter({'G': 28742, 'C': 28272, 'T': 21836, 'A': 21132, 'N': 18})

Exact Matching


In [1]:
t = 'There would have been a time for such a word'
print(t.find('word'))


40

In [5]:
def naive(p, t):
    occurrences = []
    mismatchCount = 0
    matchCount = 0
    for i in range(len(t) - len(p) + 1):  # loop over alignments
        match = True
        for j in range(len(p)):  # loop over characters
            if t[i + j] != p[j]: # compare characters
                match = False    # mismatch; reject alignment
                mismatchCount += 1
                break
            matchCount += 1
        if match:
            occurrences.append(i) # all characters matched; record
    return matchCount, mismatchCount, occurrences

match, mismatch, indexes = naive('word', t)
print("Mismatch Count: %d | Match Count: %d | Pattern Occurrence at: %s" % (mismatch, match, indexes))


Mismatch Count: 40 | Match Count: 6 | Pattern Occurrence at: [40]

In [ ]: