In [1]:
import random
random.choice('ACGT')
Out[1]:
In [2]:
seq = ''
for _ in range(10):
seq += random.choice('ACGT')
print(seq)
In [3]:
seq = ''.join([random.choice('ACGT') for _ in range(10)])
print(seq)
In [4]:
def longestCommonPrefix(s1, s2):
i = 0
while i < len(s1) and i < len(s2) and s1[i] == s2[i]:
i += 1
return s1[:i]
longestCommonPrefix('ACCAGTA', 'ACCATGT')
Out[4]:
In [5]:
def match(s1, s2):
if not len(s1) == len(s2):
return False
for i in range(len(s1)):
if not s1[i] == s2[i]:
return False
return True
match('ATGCT', 'ATGCT')
Out[5]:
In [6]:
match('ATGCT', 'ATCGT')
Out[6]:
In [7]:
def reverseComplement(s):
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
t = ''
for base in s:
t = complement[base] + t
return t
reverseComplement('ACCGTCG')
Out[7]:
In [8]:
!wget --no-check https://d396qusza40orc.cloudfront.net/ads1/data/lambda_virus.fa
In [9]:
def readGenome(filename):
genome = ''
with open(filename, 'r') as f:
for line in f:
if not line[0] == '>':
genome += line.rstrip()
return genome
genome = readGenome('lambda_virus.fa')
genome[:100]
Out[9]:
In [10]:
len(genome)
Out[10]:
In [11]:
counts = {'A': 0, 'C': 0, 'G': 0, 'T': 0}
for base in genome:
counts[base] += 1
print(counts)
In [12]:
import collections
collections.Counter(genome)
Out[12]:
In [13]:
def QtoPhred33(Q):
''' Turn Q into Phred+33 ASCII-encoded quality '''
return chr(Q + 33) # converts integer to character according to ASCII table
In [14]:
def phred33ToQ(qual):
'''Turn Phred+33 ASCII-encoded quality into Q'''
return ord(qual) - 33 # converts character to integer according to ASCII table
In [15]:
!wget https://d396qusza40orc.cloudfront.net/ads1/data/SRR835775_1.first1000.fastq
In [16]:
!head -n 4 SRR835775_1.first1000.fastq
In [21]:
def readFastq(filename):
sequences = []
qualities = []
with open(filename) as fh:
while True:
fh.readline() # identifier line
seq = fh.readline().rstrip() # sequence line
fh.readline() # '+' line
qual = fh.readline().rstrip() # quality line
if len(seq) == 0:
break
sequences.append(seq)
qualities.append(qual)
return sequences, qualities
seqs, quals = readFastq('SRR835775_1.first1000.fastq')
In [22]:
print(seqs[:5])
In [23]:
print(quals[:5])
In [26]:
def createHist(qualities):
hist = [0] * 50
for qual in qualities:
for phred in qual:
q = phred33ToQ(phred)
hist[q] += 1
return hist
h = createHist(quals)
print(h)
In [28]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.bar(range(len(h)), h)
plt.show()
In [30]:
def findGCByPos(reads):
gc = [0] * 100
totals = [0] * 100
for read in reads:
for i in range(len(read)):
if read[i] == 'C' or read[i] == 'G':
gc[i] += 1
totals[i] += 1
for i in range(len(gc)):
if totals[i] > 0:
gc[i] /= float(totals[i])
return gc
gc = findGCByPos(seqs)
plt.plot(range(len(gc)), gc)
plt.show()
In [32]:
import collections
count = collections.Counter()
for seq in seqs:
count.update(seq)
print(count)
In [1]:
t = 'There would have been a time for such a word'
print(t.find('word'))
In [5]:
def naive(p, t):
occurrences = []
mismatchCount = 0
matchCount = 0
for i in range(len(t) - len(p) + 1): # loop over alignments
match = True
for j in range(len(p)): # loop over characters
if t[i + j] != p[j]: # compare characters
match = False # mismatch; reject alignment
mismatchCount += 1
break
matchCount += 1
if match:
occurrences.append(i) # all characters matched; record
return matchCount, mismatchCount, occurrences
match, mismatch, indexes = naive('word', t)
print("Mismatch Count: %d | Match Count: %d | Pattern Occurrence at: %s" % (mismatch, match, indexes))
In [ ]: