In [1]:
text = "There would have been a time for such a word"
pattern = "word"
In [39]:
def naive(text, pattern):
matches = []
comparisons = 0
for i in range(len(text) - len(pattern) + 1):
for j in range(len(pattern)):
comparisons += 1
if text[i+j] != pattern[j]:
break
else:
matches.append(i)
return (matches, comparisons)
In [23]:
def reverseComplement(dna):
complements = dict(A='T', C='G', T='A', G='C', N='N')
revComp = ''.join([complements.get(base, 'N') for base in dna[::-1]])
return(revComp)
In [24]:
reverseComplement('GATACCA')
Out[24]:
In [21]:
naive(text, pattern)
Out[21]:
In [5]:
print(text[40:(40+len(pattern))])
print(len(text))
print(len(pattern))
In [30]:
from Bio import SeqIO
def readGenome(filename):
sequence = str(SeqIO.read(filename, "fasta").seq).upper()
return(sequence)
In [31]:
fasta_filename = '/home/pvh/PycharmProjects/galaxy-tooltest/test-data/1.fasta'
dna = readGenome(fasta_filename)
print dna[:20]
In [33]:
def readFastq(filename):
sequences = []
qualities = []
input_file = open(filename)
with input_file:
while True:
input_file.readline()
seq = input_file.readline().rstrip()
input_file.readline()
qual = input_file.readline()
if len(seq) == 0:
break
sequences.append(seq)
qualities.append(seq)
return(sequences, qualities)
In [34]:
genome = readGenome('/tmp/lambda_virus.fa')
In [49]:
pattern = 'AGGT'
rc_pattern = reverseComplement(pattern)
matches = naive(genome, pattern)[0]
rc_matches = naive(genome, rc_pattern)[0]
print len(matches), len(rc_matches), len(rc_matches) + len(matches)
In [52]:
pattern = 'TTAA'
rc_pattern = reverseComplement(pattern)
matches = naive(genome, pattern)[0]
rc_matches = naive(genome, rc_pattern)[0]
all_matches = matches[:]
all_matches.extend(rc_matches)
all_matches = set(all_matches)
print len(matches), len(rc_matches), len(rc_matches) + len(matches)
print len(all_matches)
In [55]:
pattern = 'ACTAAGT'
rc_pattern = reverseComplement(pattern)
print rc_pattern
matches = naive(genome, pattern)
print matches[0][0]
rc_matches = naive(genome, rc_pattern)
print rc_matches[0][0]
In [56]:
print genome[26028:26028+len(pattern)]
In [57]:
pattern = 'AGTCGA'
rc_pattern = reverseComplement(pattern)
print rc_pattern
matches = naive(genome, pattern)
print matches[0][0]
rc_matches = naive(genome, rc_pattern)
print rc_matches[0][0]
In [60]:
def naive_2mm(text, pattern):
matches = []
comparisons = 0
for i in range(len(text) - len(pattern) + 1):
mismatches = 0
for j in range(len(pattern)):
comparisons += 1
if text[i+j] != pattern[j]:
mismatches += 1
if mismatches > 2:
break
else:
matches.append(i)
return (matches, comparisons)
In [61]:
naive_2mm('ACTTACTTGATAAAGT', 'ACTTTA')
Out[61]:
In [63]:
len(naive_2mm(genome, 'TTCAAGCC')[0])
Out[63]:
In [64]:
naive_2mm(genome, 'AGGAGGTT')[0][0]
Out[64]:
In [66]:
from Bio import SeqIO
seq_records = []
for seq_record in SeqIO.parse("/tmp/ERR037900_1.first1000.fastq", "fastq"):
seq_records.append(seq_record)
In [72]:
%matplotlib inline
In [73]:
from matplotlib import pyplot as plt
In [74]:
plt.plot(seq_records[0].letter_annotations['phred_quality'])
Out[74]:
In [75]:
plt.plot(seq_records[1].letter_annotations['phred_quality'])
Out[75]:
In [76]:
py_matrix = [sr.letter_annotations['phred_quality'] for sr in seq_records]
In [77]:
import numpy as np
In [78]:
matrix = np.array(py_matrix, dtype=np.int32)
In [79]:
matrix
Out[79]:
In [86]:
plt.plot(np.mean(matrix, axis=0))
Out[86]:
In [88]:
np.argmin(np.mean(matrix, axis=0))
Out[88]:
In [ ]: