In [1]:
text = "There would have been a time for such a word"
pattern = "word"

In [39]:
def naive(text, pattern):
    matches = []
    comparisons = 0
    for i in range(len(text) - len(pattern) + 1):
        for j in range(len(pattern)):
            comparisons += 1
            if text[i+j] != pattern[j]:
                break
        else:
            matches.append(i)
    return (matches, comparisons)

In [23]:
def reverseComplement(dna):
    complements = dict(A='T', C='G', T='A', G='C', N='N')
    revComp = ''.join([complements.get(base, 'N') for base in dna[::-1]])
    return(revComp)

In [24]:
reverseComplement('GATACCA')


Out[24]:
'TGGTATC'

In [21]:
naive(text, pattern)


('working at:', 6)
('working at:', 6)
('working at:', 40)
('working at:', 40)
('working at:', 40)
Out[21]:
([40], 46)

In [5]:
print(text[40:(40+len(pattern))])
print(len(text))
print(len(pattern))


word
44
4

In [30]:
from Bio import SeqIO

def readGenome(filename):
    sequence = str(SeqIO.read(filename, "fasta").seq).upper()
    return(sequence)

In [31]:
fasta_filename = '/home/pvh/PycharmProjects/galaxy-tooltest/test-data/1.fasta'
dna = readGenome(fasta_filename)
print dna[:20]


GTTTGCCATCTTTTGCTGCT

In [33]:
def readFastq(filename):
    sequences = []
    qualities = []
    input_file = open(filename)
    with input_file:
        while True:
            input_file.readline()
            seq = input_file.readline().rstrip()
            input_file.readline()
            qual = input_file.readline()
            if len(seq) == 0:
                break
            sequences.append(seq)
            qualities.append(seq)
    return(sequences, qualities)

In [34]:
genome = readGenome('/tmp/lambda_virus.fa')

In [49]:
pattern = 'AGGT'
rc_pattern = reverseComplement(pattern)
matches = naive(genome, pattern)[0]
rc_matches = naive(genome, rc_pattern)[0]
print len(matches), len(rc_matches), len(rc_matches) + len(matches)


150 156 306

In [52]:
pattern = 'TTAA'
rc_pattern = reverseComplement(pattern)
matches = naive(genome, pattern)[0]
rc_matches = naive(genome, rc_pattern)[0]
all_matches = matches[:]
all_matches.extend(rc_matches)
all_matches = set(all_matches)
print len(matches), len(rc_matches), len(rc_matches) + len(matches)
print len(all_matches)


195 195 390
195

In [55]:
pattern = 'ACTAAGT'
rc_pattern = reverseComplement(pattern)
print rc_pattern
matches = naive(genome, pattern)
print matches[0][0]
rc_matches = naive(genome, rc_pattern)
print rc_matches[0][0]


ACTTAGT
27733
26028

In [56]:
print genome[26028:26028+len(pattern)]


ACTTAGT

In [57]:
pattern = 'AGTCGA'
rc_pattern = reverseComplement(pattern)
print rc_pattern
matches = naive(genome, pattern)
print matches[0][0]
rc_matches = naive(genome, rc_pattern)
print rc_matches[0][0]


TCGACT
18005
450

In [60]:
def naive_2mm(text, pattern):
    matches = []
    comparisons = 0
    for i in range(len(text) - len(pattern) + 1):
        mismatches = 0
        for j in range(len(pattern)):
            comparisons += 1
            if text[i+j] != pattern[j]:
                mismatches += 1
                if mismatches > 2:
                    break
        else:
            matches.append(i)
    return (matches, comparisons)

In [61]:
naive_2mm('ACTTACTTGATAAAGT', 'ACTTTA')


Out[61]:
([0, 4], 43)

In [63]:
len(naive_2mm(genome, 'TTCAAGCC')[0])


Out[63]:
191

In [64]:
naive_2mm(genome, 'AGGAGGTT')[0][0]


Out[64]:
49

In [66]:
from Bio import SeqIO

seq_records = []
for seq_record in SeqIO.parse("/tmp/ERR037900_1.first1000.fastq", "fastq"):
    seq_records.append(seq_record)

In [72]:
%matplotlib inline

In [73]:
from matplotlib import pyplot as plt

In [74]:
plt.plot(seq_records[0].letter_annotations['phred_quality'])


Out[74]:
[<matplotlib.lines.Line2D at 0x7f7e2b48d750>]

In [75]:
plt.plot(seq_records[1].letter_annotations['phred_quality'])


Out[75]:
[<matplotlib.lines.Line2D at 0x7f7e2b33f3d0>]

In [76]:
py_matrix = [sr.letter_annotations['phred_quality'] for sr in seq_records]

In [77]:
import numpy as np

In [78]:
matrix = np.array(py_matrix, dtype=np.int32)

In [79]:
matrix


Out[79]:
array([[39, 39, 39, ..., 34, 27, 36],
       [39, 39, 39, ...,  2,  2,  2],
       [39, 39, 39, ...,  2,  2,  2],
       ..., 
       [31, 30, 31, ...,  2,  2,  2],
       [39, 39, 39, ..., 17, 15, 22],
       [38, 38, 38, ...,  2,  2,  2]], dtype=int32)

In [86]:
plt.plot(np.mean(matrix, axis=0))


Out[86]:
[<matplotlib.lines.Line2D at 0x7f7e2b27d690>]

In [88]:
np.argmin(np.mean(matrix, axis=0))


Out[88]:
66

In [ ]: