Recap

  • functions

In [ ]:
# built-in functions
seq = 'ATCCTGCTAAA'
print(len(seq))

In [ ]:
# your own function
def gc_content(seq):
    gc = 0
    for base in seq:
        if (base == 'C') or (base == 'G'):
            gc += 1
    return gc

print(gc_content('ATCCTGCTAAA'))
print(gc_content('GGGCCCCTTTA'))

Session 3: modules

  • math, os.path, csv and pandas
  • create own module

In [ ]:
import math
print(math.pi)

In [ ]:
import os.path
seq_filename = os.path.join('data', 'seq.txt')
print(os.path.exists(seq_filename))
print(os.path.dirname(seq_filename))
print(os.path.basename(seq_filename))

Ex 3.1

  • read a tab delimited file data/genes.txt
  • check if the file exists
  • calculate the lenght of each gene
  • write the results into a tab separated file

In [ ]:
data_filename = os.path.join('data', 'genes.txt')
if os.path.exists(data_filename):
    with open(data_filename) as data:
        header = data.readline()
        with open('results.txt', 'w') as out:
            for line in data:
                #gene, chrom, start, end = line.strip().split()
                row = line.strip().split()
                print(int(row[3])-int(row[2]))
                #out.write('{}\t{}\n'.format(gene, int(end)-int(start)+1))
else:
    print('{} file does not exist'.format(data_filename))

In [ ]:
def gc_content(seq):
    gc = 0
    for base in seq:
        if (base == 'C') or (base == 'G'):
            gc += 1
    return gc

seq_filename = os.path.join('data', 'seq.txt')
if os.path.exists(seq_filename):
    with open (seq_filename) as data:
        with open('gc_content_data.csv', 'w') as out:           
            for line in data:
                seq = line.strip()
                out.write('{},{}\n'.format(seq, gc_content(seq)))

cvs module


In [ ]:
import csv
data_filename = 'gc_content_data.csv'
if os.path.exists(data_filename):
    with open(data_filename) as data:
        reader = csv.reader(data, delimiter=',')
        for row in reader:
            print(row)

In [ ]:
import csv
# add column header to data file called seq,gc
data_filename = 'gc_content_data.csv'
results = []
if os.path.exists(data_filename):
    with open(data_filename) as data:
        reader = csv.DictReader(data, delimiter=',')
        for row in reader:
            results.append(row)

for r in results:
    print('{}\t{}'.format(r['seq'], r['gc']))

In [ ]:
import csv
with open('output.txt', 'w') as out:
    writer = csv.DictWriter(out, fieldnames=['seq', 'gc'], delimiter='\t')
    for r in results:
        writer.writerow(r)

Ex 3.2

  • change the script you wrote for ex 3.1 to make use of the csv module

In [ ]:
data_filename = os.path.join('data', 'genes.txt')
results = []
if os.path.exists(data_filename):
    with open(data_filename) as data:
        reader = csv.DictReader(data, delimiter='\t')
        for row in reader:
            results.append({'gene': row['gene'], 'len': int(row['end'])-int(row['start'])+1})
else:
    print('{} file does not exist'.format(data_filename))
    
with open('results_with_csv.txt', 'w') as out:
    writer = csv.DictWriter(out, fieldnames=['gene', 'len'], delimiter='\t')
    for r in results:
        writer.writerow(r)

#print(results)

In [ ]:
import pandas
data = pandas.read_csv('results_with_csv.txt', sep='\t')
print(data)

Writing your own module


In [ ]:
def gc_content(seq):
    gc = 0
    for base in seq:
        if (base == 'C') or (base == 'G'):
            gc += 1
    return gc

In [ ]:
import tools
print(tools.gc_content('CCCTTCGCTT'))

In [ ]:
from tools import gc_content
print(gc_content('AAAAA'))

Ex 3.3

  • write a function that extract a list of overlapping sub-sequences of a given sequence for a given window size

In [ ]:
def extract_seq(seq, window_size):
    results = []
    nb_windows = len(seq) - window_size + 1
    for i in range(nb_windows):
        results.append(seq[i:i+window_size])
    return results

seq = 'ATTCCGGGCCTTAAAA'
print(extract_seq(seq, 5))

Ex 3.4

  • calculate the gc content along the DNA sequence by combining the two functiona writen using the tools module

In [ ]:
import tools
seq = 'ATTCCGGGCCTTAAAA'
for sub_seq in tools.extract_seq(seq, 5):
    print(tools.gc_content(sub_seq))