Recap

  • basic python

Python basic


In [ ]:
# list
my_name = 'Anne'
my_list = [2, 4, 6, 8, my_name]
print(my_list)
print(my_list[1])

In [ ]:
# dictionary
my_dict = {'A': 'Adenine', 'C': 'Cytosine'} 
print(my_dict)
print(my_dict['A'])

In [ ]:
# string
seq = 'ATC CTG TAC TTT'
codons = seq.split()
print(codons)
new_seq = ','.join(codons)
print(new_seq)

In [ ]:
# loop
seq = 'ATCCTGTACTT'
for base in seq:
    print(base)

In [ ]:
# condition
base = 'A'
if base == 'A':
    print('found base A')

In [ ]:
# loop and condition combined
seq = 'ATCCTGTACTT'
gc = 0
for base in seq:
    if (base == 'G') or (base == 'C'):
        gc += 1
        print(base)
print('total number of GCs in the sequence', seq, 'is', gc)

In [ ]:
# file
seq = 'ATCCTGTACTT'
gc = 0
for base in seq:
    if (base == 'G') or (base == 'C'):
        gc += 1

with open('my_file.txt', 'w') as out:
    out.write('seq,gc_content\n')
    out.write('{},{}'.format(seq, gc))

Recap

  • functions

Python documentation

  • link to python.org

Functions


In [ ]:
# build-in ones
seq = 'ATCCTGTACTT'
print(len(seq))

In [ ]:
# your own one
def gc_content(seq):
    gc = 0
    for base in seq:
        if (base == 'G') or (base == 'C'):
            gc += 1
    return gc

seq = 'ATCCTGTACTT'
print(gc_content(seq))
print(gc_content('AAATCGATTTAAGGGG')) # reuse multiple time

with open('gc_content_data.csv', 'w') as out:
    with open('seq.txt') as data:
        for line in data:
            seq = line.strip()
            out.write('{},{}\n'.format(seq, gc_content(seq)))

Session 3: Modules

  • use built-in modules: math, os.path, csv and pandas
  • create your own

math module


In [ ]:
# import math
import math
dir(math)

os.path module


In [ ]:
# os.path module
import os.path
print(os.path.exists('my_file_that_does_not_exist.txt'))

In [ ]:
import os.path
seq_filename = os.path.join('data', 'seq.txt')
if (os.path.exists(seq_filename)):
    with open(seq_filename) as data:
        for line in data:
            print(line.strip())

    print(os.path.dirname(seq_filename))
    print(os.path.basename(seq_filename))
    
else:
    print('file {} not found'.format(seq_filename))

Ex 3.1

  • Read a tab delimited file data/genes.txt
  • Check the file exists
  • Calculate the length of each gene
  • Write the results into a new tab separated file

csv module


In [ ]:
# csv module - reader
import csv
gc_content_filename = 'gc_content_data.csv'
if os.path.exists(gc_content_filename):
    #print('file exists')
    with open(gc_content_filename) as data:
        #for line in data:
        #    print(line)
        reader = csv.reader(data, delimiter = ",")
        for row in reader:
            print(row)

In [ ]:
import csv
gc_content_filename = 'gc_content_data.csv'
results = []
if os.path.exists(gc_content_filename):
    with open(gc_content_filename) as data:
        reader = csv.DictReader(data, delimiter = ",")
        for row in reader:
            results.append(row)

# ordered dictionary
print(results[1])

for r in results:
    print('{}\t{}'.format(r['seq'], r['gc']))

In [ ]:
# csv module - writer
with open('output.txt', 'w') as out:
    writer = csv.DictWriter(out, fieldnames=['seq', 'gc'], delimiter='\t')
    #writer.writeheader()
    for r in results:
        writer.writerow(r)

Ex 3.2

  • change the script you wrote for Ex 3.1 to make use of the csv module

In [ ]:
# pandas module
import pandas
data = pandas.read_csv('gc_content_data.csv')
print(data)
for i, d in data.iterrows():
    print(d['seq'], d['gc'])

In [ ]:
data.to_csv('new_gc_content_data.csv', sep=',', index=False)

Writing your own module


In [ ]:
# use this function and save it into a file called tools.py
def gc_content(seq):
    gc = 0
    for base in seq:
        if (base == 'G') or (base == 'C'):
            gc += 1
    return gc

In [ ]:
import tools
print(tools.gc_content('AAATTTCCGG'))

In [ ]:
from tools import gc_content
print(gc_content('AAATTTCCGG'))

Ex 3.3

  • Write a function that extracts a list of overlapping sub-sequences for a given window size for any given sequences

Ex 3.4

  • Calculate GC content along the DNA sequence by combining the two functions writen using the tools module