Recap

basic python

Python basic



In [ ]:

    
# list
my_name = 'Anne'
my_list = [2, 4, 6, 8, my_name]
print(my_list)
print(my_list[1])



In [ ]:

    
# dictionary
my_dict = {'A': 'Adenine', 'C': 'Cytosine'} 
print(my_dict)
print(my_dict['A'])



In [ ]:

    
# string
seq = 'ATC CTG TAC TTT'
codons = seq.split()
print(codons)
new_seq = ','.join(codons)
print(new_seq)



In [ ]:

    
# loop
seq = 'ATCCTGTACTT'
for base in seq:
    print(base)



In [ ]:

    
# condition
base = 'A'
if base == 'A':
    print('found base A')



In [ ]:

    
# loop and condition combined
seq = 'ATCCTGTACTT'
gc = 0
for base in seq:
    if (base == 'G') or (base == 'C'):
        gc += 1
        print(base)
print('total number of GCs in the sequence', seq, 'is', gc)



In [ ]:

    
# file
seq = 'ATCCTGTACTT'
gc = 0
for base in seq:
    if (base == 'G') or (base == 'C'):
        gc += 1

with open('my_file.txt', 'w') as out:
    out.write('seq,gc_content\n')
    out.write('{},{}'.format(seq, gc))

Recap

functions

Python documentation

link to python.org

Functions



In [ ]:

    
# build-in ones
seq = 'ATCCTGTACTT'
print(len(seq))



In [ ]:

    
# your own one
def gc_content(seq):
    gc = 0
    for base in seq:
        if (base == 'G') or (base == 'C'):
            gc += 1
    return gc

seq = 'ATCCTGTACTT'
print(gc_content(seq))
print(gc_content('AAATCGATTTAAGGGG')) # reuse multiple time

with open('gc_content_data.csv', 'w') as out:
    with open('seq.txt') as data:
        for line in data:
            seq = line.strip()
            out.write('{},{}\n'.format(seq, gc_content(seq)))

Session 3: Modules

use built-in modules: math, os.path, csv and pandas
create your own

math module



In [ ]:

    
# import math
import math
dir(math)

os.path module



In [ ]:

    
# os.path module
import os.path
print(os.path.exists('my_file_that_does_not_exist.txt'))



In [ ]:

    
import os.path
seq_filename = os.path.join('data', 'seq.txt')
if (os.path.exists(seq_filename)):
    with open(seq_filename) as data:
        for line in data:
            print(line.strip())

    print(os.path.dirname(seq_filename))
    print(os.path.basename(seq_filename))
    
else:
    print('file {} not found'.format(seq_filename))

Ex 3.1

Read a tab delimited file data/genes.txt
Check the file exists
Calculate the length of each gene
Write the results into a new tab separated file

csv module



In [ ]:

    
# csv module - reader
import csv
gc_content_filename = 'gc_content_data.csv'
if os.path.exists(gc_content_filename):
    #print('file exists')
    with open(gc_content_filename) as data:
        #for line in data:
        #    print(line)
        reader = csv.reader(data, delimiter = ",")
        for row in reader:
            print(row)



In [ ]:

    
import csv
gc_content_filename = 'gc_content_data.csv'
results = []
if os.path.exists(gc_content_filename):
    with open(gc_content_filename) as data:
        reader = csv.DictReader(data, delimiter = ",")
        for row in reader:
            results.append(row)

# ordered dictionary
print(results[1])

for r in results:
    print('{}\t{}'.format(r['seq'], r['gc']))



In [ ]:

    
# csv module - writer
with open('output.txt', 'w') as out:
    writer = csv.DictWriter(out, fieldnames=['seq', 'gc'], delimiter='\t')
    #writer.writeheader()
    for r in results:
        writer.writerow(r)

Ex 3.2

change the script you wrote for Ex 3.1 to make use of the csv module



In [ ]:

    
# pandas module
import pandas
data = pandas.read_csv('gc_content_data.csv')
print(data)
for i, d in data.iterrows():
    print(d['seq'], d['gc'])



In [ ]:

    
data.to_csv('new_gc_content_data.csv', sep=',', index=False)

Writing your own module



In [ ]:

    
# use this function and save it into a file called tools.py
def gc_content(seq):
    gc = 0
    for base in seq:
        if (base == 'G') or (base == 'C'):
            gc += 1
    return gc



In [ ]:

    
import tools
print(tools.gc_content('AAATTTCCGG'))



In [ ]:

    
from tools import gc_content
print(gc_content('AAATTTCCGG'))

Ex 3.3

Write a function that extracts a list of overlapping sub-sequences for a given window size for any given sequences

Ex 3.4

Calculate GC content along the DNA sequence by combining the two functions writen using the tools module