In [ ]:
# built-in functions
seq = 'ATCCTGCTAAA'
print(len(seq))
In [ ]:
# your own function
def gc_content(seq):
gc = 0
for base in seq:
if (base == 'C') or (base == 'G'):
gc += 1
return gc
print(gc_content('ATCCTGCTAAA'))
print(gc_content('GGGCCCCTTTA'))
In [ ]:
import math
print(math.pi)
In [ ]:
import os.path
seq_filename = os.path.join('data', 'seq.txt')
print(os.path.exists(seq_filename))
print(os.path.dirname(seq_filename))
print(os.path.basename(seq_filename))
In [ ]:
data_filename = os.path.join('data', 'genes.txt')
if os.path.exists(data_filename):
with open(data_filename) as data:
header = data.readline()
with open('results.txt', 'w') as out:
for line in data:
#gene, chrom, start, end = line.strip().split()
row = line.strip().split()
print(int(row[3])-int(row[2]))
#out.write('{}\t{}\n'.format(gene, int(end)-int(start)+1))
else:
print('{} file does not exist'.format(data_filename))
In [ ]:
def gc_content(seq):
gc = 0
for base in seq:
if (base == 'C') or (base == 'G'):
gc += 1
return gc
seq_filename = os.path.join('data', 'seq.txt')
if os.path.exists(seq_filename):
with open (seq_filename) as data:
with open('gc_content_data.csv', 'w') as out:
for line in data:
seq = line.strip()
out.write('{},{}\n'.format(seq, gc_content(seq)))
In [ ]:
import csv
data_filename = 'gc_content_data.csv'
if os.path.exists(data_filename):
with open(data_filename) as data:
reader = csv.reader(data, delimiter=',')
for row in reader:
print(row)
In [ ]:
import csv
# add column header to data file called seq,gc
data_filename = 'gc_content_data.csv'
results = []
if os.path.exists(data_filename):
with open(data_filename) as data:
reader = csv.DictReader(data, delimiter=',')
for row in reader:
results.append(row)
for r in results:
print('{}\t{}'.format(r['seq'], r['gc']))
In [ ]:
import csv
with open('output.txt', 'w') as out:
writer = csv.DictWriter(out, fieldnames=['seq', 'gc'], delimiter='\t')
for r in results:
writer.writerow(r)
In [ ]:
data_filename = os.path.join('data', 'genes.txt')
results = []
if os.path.exists(data_filename):
with open(data_filename) as data:
reader = csv.DictReader(data, delimiter='\t')
for row in reader:
results.append({'gene': row['gene'], 'len': int(row['end'])-int(row['start'])+1})
else:
print('{} file does not exist'.format(data_filename))
with open('results_with_csv.txt', 'w') as out:
writer = csv.DictWriter(out, fieldnames=['gene', 'len'], delimiter='\t')
for r in results:
writer.writerow(r)
#print(results)
In [ ]:
import pandas
data = pandas.read_csv('results_with_csv.txt', sep='\t')
print(data)
In [ ]:
def gc_content(seq):
gc = 0
for base in seq:
if (base == 'C') or (base == 'G'):
gc += 1
return gc
In [ ]:
import tools
print(tools.gc_content('CCCTTCGCTT'))
In [ ]:
from tools import gc_content
print(gc_content('AAAAA'))
In [ ]:
def extract_seq(seq, window_size):
results = []
nb_windows = len(seq) - window_size + 1
for i in range(nb_windows):
results.append(seq[i:i+window_size])
return results
seq = 'ATTCCGGGCCTTAAAA'
print(extract_seq(seq, 5))
In [ ]:
import tools
seq = 'ATTCCGGGCCTTAAAA'
for sub_seq in tools.extract_seq(seq, 5):
print(tools.gc_content(sub_seq))