1) Given a file of FASTA sequences (you can use 'python.fasta' as an example), split it in (approximately) half, creating two files ('seq1.fasta' and 'seq2.fasta').
a) Use seq_utils.count_seqs() as part of the solution.
b) Do it without counting the sequences first.
In [10]:
import sys
sys.path.append('/home/pvh/Documents/counting_sequences/lib')
import seq_utils
import os
def split_fasta(filename):
try:
input_file = open(filename)
except IOError as e:
print >>sys.stderr, "Failed to open {}: {}".format(filename, e.strerror)
return False
else:
with input_file:
seq_count = seq_utils.count_seqs(input_file)
input_file.seek(0)
half = seq_count // 2
count = 0
filename = 'seq1.fasta'
output_file = open(filename, 'w')
for line in input_file:
if line.lstrip().startswith('>'):
count += 1
if count == half+1:
# change over to seq2.fasta at halfway point
filename = 'seq2.fasta'
output_file.close()
output_file = open(filename, 'w')
output_file.write(line)
output_file.close()
return True
print os.getcwd()
split_fasta('python.fasta')
Out[10]:
In [13]:
import sys
def split_fasta2(filename):
try:
input_file = open(filename)
except IOError as e:
print >>sys.stderr, "Failed to open {}: {}".format(filename, e.strerror)
return False
else:
filename1 = 'seq1.fasta'
filename2 = 'seq2.fasta'
output_file1 = open(filename1, 'w')
output_file2 = open(filename2, 'w')
filenum = 1
with input_file:
for line in input_file:
if line.startswith('>'):
if filenum == 1:
filenum = 2
output_file = output_file2
elif filenum == 2:
filenum = 1
output_file = output_file1
output_file.write(line)
output_file1.close()
output_file2.close()
return True
split_fasta2('python.fasta')
Out[13]:
In [35]:
import sys
def split_fasta3(filename):
try:
input_file = open(filename)
except IOError as e:
print >>sys.stderr, "Failed to open {}: {}".format(filename, e.strerror)
return False
else:
output_files = []
num_parts = 2
for partnum in range(1, num_parts+1):
filename = 'seq{}.fasta'.format(partnum)
output_file = open(filename, 'w')
output_files.append(output_file)
count = 0
with input_file:
for line in input_file:
if line.startswith('>'):
count += 1
filenum = count % num_parts
output_file = output_files[filenum]
output_file.write(line)
for output_file in output_files:
output_file.close()
return True
split_fasta3('python.fasta')
Out[35]:
2) Expand the solution from 1) to split the file into any number of pieces - i.e. the input will now be a filename and the number of parts. You can experiment in iPython first but the final version should be a script split_fasta.py that takes two command line arguments.
In [26]:
import sys
sys.path.append('/home/pvh/Documents/python/lib')
import seq_utils
import os
def get_output_file(part_num):
filename = 'seq{}.fasta'.format(part_num)
output_file = open(filename, 'w')
return output_file
# by default num_parts is equal to two
def split_fasta4(filename, num_parts=2):
try:
input_file = open(filename)
except IOError as e:
print >>sys.stderr, "Failed to open {}: {}".format(filename, e.strerror)
return False
else:
with input_file:
seq_count = seq_utils.count_seqs(input_file)
if seq_count < num_parts:
print >>sys.stderr, "Can't split, number of parts ({}) is greater than number of sequences in file ({})".format(num_parts, seq_count)
return False
input_file.seek(0)
chunk_size = seq_count // num_parts
count = 0
part_num = 1
output_file = get_output_file(part_num)
for line in input_file:
if line.lstrip().startswith('>'):
count += 1
if part_num < num_parts and count == (part_num*chunk_size)+1:
# change over to writing the next part
part_num += 1
output_file = get_output_file(part_num)
output_file.write(line)
output_file.close()
return True
print os.getcwd()
split_fasta('small.fasta', 6)
Out[26]:
In [29]:
import sys
def split_fasta5(filename, num_parts=2):
try:
input_file = open(filename)
except IOError as e:
print >>sys.stderr, "Failed to open {}: {}".format(filename, e.strerror)
return False
else:
output_files = []
for partnum in range(1, num_parts+1):
filename = 'seq{}.fasta'.format(partnum)
output_file = open(filename, 'w')
output_files.append(output_file)
count = 0
with input_file:
for line in input_file:
if line.startswith('>'):
count += 1
filenum = (count - 1) % num_parts
output_file = output_files[filenum]
output_file.write(line)
for output_file in output_files:
output_file.close()
return True
split_fasta5('python.fasta', 3)
Out[29]:
In [34]:
import sys
def split_fasta6(filename, num_parts=2):
try:
input_file = open(filename)
except IOError as e:
print >>sys.stderr, "Failed to open {}: {}".format(filename, e.strerror)
return False
else:
output_files = []
count = 0
with input_file:
for line in input_file:
if line.startswith('>'):
count += 1
filenum = (count - 1) % num_parts
if len(output_files)-1 < filenum:
filename = 'seq{}.fasta'.format(filenum+1)
output_file = open(filename, 'w')
output_files.append(output_file)
else:
output_file = output_files[filenum]
output_file.write(line)
for output_file in output_files:
output_file.close()
if count < num_parts:
print >>sys.stderr, "Warning: number of parts ({}) is greater than number of sequences in file ({})".format(num_parts, count)
return False
return True
split_fasta6('small.fasta', 6)
Out[34]:
In [ ]: