In [1]:
class Contig:
def __init__(self, name, seq):
self.name = name
self.seq = seq
def __repr__(self):
return '< "%s" %i nucleotides>' % (self.name, len(self.seq))
def read_contigs(input_file_path):
contigs = []
current_name = ""
seq_collection = []
# Pre-read generates an array of contigs with labels and sequences
with open(input_file_path, 'r') as streamFASTAFile:
for read in streamFASTAFile.read().splitlines():
if read == "":
continue
if read[0] == ">":
# If we have sequence gathered and we run into a second (or more) block
if len(seq_collection) > 0:
sequence = "".join(seq_collection)
seq_collection = [] # clear
contigs.append(Contig(current_name, sequence))
current_name = read[1:] # remove >
else:
# collects the sequence to be stored in the contig, constant time performance don't concat strings!
seq_collection.append(read.upper())
# add the last contig to the list
sequence = "".join(seq_collection)
contigs.append(Contig(current_name, sequence))
return contigs
In [5]:
from collections import Counter
species = read_contigs('9927_alignment.fasta')
for s in species:
s.name = s.name[:6]
informative_columns = {}
consensus_sequence = []
for col in range(len(species[0].seq)):
letters = []
for entry in species:
letters.append(entry.seq[col])
column_seq = ''.join(letters)
consensusing = Counter(column_seq)
consensus_sequence.append(consensusing.most_common()[0][0])
if column_seq != letters[0] * len(species) and col > 200 and col < 1500:
informative_columns[col] = column_seq
print(column_seq, col+1)
species.append(Contig('Consen', ''.join(consensus_sequence)))
In [49]:
with open('9927_informative_positions.csv', 'w') as csv_out:
csv_out.write('Positions,' + ','.join([str(x+1) for x in sorted(informative_columns.keys())]))
csv_out.write('\n')
for entry in species:
csv_out.write(entry.name[:6] + ",")
for col in range(len(species[0].seq)):
if col in informative_columns:
csv_out.write(entry.seq[col] + ",")
csv_out.write('\n')
In [13]:
seq_length = len(species[0].seq)
similarity_scores = {}
for target in species:
for query in species:
if target != query:
name = (target.name, query.name)
score = sum([target.seq[i] != query.seq[i] for i in range(250,1500)])
similarity_scores[name] = score
min(similarity_scores)
Out[13]:
In [14]:
with open('9927_differentiability.csv', 'w') as csv_out:
csv_out.write(',' + ','.join([s.name for s in species]))
for target in species: # rows
csv_out.write(target.name +',')
for query in species: # cells
if target != query:
name = (target.name, query.name)
csv_out.write(str(similarity_scores[name]) + ',')
else:
csv_out.write(',')
csv_out.write('\n')
In [18]:
min(similarity_scores.values())
Out[18]:
In [24]:
for k,v in similarity_scores.items():
if v < 4:
print(','.join(k))
In [11]:
base_command = "java -cp CONTEXT-.jar uk.ac.qmul.sbcs.evolution.convergence.runners.BasicAlignmentStats "
data_directory = './Data/'
In [12]:
from glob import glob
for filename in glob(data_directory + '*'):
print(base_command + filename)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: