Sparse Alignment columns


In [1]:
class Contig:
    def __init__(self, name, seq):
        self.name = name
        self.seq = seq
        
    def __repr__(self):
        return '< "%s" %i nucleotides>' % (self.name, len(self.seq))

def read_contigs(input_file_path):
    contigs = []
    current_name = ""
    seq_collection = []

    # Pre-read generates an array of contigs with labels and sequences
    with open(input_file_path, 'r') as streamFASTAFile:
        for read in streamFASTAFile.read().splitlines():
            if read == "":
                continue
            if read[0] == ">":
                # If we have sequence gathered and we run into a second (or more) block
                if len(seq_collection) > 0:
                    sequence = "".join(seq_collection)
                    seq_collection = []  # clear
                    contigs.append(Contig(current_name, sequence))
                current_name = read[1:]  # remove >
            else:
                # collects the sequence to be stored in the contig, constant time performance don't concat strings!
                seq_collection.append(read.upper())

    # add the last contig to the list
    sequence = "".join(seq_collection)
    contigs.append(Contig(current_name, sequence))
    return contigs

In [5]:
from collections import Counter

species = read_contigs('9927_alignment.fasta')
for s in species:
    s.name = s.name[:6]
informative_columns = {}
consensus_sequence = []
for col in range(len(species[0].seq)):
    letters = []
    for entry in species:
        letters.append(entry.seq[col])
    column_seq = ''.join(letters)
    consensusing = Counter(column_seq)
    consensus_sequence.append(consensusing.most_common()[0][0])
    if column_seq != letters[0] * len(species) and col > 200 and col < 1500:
        informative_columns[col] = column_seq
        print(column_seq, col+1)
species.append(Contig('Consen', ''.join(consensus_sequence)))


CCCCCCCCCTTCCCCACCCAACCCCCCCC 210
GGGGGGGGGGGGGGGGGGGGGAGGGGGGG 211
CCCCCCCCCCCGCCCCCCCCCCCCCCCCC 212
AAATAAAAAAAAAAAAAAAAAAAAAAGGA 216
GGGGGGGGGGGAGGGGTTGGGGTGGGGGG 223
GGGGGAGGGGGGGGGGGGGGGGGGGGGGG 226
TTTTTTTTTTTAAAAAAATTTTTTTTTTT 229
TTTTTTTTTTTTTTTTTTT-TTTTTTTTT 232
TTTTTTTTTTTTTTTTTTT-TTTTTTTTT 233
CCCCCCCCCCCCCCCCCCC-CCCCCCCCC 234
TTTTTTTTTTTTTTTTTTT-TTTTTTTTT 235
AAAAAAAAAAAAAAAAAAA-AAAAAAAAA 236
GGGGGGGGGGGGGGGGGGC-GGGGGGGGG 237
TTTTTTTTTTTTTTTTTTT-TTTTTTTTT 238
AAAAAAAAAAAAAAAAAAA-AAAAAAAAA 239
CCCCCCCCCCCCCCCCCCC-CCCCCCCCC 240
CCCCCCCCCCCCCCCACCC-CCCCCCCCC 241
AAAAAAAAAAAAAAAAAAA-AAAAAAAAA 242
GGGGGGGGGGGGGGGGGGG-GGGGGGGGG 243
TTTTTTTTTTTTTTTTTTT-TTTTTTTTT 244
AAAAAAAAAAAAAAAAAAA-AAAAAAAAA 245
GGGGGGGGGGGGGGGGGGG-GGGGGGGGG 246
TTTTTTTTTTTTCTTTTTT-TTTCTTTTT 247
GGGGGGGGGGGGGGGGGGG-AGGGGGGGG 248
TTTTTTTTTTTTTTTTTTT-TTTTTTTTT 249
GGGGGGGGGGGGGGGGGGG-GGGGGGGGG 250
GGGGGGGGGGGGGGGGGGG-GGGGGGGGG 251
AAAAAAAAAAAAAAAAAAA-AAAAAAAAA 252
AAAGAAAAAAAAAAAAAAAAAAAAAAAAA 280
CCCCCCCCCCCCCCCCCCTTTACCCCCCC 283
TTTTTTTTTTTTTCTTTTAAATTTTTTTT 293
AAAAAAAAAAAAAAGGGGAAAAAAAAAAA 311
AAAAAAAAAAAAAAAAAAAAAGAAAAAAA 326
GGGGGGGGGGGGTTGGGGGGGGGGGGGGG 328
TTTTTTTTTATTTTTTTTTTTTTTTTTTT 329
AAAAAAAAAAAAAAAAAAAAAAAGAAAAA 342
CTTCCCCCCCCCCCCCCCTTTTTTTTTTT 349
GGGGGGGGGGGGGGGGGGTTTGGGGGGGG 350
GGGGGGGGGGGGGGGGGGGGGGGGGAGGG 372
GGGGGGGGGGGGGGGGGGAAAAAAAAAAA 373
CTTTTTTTTTTTTTTTTTTTTTTTTTTTT 397
CTTCCCCCCCCCCCCCCCCCCCCCCCCCC 442
CTTTTTTTTTTTTTTTTTTTTTTTTTTTT 445
A---------------------------- 452
A---------------------------- 453
G---------------------------- 454
AAAAAAAAAAAAAAAAAAAGAAAAAAAAA 459
ACCCCCCCCCCCCCCCCCCCCCCCCCCCC 464
CTTCCCCTCCCCCCCCCCCCCCCCCCCCC 465
AAAAAAAAAAAAAAAAAAAAAGAAAAAAA 467
TTTTCTTTTTTTTTTTTTTTTTTTTTTTT 475
CCCACCCCCCCCCCCCCCCCCCCCCCCCC 485
AAAAAAAAAAAAAAAAAAAAAAAGGGAAA 487
TCCTTTTTTTTTTTTTTTTTTTTTTTTTT 501
TTTTTTCTTTTCTTTTTTTTTTTTTTTTT 514
GGGAGGGGGGGGGGGGGGGGGGGGGGGGG 517
TCCTTTTTTTTTTTTTTTTTTTTTTTTTT 520
TTTCTTTTTTTTTTTTTTTTTGTTTTTTT 523
GGTGGGGGGGGGGGGGGGGGGGGGGGGGG 524
TTTCTTTTTTTTTTTTTTTTTTTTTTTTT 526
TTTTTTTTTTTTTTTTTTTTTTCTTTCCC 537
GGGGAGGGGGGGGGGGGGGGGGGGGGGGG 539
GTGGGGGGGGGGGGGGGGGGGGGGGGGGG 541
GGGGGGGGGGGGGGGGGGAAAGGGGGGGG 545
TTTTTTTTTTTTTTCCCCTTTTTTTTTTT 549
GGGGGGGGGGGGGGGGGGGGGGGTTGGGG 550
GGGGGAGGGGGGGGGGGGGGGGGGGGGGG 552
AAGAAAAAAAAAAAAAAAAAAAAAAAAAA 557
GGGGGGGAGGGGGGGGGGGGGGGGGGGGG 560
TTTTCCCCCCCTTTTTTTTTTCTTTTTTT 577
TTTTTTTCTTCTTTTTTTTTTTTTTTTTT 583
GGGTGGGGGGGGGGGGGGGGGGGGGGGGG 598
AGGAAAAAAAAAAAAAAAGGGGGGGGGGG 599
TAAAGAAAAAAAAAAAAAAAAAAACAAAA 602
GAAGGGGGGGGGGGGGGGAAAAAAAAAAA 610
GGGGGGGGGGGGGGGGGGGGGGGGGGGGA 619
AAAAAAAAAAAATAAAAAAAAAAAAAAAA 644
AGGAAAAAAAAAAAAAAAGGGGGGGGGGG 646
GGGTGGGGGGGGGGGGGGGGGGGGCCGGG 679
TGGGGGGGGGGGGGGGGGGGGGGGGGGGG 681
TTTTTTTTTTTTTTGTTTTTTTTTTTTTT 686
CCCCCCCCCCCCCCCCCCTCCCCCCCCCC 692
TTTTATTTTTTTTTTTTTTTTTTTTTTTT 706
TTTTTTTTTTTTTTTTTTTTTTTTTTCCC 710
GGGGGGGGGGTGGGGGGGGGGGGGGGGGG 719
TTTCTTTTTTTTTTTTTTTTTTTTTTTTT 727
GGAGGGGGGGGGGGGGGGGGGGGGGGGGG 731
GAAGGGGGGGGGGGGGGGGGGGGGGGGGG 733
CCCCTTTTTTTTTTTTTTTCCTCCCCCCC 742
GAAGGGGGGGGGGGGGGGGGGGGGGGGGG 757
GGGGGGGGGGGGAGGGGGGGGGGGGGGGG 764
CCCACCCCCCCCCCCCCCCCCCCCCCCCC 775
GGGGGGGGGGGGGGGGGGGGGGAAAAAAA 793
TTTCCCTTTTTTCCCCCCTTTTTTTTTTT 808
GGGGGGGGGGGAGGGGGGGGGGGGGGGGG 832
GGGGGGGGGGGGGGGGGGGGGGAAAAAAA 838
GGGGGGGGGGGGGTGGGGGGGGGGGGGGG 848
-------------G--------------- 849
-------------C--------------- 850
AAAAAAAAAAAAAAAAAAAAAATAAAAAA 866
CTTTTTTTTTTTTTTTTTTTTTTTTTTTT 869
AAAAAAAAAAAAAAAGAAAAAAAAAAAAA 870
CCCCCCCCCCCCCCCCCCTCCCCCCCCCC 884
AAAAGAAAAAAAAAAAAAGGAAAAAAAAA 885
GGGGGGGGGGGAGGGGGGGGGGGGGGGGG 888
TAATTTTTTTTTTTTTTTTTTTTTTTTTT 919
CCCCCCCCCCCCTCCCCCCCCCCCCCCCC 921
GGGGGGGGGGGAAAAAAAGGGGGGGGGGG 933
GGGAGGGGGGGGAGGGGGGGGGGGGGGGG 939
CCCTCTCCCCCCCCCCCCCCCCCCCCCCC 944
GGGGGGGGGGGGGAGGGGGGGGGGGGGGG 973
CCCCCCCCCCCCCCCCCCCCCCCCCCCTT 981
AAAAAAAAAAAAAAAAAAAAAAAACAAAA 982
AAAAAAAAAAAAAAAAGGAAAAAAAAAAA 993
TTTTTTTTTTTTTTTTTTTTTTCTTTTTT 1008
TTTTTTTTTTTTTTTTTTTTTTCCCCCCC 1025
TCTCCCCCCCCCCCCCCCCCCCCCCCCCC 1044
TTTTTTATATTTTTTTTTTTTTTTTTTTT 1057
AGGGGGGGGGGGGGGGGGGGGGGGGGGGG 1069
CGGCTCCCCCCCCCCCCCCCCCCCCCCCC 1071
GGGGGGGGGGGGGGGGAAGGGGGGGGGGG 1072
CCCCCCCCCCCCCCCCCCCCCCCCCCGCC 1077
TTCTTTTTTTTTTTTTTTTTTTTTTTTTT 1081
TAAATTTTTTTTTTTTTTAAAAAAAAAAA 1104
GGGGGGGGGGGGGGGGGGGGGGTTTTTTT 1125
GGGAGGAGAGGGGGGGGGGGGGGGGGGGG 1137
TTTTTTTTTTTTCTTTTTTTTTTTTTTTT 1146
ATAAAAAAAAAAAAAAAAAAAAAAAAAAA 1149
AAAAAAAAAAAAAAAAAAAAAAAAAAGGA 1152
GAGGGGGGGGGGGGGGGGGGGGGGGGGGG 1171
TAATTTTTTTTTTTTTTTTTTTTTTTTTT 1181
AAAAAAAAATAAAAAAAAAAAAAAAAAAA 1183
AAAAAAAAAAAAAAAAAAAAAAGAAAGGG 1197
TTTTTCTTTTTTTTTTTTTTTTTTTTTTT 1203
AAAAAAAAAAAAAAAAAAAAAGGGGGGGG 1212
ATTAAAAAAAAAAAAAAAAAAAAAAAAAA 1235
TTTCTTTTTTTTTTTTTTTTTTTTTTTTT 1242
GGGGGGGGGGGGGGGGGGTTTGGGGGGGG 1257
TCTTTTTTTTTTTTTTTTTTTTTTTTTTT 1272
GGGGGGGGGGAGGGGGGGGGGGGGGGGGG 1281
GGGGGGGTGGGGGGGGGGGGGGGGGGGGG 1284
AGGGGGGGGGGGGGGGGGGGGGGGGGGGG 1304
TCCCCCCCCCCCCCCCCCCCCCCCCCCCC 1305
AAAAAAAAAAAAAAAAAAAAAGAAAAAAA 1306
TGGGGGGGGGGGGGGGGGGGGGGGGGGGG 1309
TTTTTTTTTCTTTTTTTTTTTTTTTTTTG 1310
CTTCCCTTCCTCTTTTTTTCCCCCCCCCC 1311
GGGGGAGGGGCGGGGGGGGGGGAAAAGAA 1312
GGGAGGGGGGGGGGGGGGGGGGGGGGGGG 1315
TAAAAAAAAAAAAAAAAAAAAAAAAAAAA 1317
TAATTTTTTTTTTTTTTTTTTTTTTTTTT 1326
AAAAAAAAAGAAAAAAAAAAAAAAAAAAA 1335
AAAAAACAAAAGAAAAAAAAAAAAAAAAA 1341
AAAAAAAAAATAAAAAAAAAAAAAAAAAA 1344
GGGGGGGGGGGGGGGGGGGGGAGGGGGGG 1345
GAAAAAAAAAAAAAAAAAAAAAAAAAAAA 1347
TGGGGGGGGGGGGGGGGGGGGGGGGGGGG 1348
GCGGGGGGGGGGGGGGGGGGGGGGGGGGG 1352
TTTCCCCCCCCCCCCCCCCCCCTTTTTTT 1356
GGGGGGGGGGGGGGGGGGGAGGGGGGGGG 1363
TTTTTTCTTTTTTTTTTTTTTTTTTTTTT 1365
TTTTCTCCTTTTTTTTTTTTTTTTTTTTT 1371
TTTTTTTTTTGTTTTTTTTTTTTTTTTTT 1381
AAAAAAAAAAAATAAAAAAAAAAAAAAAA 1383
GGGGGGGGGGGGGGCCCCGGGGGGGGGGG 1384
GGGGGGGGGGGGGGGCCCGGGGGGGGGGG 1385
CCCCCCCCCCCCCCCGGGCCCCCCCCCCC 1387
TTTTTTTTTTTTTTTTTTTCTTTTTTTTT 1392
GGGGGGGGGGGGGGGGGGGAGGGGGGGGG 1393
GGGGGGGGGGGGGGGGGGAGGGGGGGGGG 1401
TTTTTTTTTCTTTTTTTTTTTTTTTTTTT 1413
GGGGGGGGGGGGGGGTTTGGGGGGGGGGG 1419
TTTTTTTTTTTTTTTCCCTTTTTTTTTTT 1440
GGGGGGGGGGGAGGGGGGGGGGGGGGGGG 1452
TTTTGGGGGGGGGGGGGGTTTTTTTTTTT 1466
TTTTTTTTTTTCTTTTTTTTTTTTTTTTT 1479
TGGGGGGGGGGGAGGGGGGGGGGGGGAGG 1487
CCCTCCCCCCCCCCCCCCCCCCCCCCCCC 1497
  • Generate a fasta with informative columns
  • Majority vote consensus sequence, but it includes gaps
  • transpose?
  • CSV file write

In [49]:
with open('9927_informative_positions.csv', 'w') as csv_out:
    csv_out.write('Positions,' + ','.join([str(x+1) for x in sorted(informative_columns.keys())]))
    csv_out.write('\n')
    for entry in species:
        csv_out.write(entry.name[:6] + ",")
        for col in range(len(species[0].seq)):
            if col in informative_columns:
                csv_out.write(entry.seq[col] + ",")
        csv_out.write('\n')

Pair wise table

How well can you differentiate between every species?


In [13]:
seq_length = len(species[0].seq)
similarity_scores = {}
for target in species:
    for query in species:
        if target != query:
            name = (target.name, query.name)
            score = sum([target.seq[i] != query.seq[i] for i in range(250,1500)])
            similarity_scores[name] = score
min(similarity_scores)


Out[13]:
('Consen', 'FRAEX3')

In [14]:
with open('9927_differentiability.csv', 'w') as csv_out:
    csv_out.write(',' + ','.join([s.name for s in species]))
    for target in species: # rows
        csv_out.write(target.name +',')
        for query in species: # cells
            if target != query:
                name = (target.name, query.name)
                csv_out.write(str(similarity_scores[name]) + ',')
            else:
                csv_out.write(',')
        csv_out.write('\n')

In [18]:
min(similarity_scores.values())


Out[18]:
0

In [24]:
for k,v in similarity_scores.items():
    if v < 4:
        print(','.join(k))


FRAX09,FRAX10
FRAX15,FRAX16
FRAX15,FRAX01
FRAX16,FRAX15
FRAX01,FRAX16
FRAX16,FRAX01
FRAX01,FRAX15
FRAX10,FRAX09
  • Iterate over all the sequences at the same time
  • for each position, how many species can you differentiate
  • keep of list of species

Other Work


In [11]:
base_command = "java -cp CONTEXT-.jar uk.ac.qmul.sbcs.evolution.convergence.runners.BasicAlignmentStats "
data_directory = './Data/'

In [12]:
from glob import glob

for filename in glob(data_directory + '*'):
    print(base_command + filename)


java -cp CONTEXT-.jar uk.ac.qmul.sbcs.evolution.convergence.runners.BasicAlignmentStats ./Data\OG100_full_length_guidance_results_full_length_MUSCLE.MSA.MUSCLE.Without_low_SP_Col.With_Names.fasta
java -cp CONTEXT-.jar uk.ac.qmul.sbcs.evolution.convergence.runners.BasicAlignmentStats ./Data\OG133_full_length_guidance_results_full_length_MUSCLE.MSA.MUSCLE.Without_low_SP_Col.With_Names.fasta
java -cp CONTEXT-.jar uk.ac.qmul.sbcs.evolution.convergence.runners.BasicAlignmentStats ./Data\OG149_full_length_guidance_results_full_length_MUSCLE.MSA.MUSCLE.Without_low_SP_Col.With_Names.fasta
java -cp CONTEXT-.jar uk.ac.qmul.sbcs.evolution.convergence.runners.BasicAlignmentStats ./Data\OG237_full_length_guidance_results_full_length_MUSCLE.MSA.MUSCLE.Without_low_SP_Col.With_Names.fasta
java -cp CONTEXT-.jar uk.ac.qmul.sbcs.evolution.convergence.runners.BasicAlignmentStats ./Data\OG267_full_length_guidance_results_full_length_MUSCLE.MSA.MUSCLE.Without_low_SP_Col.With_Names.fasta
java -cp CONTEXT-.jar uk.ac.qmul.sbcs.evolution.convergence.runners.BasicAlignmentStats ./Data\OG293_full_length_guidance_results_full_length_MUSCLE.MSA.MUSCLE.Without_low_SP_Col.With_Names.fasta
java -cp CONTEXT-.jar uk.ac.qmul.sbcs.evolution.convergence.runners.BasicAlignmentStats ./Data\OG436_full_length_guidance_results_full_length_MUSCLE.MSA.MUSCLE.Without_low_SP_Col.With_Names.fasta
java -cp CONTEXT-.jar uk.ac.qmul.sbcs.evolution.convergence.runners.BasicAlignmentStats ./Data\OG439_full_length_guidance_results_full_length_MUSCLE.MSA.MUSCLE.Without_low_SP_Col.With_Names.fasta
java -cp CONTEXT-.jar uk.ac.qmul.sbcs.evolution.convergence.runners.BasicAlignmentStats ./Data\OG495_full_length_guidance_results_full_length_MUSCLE.MSA.MUSCLE.Without_low_SP_Col.With_Names.fasta
java -cp CONTEXT-.jar uk.ac.qmul.sbcs.evolution.convergence.runners.BasicAlignmentStats ./Data\OG555_full_length_guidance_results_full_length_MUSCLE.MSA.MUSCLE.Without_low_SP_Col.With_Names.fasta
java -cp CONTEXT-.jar uk.ac.qmul.sbcs.evolution.convergence.runners.BasicAlignmentStats ./Data\OG571_full_length_guidance_results_full_length_MUSCLE.MSA.MUSCLE.Without_low_SP_Col.With_Names.fasta
java -cp CONTEXT-.jar uk.ac.qmul.sbcs.evolution.convergence.runners.BasicAlignmentStats ./Data\OG70_full_length_guidance_results_full_length_MUSCLE.MSA.MUSCLE.Without_low_SP_Col.With_Names.fasta

Processing FinchTV FASTA outputs into and alignment

  • Example output FASTA file with all the N's and ugliness
  • Load up consensus MSA
  • Initial cleanup (lenient)
    • First 50bp are low accuracy => trim them
  • Force Alignment onto MSA
    • Do not allow Consensus to have indels introduced, so that we can continue to use the same coordinates

Notes from Jan Kim

  • Heterozygosity inside a single PCR product is a problem because any bias in amplification will exponentially become dominant.
    • PCR as few steps as possible
    • Don't expect perfect 50/50 spilts in the ABI trace
  • Geneous (or manual) base caller with ambiguity codes for all significant peaks, not just the tallest
  • Do much of this manually, there are only 40 specimens
  • Possibly use pairwise alignments that are ambiguity aware to check for best species match
  • You could also still use a frozen Multiple Sequence Alignment

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: