• Stitch together all tiles based on graph connections
  • Calculate how many sequences are desired originals

In [29]:
import networkx as nx
import itertools
from difflib import SequenceMatcher
from Bio.Seq import Seq
import generate_tiles_basic as gt

tiles = gt.generate_tiles(gt.get_seqs("AllUniqueDNA.fasta"), gt.tile_length, gt.OVERLAP, gt.num_tiles)
THRESHOLD = 0.95

start = len(gt.start_overlap)
end = -len(gt.end_overlap)

seq_list = [seq for seq in tiles.values()]
tile_list_index = []
master_tile_list = []
for seq in seq_list[:5]:
    tile_list = []
    for i, tile in enumerate(seq):
        if i == 0:
            tile_list_index.append([i, tile[start:]])
            tile_list.append(tile[start:])
        elif i == len(seq)-1:
            tile_list_index.append([i, tile[:end]])
            tile_list.append(tile[:end])
        else:
            tile_list_index.append([i, tile])
            tile_list.append(tile)
    master_tile_list.append(tile_list)
sources = []
targets = []
for tile in tile_list_index:
    if tile[0] == 0:
        sources.append(tile[1])
    elif tile[0] == gt.num_tiles-1:
        targets.append(tile[1])
master_tile_list


Out[29]:
[['ATGGAAAGAATAAAAGAACTACGGAATCTGATGTCGCAGTCTCGCACTCGCGAGATACTGACAAAAACCACAGTGGACCATATGGCCATAATTAAGAAGTACACATCGGGGAGACAGGAAAAGAACCCGTCACTTAGGATGAAATGGATGATGGCAATGAAATATCCAATCACTGCT',
  'ATGATGGCAATGAAATATCCAATCACTGCTGACAAAAGGGTAACAGAAATGGTTCCGGAGAGAAATGAACAAGGACAAACTCTATGGAGTAAAATGAGTGATGCTGGATCAGATAGAGTGATGGTATCACCTTTGGCTGTAACATGGTGGAATAGGAATGGACCCGTGACAAGTACGGTCCATTACCCAAAAGTGTACAAAACTTAT',
  'GTCCATTACCCAAAAGTGTACAAAACTTATTTCGACAAAGTCGAAAGGTTAAAACATGGAACCTTTGGCCCTGTCCATTTTAGAAATCAAGTCAAGATACGCAGAAGAGTAGACATAAACCCTGGTCATGCAGACCTCAGTGCCAAAGAGGCACAAGATGTAATTATGGAAGTTGTTTTTCCCAATGAAGTGGGAGCCAGAATACTA',
  'TTTCCCAATGAAGTGGGAGCCAGAATACTAACATCAGAATCACAACTAACAATAACTAAAGAGAAAAAAGAAGAACTCCGAGATTGCAAAATTTCTCCCTTGATGGTCGCATACATGTTAGAGAGAGAACTTGTGCGAAAAACAAGATTTCTCCCAGTTGCTGGCGGAACAAGCAGTATCTACATTGAAGTTTTACATTTGACTCAA',
  'ATCTACATTGAAGTTTTACATTTGACTCAAGGAACGTGTTGGGAACAAATGTACACTCCAGGTGGAGGAGTGAGGAATGACGATGTTGACCAAAGCCTAATTATTGCGGCCAGGAACATAGTAAGAAGAGCCGCAGTATCAGCAGATCCACTAGCATCTTTATTGGAGATGTGCCACAGCACGCAAATTGGCGGAACAAGGATGGTG',
  'AGCACGCAAATTGGCGGAACAAGGATGGTGGACATTCTTAGACAGAACCCGACTGAAGAACAAGCTGTGGATATATGCAAGGCTGCAATGGGATTGAGAATCAGCTCATCCTTCAGCTTTGGTGGCTTTACATTTAAAAGAACAAGCGGGTCATCAGTCAAAAAAGAAGAAGAGGTGCTTACAGGCAACCTCCAAACATTGAGAATA',
  'CTTACAGGCAACCTCCAAACATTGAGAATAAGAGTACATGAGGGGTATGAGGAGTTCACAATGGTGGGGAAAAGAGCAACAGCTATACTAAGAAAAGCAACCAGAAGATTGGTTCAACTCATAGTGAGTGGAAGAGACGAACAGTCAATAGCCGAAGCAATAATCGTGGCCATGGTGTTTTCACAAGAAGATTGCATGATAAAAGCA',
  'TTTTCACAAGAAGATTGCATGATAAAAGCAGTTAGAGGTGACCTGAATTTTGTCAACAGAGCAAATCAGCGGTTGAACCCCATGCATCAGCTTTTAAGGCATTTTCAGAAAGATGCGAAAGTGCTCTTTCAAAATTGGGGAGTTGAACACATCGACAATGTGATGGGAATGGTTGGAGTATTACCAGATATGACTCCAAGCACAGAG',
  'GTATTACCAGATATGACTCCAAGCACAGAGATGTCAATGAGAGGAATAAGAGTCAGCAAAATGGGTGTGGATGAATACTCCAGTACAGAGAGGGTGGTGGTTAGCATTGATCGGTTTTTGAGAGTTCGAGACCAACGTGGGAATGTATTATTATCTCCTGAGGAGGTCAGTGAAACACAGGGAACTGAGAGACTGACAATAACTTAT',
  'CAGGGAACTGAGAGACTGACAATAACTTATTCATCGTCGATGATGTGGGAGATTAACGGTCCTGAGTCGGTTTTGGTCAATACCTATCAATGGATCATCAGGAATTGGGAAGCTGTCAAAATTCAATGGTCTCAGAATCCTGCAATGTTGTACAACAAAATGGAATTTGAACCATTTCAATCTTTAGTCCCCAAGGCCATTAGAAGC',
  'CAATCTTTAGTCCCCAAGGCCATTAGAAGCCAATACAGTGGGTTTGTCAGAACTCTATTCCAACAAATGAGAGACGTACTTGGGACATTTGACACTGCCCAGATAATAAAGCTTCTCCCTTTTGCAGCTGCTCCACCGAAGCAAAGCAGAATGCAGTTCTCTTCACTGACTGTGAATGTGAGGGGATCAGGGATGAGAATACTTGTA',
  'GTGAGGGGATCAGGGATGAGAATACTTGTAAGGGGAAATTCTCCTGTATTCAACTACAACAAGACCACTAAACGGCTAACAATTCTCGGAAAAGATGCCGGCACTTTAATTGAAGATCCAGATGAAAGCACATCCGGGGTGGAGTCTGCCGTTTTGAGAGGGTTCCTCATTATAGGTAAGGAAGACAGAAGATACGGACCAGCATTA',
  'AAGGAAGACAGAAGATACGGACCAGCATTAAGCATCAATGAACTGAGTAACCTTGCAAAAGGGGAAAAGGCTAATGTGCTAATCGGGCAAGGAGACGTGGTGTTGGTAATGAAACGAAAACGGGACTCTAGCATACTTACTGACAGCCAGACAGCGACCAAAAGAATTCGGATGGCCATCAATTAA'],
 ['ATGGAAAGAATAAAAGAACTACGGAATCTAATGTCGCAGTCTCGCACTCGCGAGATACTGACAAAAACCACAGTGGACCATATGGCCATAATTAAGAAGTACACATCGGGGAGACAGGAAAAGAACCCGTCACTTAGGATGAAATGGATGATGGCAATGAAATATCCAATCACTGCT',
  'ATGATGGCAATGAAATATCCAATCACTGCTGACAAAAGGATAACAGAAATGGTTCCGGAGAGAAATGAACAAGGACAAACTCTATGGAGTAAAATGAGTGATGCTGGATCAGATAGAGTGATGGTATCACCTTTGGCTGTAACATGGTGGAATAGGAATGGACCCGTGACAAGTACGGTCCATTACCCAAAAGTATACAAAACTTAT',
  'GTCCATTACCCAAAAGTATACAAAACTTATTTCGACAAAGTCGAAAGGTTAAAACATGGAACCTTTGGCCCTGTCCATTTTAGAAATCAAGTCAAGATACGCAGAAGAGTAGATATAAACCCTGGTCATGCAGACCTCAGTGCCAAAGAGGCACAAGATGTAATTATGGAAGTTGTTTTTCCCAATGAAGTGGGAGCCAGAATACTA',
  'TTTCCCAATGAAGTGGGAGCCAGAATACTAACATCAGAATCACAGCTAACAATAACTAAAGAGAAAAAAGAAGAACTCCGAGATTGCAAAATTTCTCCCTTGATGGTTGCATACATGTTAGAGAGAGAACTTGTGCGGAAAACAAGATTTCTCCCAGTTGCTGGCGGAACAAGCAGTATATACATTGAAGTTTTACATTTGACTCAA',
  'ATATACATTGAAGTTTTACATTTGACTCAAGGAACGTGCTGGGAACAAATGTACACTCCAGGTGGAGGAGTGAGGAATGACGATGTTGACCAAAGCCTAATTATTGCGGCCAGAAACATAGTAAGAAGAGCCGCAGTGTCAGCAGATCCACTAGCATCTTTATTGGAGATGTGCCACAGCACGCAAATTGGCGGAACAAGGATGGTG',
  'AGCACGCAAATTGGCGGAACAAGGATGGTGGACATTCTTAGACAGAACCCGACTGAAGAACAAGCGGTGGATATATGCAAGGCTGCAATGGGATTGAGAATCAGCTCATCCTTCAGCTTTGGTGGCTTTACATTTAAAAGAACAAGCGGGTCGTCAGTCAAAAAAGAAGAAGAGGTTCTTACAGGCAATCTCCAAACATTGAGAATA',
  'CTTACAGGCAATCTCCAAACATTGAGAATAAGAGTACATGAGGGGTATGAGGAGTTCACAATGGTGGGGAAAAGAGCAACAGCTATACTAAGAAAAGCAACCAGAAGATTGGTTCAACTCATAGTGAGTGGAAGAGACGAACAGTCAATAGCCGAAGCAATAATCGTGGCCATGGTGTTTTCACAAGAAGATTGCATGATAAAAGCA',
  'TTTTCACAAGAAGATTGCATGATAAAAGCAGTTAGAGGTGACCTGAATTTTGTCAACAGAGCAAATCAGCGGTTGAACCCCATGCATCAGCTTTTAAGGCATTTTCAGAAAGATGCAAAAGTACTCTTTCAAAATTGGGGAGTTGAACACATCGACAGTGTGATGGGAATGGTTGGAGTATTACCAGATATGACTCCAAGCACAGAG',
  'GTATTACCAGATATGACTCCAAGCACAGAGATGTCAATGAGAGGAATAAGAGTCAGCAAAATGGGTGTGGATGAATACTCCAGTACAGAGAGGGTGGTGGTTAGCATTGATCGGTTTTTGAGAGTTCGAGACCAACGTGGGAATGTATTATTATCTCCTGAGGAGGTCAGTGAAACACAGGGAACTGAAAGACTGACAATAACTTAT',
  'CAGGGAACTGAAAGACTGACAATAACTTATTCATCGTCGATGATGTGGGAGATTAACGGTCCCGAGTCGGTTTTGGTCAATACCTATCAATGGATCATCAGGAATTGGGAAGCTGTCAAAATTCAATGGTCTCAGAATCCTGCAATGTTGTACAACAAAATGGAATTTGAACCATTTCAATCTTTAGTTCCCAAGGCCACTAGAAGC',
  'CAATCTTTAGTTCCCAAGGCCACTAGAAGCCAATACAGTGGGTTTGTCAGAACTCTATTCCAACAAATGAGAGACGTACTTGGGACATTTGACACTGCCCAGATAATAAAGCTTCTCCCTTTTGCAGCTGCTCCACCGAAGCAAAGCAGAATGCAGTTCTCTTCACTGACTGTGAATGTGAGGGGATCAGGGATGAGAATACTTGTA',
  'GTGAGGGGATCAGGGATGAGAATACTTGTAAGGGGCAATTCTCCTGTATTCAACTACAACAAGACCACTAAAAGGCTAACAATTCTTGGAAAAGATGCCGGCACTTTAATTGAAGACCCAGATGAAAGCACATCCGGAGTGGAGTCCGCCGTCTTGAGAGGGTTCCTCATTATAGGTAAAGAAGACAGAAGATACGGACCAGCATTA',
  'AAAGAAGACAGAAGATACGGACCAGCATTAAGCATCAATGAACTGAGTAACCTTGCAAAAGGGGAAAAGGCTAATGTGCTAATTGGGCAAGGAGACGTGGTGTTGGTAATGAAACGAAAACGGGACTCTAGTATACTTACTGACAGCCAGACAGCGACCAAAAGAATTCGGATGGCCATCAATTAA'],
 ['ATGGAAAGAATAAAAGAACTACGGAATCTAATGTCGCAGTCTCGCACTCGCGAGATACTGACAAAAACCACAGTGGACCATATGGCCATAATTAAGAAGTACACATCGGGGAGACAGGAAAAGAACCCGTCACTTAGGATGAAATGGATGATGGCAATGAAATATCCAATCACTGCT',
  'ATGATGGCAATGAAATATCCAATCACTGCTGACAAAAGGATAACAGAAATGGTTCCGGAGAGAAATGAACAAGGACAAACTCTATGGAGTAAAATGAGTGATGCTGGTTCAGATAGAGTGATGGTATCACCTTTGGCTGTAACATGGTGGAATAGGAATGGACCCGTGACAAGTACGGTCCATTACCCAAAAGTGTACAAAACTTAT',
  'GTCCATTACCCAAAAGTGTACAAAACTTATTTCGACAAAGTCGAAAGGTTAAAACATGGAACCTTTGGCCCTGTCCATTTTAGAAATCAAGTCAAGATACGCAGAAGAGTAGACATAAACCCTGGTCATGCAGACCTCAGTGCCAAAGAGGCACAAGATGTAATTATGGAAGTTGTTTTTCCCAATGAAGTGGGAGCCAGAATACTA',
  'TTTCCCAATGAAGTGGGAGCCAGAATACTAACATCAGAATCACAGCTAACAATAACTAAAGAGAAAAAAGAAGAACTCCGAGATTGCAAAATTTCTCCCTTGATGGTCGCATACATGCTAGAGAGAGAACTTGTGCGGAAAACAAGATTTCTCCCAGTTGCTGGCGGAACAAGCAGTATATACATTGAAGTTTTACATTTGACTCAA',
  'ATATACATTGAAGTTTTACATTTGACTCAAGGAACGTGTTGGGAACAAATGTACACTCCAGGTGGAGGAGTGAGGAATGACGATGTTGACCAAAGCCTAATTATTGCGGCCAGGAACATAGTAAGAAGAGCCGCAGTGTCAGCAGATCCACTAGCATCTTTATTGGAGATGTGCCACAGCACGCAAATTGGCGGAACAAGGATGGTG',
  'AGCACGCAAATTGGCGGAACAAGGATGGTGGACATTCTTAGACAGAACCCGACTGAAGAACAAGCTGTGGATATATGCAAGGCTGCAATGGGACTGAGAATCAGCTCATCCTTCAGCTTTGGTGGCTTTACATTTAAAAGAACAAGCGGGTCGTCAGTCAAAAAAGAAGAAGAGGTTCTTACAGGCAATCTCCAAACATTGAGAATA',
  'CTTACAGGCAATCTCCAAACATTGAGAATAAGAGTACATGAGGGGTATGAGGAGTTCACAATGGTGGGGAAAAGAGCAACAGCTATACTAAGAAAAGCAACCAGAAGATTGGTTCAACTCATAGTGAGTGGAAGAGACGAACAGTCAATAGCCGAAGCAATAATCGTGGCCATGGTGTTTTCACAAGAAGATTGCATGATAAAAGCA',
  'TTTTCACAAGAAGATTGCATGATAAAAGCAGTTAGAGGTGACCTGAATTTTGTCAACAGAGCAAATCAGCGGTTGAACCCCATGCATCAGCTTTTAAGGCATTTTCAGAAAGATGCGAAAGTACTCTTTCAAAATTGGGGAGTTGAACACATCGACAGTGTGATGGGAATGGTTGGAGTATTACCAGATATGACTCCAAGCACAGAG',
  'GTATTACCAGATATGACTCCAAGCACAGAGACGTCAATGAGAGGAATAAGAGTCAGCAAAATGGGTGTGGATGAATACTCCAGTACAGAGAGGGTGGTGGTTAGCATTGATCGGTTTTTGAGAGTTCGAGACCAACGTGGGAATGTATTATTATCTCCTGAGGAGGTCAGTGAAACACAGGGAACTGAAAGACTGACAATAACTTAT',
  'CAGGGAACTGAAAGACTGACAATAACTTATTCATCGTCGATGATGTGGGAGATTAACGGTCCTGAGTCGGTTTTGGTCAATACCTATCAATGGATCATCAGGAATTGGGAAGCTGTCAAAATTCAATGGTCTCAGAATCCTGCAATGTTGTACAACAAAATGGAATTTGAACCATTTCAATCTTTAGTCCCCAAGGCCACTAGAAGC',
  'CAATCTTTAGTCCCCAAGGCCACTAGAAGCCAATACAGTGGGTTTGTCAGAACTCTATTCCAACAAATGAGAGACGTACTTGGGACATTTGACACTGCCCAGATAATAAAGCTTCTCCCTTTTGCAGCTGCTCCACCGAAGCAAAGCAGAATGCAGTTCTCTTCACTGACTGTGAATGTGAGGGGATCAGGGATGAGAATACTTGTA',
  'GTGAGGGGATCAGGGATGAGAATACTTGTAAGGGGCAATTCTCCTGTATTCAACTACAACAAGACCACTAAAAGGCTAACAATTCTTGGAAAAGATGCCGGTACTTTAATTGAAGACCCAGATGAAAGCACATCCGGAGTGGAGTCCGCCGTCTTGAGAGGGTTCCTCATTATAGGTAAAGAAGACAGAAGATACGGACCAGCATTA',
  'AAAGAAGACAGAAGATACGGACCAGCATTAAGCATCAATGAACTGAGTAACCTTGCAAAAGGGGAAAAGGCTAATGTGCTAATTGGGCAAGGAGACGTGGTGTTGGTAATGAAACGAAAACGGGACTCTAGTATACTTACTGACAGCCAGACAGCGACCAAAAGAATTCGGATGGCCATCAATTAA'],
 ['ATGGAAAGAATAAAAGAACTACGGAATCTGATGTCGCAGTCTCGCACTCGCGAGATACTGACAAAAACCACAGTGGACCATATGGCCATAATTAAGAAGTACACATCAGGGAGACAGGAAAAGAACCCGTCACTTAGGATGAAATGGATGATGGCAATGAAATATCCAATCACTGCT',
  'ATGATGGCAATGAAATATCCAATCACTGCTGACAAAAGGGTCACAGAAATGGTTCCGGAGAGAAATGAACAAGGACAAACTCTATGGAGTAAAATGAGTGATGCTGGATCAGATAGAGTGATGGTATCACCTTTGGCTGTAACATGGTGGAATAGGAATGGACCCGTGACAAGTACTGTCCATTACCCAAAAGTGTACAAAACTTAT',
  'GTCCATTACCCAAAAGTGTACAAAACTTATTTCGATAAAGTCGAAAGGTTAAAACATGGAACCTTTGGCCCTGTCCATTTCAGAAATCAAGTCAAGATACGCAGAAGAGTAGACATAAACCCTGGTCATGCAGACCTCAGTGCCAAAGAGGCACAAGATGTAATTATGGAAGTTGTTTTTCCCAATGAAGTGGGAGCCAGAATACTA',
  'TTTCCCAATGAAGTGGGAGCCAGAATACTAACATCAGAATCACAACTAACAATAACTAAAGAGAAAAAAGAAGAACTCCGAGATTGCAAAATTTCTCCCTTGATGGTCGCATACATGTTAGAGAGAGAACTTGTGCGAAAAACAAGATTTCTCCCAGTTGCTGGCGGAACAAGCAGTATATACATTGAAGTTTTACATCTGACTCAA',
  'ATATACATTGAAGTTTTACATCTGACTCAAGGAACGTGTTGGGAACAAATGTACACTCCAGGTGGAGGAGTGAGGAATGACGATGTTGACCAAAGTTTAATTATTGCGGCCAGGAACATAGTAAGAAGAGCCGCAGTATCAGCAGATCCACTAGCATCTTTATTGGAGATGTGCCACAGCACGCAAATTGGCGGAACAAGGATGGTG',
  'AGCACGCAAATTGGCGGAACAAGGATGGTGGACATTCTTAGACAGAACCCGACTGAAGAACAAGCTGTGGATATATGCAAGGCTGCAATGGGATTGAGAATCAGCTCATCCTTCAGCTTTGGTGGCTTTACATTTAAAAGAACAAGCGGGTCGTCAGTCAAAAAAGAAGAAGAGGTGCTTACAGGCAATCTCCAAACATTGAGAATA',
  'CTTACAGGCAATCTCCAAACATTGAGAATAAGAGTACATGAGGGGTATGAGGAGTTCACAATGGTGGGGAAAAGAGCAACAGCTATACTAAGAAAAGCAACCAGAAGATTGGTCCAACTCATAGTGAGTGGAAAAGACGAACAGTCAATAGCCGAAGCAATAATCGTGGCCATGGTGTTTTCACAAGAAGATTGCATGATAAAAGCA',
  'TTTTCACAAGAAGATTGCATGATAAAAGCAGTTAGGGGTGACCTGAATTTTGTCAACAGAGCAAATCAGCGGTTGAACCCCATGCATCAGCTTTTAAGGCATTTTCAGAAAGATGCGAAAGTGCTCTTTCAAAATTGGGGAGTTGAACACATCGACAGTGTGATGGGAATGGTTGGGGTATTACCAGATATGACTCCAAGCACAGAG',
  'GTATTACCAGATATGACTCCAAGCACAGAGATGTCAATGAGAGGAATAAGAGTCAGCAAAATGGGTGTGGATGAATACTCCAGTACAGAGAGGGTGGTGGTTAGCATTGATCGGTTTTTGAGAGTTCGAGACCAACGTGGAAATGTATTATTATCTCCTGAGGAGGTCAGTGAAACACAGGGAACTGAGAGACTGACAATAACTTAT',
  'CAGGGAACTGAGAGACTGACAATAACTTATTCATCGTCGATGATGTGGGAGATTAATGGTCCTGAGTCGGTTTTGGTCAATACCTATCAATGGATCATCAGGAATTGGGAAGCTATCAAAATTCAATGGTCTCAGAATCCTGCAATGTTGTACAACAAAATGGAATTTGAACCATTTCAATCTTTAGTCCCCAAGGCCATTAGAAGC',
  'CAATCTTTAGTCCCCAAGGCCATTAGAAGCCAATATAGTGGGTTTGTCAGAACTCTATTCCAACAAATGAGAGACGTACTTGGGACATTTGACACTGCCCAGGTAATAAAGCTTCTCCCTTTTGCAGCTGCTCCACCGAAGCAAAGCAGAATGCAGTTCTCTTCACTGACTGTAAATGTGAGGGGATCAGGGATGAGAATACTTGTA',
  'GTGAGGGGATCAGGGATGAGAATACTTGTAAGGGGCAATTCTCCTGTATTCAACTACAACAAGACCACTAAAAGGCTAACAATTCTCGGGAAAGATGCCGGCACTTTAATTGAAGACCCAGATGAAAGCACATCCGGAGTGGAGTCCGCCGTCTTGAGAGGGTTCCTCATTATAGGTAAGGAAGACAGAAGGTACGGACCAGCATTA',
  'AAGGAAGACAGAAGGTACGGACCAGCATTAAGCATCAATGAATTGAGTAACCTCGCAAAAGGGGAAAAGGCTAATGTGCTAATCGGGCAAGGAGATGTGGTGTTGGTAATGAAACGAAAACGGGACTCTAGCATACTTACTGACAGCCAGACAGCGACCAAAAGAATTCGGATGGCCATCAATTAA'],
 ['ATGGAAAGAATAAAAGAACTGCGGAATCTGATGTCGCAGTCTCGCACTCGCGAGATACTGACAAAAACCACAGTGGACCATATGGCCATAATTAAGAAGTACACATCAGGGAGACAGGAAAAGAACCCATCACTTAGGATGAAATGGATGATGGCAATGAAATATCCAATCACTGCT',
  'ATGATGGCAATGAAATATCCAATCACTGCTGACAAAAGGGTCACAGAAATGGTTCCGGAGAGAAATGAACAAGGACAAACTCTATGGAGTAAAATGAGTGATGCTGGATCAGATAGAGTGATGGTATCACCTTTGGCTGTAACATGGTGGAATAGGAATGGACCCGTGACAAGTACGGTCCATTACCCAAAAGTGTACAAAACTTAT',
  'GTCCATTACCCAAAAGTGTACAAAACTTATTTCGACAAAGTCGAAAGGTTAAAACATGGAACCTTTGGCCCTGTCCATTTCAGAAATCAAGTCAAGATACGCAGGAGAGTAGACATAAACCCTGGTCATGCAGACCTCAGTGCCAAAGAGGCACAAGATGTAATTATGGAAGTTGTTTTTCCCAATGAAGTGGGAGCCAGAATACTA',
  'TTTCCCAATGAAGTGGGAGCCAGAATACTAACATCAGAATCACAACTAACAATAACTAAAGAGAAAAAAGAAGAACTCCGAGATTGCAAAATTTCTCCCTTGATGGTCGCATACATGTTAGAGAGAGAACTTGTGCGAAAAACAAGATTTCTCCCAGTTGCTGGCGGAACAAGCAGTATATACATTGAAGTTTTACATCTGACTCAA',
  'ATATACATTGAAGTTTTACATCTGACTCAAGGAACGTGTTGGGAACAAATGTACACTCCAGGTGGAGGAGTGAGGAATGACGATGTTGACCAAAGTTTAATTATTGCGGCCAGGAACATAGTAAGAAGAGCCGCAGTATCAGCAGATCCACTAGCATCTTTATTGGAGATGTGCCACAGCACGCAAATTGGCGGAACAAGGATGGTG',
  'AGCACGCAAATTGGCGGAACAAGGATGGTGGACATTCTTAGACAGAACCCGACTGAAGAACAAGCTGTGGATATATGCAAGGCTGCAATGGGATTGAGAATCAGCTCATCCTTCAGCTTTGGTGGCTTCACATTTAAAAGAACAAGCGGGTCGTCAGTCAAAAAAGAAGAAGAGGTGCTTACAGGCAATCTCCAAACATTGAGAATA',
  'CTTACAGGCAATCTCCAAACATTGAGAATAAGAGTACATGAGGGGTATGAGGAGTTCACAATGGTGGGGAAAAGAGCAACAGCTATACTAAGAAAAGCAACCAGAAGATTGGTCCAACTCATAGTGAGTGGAAAAGACGAACAGTCAATAGCCGAAGCAATAATCGTGGCCATGGTGTTTTCACAAGAAGATTGCATGATAAAAGCA',
  'TTTTCACAAGAAGATTGCATGATAAAAGCAGTTAGGGGTGACCTGAATTTTGTCAACAGAGCAAATCAGCGGTTGAACCCCATGCATCAGCTTTTAAGGCATTTTCAGAAAGATGCGAAAGTGCTCTTTCAAAATTGGGGAGTTGAACACATCGACAGTGTGATGGGAATGGTTGGGGTATTACCAGATATGACTCCAAGCACAGAG',
  'GTATTACCAGATATGACTCCAAGCACAGAGATGTCAATGAGAGGAATAAGAGTCAGCAAAATGGGTGTGGATGAATACTCCAGTACAGAGAGGGTGGTGGTTAGCATTGATCGGTTTTTGAGAGTTCGAGACCAACGTGGAAATGTATTATTATCTCCTGAGGAGGTCAGTGAAACACAGGGAACTGAGAGACTGACAATAACTTAT',
  'CAGGGAACTGAGAGACTGACAATAACTTATTCATCGTCGATGATGTGGGAGATTAATGGTCCTGAGTCGGTTTTGGTCAATACCTATCAATGGATCATCAGGAATTGGGAAGCTGTCAAAATTCAATGGTCTCAGAATCCTGCAATGTTGTACAACAAAATGGAATTTGAACCATTTCAATCTTTAGTCCCCAAGGCCATTAGAAGC',
  'CAATCTTTAGTCCCCAAGGCCATTAGAAGCCAATATAGTGGGTTTGTCAGAACTCTATTCCAACAAATGAGAGACGTACTTGGGACATTTGACACTGCCCAGATAATAAAGCTTCTCCCTTTTGCAGCTGCTCCACCGAAGCAAAGCAGAATGCAGTTCTCTTCACTGACTGTAAATGTGAGGGGATCAGGGATGAGAATACTTGTA',
  'GTGAGGGGATCAGGGATGAGAATACTTGTAAGGGGCAATTCTCCTGTATTCAACTACAACAAGACCACTAAAAGGCTAACAATTCTCGGAAAAGATGCCGGCACTTTAATTGAAGACCCAGATGAAAGCACATCCGGAGTGGAGTCCGCCGTCTTGAGAGGGTTCCTCATTATAGGTAAGGAGGACAGAAGGTACGGACCAGCATTA',
  'AAGGAGGACAGAAGGTACGGACCAGCATTAAGCATCAATGAACTGAGTAACCTTGCAAAAGGGGAAAAGGCTAATGTGCTAATCGGGCAAGGAGATGTGGTGTTGGTAATGAAACGAAAACGGGACTCTAGCATACTTACTGACAGCCAGACAGCGACCAAAAGAATTCGGATGGCCATCAATTAA']]

In [ ]:
def similar(overlap1, overlap2):
    return SequenceMatcher(None, overlap1, overlap2).ratio()

def gen_graph(tile_list, overlap, threshold, sources, targets, master_tile_list):
    """
    tile_list: list of all tiles from all sequences
    overlap: length of overlaps
    threshold: fraction from 0 to 1 indicating percentage of paired bases needed for two tiles to anneal
    sources: list of 5' end fragments
    targets: list of 3' end fragments
    """
    T = nx.DiGraph()
    T.add_nodes_from(tile_list)
    original = 0
    new = 0
    for tile1, tile2 in itertools.combinations(tile_list, 2):      
        if similar(tile1[-overlap:], tile2[:overlap]) == threshold: 
            print(tile1[-overlap:], tile2[:overlap])
            T.add_edge(tile1, tile2)
    paths = []
    for source, target in itertools.product(sources, targets):
        for path in nx.all_simple_paths(T, source, target):         
            if path in master_tile_list:                
                original+=1
            else:
                new += 1
            #paths.append([path for path in nx.all_simple_paths(T, source, target)])
    return (original, new)

original, new = gen_graph([tile[1] for tile in tile_list_index], gt.OVERLAP, THRESHOLD, sources, targets, master_tile_list)
original, new

In [ ]:


In [44]:


In [79]:


In [ ]:


In [ ]:
handle = open(in_file, 'w')
        SeqIO.write(records, handle, "fasta")
        out_file = "aligned.clustal"
        clustalomega_cline = cmd(infile=in_file, outfile=out_file, verbose=True, auto=True, outfmt="fasta", force=True, seqtype="DNA")
        clustalomega_cline()

In [ ]:
#dictionary of lengths of overlaps attached to each tile number