In [30]:
from screed.fasta import fasta_iter

In [18]:
with open("../outputs/rna/match_ortho.rna.moleculo.txt") as rna_mol_match:
    rna_mol_set = set([line.split(' ')[0] for line in rna_mol_match if line.strip()])

with open("../outputs/rna/match_ortho.rna.ref.txt") as rna_ref_match:
    rna_ref_set = set([line.split(' ')[0] for line in rna_ref_match if line.strip()])
    
with open("../outputs/rna/ortho.fa") as ortho_file:
    E = set([line.split(' ')[0][1:] for line in ortho_file if line.startswith('>')])

In [20]:
B = rna_mol_set & rna_ref_set

In [21]:
C = rna_mol_set - B

In [22]:
A = rna_ref_set - B

In [24]:
D = E - A - C - B

In [53]:
def write_output_fasta(outfile, seqfile, keys):
    n_written = 0
    with open(outfile, 'w') as output:
        with open(seqfile, 'r') as sequences:
            for seq in fasta_iter(sequences):
                if seq['name'] in keys:
                    output.write(">{name} {description}\n{sequence}\n".format(**seq))
                    n_written += 1
    return n_written

In [54]:
write_output_fasta("../outputs/rna/cobb/A-ref_rna.fa", "../outputs/rna/ortho.fa", A)
write_output_fasta("../outputs/rna/cobb/B-ref_rna_mol.fa", "../outputs/rna/ortho.fa", B)
write_output_fasta("../outputs/rna/cobb/C-mol_rna.fa", "../outputs/rna/ortho.fa", C)
write_output_fasta("../outputs/rna/cobb/D-only_rna.fa", "../outputs/rna/ortho.fa", D)


Out[54]:
3682

In [ ]: