In [1]:
from os.path import expandvars
from skbio import Alignment, SequenceCollection
from skbio.alignment import global_pairwise_align_nucleotide
from skbio import DNA
core_set_fp = expandvars("$HOME/data/greengenes_core_sets/core_set_aligned.fasta.imputed")
core_set = Alignment.read(core_set_fp)
otus_85_ssu_fp = expandvars('$HOME/data/gg_13_8_otus/rep_set_aligned/85_otus.fasta')
otus_85_ssu = Alignment.read(otus_85_ssu_fp)
otus_85_pyn_fp = expandvars('$HOME/data/gg_13_8_otus/rep_set_aligned/85_otus.pynast.fasta')
otus_85_pyn = Alignment.read(otus_85_pyn_fp)
otus_85_unaligned_fp = expandvars('$HOME/data/gg_13_8_otus/rep_set/85_otus.fasta')
otus_85_unaligned = SequenceCollection.read(otus_85_unaligned_fp)
core_set = core_set.degap()
otus_85_ssu = otus_85_ssu.degap()
otus_85_pyn = otus_85_pyn.degap()
In [2]:
otus_85_unaligned
Out[2]:
In [3]:
otus_85_pyn
Out[3]:
In [4]:
otus_85_ssu
Out[4]:
In [ ]:
not_equal = []
for e in otus_85_pyn.ids():
if str(otus_85_pyn[e]) != str(otus_85_unaligned[e]):
try:
not_equal.append((e, global_pairwise_align_nucleotide(str(otus_85_pyn[e]), str(otus_85_unaligned[e])).distances()[0,1]))
except ValueError:
not_equal.append((e, "Sequence contains N's, skipping."))
print not_equal[-1]
print len(not_equal)
Misc. notes from here...
In [17]:
print global_pairwise_align_nucleotide(str(otus_85_pyn['674285']), str(otus_85_unaligned['674285'])).distances()[0,1]
In [14]:
core_set.distribution_stats()
Out[14]:
In [15]:
otus_85.distribution_stats()
Out[15]:
In [108]:
otus_85_pyn.distribution_stats()
Out[108]:
In [52]:
common_ids = list(set(otus_85.ids()) & (set(core_set.ids())))
len(common_ids)
print common_ids
In [23]:
otus_85['203503']
Out[23]:
In [39]:
core_set['203503']
Out[39]:
In [38]:
core_set['90040']
Out[38]:
In [29]:
denovo1009 = DNA("GACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGATGAAGCCTAGCTTGCTAGGTGGATTAGTGGCGAACGGGTGAGTAATACGTGAGTAACCTACCTTTAACTCTGGGATAAGCCCGGGAAACTGGGTCTAATACCGGATACGACCAATCTTCGCATGGGGGTTGGTGGAAAGGTTTGTTCTGGTGGGGGATGGGCTCGCGGCCTATCAGCTTGTTGGTGGGGTGATGGCCTACCAAGGCTTTGACGGGTAGCCGGCCTGAGAGGGTGACCGGTCACATTGGGACTGAGATACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGAAAGCCTGATGCAGCGACGCCGCGTGAGGGATGGAGGCCTTCGGGTTGTAAACCTCTTTCGCTCATGGTCAAGCCGCAACTGTGGGTTGTGGTGAGGGTAGTGGGGTAAAGAAGCGCCGGGCTAACTCCGTGCCAGCAGCCGCGGTAATGACTGCCAAGGG")
In [40]:
ac = global_pairwise_align_nucleotide(denovo1009, core_set['90040'])
In [41]:
print(ac.to_fasta())
In [58]:
ac = global_pairwise_align_nucleotide(otus_85['214270'], str(core_set['214270']).upper())
In [60]:
print(ac.to_fasta())
In [68]:
ac = global_pairwise_align_nucleotide(denovo1009, otus_85['214270'])
print ac.to_fasta()
In [61]:
In [64]:
In [67]:
print len(otus_85_unaligned['214270'])
print len(otus_85['214270'])
In [89]:
seqs_515_fp = "/Users/caporaso/code/short-read-tax-assignment/data/simulated-community/B1-iter0/rep_set.fna"
seqs_515 = SequenceCollection.read(seqs_515_fp)
In [97]:
print set(seqs_515.ids()) & set(otus_85.ids()) & set(core_set.ids())
In [103]:
print global_pairwise_align_nucleotide(str(otus_85['41280']), str(seqs_515['41280'])).to_fasta()
In [105]:
print global_pairwise_align_nucleotide(str(core_set['41280']).upper(), str(seqs_515['41280'])).to_fasta()
In [ ]: