In [1]:
from os.path import expandvars
from skbio import Alignment, SequenceCollection
from skbio.alignment import global_pairwise_align_nucleotide 
from skbio import DNA

core_set_fp = expandvars("$HOME/data/greengenes_core_sets/core_set_aligned.fasta.imputed")
core_set = Alignment.read(core_set_fp)
otus_85_ssu_fp = expandvars('$HOME/data/gg_13_8_otus/rep_set_aligned/85_otus.fasta')
otus_85_ssu = Alignment.read(otus_85_ssu_fp)

otus_85_pyn_fp = expandvars('$HOME/data/gg_13_8_otus/rep_set_aligned/85_otus.pynast.fasta')
otus_85_pyn = Alignment.read(otus_85_pyn_fp)
otus_85_unaligned_fp = expandvars('$HOME/data/gg_13_8_otus/rep_set/85_otus.fasta')
otus_85_unaligned = SequenceCollection.read(otus_85_unaligned_fp)

core_set = core_set.degap()
otus_85_ssu = otus_85_ssu.degap()
otus_85_pyn = otus_85_pyn.degap()

In [2]:
otus_85_unaligned


Out[2]:
<SequenceCollection: n=5088; mean +/- std length=1430.19 +/- 70.99>

In [3]:
otus_85_pyn


Out[3]:
<SequenceCollection: n=4797; mean +/- std length=1425.03 +/- 60.45>

In [4]:
otus_85_ssu


Out[4]:
<SequenceCollection: n=5088; mean +/- std length=1269.15 +/- 44.56>

In [ ]:
not_equal = []
for e in otus_85_pyn.ids():
    if str(otus_85_pyn[e]) != str(otus_85_unaligned[e]):
        try:
            not_equal.append((e, global_pairwise_align_nucleotide(str(otus_85_pyn[e]), str(otus_85_unaligned[e])).distances()[0,1]))
        except ValueError:
            not_equal.append((e, "Sequence contains N's, skipping."))
        print not_equal[-1]
print len(not_equal)


('1110814', 0.0013458950201884253)
('1109993', 0.0013605442176870747)
('1109948', 0.0012978585334198572)
('1107031', 0.0013395847287340924)
('851865', 0.0022865853658536584)
('851822', 0.0020891364902506965)
('847804', 0.00069444444444444447)
('819699', 0.00067842605156037987)
('813266', 0.00067159167226326397)
('810721', 0.0019815059445178335)
('769319', 0.0019442644199611147)
('674402', 0.0013183915622940012)
('674285', 0.00068540095956134343)
('660890', 0.0013360053440213762)
('566326', 0.0013395847287340924)
('566061', 0.0013395847287340924)
('556736', 0.00071530758226037196)
('550826', 0.002054794520547945)
('535799', 0.0014336917562724014)
('529491', 0.006405693950177936)
('528234', 0.00071530758226037196)
('522863', 0.0014440433212996389)
('517210', "Sequence contains N's, skipping.")
('514547', "Sequence contains N's, skipping.")
('512076', 0.00071684587813620072)
('511139', 0.0014825796886582653)
('510608', 0.0014285714285714286)
('354920', 0.0064148253741981472)
('353530', 0.0014367816091954023)
('349931', 0.0019933554817275745)
('347595', 0.00065746219592373442)
('346914', 0.0014357501794687725)
('344506', "Sequence contains N's, skipping.")
('343387', 0.0019880715705765406)
('343009', 0.001440922190201729)
('340937', 0.00066889632107023408)
('340665', 0.0014336917562724014)
('339598', 0.00067294751009421266)
('339005', 0.0019973368841544607)
('338374', 0.0064655172413793103)

Misc. notes from here...


In [17]:
print global_pairwise_align_nucleotide(str(otus_85_pyn['674285']), str(otus_85_unaligned['674285'])).distances()[0,1]


0.000685400959561

In [14]:
core_set.distribution_stats()


Out[14]:
(4938, 1521.4262859457269, 24.17461148474332)

In [15]:
otus_85.distribution_stats()


Out[15]:
(5088, 1269.1540880503144, 44.555375046698863)

In [108]:
otus_85_pyn.distribution_stats()


Out[108]:
(4797, 1425.0273087346259, 60.454162283929563)

In [52]:
common_ids = list(set(otus_85.ids()) & (set(core_set.ids())))
len(common_ids)
print common_ids


['203503', '103940', '161507', '74066', '35786', '88193', '101764', '144333', '227846', '203137', '201355', '4146', '158611', '107881', '109057', '177228', '217784', '138007', '220160', '193092', '202746', '2917', '179300', '151811', '3490', '207151', '98761', '99461', '131207', '193598', '138301', '93530', '1663', '140806', '159742', '52397', '92430', '161484', '139526', '17862', '223031', '114856', '174109', '217989', '144625', '43971', '104067', '111274', '206066', '140938', '211314', '137541', '172015', '198375', '134265', '102448', '153802', '170060', '79590', '155181', '182630', '180127', '109624', '214891', '179000', '184252', '224506', '8194', '100246', '3892', '88848', '180402', '33606', '38917', '131533', '209414', '148047', '184401', '106627', '112651', '145658', '51807', '153746', '184661', '110224', '16195', '209483', '114724', '222095', '191213', '166367', '162211', '159293', '218559', '51154', '233513', '79191', '66919', '136119', '208896', '19806', '141088', '227130', '202139', '170145', '182569', '32172', '234397', '218058', '28592', '2534', '67842', '109663', '21', '105940', '6321', '141010', '90217', '58625', '203454', '150102', '110393', '150264', '55626', '155396', '13725', '131162', '210997', '192222', '232590', '158624', '60618', '211918', '33001', '148676', '32823', '151965', '213210', '142736', '17507', '161403', '161407', '1711', '175358', '140122', '201138', '7', '56503', '176242', '82334', '209806', '153017', '153403', '150680', '150441', '106474', '148318', '95345', '28705', '27500', '149351', '193107', '74136', '220200', '159367', '201556', '177176', '136895', '1006', '193666', '178045', '1222', '1220', '191032', '35350', '183051', '80101', '143399', '186578', '52036', '216234', '216235', '106876', '170175', '231994', '158117', '114082', '107797', '111582', '217227', '180430', '164402', '209664', '173296', '184873', '90508', '185977', '162509', '883', '203525', '221562', '159945', '108724', '19323', '215574', '888', '15704', '203529', '43314', '159203', '93487', '217717', '22294', '110020', '200608', '165300', '110455', '223835', '206331', '44257', '198577', '44156', '183870', '771', '164293', '772', '222456', '109549', '230258', '141239', '214270', '192697', '144924', '155335', '51926', '142268', '18783', '208292', '21860', '114305', '224817', '130062', '160813', '26708', '28429', '111025', '141395', '114578', '57451', '144458', '48617', '215025', '132414', '64372', '209786', '221463', '143944', '234668', '103462', '111958', '159862', '182617', '207920', '110470', '203449', '218453', '232927', '232888', '37026', '231758', '838', '206038', '146428', '58627', '203188', '180467', '92321', '149095', '105480', '225943', '221738', '93199', '107637', '145173', '108673', '91130', '200948', '161220', '185012', '146556', '170463', '213358', '150571', '112707', '37803', '151616', '141197', '159172', '45967', '159170', '1429', '105025', '2563', '158207', '137398', '178547', '166591', '99346', '183580', '196844', '140910', '202302', '143387', '51677', '207211', '143099', '144777', '174018', '218700', '183107', '178438', '41280', '107789', '163874', '195743', '218850', '165063', '161018', '3538', '200762', '89297', '43095', '221823', '151438', '134564', '102556', '10485', '98560', '220089', '144490', '111520', '195985', '169822', '142674', '164913', '13031', '181596', '32511', '145732', '160498', '181986', '45082', '2843', '101480', '234971', '189190', '171242', '160113', '100774', '135186', '6010', '143031', '135632', '143037', '146921', '132390', '199942', '20204', '104684', '139265', '52', '129486', '191378', '209792', '136414', '93838', '201939', '40435', '217293', '91225', '230061', '161126', '123055', '191183', '80284', '136950', '50807', '18466', '145762', '158641', '111406', '125020', '174974', '152225', '164420', '220131', '18867', '114291', '176499', '195325', '112809', '19343', '203276', '207228', '153923', '150191', '220728', '100328', '213436', '37498', '198417', '206757', '150741', '111066', '104534', '72055', '142856', '204874', '33564', '165553', '100020', '111614', '115256', '45149', '25562', '26642', '176053', '150632', '195496', '49', '111100', '182857', '101465', '2508', '42311', '141661', '34789', '200020', '197460', '136293', '79153', '160555', '34546', '101908', '49231', '32849', '148516', '63784', '152353', '185950', '224797', '145549']

In [23]:
otus_85['203503']


Out[23]:
<BiologicalSequence: GTTGAACGCT... (length: 1217)>

In [39]:
core_set['203503']


Out[39]:
<BiologicalSequence: agagtttgat... (length: 1498)>

In [38]:
core_set['90040']


Out[38]:
<BiologicalSequence: AGAGTTTGAT... (length: 1530)>

In [29]:
denovo1009 = DNA("GACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGATGAAGCCTAGCTTGCTAGGTGGATTAGTGGCGAACGGGTGAGTAATACGTGAGTAACCTACCTTTAACTCTGGGATAAGCCCGGGAAACTGGGTCTAATACCGGATACGACCAATCTTCGCATGGGGGTTGGTGGAAAGGTTTGTTCTGGTGGGGGATGGGCTCGCGGCCTATCAGCTTGTTGGTGGGGTGATGGCCTACCAAGGCTTTGACGGGTAGCCGGCCTGAGAGGGTGACCGGTCACATTGGGACTGAGATACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGAAAGCCTGATGCAGCGACGCCGCGTGAGGGATGGAGGCCTTCGGGTTGTAAACCTCTTTCGCTCATGGTCAAGCCGCAACTGTGGGTTGTGGTGAGGGTAGTGGGGTAAAGAAGCGCCGGGCTAACTCCGTGCCAGCAGCCGCGGTAATGACTGCCAAGGG")

In [40]:
ac = global_pairwise_align_nucleotide(denovo1009, core_set['90040'])

In [41]:
print(ac.to_fasta())


>0
--------------------GACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAAC-GATGAAGCCTAGC---TTGCTAGGTGGATTAGTGGCGAACGGGTGAGTAATACGTGAGTAACCTACCTTTAACTCTGGGATAAGCCCGGGAAACTGGGTCTAATACCGGATACGACCAATCTTCGCATGGGGGTTGGTGGAAAGGTTTGTTCTGGTGGGGGATGGGCTCGCGGCCTATCAGCTTGTTGGTGGGGTGATGGCCTACCAAGGCTTTGACGGGTAGCCGGCCTGAGAGGGTGACCGGTCACATTGGGACTGAGATACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGAAAGCCTGATGCAGCGACGCCGCGTGAGGGATGGAGGCCTTCGGGTTGTAAACCTCTTTCGCTCATGGTCAAGCCGCAACTGTGGGTTGTGGTGAGGGTAGTGGGGTAAAGAAGCGCCGGGCTAACTCCGTGCCAGCAGCCGCGGTAATGACTGCCAAGGG--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>90040
AGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGGACTGGCCCTGGCTTTTTTTGGGGTTGGTGAGTGGCGAACGGGTGAGTAACACGTGAGTAACCTGCCCTTGACTCTGGGATAAGCGCCGGAAACGGTGTCTAATACTGGATGTGACTTCTGCCTGCATGGGTGGGGGTGGAAAGATTTTTT--GGTTGGGGATGGGCTCGCGGCTTATCAGCTTGTTGGTGGGGTGATGGCTTACCAAGGCTTTGACGGGTAGCCGGCCTGAGAGGGTGACCGGTCACATTGGGACTGAGATACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGGAAGCCTGATGCAGCGACGCCGCGTGAGGGATGGAGGCCTTCGGGTTGTGAACCTCTTTCGCTCGTGGTCAAGCCGCCCCTGTGGGGGGTGGTGAGGGTAGCGGG-TAAAGAAGCTCCGG-CTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGGAGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGGGCTTGTAGGCGGTTTGTTGCGTCTGCCGTGAAATTCCCTGGCTTAACTGGGGGCGTGCGGTGGGTACGGGCAGGCTTGAGTGCGGTAGGGGAGACTGGAATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAACACCGGTGGCGAAGGCGGGTCTCTGGGCGTTACTGACGCTGAGGAGCGAAACGTTGGGGAGCGAACAGGATTAAATACCCTGGTAGTCCATGCTGTAAACGTTGGGCACTAGGTGTTGGGGCCTCCCTTGGTTTCGCGCCGTAGCTAACGCTTTAATTGCCCCCCTGGGGAGTACGGCGCAAGCTAAAACTCCAAGGAATTGACGGGGGCCCGCAAAACGGGGGGAGCATGCGGATTAATTCGATGCAACCGCAAAGAACCTTACCAAGGCTTGACATGCGCCCTGAGCACCCTGAGTGGGTGCCATTGGTTGGGGGTGCCAGGGGTCGCATGGTTGTCGTCAGCTCGTGTCGTAGAGATGTTGGGTTAAGTCCCGCAACGAGCGCAACCACCCTTGCCCTATGTTGCCAGCACGTGATGGTGGGGACTCGTGGGGGACTGCCGGGGTTAACTCGGAGGAAGGTGGGGATGACGTCAAATCATCATGCCCCTTATGTCTTGGGCTCCACGCATGCTACAATGGCTGGTACAGAGGGTTGCGATGCCGTGAGGTGGAGCGAATCCCTTAAAGCTGGTCTCAGTTCGGATTGGGGTCTGCAACTCGACCCCATGAAGGTGGAGTCGCTAGTAATCGCAGATCAGCAGTGCTGCGGTGAATACGTTCTCGGGCCTTGTACACACCGCCCGTCACGTCATGAAAGTTGGTAACGCCCGAAGCCCATGGCCTAACCGGTTTGTCTGGGGGGAGTGGTCGAAGGTGGGACTGGCGATTGGGACGAAGTCGTAACAAGGTAGCCGTACCGGAAGGTGCGGCTGGATCACCTCCTT


In [58]:
ac = global_pairwise_align_nucleotide(otus_85['214270'], str(core_set['214270']).upper())

In [60]:
print(ac.to_fasta())


>214270
-GAGTTTGATCCTGGCTCAGGACGAACGCTAGCGGCGTGTTTGAAACATGCAAGTCGAA-------------------CGGTGGCGAACGGGTGCGTAACACGTAAAGAACCTACCCTTTAGCGGGGGATAGCCTTGGGAAACCGGGGGTAATACCGCATAAG----------------------------------GAAA-GTCACTGAAGGAGGGCTTTGCTGCCTATCAGGTAGTTGGTGAGGTAAAGGCCCACCAAGCCTAAGACGGGTAGCTGGTCTGAGAGGATGGTCAGCCACACTGGGACTGAGATACGGCCCAGACTCCTACGGGGGGCAGCAGTTTAGAATATTGTGCAATGGGCGAAAGCCTGACACAGCGACGCCGCGTGGGTGATGAAGGCCTTCGGGTCGTAAAGCCCTGTTGGTAGGGATGA----GGACAGTACCTACCGAGGAAGCCCCGGCTAACTACGTGCCAGCAGCTGCGGTAATACGTAGGGGGCGAGCGTTGTCCGAAGTTACTGGGCGTAAAGCGCACGTAGGCGGCATTGTAAGTTGCATTTGAAATTTGACGGCTCAACCGTCAAAAGTGGTGTAAGACTGCAAAGCTTGAGGCAATCAAAAGAATGTGGAACTCCGAGTGTAGCGGTGGAATGCGTAGAGATTCGGAAGAACACCCATGGCGAAGGCAGCATTCTGGGATT--CCTGACGCTGAGGAGCGAAAGCCAGGGGAGCGAACGGGATTAGATACCCCGGTAGTCCTGGCCGTAAACGATGGATACTAGGTGTA-GGGGCCT---------CTGTGCCGAAGCTAACGCAATAAGTATCCCGCCTGGGGAGTACGGTCGCAAGGCTAAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGATGCAACGCGAAGAACCTTACCTGGGTTTGACAT--------------------------------------------CGACAGGTGCTGCATGGCTGTCGTCAGCTCGTATCGTGAGATGTCGGGTTAAGTCCCTTAACGAGCGCAACCCTTGTCCTTAGTTGCCATC------ATGGGCACTCTAAGGAGACCGCCGGTGTTAAACCGGAGGAAGGTGGGGATGACGTCAAGTCCTCATGGCCCTTATGTCCAGGGCTACACACGTGCTACAATGGCCGG--CAAAGGGATGCTAACCTGTAAGGGGGAGCTAACCTC----AACCGGTCTCAGTTCGGA-TGAAGTCTGCAATTCGACTTCA-GAAGCTGGAATCGCTAGTAATCGCGGATCAGAACGCCGCGGTGAATACGTTCCCGGGCCTTGCACACAC----------------------------------------------------------------------------------------------------------------------------------------------
>1
AGAGTTTGATCCTGGCTCAGGACGAACGCTAGCGGCGTGTTTGAAACATGCAAGTCGAACGGAGTGTAGTAATACATTCGGTGGCGAACGGGTGCGTAACACGTAAAGAACCTACCCTTTAGCGGGGGATAGCCTTGGGAAACCGGGGGTAATACCGCATAAGCAACTTTAGACGCATGTTTAGAGTTGGAAAGGCAGAAATGTCACTGAAGGAGGGCTTTGCTGCCTATCAGGTAGTTGGTGAGGTAAAGGCCCACCAAGCCTAAGACGGGTAGCTGGTCTGAGAGGATGGTCAGCCACACTGGGACTGAGATACGGCCCAGACTCCTACGGGGGGCAGCAGTTTAGAATATTGTGCAATGGGCGAAAGCCTGACACAGCGACGCCGCGTGGGTGATGAAGGCCTTCGGGTCGTAAAGCCCTGTTGGTAGGGATGAGCAAGGACAGTACCTACCGAGGAAGCCCCGGCTAACTACGTGCCAGCAGCTGCGGTAATACGTAGGGGGCGAGCGTTGTCCGAAGTTACTGGGCGTAAAGCGCACGTAGGCGGCATTGTAAGTTGCATTTGAAATTTGACGGCTCAACCGTCAAAAGTGGTGTAAGACTGCAAAGCTTGAGGCAATCAAAAGAATGTGGAACTCCGAGTGTAGCGGTGGAATGCGTAGAGATTCGGAAGAACACCCATGGCGAAGGCAGCATTCTGGGATTGACCTGACGCTGAGGAGCGAAAGCCAGGGGAGCGAACGGGATTAGATACCCCGGTAGTCCTGGCCGTAAACGATGGATACTAGGTGTAGGGGGTATCGACCCCCTCTGTGCCGAAGCTAACGCAATAAGTATCCCGCCTGGGGAGTACGGTCGCAAGGCTAAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGATGCAACGCGAAGAACCTTACCTGGGTTTGACATCCCCGGACAGCCCTAGAGATAGGGTTTCCTCTTCGGAGGCCGGGAGACAGGTGCTGCATGGCTGTCGTCAGCTCGTATCGTGAGATGTCGGGTTAAGTCCCTTAACGAGCGCAACCCTTGTCCTTAGTTGCCATCATTTAGTTGGGCACTCTAAGGAGACCGCCGGTGTTAAACCGGAGGAAGGTGGGGATGACGTCAAGTCCTCATGGCCCTTATGTCCAGGGCTACACACGTGCTACAATGGCCGGTACAAAGGGATGCTAACCTGTAAGGGGGAGCTAACCTCAAAAAGCCGGTCTCAGTTCGGATTGAAGTCTGCAATTCGACTTCATGAAGCTGGAATCGCTAGTAATCGCGGATCAGAACGCCGCGGTGAATACGTTCCCGGGCCTTGCACACACCGCCCGTCACACCATGGGAGTGGGTTGCAAAAGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGCTGGATCACCTCCTT


In [68]:
ac = global_pairwise_align_nucleotide(denovo1009, otus_85['214270'])
print ac.to_fasta()


>0
GACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGATGAAGCCTAGCTTGCTAGGTGGATTAGTGGCGAACGGGTGAGTAATACGTGAGTAACCTACCTTTAACTCTGGGATAAGCCCGGGAAACTGGGTCTAATACCGGATACGACCAATCTTCGCATGGGGGTTGGTGGAAAGGTTTGTTCTGGTGGGGGATGGGCTCGCGGCCTATCAGCTTGTTGGTGGGGTGATGGCCTACCAAGGCTTTGACGGGTAGCCGGCCTGAGAGGGTGACCGGTCACATTGGGACTGAGATACGGCCCAGACTCCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGAAAGCCTGATGCAGCGACGCCGCGTGAGGGATGGAGGCCTTCGGGTTGTAAACCTCTTTCGCTCATGGTCAAGCCGCAACTGTGGGTTGTGGTGAGGGTAGTGGGGTAAAGAAGCGCCGGGCTAACTCCGTGCCAGCAGCCGCGGTAATGACTGCCAAGGG-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
>214270
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------GAGTTTGATCCTGGCTCAGGACGAACGCTAGCGGCGTGTTTGAAACATGCAAGTCGAACGGTGGCGAACGGGTGCGTAACACGTAAAGAACCTACCCTTTAGCGGGGGATAGCCTTGGGAAACCGGGGGTAATACCGCATAAGGAAAGTCACTGAAGGAGGGCTTTGCTGCCTATCAGGTAGTTGGTGAGGTAAAGGCCCACCAAGCCTAAGACGGGTAGCTGGTCTGAGAGGATGGTCAGCCACACTGGGACTGAGATACGGCCCAGACTCCTACGGGGGGCAGCAGTTTAGAATATTGTGCAATGGGCGAAAGCCTGACACAGCGACGCCGCGTGGGTGATGAAGGCCTTCGGGTCGTAAAGCCCTGTTGGTAGGGATGAGGACAGTACCTACCGAGGAAGCCCCGGCTAACTACGTGCCAGCAGCTGCGGTAATACGTAGGGGGCGAGCGTTGTCCGAAGTTACTGGGCGTAAAGCGCACGTAGGCGGCATTGTAAGTTGCATTTGAAATTTGACGGCTCAACCGTCAAAAGTGGTGTAAGACTGCAAAGCTTGAGGCAATCAAAAGAATGTGGAACTCCGAGTGTAGCGGTGGAATGCGTAGAGATTCGGAAGAACACCCATGGCGAAGGCAGCATTCTGGGATTCCTGACGCTGAGGAGCGAAAGCCAGGGGAGCGAACGGGATTAGATACCCCGGTAGTCCTGGCCGTAAACGATGGATACTAGGTGTAGGGGCCTCTGTGCCGAAGCTAACGCAATAAGTATCCCGCCTGGGGAGTACGGTCGCAAGGCTAAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGATGCAACGCGAAGAACCTTACCTGGGTTTGACATCGACAGGTGCTGCATGGCTGTCGTCAGCTCGTATCGTGAGATGTCGGGTTAAGTCCCTTAACGAGCGCAACCCTTGTCCTTAGTTGCCATCATGGGCACTCTAAGGAGACCGCCGGTGTTAAACCGGAGGAAGGTGGGGATGACGTCAAGTCCTCATGGCCCTTATGTCCAGGGCTACACACGTGCTACAATGGCCGGCAAAGGGATGCTAACCTGTAAGGGGGAGCTAACCTCAACCGGTCTCAGTTCGGATGAAGTCTGCAATTCGACTTCAGAAGCTGGAATCGCTAGTAATCGCGGATCAGAACGCCGCGGTGAATACGTTCCCGGGCCTTGCACACAC


In [61]:


In [64]:


In [67]:
print len(otus_85_unaligned['214270'])
print len(otus_85['214270'])


1360
1232

In [89]:
seqs_515_fp = "/Users/caporaso/code/short-read-tax-assignment/data/simulated-community/B1-iter0/rep_set.fna"
seqs_515 = SequenceCollection.read(seqs_515_fp)

In [97]:
print set(seqs_515.ids()) & set(otus_85.ids()) & set(core_set.ids())


set(['41280', '3490', '196844', '233513', '93487', '92430', '161484', '224506', '193107', '223031', '100246', '220728', '184873', '216235', '92321', '51154', '33606', '114578', '136119', '179300', '110393', '150680', '107637', '144458', '49231', '145173', '141088', '21', '32849', '220131', '142856', '220089', '223835', '158624', '159367', '225943', '138301', '142268', '132390', '32823', '145549'])

In [103]:
print global_pairwise_align_nucleotide(str(otus_85['41280']), str(seqs_515['41280'])).to_fasta()


>0
ATCCTGGCTCAGGACGAACGCTGGCGGCGTGCCTAAACTATGCGAGTCGCGCGAGCGGCGAACGGGTGAGTAACGCGTGAGTAACCTGCCCCCCGGACCGGGATAACTCCGAGAAATCGGGGCTAATACCGGATATGGAAAGCCACCGAGGGAGGGGCTCGCGTCCTATCAGCTTGTTGGTGGGGTAAAGGCCTACCAAGGCTGCGACGGGTAGCCGGTCTGAGAGGATGGCCGGTCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGTAGCAAGTGGGAATCTTGCACAATGGACGAAAGTCTGATGCAGCGACGCCGCGTGAGGGATGAAGGCTCTCGGGTCGTAAACCTCTGTTGACAGGGAAGATGACGGTACCTGTCGAGGAAGCCTCGGCTAACTCCGTGCCAGCAGCCGCGGCAAGACGGGGGAGGCGAGCGTTGTCCGGAATGACTGGGCGTAAAGCGCGTGTAGGCGGGATTTTAAGTCAGCGGTGAAATCCCCTCGCTCAACGAGGGAACTGCCGCTGATACTGGGATTCTTGAGGGCAGGAGAGGAGAGCGGAATTGCCGGTGTAGCGGTAAAATGCGTAGAAATCGGCAAGAACACCCGTGGCGAAGGCGGCTCTCTGGACTG--CCTGACGCTGAGACGCGAAAGCTAGGGTAGCGAACGGGATTAGATACCCCGGTAGTCCTAGCCGTAAACGATGGTCACTAGGTGTCGGAACTTCGGCGCCGAAGTTAACGCATTAAGTGACCCGCCTGGGGAGTACGGCCGCAAGGCTGAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGCTTAATTCGACGCAACGCGAAGAACCTTACCAGGGCTTGACTGTAACAGGTGGTGCATGGCCGTCGTCAGCTCGTGTCGTGAGATGTTGGGTTGAGTCCCGCAACGAGCGCAACCCTCACTGTGTGTTGCCAGCGGCCGGGAACTCACACGGAACTGCCTCGGTAACGAGGAGGAAGGTGAGGATGACGTCAGGTCCGCATGCCCCTTACGCCCTGGGCTGCACACATGCTACAATGGAGAGCAATGGGTCGCAAAACCGCGAGGCCGAGCTAATCCCCCTCTCCCCAATTCGGATGTAGTCTGCAACTCGACTGCAGAAGTCGGAATCGCTAGTAACCGCAAATCAGCAACGTTGCGGTGAATACGTTCTCGGGCCTTGTACACACCGCCCGTCAAGCGATGAGAGTCAGGTGCACCTTAAGTCGCTGCCCAGGGAGGCGCCCACGGTGCGACTG
>1
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------GTGCCAGCAGCCGCGGCAAGACGGGGGAGGCGAGCGTTGTCCGGAATGACTGGGCGTAAAGCGCGTGTAGGCGGGATTTTAAGTCAGCGGTGAAATCCCCTCGCTCAACGAGGGAACTGCCGCTGATACTGGGATTCTTGAGGGCAGGAGAGGAGAGCGGAATTGCCGGTGTAGCGGTAAAATGCGTAGAAATCGGCAAGAACACCCGTGGCGAAGGCGGCTCTCTGGACTGCTCCTGACGCTGAGACGCGAAAGCTAGGGTAGCGAACGGGATTAGATACCCCGGTAGTC--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


In [105]:
print global_pairwise_align_nucleotide(str(core_set['41280']).upper(), str(seqs_515['41280'])).to_fasta()


>0
AGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTGCCTAAACTATGCGAGTCGCGCGGGGTTTTATGACAAACCGCAAGGCGAGTTGTAAAACTTAGCGGCGAACGGGTGAGTAACGCGTGAGTAACCTGCCCCCCGGACCGGGATAACTCCGAGAAATCGGGGCTAATACCGGATATGAAGTCCGCCCTCCTGGGCGGGCTTGGAAAGGAGCAATCCACCGAGGGAGGGGCTCGCGTCCTATCAGCTTGTTGGTGGGGTAAAGGCCTACCAAGGCTGCGACGGGTAGCCGGTCTGAGAGGATGGCCGGTCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGTAGCAAGTGGGAATCTTGCACAATGGACGAAAGTCTGATGCAGCGACGCCGCGTGAGGGATGAAGGCTCTCGGGTCGTAAACCTCTGTTGACAGGGAAGAATAACTGACGGTACCTGTCGAGGAAGCCTCGGCTAACTCCGTGCCAGCAGCCGCGGCAAGACGGGGGAGGCGAGCGTTGTCCGGAATGACTGGGCGTAAAGCGCGTGTAGGCGGGATTTTAAGTCAGCGGTGAAATCCCCTCGCTCAACGAGGGAACTGCCGCTGATACTGGGATTCTTGAGGGCAGGAGAGGAGAGCGGAATTGCCGGTGTAGCGGTAAAATGCGTAGAAATCGGCAAGAACACCCGTGGCGAAGGCGGCTCTCTGGACTGCTCCTGACGCTGAGACGCGAAAGCTAGGGTAGCGAACGGGATTAGATACCCCGGTAGTCCTAGCCGTAAACGATGGTCACTAGGTGTCGGAAGATCGACCCCTTCGGCGCCGAAGTTAACGCATTAAGTGACCCGCCTGGGGAGTACGGCCGCAAGGCTGAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGCTTAATTCGACGCAACGCGAAGAACCTTACCAGGGCTTGACTGTTGCAGAATCCTGATGAAAGTCAGGAGTGCTCTTCGGAGAGCTGCAAAACAGGTGGTGCATGGCCGTCGTCAGCTCGTGTCGTGAGATGTTGGGTTGAGTCCCGCAACGAGCGCAACCCTCACTGTGTGTTGCCAGCGGTTCGGCCGGGAACTCACACGGAACTGCCTCGGTAACGAGGAGGAAGGTGAGGATGACGTCAGGTCCGCATGCCCCTTACGCCCTGGGCTGCACACATGCTACAATGGAGAGTACAATGGGTCGCAAAACCGCGAGGCCGAGCTAATCCCCAAAACTCTCCCCAATTCGGATTGTAGTCTGCAACTCGACTGCATGAAGTCGGAATCGCTAGTAACCGCAAATCAGCAACGTTGCGGTGAATACGTTCTCGGGCCTTGTACACACCGCCCGTCAAGCGATGAGAGTCAGGTGCACCTTAAGTCGCTGGCCCAACCCGCAAGGGAGGGAGGCGCCCACGGTGCGACTGATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGCTGGATCACCTCCTT
>1
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------GTGCCAGCAGCCGCGGCAAGACGGGGGAGGCGAGCGTTGTCCGGAATGACTGGGCGTAAAGCGCGTGTAGGCGGGATTTTAAGTCAGCGGTGAAATCCCCTCGCTCAACGAGGGAACTGCCGCTGATACTGGGATTCTTGAGGGCAGGAGAGGAGAGCGGAATTGCCGGTGTAGCGGTAAAATGCGTAGAAATCGGCAAGAACACCCGTGGCGAAGGCGGCTCTCTGGACTGCTCCTGACGCTGAGACGCGAAAGCTAGGGTAGCGAACGGGATTAGATACCCCGGTAGTC-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


In [ ]: