In [1]:
%pylab inline
%config InlineBackend.figure_format = 'retina'


Populating the interactive namespace from numpy and matplotlib

In [2]:
import pandas as pd
import seaborn as sns
from ficus import FigureManager
from IPython.display import FileLink
import glob
import os
from collections import OrderedDict
import matplotlib.pyplot as plt
from grave import plot_network

In [3]:
import networkx as nx

In [8]:
meta = pd.read_csv('../sacPom.csv')

In [10]:
meta.tail()


Out[10]:
read_n n_full n_tips n_islands n_unknown n_trivial n_dnodes n_unodes n_tags n_updates n_unique estimated_fp
20 4200 907 1341 3472 0 58 1081 5778 110146 318 8322366 0
21 4400 1140 1645 3563 0 59 1336 6407 131865 370 8719937 0
22 4600 1247 1928 3665 0 59 1501 6899 151851 394 9111504 0
23 4800 1352 2316 3741 0 60 1701 7470 176193 423 9551868 0
24 5000 1574 2740 3815 0 67 1994 8196 198549 491 9965479 0

In [12]:
G = nx.read_graphml('../fugu.cdbg.graphml')


Traceback (most recent call last):

  File "/home/camille/miniconda/envs/goetia/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)

  File "<ipython-input-12-14a3e3616ba7>", line 1, in <module>
    G = nx.read_graphml('../fugu.cdbg.graphml')

  File "<decorator-gen-566>", line 2, in read_graphml

  File "/home/camille/miniconda/envs/goetia/lib/python3.6/site-packages/networkx/utils/decorators.py", line 227, in _open_file
    result = func_to_be_decorated(*new_args, **kwargs)

  File "/home/camille/miniconda/envs/goetia/lib/python3.6/site-packages/networkx/readwrite/graphml.py", line 239, in read_graphml
    glist = list(reader(path=path))

  File "/home/camille/miniconda/envs/goetia/lib/python3.6/site-packages/networkx/readwrite/graphml.py", line 711, in __call__
    self.xml = ElementTree(file=path)

  File "/home/camille/miniconda/envs/goetia/lib/python3.6/xml/etree/ElementTree.py", line 557, in __init__
    self.parse(file)

  File "/home/camille/miniconda/envs/goetia/lib/python3.6/xml/etree/ElementTree.py", line 597, in parse
    self._root = parser._parse_whole(source)

  File "<string>", line unknown
ParseError: junk after document element: line 11449690, column 0

In [7]:
with FigureManager(show=True, figsize=(16,16)) as (fig, ax):
    plot_network(G, layout='kamada_kawai', ax=ax)



In [34]:
from goetia.dbg import make_dBG
from khmer._oxli.parsing import FastxParser

In [36]:
graph = make_dBG(41, 2e9, 4, storage='_ByteStorage')

In [37]:
for record in FastxParser('sacPom/sacPom.cdbg.fasta.5138.FASTA'):
    graph.add_sequence(record.sequence)

In [59]:
sequences = dict()
for record in FastxParser('sacPom/sacPom.cdbg.fasta.5138.FASTA'):
    sequences[record.name] = record

In [41]:
graph.get_counts(sequences['ID=6823 L=391 type=FULL'])


Out[41]:
[4,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 4]

In [73]:
counts = pd.read_csv('sacPom/sacPom.cdbg.fasta.5138.FASTA.tsv', na_values='None', delimiter='\t', quotechar='"')

In [27]:
bcalm = pd.read_csv('sacPom/bcalm-cdna/files.unitigs.fa.tsv', na_values='None', delimiter='\t', quotechar='"')

In [29]:
bcalm[bcalm['max'] > 1]


Out[29]:
name max median start end internal_max

In [74]:
counts['L'] = counts.name.str.split(expand=True)[1].str.partition('=', expand=True)[2]
counts['type'] = counts.name.str.split(expand=True)[2].str.partition('=', expand=True)[2]
counts['ID'] = counts.name.str.split(expand=True)[0].str.partition('=', expand=True)[2]

In [75]:
bad = counts[counts['internal_max'] > 1]

In [76]:
bad


Out[76]:
name max median start end internal_max L type ID
114 ID=6827 L=132 type=FULL 4 2.0 4 4 2.0 132 FULL 6827
115 ID=6826 L=83 type=FULL 4 2.0 4 4 2.0 83 FULL 6826
117 ID=6824 L=391 type=FULL 4 2.0 4 4 2.0 391 FULL 6824
634 ID=6242 L=87 type=FULL 3 1.0 3 3 2.0 87 FULL 6242
1185 ID=5620 L=71 type=TIP 4 2.0 1 4 2.0 71 TIP 5620
1186 ID=5619 L=83 type=FULL 4 2.0 4 4 2.0 83 FULL 5619
1188 ID=5617 L=68 type=FULL 4 2.0 4 4 2.0 68 FULL 5617
1189 ID=5616 L=83 type=FULL 4 2.0 4 4 2.0 83 FULL 5616
1191 ID=5614 L=486 type=FULL 4 2.0 4 4 2.0 486 FULL 5614
1192 ID=5613 L=145 type=FULL 4 2.0 4 4 2.0 145 FULL 5613
1194 ID=5611 L=162 type=FULL 4 2.0 4 4 2.0 162 FULL 5611
1195 ID=5610 L=97 type=FULL 5 2.0 4 5 2.0 97 FULL 5610
1197 ID=5608 L=368 type=TIP 4 1.0 4 1 2.0 368 TIP 5608
1199 ID=5606 L=370 type=FULL 5 2.0 5 4 2.0 370 FULL 5606
1503 ID=5265 L=58 type=FULL 4 1.5 3 4 2.0 58 FULL 5265
1505 ID=5263 L=83 type=FULL 4 2.0 4 4 2.0 83 FULL 5263
1506 ID=5262 L=59 type=FULL 4 2.0 4 4 2.0 59 FULL 5262
1507 ID=5261 L=56 type=FULL 4 2.0 4 4 2.0 56 FULL 5261
1509 ID=5259 L=83 type=FULL 4 2.0 4 4 2.0 83 FULL 5259
1510 ID=5258 L=242 type=FULL 4 2.0 4 4 2.0 242 FULL 5258
1590 ID=5176 L=186 type=FULL 4 2.0 4 4 2.0 186 FULL 5176
1592 ID=5174 L=83 type=FULL 4 2.0 4 4 2.0 83 FULL 5174
1593 ID=5173 L=106 type=FULL 4 2.0 4 4 2.0 106 FULL 5173
1595 ID=5171 L=83 type=FULL 4 2.0 4 4 2.0 83 FULL 5171
1596 ID=5170 L=92 type=FULL 4 2.0 4 4 2.0 92 FULL 5170
1598 ID=5168 L=83 type=FULL 4 2.0 4 4 2.0 83 FULL 5168
1599 ID=5167 L=44 type=FULL 4 3.0 4 4 2.0 44 FULL 5167
1602 ID=5164 L=73 type=FULL 4 2.0 4 4 2.0 73 FULL 5164
1604 ID=5162 L=47 type=FULL 4 2.0 4 4 2.0 47 FULL 5162
1605 ID=5161 L=83 type=FULL 4 2.0 4 4 2.0 83 FULL 5161
... ... ... ... ... ... ... ... ... ...
1613 ID=5152 L=656 type=FULL 4 2.0 4 4 2.0 656 FULL 5152
3112 ID=1334 L=83 type=FULL 4 2.0 4 4 2.0 83 FULL 1334
4053 ID=43 L=607 type=ISLAND 4 2.0 2 2 4.0 607 ISLAND 43
4098 ID=3856 L=1227 type=ISLAND 5 2.0 2 2 5.0 1227 ISLAND 3856
4104 ID=3859 L=165 type=FULL 4 2.0 4 4 2.0 165 FULL 3859
4108 ID=3861 L=83 type=FULL 4 2.0 4 4 2.0 83 FULL 3861
4222 ID=3923 L=83 type=FULL 4 2.0 4 4 2.0 83 FULL 3923
4226 ID=3925 L=729 type=FULL 4 2.0 4 3 2.0 729 FULL 3925
4229 ID=3927 L=43 type=FULL 4 4.0 4 4 2.0 43 FULL 3927
4235 ID=3930 L=44 type=FULL 4 3.0 4 4 2.0 44 FULL 3930
4237 ID=3931 L=80 type=FULL 4 2.0 4 4 2.0 80 FULL 3931
4239 ID=3932 L=59 type=FULL 4 2.0 4 4 2.0 59 FULL 3932
4241 ID=3933 L=65 type=FULL 4 2.0 4 4 2.0 65 FULL 3933
4246 ID=3936 L=73 type=FULL 4 2.0 4 4 2.0 73 FULL 3936
4435 ID=631 L=4064 type=ISLAND 4 2.0 2 2 4.0 4064 ISLAND 631
5259 ID=2525 L=83 type=FULL 4 2.0 4 4 2.0 83 FULL 2525
5261 ID=2529 L=272 type=FULL 4 2.0 4 4 2.0 272 FULL 2529
5262 ID=2530 L=83 type=FULL 4 2.0 4 4 2.0 83 FULL 2530
5264 ID=2532 L=44 type=FULL 4 3.0 4 4 2.0 44 FULL 2532
5266 ID=2534 L=83 type=FULL 4 2.0 4 4 2.0 83 FULL 2534
5267 ID=2535 L=67 type=FULL 4 2.0 4 4 2.0 67 FULL 2535
5269 ID=2537 L=115 type=FULL 4 2.0 4 4 2.0 115 FULL 2537
5270 ID=2538 L=47 type=FULL 4 2.0 4 4 2.0 47 FULL 2538
5271 ID=2539 L=83 type=FULL 4 2.0 4 4 2.0 83 FULL 2539
5273 ID=2543 L=83 type=FULL 4 1.0 3 4 2.0 83 FULL 2543
5486 ID=2781 L=82 type=FULL 4 2.0 4 4 2.0 82 FULL 2781
5487 ID=2782 L=384 type=TIP 4 1.0 4 1 2.0 384 TIP 2782
5488 ID=2783 L=136 type=FULL 4 2.0 4 4 2.0 136 FULL 2783
5489 ID=2785 L=83 type=FULL 4 2.0 4 4 2.0 83 FULL 2785
5490 ID=2787 L=83 type=FULL 4 2.0 4 4 2.0 83 FULL 2787

64 rows × 9 columns


In [60]:
S = sequences['ID=5173 L=106 type=FULL']

In [64]:
S


Out[64]:
Sequence(name="ID=5173 L=106 type=FULL", sequence="CAGGGAAAGTACAAGGATCCAACAAAGGTGATCGTTTAACTAAAACATTTGAAGGTTTTAGAAATCAATTGGACAAAGTTCAATTTATAAGGAAACTCATGTCAAA")

In [70]:
for kmer in list(S.kmers(41))[1:-1]:
    for seq in sequences.values():
        if kmer in seq.sequence:
            print(kmer, seq.name, seq, seq.sequence.find(kmer))
    break


AGGGAAAGTACAAGGATCCAACAAAGGTGATCGTTTAACTA ID=5173 L=106 type=FULL CAGGGAAAGTACAAGGATCCAACAAAGGTGATCGTTTAACTAAAACATTTGAAGGTTTTAGAAATCAATTGGACAAAGTTCAATTTATAAGGAAACTCATGTCAAA 1
AGGGAAAGTACAAGGATCCAACAAAGGTGATCGTTTAACTA ID=631 L=4064 type=ISLAND GGAGAAAGAACTTTGGAAGGACTGTTATCCCTTTTGAAATCTCCCAAAGGGAAACATATACAATGTCCTACGCAAATTATCGTTATATGAAAGCAAGAGCAAAACGATGGAGACCAGAGAATTTGGATGGAATTCAAACATCAGACGAACATTTAATAAACCTTTTTGCAAAAATATTATCGAAGCATGTACCAGAGATAGGGAAATTCGATCCTAATAAGGATGTTGAAAGTTACATTTCAAAACTTGATCAACACTTTACTGAATACCCTTCATTATTCCCAAATGAGCATACTAAAAGACAGTATACATTGAATCACCTAGAAGAATTAGAGCAACAATTCGCTGAACGCATGTTTTCTGAGAATGGAAGTCTTACATGGCAAGAATTACTCAGACAAACAGGGAAAGTACAAGGATCCAACAAAGGTGATCGTTTAACTAAAACATTTGAAGGTTTTAGAAATCAATTGGACAAAGTTCAATTTATAAGGAAACTCATGTCAAAAGCAAATGTTGATGATTTCCATACTCGCTTGTTTATATTATGGATGCTGCCATATTCCTTAAGGAAATTAAAGGAAAGAAATTACTGGAAATCAGAAATCAGTGAAATTTATGACTTTTTAGAGGACAAAAGAACAGCCTCGTATGGTAAAACTCACAAGCGTTTTCAACTGCAAAATAAAAATCTAGGAAAAGAGTCCCTTTCAAAGAAAAATAACACCACTAATAGCAGAAACCTGAGGAAGACAAATGTTTCGAGAATAGAATACTCATCTAACAAATTCCTAAATCATACTAGGAAACGTTACGAAATGGTATTACAAGCTGAACTTCCAGACTTCAAGTGCTCAATACCCTGTCTAATCGATACGGGCGCTCAAGCAAATATTATAACAGAAGAAACTGTTCGAGCACATAAACTGCCTACCAGACCCTGGTCAAAAAGTGTGATATATGGTGGAGTTTATCCAAATAAGATTAATCGCAAAACAATAAAACTTAACATAAGTCTAAATGGAATATCAATCAAAACAGAATTCTTGGTTGTAAAGAAATTTTCGCATCCAGCTGCTATCTCCTTCACAACATTATATGACAATAACATTGAAATATCTAGCAGTAAACACACGCTCTCTCAAATGAACAAAGTTTCAAATATTGTCAAGGAACCTGAGTTACCAGATATCTATAAAGAATTCAAAGACATTACTGCAGAAACCAATACGGAAAAGCTACCAAAGCCAATAAAAGGGTTAGAATTTGAAGTTGAACTAACTCAAGAAAACTACAGATTACCTATCAGAAATTACCCGCTACCACCGGGAAAAATGCAAGCTATGAATGATGAAATTAATCAAGGATTAAAAAGTGGAATTATACGAGAATCTAAAGCCATTAACGCCTGTCCAGTAATGTTCGTTCCGAAAAAGGAAGGCACCTTGAGAATGGTGGTTGACTACAAACCTTTAAATAAGTATGTCAAACCCAATATATATCCGTTACCACTTATTGAACAATTACTTGCTAAAATACAAGGTTCTACAATTTTTACTAAACTTGACCTCAAAAGTGCCTATCACTTGATACGAGTAAGAAAAGGAGATGAACATAAACTTGCTTTTCGCTGTCCTCGTGGAGTTTTTGAATATCTAGTAATGCCTTATGGCATATCTACAGCTCCAGCACATTTTCAATACTTTATCAATACAATACTTGGTGAAGCCAAAGAATCACATGTAGTATGTTATATGGATGATATTTTAATTCATTCAAAATCGGAATCTGAACATGTAAAACATGTTAAAGACGTTCTACAGAAATTGAAAAATGCGAACTTAATTATCAATCAAGCAAAATGTGAATTTCACCAATCACAAGTAAAATTTATAGGGTATCACATTTCGGAAAAAGGATTTACGCCTTGTCAAGAAAATATAGACAAAGTCTTACAATGGAAGCAACCTAAGAATCGTAAAGAATTACGACAATTTCTAGGTTCTGTCAATTATCTTAGGAAATTCATTCCAAAGACATCACAATTAACACATCCACTCAATAATCTTTTGAAAAAGGATGTACGCTGGAAATGGACACCAACACAAACCCAAGCGATAGAAAACATTAAACAATGTTTAGTTTCTCCTCCGGTGCTACGACACTTTGATTTCAGTAAAAAGATTCTACTGGAAACTGATGCTTCAGATGTCGCTGTAGGAGCCGTATTGTCTCAAAAACATGATGATGATAAATACTATCCTGTTGGATACTATTCAGCAAAGATGTCTAAAGCACAATTAAATTATAGCGTATCGGACAAAGAAATGCTTGCAATCATTAAGTCTCTCAAACATTGGAGACACTATTTAGAATCCACTATCGAACCTTTCAAAATTTTAACAGACCATCGAAACTTAATTGGTCGCATTACTAACGAATCCGAGCCTGAAAACAAACGTTTAGCTCGTTGGCAATTATTTTTACAAGACTTCAACTTTGAAATTAACTACAGACCTGGATCAGCAAATCACATAGCTGATGCCTTATCCAGAATTGTTGACGAAACAGAACCAATTCCAAAAGATTCAGAAGACAATAGTATCAACTTTGTTAATCAAATCTCGATAACCGATGATTTTAAAAACCAAGTGGTTACAGAATATACGAATGATACAAAATTGTTGAATTTACTAAACAATGAAGACAAACGAGTGGAAGAGAATATCCAACTCAAAGATGGCTTACTAATTAACAGTAAAGACCAAATCTTATTACCTAATGATACTCAGCTGACTAGGACAATTATTAAAAAGTATCATGAAGAAGGTAAATTGATTCATCCAGGCATTGAACTTCTTACAAACATTATATTACGTAGATTTACGTGGAAAGGAATAAGAAAACAAATACAAGAATATGTACAGAACTGCCATACATGTCAAATAAACAAATCTAGGAATCATAAACCTTATGGACCTTTACAACCAATTCCCCCATCAGAAAGACCTTGGGAATCTTTATCAATGGATTTTATTACAGCTTTACCAGAATCATCTGGTTATAATGCACTTTTCGTGGTAGTTGACCGATTTTCAAAAATGGCAATCTTAGTACCTTGTACGAAATCCATTACAGCAGAGCAAACAGCTCGAATGTTTGATCAACGAGTTATTGCTTATTTCGGCAATCCAAAAGAAATCATTGCAGATAATGATCATATTTTTACTTCCCAAACGTGGAAAGATTTCGCACATAAATATAATTTCGTTATGAAATTTTCGTTACCATACAGACCACAAACTGATGGACAAACTGAGCGTACAAACCAAACTGTGGAGAAATTACTAAGATGTGTATGTAGCACACATCCAAATACATGGGTAGATCATATATCCCTAGTGCAACAATCTTACAACAATGCGATACATTCAGCAACTCAAATGACACCTTTTGAGATAGTACATCGCTATTCACCAGCTTTATCACCTTTAGAGTTACCTAGCTTTAGTGACAAAACTGACGAAAACTCTCAGGAAACGATCCAAGTATTTCAAACAGTTAAAGAACACTTGAATACAAACAACATAAAGATGAAAAAGTATTTCGATATGAAAATACAAGAAATTGAAGAATTTCAACCTGGAGACCTAGTTATGGTCAAAAGAACGAAAACAGGTTTTCTTCATAAATCCAATAAATTAGCACCTAGTTTTGCAGGACCGTTCTATGTGTTACAGAAGTCGGGTCCAAACAACTATGAATTGGATCTTCCAGATTCAATCAAGCACATGTTTTCATCTACTTTTCATGTTTCTCACCTAGAAAAGTATCGACATAATTCAGAACTCAATTACGCTACCATTGATGAGTCTGATATTGGAACAATTCTTCATATCCTAGAACATAAAAACAGAGAACAAGTACTCTACTTAAATGTCAAGTACATTTCGAATCTAAATCCGAGTACTATTATGTCAGGATGGACTACATTAGCTACAGCGCTACAAGCGGACAAAGCAATTGTCAATGATTATATTAAAAACAATAATCTAAATATCTGA 403

In [72]:
'CAGGGAAAGTACAAGGATCCAACAAAGGTGATCGTTTAACTAAAACATTTGAAGGTTTTAGAAATCAATTGGACAAAGTTCAATTTATAAGGAAACTCATGTCAAA' in sequences['ID=631 L=4064 type=ISLAND'].sequence


Out[72]:
True

In [ ]: