In [79]:
%config InlineBackend


InlineBackend options
-------------------
InlineBackend.close_figures=<Bool>
    Current: True
    Close all figures at the end of each cell.
    When True, ensures that each cell starts with no active figures, but it also
    means that one must keep track of references in order to edit or redraw
    figures in subsequent cells. This mode is ideal for the notebook, where
    residual plots from other cells might be surprising.
    When False, one must call figure() to create new figures. This means that
    gcf() and getfigs() can reference figures created in other cells, and the
    active figure can continue to be edited with pylab/pyplot methods that
    reference the current active figure. This mode facilitates iterative editing
    of figures, and behaves most consistently with other matplotlib backends,
    but figure barriers between cells must be explicit.
InlineBackend.figure_format=<Unicode>
    Current: 'retina'
    The figure format to enable (deprecated use `figure_formats` instead)
InlineBackend.figure_formats=<Set>
    Current: {'retina'}
    A set of figure formats to enable: 'png', 'retina', 'jpeg', 'svg', 'pdf'.
InlineBackend.print_figure_kwargs=<Dict>
    Current: {'bbox_inches': 'tight'}
    Extra kwargs to be passed to fig.canvas.print_figure.
    Logical examples include: bbox_inches, quality (for jpeg figures), etc.
InlineBackend.rc=<Dict>
    Current: {'figure.figsize': (6.0, 4.0), 'figure.facecolor': (1, 1, 1, 0), 'figure.edgecolor': (1, 1, 1, 0), 'font.size': 10, 'figure.dpi': 72, 'figure.subplot.bottom': 0.125}
    Subset of matplotlib rcParams that should be different for the inline
    backend.

In [93]:
%pylab inline
%config InlineBackend.figure_format = 'svg'

from metadata import parse_mmetsp_metadata, BOINK_DIR, DATA_DIR
import py
import pandas as pd
import seaborn as sns
from ficus import FigureManager
from astroML.plotting import hist

from collections import OrderedDict
import glob
import os
import re
from IPython.display import FileLink
import pyprind
sns.set_style('ticks')


Populating the interactive namespace from numpy and matplotlib

In [22]:
mmetsp_metadata = parse_mmetsp_metadata(os.path.join(DATA_DIR, 'MMETSP_SraRunInfo_subset.csv'))

In [106]:
mmetsp_metadata[['ScientificName', 'SampleName']]


Out[106]:
ScientificName SampleName
0 Erythrolobus australicus MMETSP1353
1 Erythrolobus madagascarensis MMETSP1354
2 Fabrea salina MMETSP1345
3 Minutocellus polymorphus MMETSP1322
4 Dictyocha speculum MMETSP1174
5 Licmophora paradoxa MMETSP1360
6 Staurosira MMETSP1361
7 Leptocylindrus danicus MMETSP1362
8 Asterionellopsis glacialis MMETSP1394
9 Pseudo-nitzschia heimii MMETSP1423
10 Bathycoccus prasinos MMETSP1399
11 Micromonas MMETSP1400
12 Micromonas pusilla MMETSP1401
13 Micromonas pusilla MMETSP1402
14 Micromonas pusilla MMETSP1403
15 Micromonas pusilla MMETSP1404
16 Moneuplotes crassus MMETSP1380
17 Pseudokeronopsis MMETSP1396
18 Climacostomum virens MMETSP1397
19 Skeletonema marinoi MMETSP1428
20 Chaetoceros sp. MMETSP1429
21 Pseudo-nitzschia delicatissima MMETSP1432
22 Prorocentrum lima MMETSP0252
23 Prorocentrum micans MMETSP0251_2
24 Pyrocystis lunula MMETSP0229_2
25 Lessardia elongata MMETSP1147
26 Gyrodinium dominans MMETSP1148
27 Symbiodinium sp. MMETSP1370
28 Symbiodinium sp. MMETSP1371
29 Oxyrrhis marina MMETSP1424
30 Oxyrrhis marina MMETSP1425
31 Oxyrrhis marina MMETSP1426
32 Elphidium margaritaceum MMETSP1385
33 Thalassiosira weissflogii MMETSP1405
34 Thalassiosira weissflogii MMETSP1406
35 Thalassiosira weissflogii MMETSP1407
36 Thalassiosira weissflogii MMETSP1408
37 Thalassiosira weissflogii MMETSP1409
38 Thalassiosira weissflogii MMETSP1410
39 Thalassiosira weissflogii MMETSP1412
40 Thalassiosira weissflogii MMETSP1413
41 Thalassiosira weissflogii MMETSP1414
42 Thalassiosira weissflogii MMETSP1415
43 Thalassiosira weissflogii MMETSP1416
44 Thalassiosira weissflogii MMETSP1417
45 Thalassiosira weissflogii MMETSP1418
46 Thalassiosira weissflogii MMETSP1419
47 Thalassiosira weissflogii MMETSP1420
48 Thalassiosira weissflogii MMETSP1422

In [71]:
partition_data = []
for _, row in mmetsp_metadata.iterrows():
    try:
        path = os.path.join(row.ReadsDir, 'partitioned-assembly', 'global-stats.csv')
        data = pd.read_csv(path, header=None, names=['n_transcripts', 'n_components', 'n_tags', 'n_kmers'])
        sample_data = data.iloc[-1]
        sample_data.n_kmers = data.n_kmers.sum()
        sample_data['Species'] = row.ScientificName
        sample_data['Sample'] = row.SampleName
        sample_data['AssemblyPath'] = row.AssemblyPath

        partition_data.append(sample_data.copy())
    except FileNotFoundError:
        pass
partition_data = pd.DataFrame(partition_data).reset_index(drop=True)

In [85]:
sns.jointplot(x=partition_data.n_transcripts, y=partition_data.n_components, )


Out[85]:
<seaborn.axisgrid.JointGrid at 0x2ae855560978>

In [103]:
with FigureManager(show=True,  figsize=(10,10)) as (fig, ax):
    sns.distplot(partition_data.n_components, bins=15, ax=ax, label='Components')
    sns.distplot(partition_data.n_transcripts, bins=15, ax=ax, label='Transcripts')
    ax.legend()



In [19]:
# n, components, tags, new kmers

def parse_partition_data_dir(stats_dir, include_global=False, verbose=True):
    stats_files = [os.path.abspath(p) for p in glob.glob(os.path.join(stats_dir,'*.stats.csv'))]
    read_num_func = lambda fn: int(os.path.basename(fn).split('.')[0])
    stats_files.sort(key=read_num_func)
    if verbose:
        print('Parsing {0} partition files from {1}'.format(len(stats_files), stats_dir))
        bar = pyprind.ProgBar(len(stats_files), monitor=False)
    
    data = OrderedDict()
    for filename in stats_files:
        df = pd.read_csv(filename, header=None, names=['comp_id', 'n_tags', 'mean_cov'])
        data[read_num_func(filename)] = df
        if verbose:
            bar.update()
    return data

In [20]:
parse_partition_data_dir('data/Skeletonema_marinoi/SRR1300462/partitioned-assembly/')


0%  100%
[##  ] | ETA: 00:00:00 | ETA: 00:00:00
Parsing 4 partition files from data/Skeletonema_marinoi/SRR1300462/partitioned-assembly/
[####] | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00
Total time elapsed: 00:00:00
Out[20]:
OrderedDict([(10000,       comp_id  n_tags  mean_cov
              0           0      33  1.000000
              1           1      24  1.000000
              2           2      33  1.000000
              3           3      99  1.000000
              4           4      22  1.000000
              5           5      21  1.000000
              6           6      18  1.000000
              7           7      55  1.000000
              8           8      30  1.000000
              9           9      15  1.000000
              10         10      52  1.000000
              11         11      81  1.000000
              12         12      26  1.000000
              13         13      31  1.000000
              14         14      34  1.000000
              15         15      43  1.000000
              16         16      37  1.000000
              17         17      26  1.000000
              18         18      35  1.000000
              19         19      52  1.000000
              20         20      34  1.000000
              21         21      64  1.000000
              22         22       8  1.000000
              23         23      30  1.000000
              24         24      15  1.000000
              25         25      20  1.000000
              26         26      41  1.000000
              27         27      42  1.976190
              28         28      42  1.000000
              29         29      17  1.000000
              ...       ...     ...       ...
              8790     8831       5  1.000000
              8791     8832       5  1.000000
              8792     8833      25  1.000000
              8793     8834      59  1.830508
              8794     8835      15  1.333333
              8795     8836      15  1.000000
              8796     8837      29  1.000000
              8797     8838      15  1.000000
              8798     8839      17  1.000000
              8799     8840      12  1.000000
              8800     8841       7  1.000000
              8801     8842      33  1.878788
              8802     8843       9  1.000000
              8803     8844      11  1.000000
              8804     8845      11  1.000000
              8805     8846      13  1.000000
              8806     8847      12  1.000000
              8807     8848      42  1.000000
              8808     8849      19  1.000000
              8809     8850       5  1.000000
              8810     8851       8  1.000000
              8811     8852      14  1.000000
              8812     8853       5  1.000000
              8813     8854      10  1.700000
              8814     8855      33  1.000000
              8815     8856      26  1.000000
              8816     8857      19  1.000000
              8817     8858      16  1.000000
              8818     8859       6  1.000000
              8819     8860      11  1.000000
              
              [8820 rows x 3 columns]),
             (20000,        comp_id  n_tags  mean_cov
              0            0      33  1.000000
              1            1      24  1.000000
              2            2      33  1.000000
              3            3      99  1.000000
              4            4      22  1.000000
              5            5      21  1.000000
              6            6      18  1.000000
              7            7      55  1.000000
              8            8      30  1.000000
              9            9      15  1.000000
              10          10      52  1.000000
              11          11      81  1.000000
              12          12      26  1.000000
              13          13      31  1.000000
              14          14      34  1.000000
              15          15      43  1.000000
              16          16      37  1.000000
              17          17      26  1.000000
              18          18      35  1.000000
              19          19      52  1.000000
              20          20      34  1.000000
              21          21      64  1.000000
              22          22       8  1.000000
              23          23      30  1.000000
              24          24      15  1.000000
              25          25      20  1.000000
              26          26      41  1.000000
              27          27      42  1.976190
              28          28      42  1.000000
              29          29      17  1.000000
              ...        ...     ...       ...
              17224    17378      48  1.000000
              17225    17379      66  2.000000
              17226    17380      68  1.000000
              17227    17381      82  1.487805
              17228    17382       8  1.000000
              17229    17383      41  1.951220
              17230    17384      12  1.500000
              17231    17385      16  1.250000
              17232    17386      36  1.000000
              17233    17387      27  1.925926
              17234    17388      12  1.000000
              17235    17389       6  1.000000
              17236    17390       7  1.000000
              17237    17391       7  1.000000
              17238    17392       6  1.000000
              17239    17393      53  1.716981
              17240    17394      36  1.861111
              17241    17395      24  1.833333
              17242    17396      11  1.000000
              17243    17397      13  1.000000
              17244    17398      28  1.000000
              17245    17399      76  1.644737
              17246    17400      18  1.000000
              17247    17401       7  1.000000
              17248    17402      49  1.000000
              17249    17403      86  1.000000
              17250    17404       7  1.000000
              17251    17405      52  1.692308
              17252    17406       5  1.000000
              17253    17407      44  1.000000
              
              [17254 rows x 3 columns]),
             (30000,        comp_id  n_tags  mean_cov
              0            0      33  1.000000
              1            1      24  1.000000
              2            2      33  1.000000
              3            3      99  1.000000
              4            4      22  1.000000
              5            5      21  1.000000
              6            6      18  1.000000
              7            7      55  1.000000
              8            8      30  1.000000
              9            9      15  1.000000
              10          10      52  1.000000
              11          11      81  1.000000
              12          12      26  1.000000
              13          13      31  1.000000
              14          14      34  1.000000
              15          15      43  1.000000
              16          16      37  1.000000
              17          17      26  1.000000
              18          18      35  1.000000
              19          19      52  1.000000
              20          20      34  1.000000
              21          21      64  1.000000
              22          22       8  1.000000
              23          23      30  1.000000
              24          24      15  1.000000
              25          25      20  1.000000
              26          26      41  1.000000
              27          27      42  1.976190
              28          28      42  1.000000
              29          29      17  1.000000
              ...        ...     ...       ...
              25792    26035      21  1.000000
              25793    26036       6  1.000000
              25794    26037      11  1.000000
              25795    26038      19  1.000000
              25796    26039      19  1.000000
              25797    26040       5  1.000000
              25798    26041       6  1.000000
              25799    26042      27  1.259259
              25800    26043      27  1.000000
              25801    26044      15  1.000000
              25802    26045      38  1.000000
              25803    26046      36  1.000000
              25804    26047       9  1.000000
              25805    26048       9  1.000000
              25806    26049      40  1.000000
              25807    26050      69  1.000000
              25808    26051      18  1.000000
              25809    26052      60  1.000000
              25810    26053      29  1.000000
              25811    26054      11  1.000000
              25812    26055       8  1.000000
              25813    26056      34  1.941176
              25814    26057      17  1.647059
              25815    26058      12  1.000000
              25816    26059      14  1.000000
              25817    26060      47  1.000000
              25818    26061      26  1.000000
              25819    26062      92  1.000000
              25820    26063      25  1.920000
              25821    26064       6  1.000000
              
              [25822 rows x 3 columns]),
             (32731,        comp_id  n_tags  mean_cov
              0            0      33   1.00000
              1            1      24   1.00000
              2            2      33   1.00000
              3            3      99   1.00000
              4            4      22   1.00000
              5            5      21   1.00000
              6            6      18   1.00000
              7            7      55   1.00000
              8            8      30   1.00000
              9            9      15   1.00000
              10          10      52   1.00000
              11          11      81   1.00000
              12          12      26   1.00000
              13          13      31   1.00000
              14          14      34   1.00000
              15          15      43   1.00000
              16          16      37   1.00000
              17          17      26   1.00000
              18          18      35   1.00000
              19          19      52   1.00000
              20          20      34   1.00000
              21          21      64   1.00000
              22          22       8   1.00000
              23          23      30   1.00000
              24          24      15   1.00000
              25          25      20   1.00000
              26          26      41   1.00000
              27          27      42   1.97619
              28          28      42   1.00000
              29          29      17   1.00000
              ...        ...     ...       ...
              28211    28466      30   1.00000
              28212    28467      45   1.00000
              28213    28468      65   1.00000
              28214    28469      22   1.00000
              28215    28470      10   1.00000
              28216    28471      34   1.00000
              28217    28472      33   1.00000
              28218    28473      22   1.00000
              28219    28474      28   1.00000
              28220    28475      39   1.00000
              28221    28476      19   1.00000
              28222    28477      44   1.00000
              28223    28478      52   1.00000
              28224    28479      84   1.00000
              28225    28480      35   1.00000
              28226    28481      36   1.00000
              28227    28482       5   1.00000
              28228    28483      40   1.00000
              28229    28484      45   1.00000
              28230    28485      26   1.00000
              28231    28486      25   1.00000
              28232    28487       7   1.00000
              28233    28488      22   1.00000
              28234    28489      55   1.00000
              28235    28490      31   1.00000
              28236    28491      41   1.00000
              28237    28492      37   1.00000
              28238    28493      54   1.00000
              28239    28494      43   1.00000
              28240    28495      24   1.00000
              
              [28241 rows x 3 columns])])

In [10]:
glob.glob('**/global-stats.csv', recursive=True)


Out[10]:
['data/Skeletonema_marinoi/SRR1300462/partitioned-assembly/global-stats.csv',
 'data/Asterionellopsis_glacialis/SRR1300451/partitioned-assembly/global-stats.csv',
 'data/Lessardia_elongata/SRR1300468/partitioned-assembly/global-stats.csv',
 'data/Moneuplotes_crassus/SRR1300459/partitioned-assembly/global-stats.csv',
 'data/Chaetoceros_sp./SRR1300463/partitioned-assembly/global-stats.csv',
 'data/Climacostomum_virens/SRR1300461/partitioned-assembly/global-stats.csv',
 'data/Elphidium_margaritaceum/SRR1300475/partitioned-assembly/global-stats.csv',
 'data/Pseudo-nitzschia_heimii/SRR1300452/partitioned-assembly/global-stats.csv',
 'data/Prorocentrum_lima/SRR1300465/partitioned-assembly/global-stats.csv',
 'data/Thalassiosira_weissflogii/SRR1300491/partitioned-assembly/global-stats.csv',
 'data/Thalassiosira_weissflogii/SRR1300482/partitioned-assembly/global-stats.csv',
 'data/Thalassiosira_weissflogii/SRR1300488/partitioned-assembly/global-stats.csv',
 'data/Thalassiosira_weissflogii/SRR1300485/partitioned-assembly/global-stats.csv',
 'data/Thalassiosira_weissflogii/SRR1300478/partitioned-assembly/global-stats.csv',
 'data/Thalassiosira_weissflogii/SRR1300476/partitioned-assembly/global-stats.csv',
 'data/Thalassiosira_weissflogii/SRR1300481/partitioned-assembly/global-stats.csv',
 'data/Thalassiosira_weissflogii/SRR1300486/partitioned-assembly/global-stats.csv',
 'data/Thalassiosira_weissflogii/SRR1300479/partitioned-assembly/global-stats.csv',
 'data/Thalassiosira_weissflogii/SRR1300484/partitioned-assembly/global-stats.csv',
 'data/Thalassiosira_weissflogii/SRR1300489/partitioned-assembly/global-stats.csv',
 'data/Thalassiosira_weissflogii/SRR1300483/partitioned-assembly/global-stats.csv',
 'data/Thalassiosira_weissflogii/SRR1300490/partitioned-assembly/global-stats.csv',
 'data/Thalassiosira_weissflogii/SRR1300487/partitioned-assembly/global-stats.csv',
 'data/Thalassiosira_weissflogii/SRR1300480/partitioned-assembly/global-stats.csv',
 'data/Thalassiosira_weissflogii/SRR1300477/partitioned-assembly/global-stats.csv',
 'data/Fabrea_salina/SRR1300445/partitioned-assembly/global-stats.csv',
 'data/Gyrodinium_dominans/SRR1300469/partitioned-assembly/global-stats.csv',
 'data/Pseudokeronopsis/SRR1300460/partitioned-assembly/global-stats.csv',
 'data/Licmophora_paradoxa/SRR1300448/partitioned-assembly/global-stats.csv',
 'data/Dictyocha_speculum/SRR1300447/partitioned-assembly/global-stats.csv',
 'data/Symbiodinium_sp./SRR1300471/partitioned-assembly/global-stats.csv',
 'data/Symbiodinium_sp./SRR1300470/partitioned-assembly/global-stats.csv',
 'data/Oxyrrhis_marina/SRR1300474/partitioned-assembly/global-stats.csv',
 'data/Oxyrrhis_marina/SRR1300473/partitioned-assembly/global-stats.csv',
 'data/Oxyrrhis_marina/SRR1300472/partitioned-assembly/global-stats.csv',
 'data/Erythrolobus_madagascarensis/SRR1300444/partitioned-assembly/global-stats.csv',
 'data/Erythrolobus_australicus/SRR1300443/partitioned-assembly/global-stats.csv',
 'data/Micromonas_pusilla/SRR1300457/partitioned-assembly/global-stats.csv',
 'data/Micromonas_pusilla/SRR1300455/partitioned-assembly/global-stats.csv',
 'data/Micromonas_pusilla/SRR1300458/partitioned-assembly/global-stats.csv',
 'data/Micromonas_pusilla/SRR1300456/partitioned-assembly/global-stats.csv',
 'data/Bathycoccus_prasinos/SRR1300453/partitioned-assembly/global-stats.csv',
 'data/Pseudo-nitzschia_delicatissima/SRR1300464/partitioned-assembly/global-stats.csv',
 'data/Staurosira/SRR1300449/partitioned-assembly/global-stats.csv',
 'data/Minutocellus_polymorphus/SRR1300446/partitioned-assembly/global-stats.csv',
 'data/Micromonas/SRR1300454/partitioned-assembly/global-stats.csv',
 'data/Leptocylindrus_danicus/SRR1300450/partitioned-assembly/global-stats.csv']

In [ ]: