In [79]:
%config InlineBackend
InlineBackend options
-------------------
InlineBackend.close_figures=<Bool>
Current: True
Close all figures at the end of each cell.
When True, ensures that each cell starts with no active figures, but it also
means that one must keep track of references in order to edit or redraw
figures in subsequent cells. This mode is ideal for the notebook, where
residual plots from other cells might be surprising.
When False, one must call figure() to create new figures. This means that
gcf() and getfigs() can reference figures created in other cells, and the
active figure can continue to be edited with pylab/pyplot methods that
reference the current active figure. This mode facilitates iterative editing
of figures, and behaves most consistently with other matplotlib backends,
but figure barriers between cells must be explicit.
InlineBackend.figure_format=<Unicode>
Current: 'retina'
The figure format to enable (deprecated use `figure_formats` instead)
InlineBackend.figure_formats=<Set>
Current: {'retina'}
A set of figure formats to enable: 'png', 'retina', 'jpeg', 'svg', 'pdf'.
InlineBackend.print_figure_kwargs=<Dict>
Current: {'bbox_inches': 'tight'}
Extra kwargs to be passed to fig.canvas.print_figure.
Logical examples include: bbox_inches, quality (for jpeg figures), etc.
InlineBackend.rc=<Dict>
Current: {'figure.figsize': (6.0, 4.0), 'figure.facecolor': (1, 1, 1, 0), 'figure.edgecolor': (1, 1, 1, 0), 'font.size': 10, 'figure.dpi': 72, 'figure.subplot.bottom': 0.125}
Subset of matplotlib rcParams that should be different for the inline
backend.
In [93]:
%pylab inline
%config InlineBackend.figure_format = 'svg'
from metadata import parse_mmetsp_metadata, BOINK_DIR, DATA_DIR
import py
import pandas as pd
import seaborn as sns
from ficus import FigureManager
from astroML.plotting import hist
from collections import OrderedDict
import glob
import os
import re
from IPython.display import FileLink
import pyprind
sns.set_style('ticks')
Populating the interactive namespace from numpy and matplotlib
In [22]:
mmetsp_metadata = parse_mmetsp_metadata(os.path.join(DATA_DIR, 'MMETSP_SraRunInfo_subset.csv'))
In [106]:
mmetsp_metadata[['ScientificName', 'SampleName']]
Out[106]:
ScientificName
SampleName
0
Erythrolobus australicus
MMETSP1353
1
Erythrolobus madagascarensis
MMETSP1354
2
Fabrea salina
MMETSP1345
3
Minutocellus polymorphus
MMETSP1322
4
Dictyocha speculum
MMETSP1174
5
Licmophora paradoxa
MMETSP1360
6
Staurosira
MMETSP1361
7
Leptocylindrus danicus
MMETSP1362
8
Asterionellopsis glacialis
MMETSP1394
9
Pseudo-nitzschia heimii
MMETSP1423
10
Bathycoccus prasinos
MMETSP1399
11
Micromonas
MMETSP1400
12
Micromonas pusilla
MMETSP1401
13
Micromonas pusilla
MMETSP1402
14
Micromonas pusilla
MMETSP1403
15
Micromonas pusilla
MMETSP1404
16
Moneuplotes crassus
MMETSP1380
17
Pseudokeronopsis
MMETSP1396
18
Climacostomum virens
MMETSP1397
19
Skeletonema marinoi
MMETSP1428
20
Chaetoceros sp.
MMETSP1429
21
Pseudo-nitzschia delicatissima
MMETSP1432
22
Prorocentrum lima
MMETSP0252
23
Prorocentrum micans
MMETSP0251_2
24
Pyrocystis lunula
MMETSP0229_2
25
Lessardia elongata
MMETSP1147
26
Gyrodinium dominans
MMETSP1148
27
Symbiodinium sp.
MMETSP1370
28
Symbiodinium sp.
MMETSP1371
29
Oxyrrhis marina
MMETSP1424
30
Oxyrrhis marina
MMETSP1425
31
Oxyrrhis marina
MMETSP1426
32
Elphidium margaritaceum
MMETSP1385
33
Thalassiosira weissflogii
MMETSP1405
34
Thalassiosira weissflogii
MMETSP1406
35
Thalassiosira weissflogii
MMETSP1407
36
Thalassiosira weissflogii
MMETSP1408
37
Thalassiosira weissflogii
MMETSP1409
38
Thalassiosira weissflogii
MMETSP1410
39
Thalassiosira weissflogii
MMETSP1412
40
Thalassiosira weissflogii
MMETSP1413
41
Thalassiosira weissflogii
MMETSP1414
42
Thalassiosira weissflogii
MMETSP1415
43
Thalassiosira weissflogii
MMETSP1416
44
Thalassiosira weissflogii
MMETSP1417
45
Thalassiosira weissflogii
MMETSP1418
46
Thalassiosira weissflogii
MMETSP1419
47
Thalassiosira weissflogii
MMETSP1420
48
Thalassiosira weissflogii
MMETSP1422
In [71]:
partition_data = []
for _, row in mmetsp_metadata.iterrows():
try:
path = os.path.join(row.ReadsDir, 'partitioned-assembly', 'global-stats.csv')
data = pd.read_csv(path, header=None, names=['n_transcripts', 'n_components', 'n_tags', 'n_kmers'])
sample_data = data.iloc[-1]
sample_data.n_kmers = data.n_kmers.sum()
sample_data['Species'] = row.ScientificName
sample_data['Sample'] = row.SampleName
sample_data['AssemblyPath'] = row.AssemblyPath
partition_data.append(sample_data.copy())
except FileNotFoundError:
pass
partition_data = pd.DataFrame(partition_data).reset_index(drop=True)
In [85]:
sns.jointplot(x=partition_data.n_transcripts, y=partition_data.n_components, )
Out[85]:
<seaborn.axisgrid.JointGrid at 0x2ae855560978>
In [103]:
with FigureManager(show=True, figsize=(10,10)) as (fig, ax):
sns.distplot(partition_data.n_components, bins=15, ax=ax, label='Components')
sns.distplot(partition_data.n_transcripts, bins=15, ax=ax, label='Transcripts')
ax.legend()
In [19]:
# n, components, tags, new kmers
def parse_partition_data_dir(stats_dir, include_global=False, verbose=True):
stats_files = [os.path.abspath(p) for p in glob.glob(os.path.join(stats_dir,'*.stats.csv'))]
read_num_func = lambda fn: int(os.path.basename(fn).split('.')[0])
stats_files.sort(key=read_num_func)
if verbose:
print('Parsing {0} partition files from {1}'.format(len(stats_files), stats_dir))
bar = pyprind.ProgBar(len(stats_files), monitor=False)
data = OrderedDict()
for filename in stats_files:
df = pd.read_csv(filename, header=None, names=['comp_id', 'n_tags', 'mean_cov'])
data[read_num_func(filename)] = df
if verbose:
bar.update()
return data
In [20]:
parse_partition_data_dir('data/Skeletonema_marinoi/SRR1300462/partitioned-assembly/')
0% 100%
[## ] | ETA: 00:00:00 | ETA: 00:00:00
Parsing 4 partition files from data/Skeletonema_marinoi/SRR1300462/partitioned-assembly/
[####] | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00
Total time elapsed: 00:00:00
Out[20]:
OrderedDict([(10000, comp_id n_tags mean_cov
0 0 33 1.000000
1 1 24 1.000000
2 2 33 1.000000
3 3 99 1.000000
4 4 22 1.000000
5 5 21 1.000000
6 6 18 1.000000
7 7 55 1.000000
8 8 30 1.000000
9 9 15 1.000000
10 10 52 1.000000
11 11 81 1.000000
12 12 26 1.000000
13 13 31 1.000000
14 14 34 1.000000
15 15 43 1.000000
16 16 37 1.000000
17 17 26 1.000000
18 18 35 1.000000
19 19 52 1.000000
20 20 34 1.000000
21 21 64 1.000000
22 22 8 1.000000
23 23 30 1.000000
24 24 15 1.000000
25 25 20 1.000000
26 26 41 1.000000
27 27 42 1.976190
28 28 42 1.000000
29 29 17 1.000000
... ... ... ...
8790 8831 5 1.000000
8791 8832 5 1.000000
8792 8833 25 1.000000
8793 8834 59 1.830508
8794 8835 15 1.333333
8795 8836 15 1.000000
8796 8837 29 1.000000
8797 8838 15 1.000000
8798 8839 17 1.000000
8799 8840 12 1.000000
8800 8841 7 1.000000
8801 8842 33 1.878788
8802 8843 9 1.000000
8803 8844 11 1.000000
8804 8845 11 1.000000
8805 8846 13 1.000000
8806 8847 12 1.000000
8807 8848 42 1.000000
8808 8849 19 1.000000
8809 8850 5 1.000000
8810 8851 8 1.000000
8811 8852 14 1.000000
8812 8853 5 1.000000
8813 8854 10 1.700000
8814 8855 33 1.000000
8815 8856 26 1.000000
8816 8857 19 1.000000
8817 8858 16 1.000000
8818 8859 6 1.000000
8819 8860 11 1.000000
[8820 rows x 3 columns]),
(20000, comp_id n_tags mean_cov
0 0 33 1.000000
1 1 24 1.000000
2 2 33 1.000000
3 3 99 1.000000
4 4 22 1.000000
5 5 21 1.000000
6 6 18 1.000000
7 7 55 1.000000
8 8 30 1.000000
9 9 15 1.000000
10 10 52 1.000000
11 11 81 1.000000
12 12 26 1.000000
13 13 31 1.000000
14 14 34 1.000000
15 15 43 1.000000
16 16 37 1.000000
17 17 26 1.000000
18 18 35 1.000000
19 19 52 1.000000
20 20 34 1.000000
21 21 64 1.000000
22 22 8 1.000000
23 23 30 1.000000
24 24 15 1.000000
25 25 20 1.000000
26 26 41 1.000000
27 27 42 1.976190
28 28 42 1.000000
29 29 17 1.000000
... ... ... ...
17224 17378 48 1.000000
17225 17379 66 2.000000
17226 17380 68 1.000000
17227 17381 82 1.487805
17228 17382 8 1.000000
17229 17383 41 1.951220
17230 17384 12 1.500000
17231 17385 16 1.250000
17232 17386 36 1.000000
17233 17387 27 1.925926
17234 17388 12 1.000000
17235 17389 6 1.000000
17236 17390 7 1.000000
17237 17391 7 1.000000
17238 17392 6 1.000000
17239 17393 53 1.716981
17240 17394 36 1.861111
17241 17395 24 1.833333
17242 17396 11 1.000000
17243 17397 13 1.000000
17244 17398 28 1.000000
17245 17399 76 1.644737
17246 17400 18 1.000000
17247 17401 7 1.000000
17248 17402 49 1.000000
17249 17403 86 1.000000
17250 17404 7 1.000000
17251 17405 52 1.692308
17252 17406 5 1.000000
17253 17407 44 1.000000
[17254 rows x 3 columns]),
(30000, comp_id n_tags mean_cov
0 0 33 1.000000
1 1 24 1.000000
2 2 33 1.000000
3 3 99 1.000000
4 4 22 1.000000
5 5 21 1.000000
6 6 18 1.000000
7 7 55 1.000000
8 8 30 1.000000
9 9 15 1.000000
10 10 52 1.000000
11 11 81 1.000000
12 12 26 1.000000
13 13 31 1.000000
14 14 34 1.000000
15 15 43 1.000000
16 16 37 1.000000
17 17 26 1.000000
18 18 35 1.000000
19 19 52 1.000000
20 20 34 1.000000
21 21 64 1.000000
22 22 8 1.000000
23 23 30 1.000000
24 24 15 1.000000
25 25 20 1.000000
26 26 41 1.000000
27 27 42 1.976190
28 28 42 1.000000
29 29 17 1.000000
... ... ... ...
25792 26035 21 1.000000
25793 26036 6 1.000000
25794 26037 11 1.000000
25795 26038 19 1.000000
25796 26039 19 1.000000
25797 26040 5 1.000000
25798 26041 6 1.000000
25799 26042 27 1.259259
25800 26043 27 1.000000
25801 26044 15 1.000000
25802 26045 38 1.000000
25803 26046 36 1.000000
25804 26047 9 1.000000
25805 26048 9 1.000000
25806 26049 40 1.000000
25807 26050 69 1.000000
25808 26051 18 1.000000
25809 26052 60 1.000000
25810 26053 29 1.000000
25811 26054 11 1.000000
25812 26055 8 1.000000
25813 26056 34 1.941176
25814 26057 17 1.647059
25815 26058 12 1.000000
25816 26059 14 1.000000
25817 26060 47 1.000000
25818 26061 26 1.000000
25819 26062 92 1.000000
25820 26063 25 1.920000
25821 26064 6 1.000000
[25822 rows x 3 columns]),
(32731, comp_id n_tags mean_cov
0 0 33 1.00000
1 1 24 1.00000
2 2 33 1.00000
3 3 99 1.00000
4 4 22 1.00000
5 5 21 1.00000
6 6 18 1.00000
7 7 55 1.00000
8 8 30 1.00000
9 9 15 1.00000
10 10 52 1.00000
11 11 81 1.00000
12 12 26 1.00000
13 13 31 1.00000
14 14 34 1.00000
15 15 43 1.00000
16 16 37 1.00000
17 17 26 1.00000
18 18 35 1.00000
19 19 52 1.00000
20 20 34 1.00000
21 21 64 1.00000
22 22 8 1.00000
23 23 30 1.00000
24 24 15 1.00000
25 25 20 1.00000
26 26 41 1.00000
27 27 42 1.97619
28 28 42 1.00000
29 29 17 1.00000
... ... ... ...
28211 28466 30 1.00000
28212 28467 45 1.00000
28213 28468 65 1.00000
28214 28469 22 1.00000
28215 28470 10 1.00000
28216 28471 34 1.00000
28217 28472 33 1.00000
28218 28473 22 1.00000
28219 28474 28 1.00000
28220 28475 39 1.00000
28221 28476 19 1.00000
28222 28477 44 1.00000
28223 28478 52 1.00000
28224 28479 84 1.00000
28225 28480 35 1.00000
28226 28481 36 1.00000
28227 28482 5 1.00000
28228 28483 40 1.00000
28229 28484 45 1.00000
28230 28485 26 1.00000
28231 28486 25 1.00000
28232 28487 7 1.00000
28233 28488 22 1.00000
28234 28489 55 1.00000
28235 28490 31 1.00000
28236 28491 41 1.00000
28237 28492 37 1.00000
28238 28493 54 1.00000
28239 28494 43 1.00000
28240 28495 24 1.00000
[28241 rows x 3 columns])])
In [10]:
glob.glob('**/global-stats.csv', recursive=True)
Out[10]:
['data/Skeletonema_marinoi/SRR1300462/partitioned-assembly/global-stats.csv',
'data/Asterionellopsis_glacialis/SRR1300451/partitioned-assembly/global-stats.csv',
'data/Lessardia_elongata/SRR1300468/partitioned-assembly/global-stats.csv',
'data/Moneuplotes_crassus/SRR1300459/partitioned-assembly/global-stats.csv',
'data/Chaetoceros_sp./SRR1300463/partitioned-assembly/global-stats.csv',
'data/Climacostomum_virens/SRR1300461/partitioned-assembly/global-stats.csv',
'data/Elphidium_margaritaceum/SRR1300475/partitioned-assembly/global-stats.csv',
'data/Pseudo-nitzschia_heimii/SRR1300452/partitioned-assembly/global-stats.csv',
'data/Prorocentrum_lima/SRR1300465/partitioned-assembly/global-stats.csv',
'data/Thalassiosira_weissflogii/SRR1300491/partitioned-assembly/global-stats.csv',
'data/Thalassiosira_weissflogii/SRR1300482/partitioned-assembly/global-stats.csv',
'data/Thalassiosira_weissflogii/SRR1300488/partitioned-assembly/global-stats.csv',
'data/Thalassiosira_weissflogii/SRR1300485/partitioned-assembly/global-stats.csv',
'data/Thalassiosira_weissflogii/SRR1300478/partitioned-assembly/global-stats.csv',
'data/Thalassiosira_weissflogii/SRR1300476/partitioned-assembly/global-stats.csv',
'data/Thalassiosira_weissflogii/SRR1300481/partitioned-assembly/global-stats.csv',
'data/Thalassiosira_weissflogii/SRR1300486/partitioned-assembly/global-stats.csv',
'data/Thalassiosira_weissflogii/SRR1300479/partitioned-assembly/global-stats.csv',
'data/Thalassiosira_weissflogii/SRR1300484/partitioned-assembly/global-stats.csv',
'data/Thalassiosira_weissflogii/SRR1300489/partitioned-assembly/global-stats.csv',
'data/Thalassiosira_weissflogii/SRR1300483/partitioned-assembly/global-stats.csv',
'data/Thalassiosira_weissflogii/SRR1300490/partitioned-assembly/global-stats.csv',
'data/Thalassiosira_weissflogii/SRR1300487/partitioned-assembly/global-stats.csv',
'data/Thalassiosira_weissflogii/SRR1300480/partitioned-assembly/global-stats.csv',
'data/Thalassiosira_weissflogii/SRR1300477/partitioned-assembly/global-stats.csv',
'data/Fabrea_salina/SRR1300445/partitioned-assembly/global-stats.csv',
'data/Gyrodinium_dominans/SRR1300469/partitioned-assembly/global-stats.csv',
'data/Pseudokeronopsis/SRR1300460/partitioned-assembly/global-stats.csv',
'data/Licmophora_paradoxa/SRR1300448/partitioned-assembly/global-stats.csv',
'data/Dictyocha_speculum/SRR1300447/partitioned-assembly/global-stats.csv',
'data/Symbiodinium_sp./SRR1300471/partitioned-assembly/global-stats.csv',
'data/Symbiodinium_sp./SRR1300470/partitioned-assembly/global-stats.csv',
'data/Oxyrrhis_marina/SRR1300474/partitioned-assembly/global-stats.csv',
'data/Oxyrrhis_marina/SRR1300473/partitioned-assembly/global-stats.csv',
'data/Oxyrrhis_marina/SRR1300472/partitioned-assembly/global-stats.csv',
'data/Erythrolobus_madagascarensis/SRR1300444/partitioned-assembly/global-stats.csv',
'data/Erythrolobus_australicus/SRR1300443/partitioned-assembly/global-stats.csv',
'data/Micromonas_pusilla/SRR1300457/partitioned-assembly/global-stats.csv',
'data/Micromonas_pusilla/SRR1300455/partitioned-assembly/global-stats.csv',
'data/Micromonas_pusilla/SRR1300458/partitioned-assembly/global-stats.csv',
'data/Micromonas_pusilla/SRR1300456/partitioned-assembly/global-stats.csv',
'data/Bathycoccus_prasinos/SRR1300453/partitioned-assembly/global-stats.csv',
'data/Pseudo-nitzschia_delicatissima/SRR1300464/partitioned-assembly/global-stats.csv',
'data/Staurosira/SRR1300449/partitioned-assembly/global-stats.csv',
'data/Minutocellus_polymorphus/SRR1300446/partitioned-assembly/global-stats.csv',
'data/Micromonas/SRR1300454/partitioned-assembly/global-stats.csv',
'data/Leptocylindrus_danicus/SRR1300450/partitioned-assembly/global-stats.csv']
In [ ]:
Content source: camillescott/boink
Similar notebooks: