Description:

  • dev dataset
  • just using 10 random bacteria genomes
  • dataset copied from wPDF_py
  • re-coding how diffusion is modeled
    • modeling diffusion as a function of fragment length, but altering how it is implemented for speed

Workflow:

  • Fragment GC distributions
  • Community abundance and incorporation
    • Simulate communities for each gradient fraction
  • Simulate isotope incorp
  • Creating OTU tables

In [1]:
workDir = '/home/nick/notebook/SIPSim/dev/bac_genome10/'
SIPSimExe = '/home/nick/notebook/SIPSim/SIPSim'

In [2]:
import os
import sys
import numpy as np
import pandas as pd
import subprocess

In [148]:
%load_ext rpy2.ipython
%pylab inline


The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython
Populating the interactive namespace from numpy and matplotlib

In [91]:
%%R
library(ggplot2)
library(dplyr)
library(tidyr)

Indexing genomes

  • needed for in silico PCR to simulate amplicon-fragments

In [56]:
%%bash -s "$workDir" "$SIPSimExe"

cd $1

$2 indexGenomes genomes/genomes10.txt --fp ./genomes/ --np 10


0
0: 5.57%, 0:00:00.902733
0: 11.13%, 0:00:01.750471
0: 16.70%, 0:00:02.630836
0: 22.27%, 0:00:03.529660
0: 27.84%, 0:00:04.438242
0: 33.40%, 0:00:05.353085
0: 38.97%, 0:00:06.267695
0: 44.54%, 0:00:07.186757
0: 50.10%, 0:00:08.107006
0: 55.67%, 0:00:09.027410
0: 61.24%, 0:00:09.951076
0: 66.80%, 0:00:10.877679
0: 72.37%, 0:00:11.807129
0: 77.94%, 0:00:12.736229
0: 83.51%, 0:00:13.668271
0: 89.07%, 0:00:14.600871
0: 94.64%, 0:00:15.531507
Time used: 0:00:18.609964
Done.
0
0: 4.00%, 0:00:01.028918
0: 8.00%, 0:00:01.910991
0: 11.99%, 0:00:02.805818
0: 15.99%, 0:00:03.709928
0: 19.99%, 0:00:04.617608
0: 23.99%, 0:00:05.534440
0: 27.99%, 0:00:06.451912
0: 31.99%, 0:00:07.375410
0: 35.98%, 0:00:08.306406
0: 39.98%, 0:00:09.239711
0: 43.98%, 0:00:10.174125
0: 47.98%, 0:00:11.108920
0: 51.98%, 0:00:12.050724
0: 55.98%, 0:00:12.989140
0: 59.97%, 0:00:13.937604
0: 63.97%, 0:00:14.881010
0: 67.97%, 0:00:15.838552
0: 71.97%, 0:00:16.792086
0: 75.97%, 0:00:17.747156
0: 79.97%, 0:00:18.705208
0: 83.96%, 0:00:19.665240
0: 87.96%, 0:00:20.626499
0: 91.96%, 0:00:21.583753
0: 95.96%, 0:00:22.550213
0: 99.96%, 0:00:23.512523
Time used: 0:00:25.608670
Done.
0
0: 3.56%, 0:00:00.945108
0: 7.11%, 0:00:01.778443
0: 10.67%, 0:00:02.640324
0: 14.22%, 0:00:03.523211
0: 17.78%, 0:00:04.414379
0: 21.34%, 0:00:05.316149
0: 24.89%, 0:00:06.223779
0: 28.45%, 0:00:07.133292
0: 32.00%, 0:00:08.046476
0: 35.56%, 0:00:08.963513
0: 39.12%, 0:00:09.882535
0: 42.67%, 0:00:10.802835
0: 46.23%, 0:00:11.724814
0: 49.79%, 0:00:12.646435
0: 53.34%, 0:00:13.569446
0: 56.90%, 0:00:14.494987
0: 60.45%, 0:00:15.420176
0: 64.01%, 0:00:16.347579
0: 67.57%, 0:00:17.273156
0: 71.12%, 0:00:18.199351
0: 74.68%, 0:00:19.125687
0: 78.23%, 0:00:20.053831
0: 81.79%, 0:00:20.981495
0: 85.35%, 0:00:21.909594
0: 88.90%, 0:00:22.838504
0: 92.46%, 0:00:23.767832
0: 96.01%, 0:00:24.696886
0: 99.57%, 0:00:25.625652
Time used: 0:00:28.042015
Done.
0
0: 3.52%, 0:00:00.909937
0: 7.04%, 0:00:01.740612
0: 10.56%, 0:00:02.608364
0: 14.09%, 0:00:03.493020
0: 17.61%, 0:00:04.393340
0: 21.13%, 0:00:05.300633
0: 24.65%, 0:00:06.213478
0: 28.17%, 0:00:07.126171
0: 31.69%, 0:00:08.044847
0: 35.21%, 0:00:08.968903
0: 38.74%, 0:00:09.891544
0: 42.26%, 0:00:10.816965
0: 45.78%, 0:00:11.741899
0: 49.30%, 0:00:12.669051
0: 52.82%, 0:00:13.596556
0: 56.34%, 0:00:14.525049
0: 59.86%, 0:00:15.454627
0: 63.39%, 0:00:16.386951
0: 66.91%, 0:00:17.317490
0: 70.43%, 0:00:18.250243
0: 73.95%, 0:00:19.180815
0: 77.47%, 0:00:20.113655
0: 80.99%, 0:00:21.046689
0: 84.51%, 0:00:21.980954
0: 88.04%, 0:00:22.915947
0: 91.56%, 0:00:23.850717
0: 95.08%, 0:00:24.784679
0: 98.60%, 0:00:25.720333
Time used: 0:00:28.414780
Done.
0
0: 2.99%, 0:00:01.661394
0: 5.98%, 0:00:02.477636
0: 8.98%, 0:00:03.322815
0: 11.97%, 0:00:04.190007
0: 14.96%, 0:00:05.072700
0: 17.95%, 0:00:05.954877
0: 20.94%, 0:00:06.842158
0: 23.94%, 0:00:07.730794
0: 26.93%, 0:00:08.623938
0: 29.92%, 0:00:09.522985
0: 32.91%, 0:00:10.422095
0: 35.90%, 0:00:11.324526
0: 38.89%, 0:00:12.231446
0: 41.89%, 0:00:13.139983
0: 44.88%, 0:00:14.047124
0: 47.87%, 0:00:14.954538
0: 50.86%, 0:00:15.869476
0: 53.85%, 0:00:16.785328
0: 56.85%, 0:00:17.694522
0: 59.84%, 0:00:18.616149
0: 62.83%, 0:00:19.533579
0: 65.82%, 0:00:20.447863
0: 68.81%, 0:00:21.371865
0: 71.81%, 0:00:22.299280
0: 74.80%, 0:00:23.225582
0: 77.79%, 0:00:24.149575
0: 80.78%, 0:00:25.085504
0: 83.77%, 0:00:26.019765
0: 86.77%, 0:00:26.956688
0: 89.76%, 0:00:27.893923
0: 92.75%, 0:00:28.839505
0: 95.74%, 0:00:29.762381
0: 98.73%, 0:00:30.670769
Time used: 0:00:33.327412
Done.
0
0: 2.55%, 0:00:00.858724
0: 5.10%, 0:00:01.693881
0: 7.66%, 0:00:02.550313
0: 10.21%, 0:00:03.438072
0: 12.76%, 0:00:04.336405
0: 15.31%, 0:00:05.241319
0: 17.87%, 0:00:06.144364
0: 20.42%, 0:00:07.049172
0: 22.97%, 0:00:07.956333
0: 25.52%, 0:00:08.867325
0: 28.08%, 0:00:09.783363
0: 30.63%, 0:00:10.701026
0: 33.18%, 0:00:11.621655
0: 35.73%, 0:00:12.539225
0: 38.29%, 0:00:13.459041
0: 40.84%, 0:00:14.378702
0: 43.39%, 0:00:15.299871
0: 45.94%, 0:00:16.221793
0: 48.50%, 0:00:17.147305
0: 51.05%, 0:00:18.071053
0: 53.60%, 0:00:18.994758
0: 56.15%, 0:00:19.920115
0: 58.71%, 0:00:20.843612
0: 61.26%, 0:00:21.768533
0: 63.81%, 0:00:22.693296
0: 66.36%, 0:00:23.618914
0: 68.92%, 0:00:24.546541
0: 71.47%, 0:00:25.476755
0: 74.02%, 0:00:26.408778
0: 76.57%, 0:00:27.342466
0: 79.13%, 0:00:28.272325
0: 81.68%, 0:00:29.200554
0: 84.23%, 0:00:30.109796
0: 86.78%, 0:00:31.021515
0: 89.34%, 0:00:31.934116
0: 91.89%, 0:00:32.848009
0: 94.44%, 0:00:33.747427
0: 96.99%, 0:00:34.633349
0: 99.55%, 0:00:35.523877
0
0: 2.22%, 0:00:00.892335
0: 4.44%, 0:00:01.708912
0: 6.66%, 0:00:02.561273
0: 8.88%, 0:00:03.424310
0: 11.10%, 0:00:04.296200
0: 13.32%, 0:00:05.176537
0: 15.54%, 0:00:06.057391
0: 17.76%, 0:00:06.942006
0: 19.98%, 0:00:07.830262
0: 22.20%, 0:00:08.717039
0: 24.42%, 0:00:09.610359
0: 26.64%, 0:00:10.505621
0: 28.86%, 0:00:11.403694
0: 31.08%, 0:00:12.302480
0: 33.29%, 0:00:13.200844
0: 35.51%, 0:00:14.101735
0: 37.73%, 0:00:15.000843
0: 39.95%, 0:00:15.899199
0: 42.17%, 0:00:16.801676
0: 44.39%, 0:00:17.701028
0: 46.61%, 0:00:18.603529
0: 48.83%, 0:00:19.505129
0: 51.05%, 0:00:20.405807
0: 53.27%, 0:00:21.309704
0: 55.49%, 0:00:22.214323
0: 57.71%, 0:00:23.120520
0: 59.93%, 0:00:24.026976
0: 62.15%, 0:00:24.933960
0: 64.37%, 0:00:25.841751
0: 66.59%, 0:00:26.749717
0: 68.81%, 0:00:27.660184
0: 71.03%, 0:00:28.571226
0: 73.25%, 0:00:29.490395
0: 75.47%, 0:00:30.409647
0: 77.69%, 0:00:31.331949
0: 79.91%, 0:00:32.255575
0: 82.13%, 0:00:33.176761
0: 84.35%, 0:00:34.100990
0: 86.57%, 0:00:35.025609
0: 88.79%, 0:00:35.952435
0: 91.01%, 0:00:36.874929
0: 93.23%, 0:00:37.753115
0: 95.45%, 0:00:38.631416
0: 97.66%, 0:00:39.507389
0: 99.88%, 0:00:40.384462
0
0: 2.08%, 0:00:00.908054
0: 4.17%, 0:00:01.725911
0: 6.25%, 0:00:02.568511
0: 8.34%, 0:00:03.429888
0: 10.42%, 0:00:04.300306
0: 12.50%, 0:00:05.175118
0: 14.59%, 0:00:06.057660
0: 16.67%, 0:00:06.944505
0: 18.76%, 0:00:07.832884
0: 20.84%, 0:00:08.723972
0: 22.92%, 0:00:09.618731
0: 25.01%, 0:00:10.515744
0: 27.09%, 0:00:11.419670
0: 29.18%, 0:00:12.323939
0: 31.26%, 0:00:13.226952
0: 33.34%, 0:00:14.129080
0: 35.43%, 0:00:15.029882
0: 37.51%, 0:00:15.929691
0: 39.60%, 0:00:16.832536
0: 41.68%, 0:00:17.735150
0: 43.76%, 0:00:18.640288
0: 45.85%, 0:00:19.543688
0: 47.93%, 0:00:20.451828
0: 50.02%, 0:00:21.360671
0: 52.10%, 0:00:22.270387
0: 54.18%, 0:00:23.179729
0: 56.27%, 0:00:24.089605
0: 58.35%, 0:00:24.999872
0: 60.44%, 0:00:25.911252
0: 62.52%, 0:00:26.823734
0: 64.60%, 0:00:27.736250
0: 66.69%, 0:00:28.651250
0: 68.77%, 0:00:29.551430
0: 70.86%, 0:00:30.442043
0: 72.94%, 0:00:31.338521
0: 75.02%, 0:00:32.231894
0: 77.11%, 0:00:33.122956
0: 79.19%, 0:00:34.000177
0: 81.28%, 0:00:34.864336
0: 83.36%, 0:00:35.734929
0: 85.44%, 0:00:36.591159
0: 87.53%, 0:00:37.454551
0: 89.61%, 0:00:38.321678
0: 91.70%, 0:00:39.196501
0: 93.78%, 0:00:40.065737
0: 95.86%, 0:00:40.946377
0: 97.95%, 0:00:41.790969
0
0: 1.94%, 0:00:00.951242
0: 3.88%, 0:00:01.794846
0: 5.82%, 0:00:02.667006
0: 7.76%, 0:00:03.551187
0: 9.70%, 0:00:04.441202
0: 11.64%, 0:00:05.342244
0: 13.58%, 0:00:06.247676
0: 15.52%, 0:00:07.157040
0: 17.46%, 0:00:08.070762
0: 19.40%, 0:00:08.988298
0: 21.34%, 0:00:09.908908
0: 23.29%, 0:00:10.827542
0: 25.23%, 0:00:11.742986
0: 27.17%, 0:00:12.663153
0: 29.11%, 0:00:13.586644
0: 31.05%, 0:00:14.512693
0: 32.99%, 0:00:15.443198
0: 34.93%, 0:00:16.365303
0: 36.87%, 0:00:17.297534
0: 38.81%, 0:00:18.230641
0: 40.75%, 0:00:19.162600
0: 42.69%, 0:00:20.097425
0: 44.63%, 0:00:21.030746
0: 46.57%, 0:00:21.967851
0: 48.51%, 0:00:22.906118
0: 50.45%, 0:00:23.846036
0: 52.39%, 0:00:24.782105
0: 54.33%, 0:00:25.726327
0: 56.27%, 0:00:26.671289
0: 58.21%, 0:00:27.614760
0: 60.15%, 0:00:28.544992
0: 62.09%, 0:00:29.448444
0: 64.03%, 0:00:30.371104
0: 65.98%, 0:00:31.296590
0: 67.92%, 0:00:32.223630
0: 69.86%, 0:00:33.148571
0: 71.80%, 0:00:34.078863
0: 73.74%, 0:00:35.010081
0: 75.68%, 0:00:35.929663
0: 77.62%, 0:00:36.860249
0: 79.56%, 0:00:37.766751
0: 81.50%, 0:00:38.674503
0: 83.44%, 0:00:39.579214
0: 85.38%, 0:00:40.488211
0: 87.32%, 0:00:41.398771
0: 89.26%, 0:00:42.307408
0: 91.20%, 0:00:43.221306
0: 93.14%, 0:00:44.147268
0: 95.08%, 0:00:45.073692
0: 97.02%, 0:00:45.994200
0: 98.96%, 0:00:46.918155
0
0: 1.91%, 0:00:00.948519
0: 3.83%, 0:00:01.773969
0: 5.74%, 0:00:02.634785
0: 7.66%, 0:00:03.501953
0: 9.57%, 0:00:04.379412
0: 11.49%, 0:00:05.267462
0: 13.40%, 0:00:06.158393
0: 15.32%, 0:00:07.053159
0: 17.23%, 0:00:07.951057
0: 19.15%, 0:00:08.852067
0: 21.06%, 0:00:09.757055
0: 22.98%, 0:00:10.664229
0: 24.89%, 0:00:11.572720
0: 26.81%, 0:00:12.481633
0: 28.72%, 0:00:13.392651
0: 30.63%, 0:00:14.305493
0: 32.55%, 0:00:15.220417
0: 34.46%, 0:00:16.138144
0: 36.38%, 0:00:17.057524
0: 38.29%, 0:00:17.977246
0: 40.21%, 0:00:18.896178
0: 42.12%, 0:00:19.817074
0: 44.04%, 0:00:20.739793
0: 45.95%, 0:00:21.664235
0: 47.87%, 0:00:22.591650
0: 49.78%, 0:00:23.516518
0: 51.70%, 0:00:24.443138
0: 53.61%, 0:00:25.368750
0: 55.52%, 0:00:26.300989
0: 57.44%, 0:00:27.229699
0: 59.35%, 0:00:28.157103
0: 61.27%, 0:00:29.047285
0: 63.18%, 0:00:29.953218
0: 65.10%, 0:00:30.857228
0: 67.01%, 0:00:31.763378
0: 68.93%, 0:00:32.671601
0: 70.84%, 0:00:33.578952
0: 72.76%, 0:00:34.490313
0: 74.67%, 0:00:35.403030
0: 76.59%, 0:00:36.315716
0: 78.50%, 0:00:37.216834
0: 80.42%, 0:00:38.102423
0: 82.33%, 0:00:38.989425
0: 84.24%, 0:00:39.877367
0: 86.16%, 0:00:40.765941
0: 88.07%, 0:00:41.657457
0: 89.99%, 0:00:42.545818
0: 91.90%, 0:00:43.439129
0: 93.82%, 0:00:44.299637
0: 95.73%, 0:00:45.159822
0: 97.65%, 0:00:46.021600
0: 99.56%, 0:00:46.885604
Time used: 0:00:49.538535
Done.
#-- All genomes indexed --#
Indexing: "Roseobacter_litoralis_Och_149"
Indexing: "Idiomarina_loihiensis_GSL_199"
Indexing: "Bacillus_cereus_F837_76"
Indexing: "Micrococcus_luteus_NCTC_2665"
Indexing: "Geobacter_lovleyi_SZ"
Indexing: "Cyanobium_gracile_PCC_6307"
Indexing: "Leuconostoc_citreum_KM20"
Indexing: "Escherichia_coli_APEC_O78"
Indexing: "Xanthomonas_axonopodis_Xac29-1"
Indexing: "Nitrosomonas_europaea_ATCC_19718"
Traceback (most recent call last):
  File "/home/nick/notebook/SIPSim/bin/../lib/chilli/mfe_index_db.py", line 227, in <module>
    main()
  File "/home/nick/notebook/SIPSim/bin/../lib/chilli/mfe_index_db.py", line 224, in main
    index(options.filename, options.k)
  File "/home/nick/notebook/SIPSim/bin/../lib/chilli/mfe_index_db.py", line 214, in index
    insert_db(conn, mer_count, plus, minus)
  File "/home/nick/notebook/SIPSim/bin/../lib/chilli/mfe_index_db.py", line 59, in insert_db
    [mer_id, plus[mer_id], minus[mer_id]])
sqlite3.IntegrityError: UNIQUE constraint failed: pos.mer_id
Traceback (most recent call last):
  File "/home/nick/notebook/SIPSim/bin/../lib/chilli/mfe_index_db.py", line 227, in <module>
    main()
  File "/home/nick/notebook/SIPSim/bin/../lib/chilli/mfe_index_db.py", line 224, in main
    index(options.filename, options.k)
  File "/home/nick/notebook/SIPSim/bin/../lib/chilli/mfe_index_db.py", line 214, in index
    insert_db(conn, mer_count, plus, minus)
  File "/home/nick/notebook/SIPSim/bin/../lib/chilli/mfe_index_db.py", line 59, in insert_db
    [mer_id, plus[mer_id], minus[mer_id]])
sqlite3.IntegrityError: UNIQUE constraint failed: pos.mer_id
Traceback (most recent call last):
  File "/home/nick/notebook/SIPSim/bin/../lib/chilli/mfe_index_db.py", line 227, in <module>
    main()
  File "/home/nick/notebook/SIPSim/bin/../lib/chilli/mfe_index_db.py", line 224, in main
    index(options.filename, options.k)
  File "/home/nick/notebook/SIPSim/bin/../lib/chilli/mfe_index_db.py", line 214, in index
    insert_db(conn, mer_count, plus, minus)
  File "/home/nick/notebook/SIPSim/bin/../lib/chilli/mfe_index_db.py", line 59, in insert_db
    [mer_id, plus[mer_id], minus[mer_id]])
sqlite3.IntegrityError: UNIQUE constraint failed: pos.mer_id
Traceback (most recent call last):
  File "/home/nick/notebook/SIPSim/bin/../lib/chilli/mfe_index_db.py", line 227, in <module>
    main()
  File "/home/nick/notebook/SIPSim/bin/../lib/chilli/mfe_index_db.py", line 224, in main
    index(options.filename, options.k)
  File "/home/nick/notebook/SIPSim/bin/../lib/chilli/mfe_index_db.py", line 214, in index
    insert_db(conn, mer_count, plus, minus)
  File "/home/nick/notebook/SIPSim/bin/../lib/chilli/mfe_index_db.py", line 59, in insert_db
    [mer_id, plus[mer_id], minus[mer_id]])
sqlite3.IntegrityError: UNIQUE constraint failed: pos.mer_id

Simulating fragments and calculating G+C


In [285]:
%%bash -s "$workDir" "$SIPSimExe"
# amplicon fragments

cd $1

$2 fragGC genomes/genomes10.txt \
    --fp ./genomes/ --fr 515Fm-927Rm.fna \
    --np 10 > genome10_ampFragGC.txt


Processing: "Roseobacter_litoralis_Och_149"
Processing: "Idiomarina_loihiensis_GSL_199"
Processing: "Bacillus_cereus_F837_76"
Processing: "Micrococcus_luteus_NCTC_2665"
Processing: "Geobacter_lovleyi_SZ"
Processing: "Cyanobium_gracile_PCC_6307"
Processing: "Leuconostoc_citreum_KM20"
Processing: "Escherichia_coli_APEC_O78"
Processing: "Xanthomonas_axonopodis_Xac29-1"
Processing: "Nitrosomonas_europaea_ATCC_19718"

In [286]:
%%bash -s "$workDir" "$SIPSimExe"
# shotgun fragments

cd $1

$2 fragGC genomes/genomes10.txt \
    --fp ./genomes/ --np 10 \
    > genome10_shotFragGC.txt


Processing: "Roseobacter_litoralis_Och_149"
Processing: "Idiomarina_loihiensis_GSL_199"
Processing: "Bacillus_cereus_F837_76"
Processing: "Micrococcus_luteus_NCTC_2665"
Processing: "Geobacter_lovleyi_SZ"
Processing: "Cyanobium_gracile_PCC_6307"
Processing: "Leuconostoc_citreum_KM20"
Processing: "Escherichia_coli_APEC_O78"
Processing: "Xanthomonas_axonopodis_Xac29-1"
Processing: "Nitrosomonas_europaea_ATCC_19718"

Simulating gradient communities


In [287]:
%%bash -s "$workDir" "$SIPSimExe"

cd $1

$2 gradientComms genomes/genomes10.txt \
    --fp ./genomes/  --pf grinder_profile \
    > genome10_comm_n3.txt


Overall diversity = 10 genomes
Percent shared   = 100.0 % (10 genomes)
Percent permuted = 100.0 % (10 top genomes)

Simulating isotope incorporation


In [288]:
import os

# making config file
config = """
[library 1]
  # baseline: no incorp
  
  [[intraPopDist 1]]
  distribution = uniform
  weight = 1

    [[[start]]]

      [[[[interPopDist 1]]]]
        distribution = uniform
        start = 0
        end = 0

    [[[end]]]

      [[[[interPopDist 1]]]]
        distribution = uniform
        start = 0
        end = 0

[library 2]
  # split intra-populations
  ## to get some taxa with split; use inter-pop mixture for 2nd intra-pop mu, where mixture is highly uneven
  
  [[intraPopDist 1]]
  distribution = normal
  weight = 0.5

    [[[mu]]]

      [[[[interPopDist 1]]]]
        distribution = normal
        mu = 90
        sigma = 2

    [[[sigma]]]

      [[[[interPopDist 1]]]]
        distribution = normal
        mu = 5
        sigma = 2

  [[intraPopDist 2]]
  distribution = normal
  weight = 0.5

    [[[mu]]]

      [[[[interPopDist 1]]]]
        distribution = normal
        mu = 5
        sigma = 2

    [[[sigma]]]

      [[[[interPopDist 1]]]]
        distribution = normal
        mu = 5
        sigma = 2


[library 3]
  # split inter-pop distribution (some approx. full; others none)
  
  [[intraPopDist 1]]
  distribution = normal
  weight = 1

    [[[mu]]]

      [[[[interPopDist 1]]]]
        distribution = normal
        mu = 90
        sigma = 2

      # these taxa in the community get no incorp
      [[[[interPopDist 2]]]]
        distribution = uniform
        start = 0
        end = 0

    [[[sigma]]]

      [[[[interPopDist 1]]]]
        distribution = normal
        mu = 5
        sigma = 2
        

"""

outfile = os.path.join(workDir, 'genome10_n3.config')

outf = open(outfile, 'wb')
outf.write(config)
outf.close()

In [289]:
import os

# making config file
config = """
[library 1]
  # baseline: no incorp
  
  [[intraPopDist 1]]
  distribution = uniform
  weight = 1

    [[[start]]]

      [[[[interPopDist 1]]]]
        distribution = uniform
        start = 0
        end = 0

    [[[end]]]

      [[[[interPopDist 1]]]]
        distribution = uniform
        start = 0
        end = 0

[library 2]
  # full incorp
  
  [[intraPopDist 1]]
  distribution = uniform
  weight = 1

    [[[start]]]

      [[[[interPopDist 1]]]]
        distribution = uniform
        start = 100
        end = 100

    [[[end]]]

      [[[[interPopDist 1]]]]
        distribution = uniform
        start = 100
        end = 100

[library 3]
  # split inter-pop distribution (some approx. full; others none)
  
  [[intraPopDist 1]]
  distribution = normal
  weight = 1

    [[[mu]]]

      [[[[interPopDist 1]]]]
        distribution = normal
        mu = 90
        sigma = 2

      # these taxa in the community get no incorp
      [[[[interPopDist 2]]]]
        distribution = uniform
        start = 0
        end = 0

    [[[sigma]]]

      [[[[interPopDist 1]]]]
        distribution = normal
        mu = 5
        sigma = 2
        

"""

outfile = os.path.join(workDir, 'genome10_n3.config')

outf = open(outfile, 'wb')
outf.write(config)
outf.close()

In [290]:
%%bash -s "$workDir" "$SIPSimExe"

cd $1

$2 isoIncorp genome10_comm_n3.txt genome10_n3.config > genome10_comm_n3_incorp.txt

Plotting in R


In [291]:
%%R
library(ggplot2)
library(dplyr)

In [292]:
%%R -i workDir

infile = paste(c(workDir, 'genome10_comm_n3_incorp.txt'), collapse='/')

tbl = read.csv(infile, sep='\t')

In [293]:
%%R -w 700 

tbl$taxon_name = reorder(tbl$taxon_name, tbl$param_value, max)

ggplot(tbl, aes(taxon_name, param_value, color=param)) +
    geom_point() +
    facet_grid(param ~ library, scales='free_y') +
    theme(
        text = element_text(size=16),
        axis.text.x = element_text(angle=90, hjust=1),
        axis.title.x = element_blank(),
        axis.title.y = element_blank()
    )


Simulating gradient fractions


In [294]:
%%bash -s "$workDir" "$SIPSimExe"

cd $1

$2 fractions genome10_comm_n3.txt > genome10_comm_n3_fracs.txt

In [295]:
%%R -i workDir

infile = paste(c(workDir, 'genome10_comm_n3_fracs.txt'), collapse='/')

tbl = read.csv(infile, sep='\t')

In [296]:
%%R -w 700 

#tbl$taxon_name = reorder(tbl$taxon_name, tbl$param_value, max)
tbl$library = as.character(tbl$library)

ggplot(tbl, aes(library, fraction_size)) +
    geom_boxplot()



In [297]:
%%R -h 300
tbl_sum = group_by(tbl, library) %>%
    summarize( n_fracs = n())

ggplot(tbl_sum, aes(library, n_fracs)) +
    geom_bar(stat='identity')


Creating OTU table & plotting simulated fragment info


In [298]:
%%bash -s "$workDir" "$SIPSimExe"

cd $1

$2 OTU_table genome10_ampFragGC.txt genome10_comm_n3.txt \
    genome10_comm_n3_incorp.txt genome10_comm_n3_fracs.txt \
    --abs_abund 1e4 --log genome10_OTU_abnd1e4_log.txt \
    > genome10_OTU_abnd1e4.txt


Processing library: "1"
  Processing taxon: "Bacillus_cereus_F837_76"
    N-fragments:   1130
    Time elapsed:  0.0 sec
  Processing taxon: "Roseobacter_litoralis_Och_149"
    N-fragments:   1098
    Time elapsed:  0.0 sec
  Processing taxon: "Xanthomonas_axonopodis_Xac29-1"
    N-fragments:   1068
    Time elapsed:  0.0 sec
  Processing taxon: "Escherichia_coli_APEC_O78"
    N-fragments:   1039
    Time elapsed:  0.0 sec
  Processing taxon: "Leuconostoc_citreum_KM20"
    N-fragments:   1010
    Time elapsed:  0.0 sec
  Processing taxon: "Cyanobium_gracile_PCC_6307"
    N-fragments:   983
    Time elapsed:  0.0 sec
  Processing taxon: "Geobacter_lovleyi_SZ"
    N-fragments:   955
    Time elapsed:  0.0 sec
  Processing taxon: "Nitrosomonas_europaea_ATCC_19718"
    N-fragments:   929
    Time elapsed:  0.0 sec
  Processing taxon: "Micrococcus_luteus_NCTC_2665"
    N-fragments:   904
    Time elapsed:  0.0 sec
  Processing taxon: "Idiomarina_loihiensis_GSL_199"
    N-fragments:   879
    Time elapsed:  0.0 sec
Processing library: "2"
  Processing taxon: "Bacillus_cereus_F837_76"
    N-fragments:   518
    Time elapsed:  0.0 sec
  Processing taxon: "Roseobacter_litoralis_Och_149"
    N-fragments:   1713
    Time elapsed:  0.0 sec
  Processing taxon: "Xanthomonas_axonopodis_Xac29-1"
    N-fragments:   1150
    Time elapsed:  0.0 sec
  Processing taxon: "Escherichia_coli_APEC_O78"
    N-fragments:   425
    Time elapsed:  0.0 sec
  Processing taxon: "Leuconostoc_citreum_KM20"
    N-fragments:   1404
    Time elapsed:  0.0 sec
  Processing taxon: "Cyanobium_gracile_PCC_6307"
    N-fragments:   633
    Time elapsed:  0.0 sec
  Processing taxon: "Geobacter_lovleyi_SZ"
    N-fragments:   2090
    Time elapsed:  0.0 sec
  Processing taxon: "Nitrosomonas_europaea_ATCC_19718"
    N-fragments:   772
    Time elapsed:  0.0 sec
  Processing taxon: "Micrococcus_luteus_NCTC_2665"
    N-fragments:   942
    Time elapsed:  0.0 sec
  Processing taxon: "Idiomarina_loihiensis_GSL_199"
    N-fragments:   348
    Time elapsed:  0.0 sec
Processing library: "3"
  Processing taxon: "Bacillus_cereus_F837_76"
    N-fragments:   657
    Time elapsed:  0.0 sec
  Processing taxon: "Roseobacter_litoralis_Och_149"
    N-fragments:   1671
    Time elapsed:  0.0 sec
  Processing taxon: "Xanthomonas_axonopodis_Xac29-1"
    N-fragments:   452
    Time elapsed:  0.0 sec
  Processing taxon: "Escherichia_coli_APEC_O78"
    N-fragments:   545
    Time elapsed:  0.0 sec
  Processing taxon: "Leuconostoc_citreum_KM20"
    N-fragments:   954
    Time elapsed:  0.0 sec
  Processing taxon: "Cyanobium_gracile_PCC_6307"
    N-fragments:   792
    Time elapsed:  0.0 sec
  Processing taxon: "Geobacter_lovleyi_SZ"
    N-fragments:   375
    Time elapsed:  0.0 sec
  Processing taxon: "Nitrosomonas_europaea_ATCC_19718"
    N-fragments:   1150
    Time elapsed:  0.0 sec
  Processing taxon: "Micrococcus_luteus_NCTC_2665"
    N-fragments:   1386
    Time elapsed:  0.0 sec
  Processing taxon: "Idiomarina_loihiensis_GSL_199"
    N-fragments:   2014
    Time elapsed:  0.0 sec
Fragment info written to: genome10_OTU_abnd1e4_log.txt

Plotting out fragment info


In [299]:
%%R -i workDir

# loading file
infile = paste(c(workDir, 'genome10_OTU_abnd1e4_log.txt'), collapse='/')
tbl = read.csv(infile, sep='\t')

# reformat
tbl$taxon = gsub("(.*?_.*?)_(.+)", "\\1\n\\2", tbl$taxon)
tbl$fragment_length = tbl$fragment_length / 1000

In [300]:
%%R -w 600 -h 1000
# plot

ggplot(tbl, aes(fragment_length, fragment_GC)) +
    geom_density2d() +
    facet_grid(taxon ~ library) +
    labs(x="fragment length [kb]", y="fragment G+C") +
    theme(
        text = element_text(size=18),
        axis.text.x = element_text(angle=90)
    )



In [301]:
%%R -w 1000 -h 700
# plot

tbl.l1 = tbl %>% filter(library == 1)

ggplot(tbl.l1, aes(fragment_length, fragment_GC)) +
    geom_density2d() +
    facet_wrap( ~ taxon) +
    labs(x="fragment length [kb]", y="fragment G+C") +
    theme(
        text = element_text(size=18),
        axis.text.x = element_text(angle=90)
    )


Creating community of larger total abundance


In [302]:
%%bash -s "$workDir" "$SIPSimExe"

cd $1

$2 OTU_table genome10_shotFragGC.txt genome10_comm_n3.txt \
    genome10_comm_n3_incorp.txt genome10_comm_n3_fracs.txt \
    --abs_abund 1e5 > genome10_OTU_abnd1e6.txt


Processing library: "1"
  Processing taxon: "Bacillus_cereus_F837_76"
    N-fragments:   11300
    Time elapsed:  0.2 sec
  Processing taxon: "Roseobacter_litoralis_Och_149"
    N-fragments:   10989
    Time elapsed:  0.1 sec
  Processing taxon: "Xanthomonas_axonopodis_Xac29-1"
    N-fragments:   10687
    Time elapsed:  0.1 sec
  Processing taxon: "Escherichia_coli_APEC_O78"
    N-fragments:   10393
    Time elapsed:  0.1 sec
  Processing taxon: "Leuconostoc_citreum_KM20"
    N-fragments:   10107
    Time elapsed:  0.1 sec
  Processing taxon: "Cyanobium_gracile_PCC_6307"
    N-fragments:   9830
    Time elapsed:  0.1 sec
  Processing taxon: "Geobacter_lovleyi_SZ"
    N-fragments:   9559
    Time elapsed:  0.1 sec
  Processing taxon: "Nitrosomonas_europaea_ATCC_19718"
    N-fragments:   9296
    Time elapsed:  0.1 sec
  Processing taxon: "Micrococcus_luteus_NCTC_2665"
    N-fragments:   9041
    Time elapsed:  0.1 sec
  Processing taxon: "Idiomarina_loihiensis_GSL_199"
    N-fragments:   8792
    Time elapsed:  0.1 sec
Processing library: "2"
  Processing taxon: "Bacillus_cereus_F837_76"
    N-fragments:   5188
    Time elapsed:  0.1 sec
  Processing taxon: "Roseobacter_litoralis_Och_149"
    N-fragments:   17134
    Time elapsed:  0.2 sec
  Processing taxon: "Xanthomonas_axonopodis_Xac29-1"
    N-fragments:   11505
    Time elapsed:  0.1 sec
  Processing taxon: "Escherichia_coli_APEC_O78"
    N-fragments:   4251
    Time elapsed:  0.1 sec
  Processing taxon: "Leuconostoc_citreum_KM20"
    N-fragments:   14040
    Time elapsed:  0.2 sec
  Processing taxon: "Cyanobium_gracile_PCC_6307"
    N-fragments:   6331
    Time elapsed:  0.1 sec
  Processing taxon: "Geobacter_lovleyi_SZ"
    N-fragments:   20909
    Time elapsed:  0.3 sec
  Processing taxon: "Nitrosomonas_europaea_ATCC_19718"
    N-fragments:   7726
    Time elapsed:  0.1 sec
  Processing taxon: "Micrococcus_luteus_NCTC_2665"
    N-fragments:   9428
    Time elapsed:  0.1 sec
  Processing taxon: "Idiomarina_loihiensis_GSL_199"
    N-fragments:   3483
    Time elapsed:  0.0 sec
Processing library: "3"
  Processing taxon: "Bacillus_cereus_F837_76"
    N-fragments:   6571
    Time elapsed:  0.1 sec
  Processing taxon: "Roseobacter_litoralis_Och_149"
    N-fragments:   16714
    Time elapsed:  0.2 sec
  Processing taxon: "Xanthomonas_axonopodis_Xac29-1"
    N-fragments:   4523
    Time elapsed:  0.1 sec
  Processing taxon: "Escherichia_coli_APEC_O78"
    N-fragments:   5452
    Time elapsed:  0.1 sec
  Processing taxon: "Leuconostoc_citreum_KM20"
    N-fragments:   9546
    Time elapsed:  0.1 sec
  Processing taxon: "Cyanobium_gracile_PCC_6307"
    N-fragments:   7920
    Time elapsed:  0.1 sec
  Processing taxon: "Geobacter_lovleyi_SZ"
    N-fragments:   3753
    Time elapsed:  0.1 sec
  Processing taxon: "Nitrosomonas_europaea_ATCC_19718"
    N-fragments:   11505
    Time elapsed:  0.2 sec
  Processing taxon: "Micrococcus_luteus_NCTC_2665"
    N-fragments:   13867
    Time elapsed:  0.2 sec
  Processing taxon: "Idiomarina_loihiensis_GSL_199"
    N-fragments:   20145
    Time elapsed:  0.3 sec

Plotting out values


In [303]:
%%R -i workDir

# loading file
infile = paste(c(workDir, 'genome10_OTU_abnd1e6.txt'), collapse='/')
tbl = read.csv(infile, sep='\t')

In [304]:
%%R

# formatting table
tbl$BD_min = gsub('-.+', '', tbl$fractions)
tbl$BD_min = as.numeric(tbl$BD_min)
tbl$BD_max = gsub('.+-', '', tbl$fractions)
tbl$BD_max = as.numeric(tbl$BD_max)

In [305]:
%%R

# summarizing counts (should be approx. total abundance)
tbl %>%
    group_by(library) %>%
    summarize(sum(count))


Source: local data frame [3 x 2]

  library sum(count)
1       1      99994
2       2      99995
3       3      99996

In [306]:
%%R -w 800
# plotting absolute abundances

## BD for G+C of 0 or 100
BD.GCp0 = 0 * 0.098 + 1.66
BD.GCp100 = 1 * 0.098 + 1.66

## plot
ggplot(tbl, aes(BD_min, count, fill=taxon, group=taxon)) +
    geom_area(stat='identity', alpha=0.5, position='dodge') +
    geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
    facet_grid(library ~ .) +
    labs(x='Buoyant density') +
    theme( text = element_text(size=16) )



In [307]:
%%R -w 800

# plotting relative abundances
ggplot(tbl, aes(BD_min, count, fill=taxon, group=taxon)) +
    geom_area(stat='identity', alpha=0.8, position='fill') +
    geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
    facet_grid(library ~ .) +
    labs(x='Buoyant density') +
    theme( text = element_text(size=16) )


Notes:

  • Limited 'noise'; taxa not present in (nearly) all fractions as seen empirically
    • This could be caused by:
      • Too low of total abundance?
      • Smaller fragments present, which have high diffusion

Simulating community with some smaller fragments (no min. size limit)


In [359]:
!cd $workDir; \
    $SIPSimExe fragGC genomes/genomes10.txt \
    --fp ./genomes/ \
    --fr 515Fm-927Rm.fna \
    --fld skewed-normal,9000,2500,-5  \
    --flr 500,None \
    --np 10 \
    > genome10_ampFragGC_difSkew.txt


Processing: "Roseobacter_litoralis_Och_149"
Processing: "Idiomarina_loihiensis_GSL_199"
Processing: "Bacillus_cereus_F837_76"
Processing: "Micrococcus_luteus_NCTC_2665"
Processing: "Geobacter_lovleyi_SZ"
Processing: "Cyanobium_gracile_PCC_6307"
Processing: "Leuconostoc_citreum_KM20"
Processing: "Escherichia_coli_APEC_O78"
Processing: "Xanthomonas_axonopodis_Xac29-1"
Processing: "Nitrosomonas_europaea_ATCC_19718"

Plotting fragment sizes


In [360]:
%%R -i workDir

setwd(workDir)
infile = 'genome10_ampFragGC_difSkew.txt'

tbl = read.delim(infile, sep='\t')

tbl$frag_len = abs(tbl$fragEnd - tbl$fragStart) + 1

In [361]:
%%R -h 350 -w 650

# stats
print(length(tbl[tbl$frag_len < 4000, 'frag_len']) / nrow(tbl))

# plot
ggplot(tbl, aes(frag_len)) +
    geom_histogram(binwidth=100) +
    geom_vline(xintercept=4000, linetype='dashed', alpha=0.5) +
    theme(
        text = element_text(size=18)
    )


[1] 0.04442

Simulating community


In [336]:
%%bash -s "$workDir" "$SIPSimExe"

cd $1

$2 OTU_table genome10_ampFragGC_difSkew.txt genome10_comm_n3.txt \
    genome10_comm_n3_incorp.txt genome10_comm_n3_fracs.txt \
    --abs_abund 1e8 > genome10_OTU_abnd1e8.txt


Processing library: "1"
  Processing taxon: "Bacillus_cereus_F837_76"
    N-fragments:   11300354
    Time elapsed:  150.0 sec
  Processing taxon: "Roseobacter_litoralis_Och_149"
    N-fragments:   10989669
    Time elapsed:  144.4 sec
  Processing taxon: "Xanthomonas_axonopodis_Xac29-1"
    N-fragments:   10687525
    Time elapsed:  141.2 sec
  Processing taxon: "Escherichia_coli_APEC_O78"
    N-fragments:   10393689
    Time elapsed:  136.2 sec
  Processing taxon: "Leuconostoc_citreum_KM20"
    N-fragments:   10107931
    Time elapsed:  131.4 sec
  Processing taxon: "Cyanobium_gracile_PCC_6307"
    N-fragments:   9830030
    Time elapsed:  128.2 sec
  Processing taxon: "Geobacter_lovleyi_SZ"
    N-fragments:   9559769
    Time elapsed:  125.2 sec
  Processing taxon: "Nitrosomonas_europaea_ATCC_19718"
    N-fragments:   9296938
    Time elapsed:  121.2 sec
  Processing taxon: "Micrococcus_luteus_NCTC_2665"
    N-fragments:   9041334
    Time elapsed:  119.6 sec
  Processing taxon: "Idiomarina_loihiensis_GSL_199"
    N-fragments:   8792757
    Time elapsed:  114.7 sec
Processing library: "2"
  Processing taxon: "Bacillus_cereus_F837_76"
    N-fragments:   5188078
    Time elapsed:  67.9 sec
  Processing taxon: "Roseobacter_litoralis_Och_149"
    N-fragments:   17134534
    Time elapsed:  225.9 sec
  Processing taxon: "Xanthomonas_axonopodis_Xac29-1"
    N-fragments:   11505806
    Time elapsed:  150.2 sec
  Processing taxon: "Escherichia_coli_APEC_O78"
    N-fragments:   4251370
    Time elapsed:  54.9 sec
  Processing taxon: "Leuconostoc_citreum_KM20"
    N-fragments:   14040891
    Time elapsed:  185.1 sec
  Processing taxon: "Cyanobium_gracile_PCC_6307"
    N-fragments:   6331173
    Time elapsed:  83.1 sec
  Processing taxon: "Geobacter_lovleyi_SZ"
    N-fragments:   20909802
    Time elapsed:  276.7 sec
  Processing taxon: "Nitrosomonas_europaea_ATCC_19718"
    N-fragments:   7726126
    Time elapsed:  101.0 sec
  Processing taxon: "Micrococcus_luteus_NCTC_2665"
    N-fragments:   9428431
    Time elapsed:  123.7 sec
  Processing taxon: "Idiomarina_loihiensis_GSL_199"
    N-fragments:   3483784
    Time elapsed:  45.4 sec
Processing library: "3"
  Processing taxon: "Bacillus_cereus_F837_76"
    N-fragments:   6571355
    Time elapsed:  94.9 sec
  Processing taxon: "Roseobacter_litoralis_Och_149"
    N-fragments:   16714389
    Time elapsed:  242.0 sec
  Processing taxon: "Xanthomonas_axonopodis_Xac29-1"
    N-fragments:   4523563
    Time elapsed:  65.6 sec
  Processing taxon: "Escherichia_coli_APEC_O78"
    N-fragments:   5452150
    Time elapsed:  79.1 sec
  Processing taxon: "Leuconostoc_citreum_KM20"
    N-fragments:   9546170
    Time elapsed:  138.8 sec
  Processing taxon: "Cyanobium_gracile_PCC_6307"
    N-fragments:   7920307
    Time elapsed:  114.1 sec
  Processing taxon: "Geobacter_lovleyi_SZ"
    N-fragments:   3753130
    Time elapsed:  54.0 sec
  Processing taxon: "Nitrosomonas_europaea_ATCC_19718"
    N-fragments:   11505785
    Time elapsed:  168.2 sec
  Processing taxon: "Micrococcus_luteus_NCTC_2665"
    N-fragments:   13867666
    Time elapsed:  199.5 sec
  Processing taxon: "Idiomarina_loihiensis_GSL_199"
    N-fragments:   20145479
    Time elapsed:  292.6 sec

Plotting out distributions


In [342]:
%%R -i workDir

# loading file
infile = paste(c(workDir, 'genome10_OTU_abnd1e8.txt'), collapse='/')
tbl = read.csv(infile, sep='\t')

In [343]:
%%R

# formatting table
tbl$BD_min = gsub('-.+', '', tbl$fractions)
tbl$BD_min = as.numeric(tbl$BD_min)
tbl$BD_max = gsub('.+-', '', tbl$fractions)
tbl$BD_max = as.numeric(tbl$BD_max)

In [344]:
%%R

# summarizing counts (should be approx. total abundance)
tbl %>%
    group_by(library) %>%
    summarize(sum(count))


Source: local data frame [3 x 2]

  library sum(count)
1       1   99999996
2       2   99999995
3       3   99999994

In [345]:
%%R -w 800
# plotting absolute abundances

## BD for G+C of 0 or 100
BD.GCp0 = 0 * 0.098 + 1.66
BD.GCp100 = 1 * 0.098 + 1.66

## plot
ggplot(tbl, aes(BD_min, count, fill=taxon, group=taxon)) +
    geom_area(stat='identity', alpha=0.5, position='dodge') +
    geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
    facet_grid(library ~ .) +
    labs(x='Buoyant density') +
    theme( text = element_text(size=16) )



In [346]:
%%R -w 800

# plotting relative abundances
ggplot(tbl, aes(BD_min, count, fill=taxon, group=taxon)) +
    geom_area(stat='identity', alpha=0.8, position='fill') +
    geom_vline(xintercept=c(BD.GCp0, BD.GCp100), linetype='dashed', alpha=0.5) +
    facet_grid(library ~ .) +
    labs(x='Buoyant density') +
    theme( text = element_text(size=16) )







In [ ]:


In [ ]:


In [367]:
%%R -i workDir

infile = paste(c(workDir, 'genome10_OTU_abnd1e6.txt'), collapse='/')

tbl = read.csv(infile, sep='\t', row.names=1)
tbl$taxon_name = rownames(tbl)

In [368]:
%%R
tbl.m = melt(tbl, id.var=c('taxon_name'))
colnames(tbl.m) = c('taxon_name', 'variable', 'abundance')

tbl.m$lib = gsub('X|\\..+', '', tbl.m$variable)
tbl.m$BD_min = gsub('X[0-9]+\\.([0-9]+\\.[0-9]+).+', '\\1', tbl.m$variable)
tbl.m$BD_min = as.numeric(tbl.m$BD_min)
tbl.m$BD_max = gsub('X[0-9]+\\.[0-9]+\\.[0-9]+\\.(.+)', '\\1', tbl.m$variable)
tbl.m$BD_max = as.numeric(tbl.m$BD_max)

In [369]:
%%R -w 800

ggplot(tbl.m, aes(BD_min, abundance, color=taxon_name, group=taxon_name)) +
    geom_point(size=1.5) +
    geom_line(alpha=0.5) +
    facet_grid(lib ~ .) +
    theme( text = element_text(size=16) )



In [370]:
%%R -w 800

ggplot(tbl.m, aes(BD_min, abundance, fill=taxon_name, group=taxon_name)) +
    geom_area(stat='identity', alpha=0.5, position='dodge') +
    facet_grid(lib ~ .) +
    labs(x='Buoyant density') +
    theme( text = element_text(size=16) )



In [373]:
%%R -w 800 -h 800

tbl.m2 = tbl.m
tbl.m2$taxon_name = gsub("_","\n", tbl.m2$taxon_name)

ggplot(tbl.m2, aes(BD_min, abundance, fill=taxon_name, group=taxon_name)) +
    geom_area(stat='identity', alpha=0.5, position='dodge') +
    facet_grid(taxon_name ~ lib) +
    labs(x='Buoyant density') +
    theme( text = element_text(size=16) )



In [372]:
%%R -w 800 -h 800

tbl.m2 = tbl.m
tbl.m2$taxon_name = gsub("_","\n", tbl.m2$taxon_name)

ggplot(tbl.m2, aes(BD_min, abundance, fill=taxon_name, group=taxon_name)) +
    geom_area(stat='identity', position='fill') +
    facet_grid(lib ~ .) +
    theme( text = element_text(size=16) )


Trying with same incorporation distributions

  • Just looking at variability via 'noise' and G+C KDE distributions

In [285]:
import os

# making config file
config = """
[library 1]
  # no incorp
  
  [[intraPopDist 1]]
  distribution = uniform
  weight = 1

    [[[start]]]

      [[[[interPopDist 1]]]]
        distribution = uniform
        start = 0
        end = 0

    [[[end]]]

      [[[[interPopDist 1]]]]
        distribution = uniform
        start = 0
        end = 0

[library 2]
  # no incorp
  
  [[intraPopDist 1]]
  distribution = uniform
  weight = 1

    [[[start]]]

      [[[[interPopDist 1]]]]
        distribution = uniform
        start = 0
        end = 0

    [[[end]]]

      [[[[interPopDist 1]]]]
        distribution = uniform
        start = 0
        end = 0

[library 3]
  # no incorp
  
  [[intraPopDist 1]]
  distribution = uniform
  weight = 1

    [[[start]]]

      [[[[interPopDist 1]]]]
        distribution = uniform
        start = 0
        end = 0

    [[[end]]]

      [[[[interPopDist 1]]]]
        distribution = uniform
        start = 0
        end = 0
"""

outfile = os.path.join(workDir, 'genome10_n3_noInc.config')

outf = open(outfile, 'wb')
outf.write(config)
outf.close()

In [290]:
%%bash -s "$workDir"

cd $1

../../../SIPSim isoIncorp --percTaxa 50 genome10_comm_n3.txt genome10_n3_noInc.config \
> genome10_comm_n3_noIncorp.txt

In [304]:
%%bash -s "$workDir"
%time

cd $1

../../../SIPSim OTU_table \
genome10_shotFragGC.txt genome10_comm_n3.txt \
genome10_comm_n3_noIncorp.txt genome10_comm_n3_fracs.txt \
--abs_abund 1e6 > genome10_OTU_noInc_abnd1e6.txt


bash: line 1: fg: no job control
Processing library: "1"
  Processing taxon: "Escherichia_coli_APEC_O78"
  Processing taxon: "Xanthomonas_axonopodis_Xac29-1"
  Processing taxon: "Micrococcus_luteus_NCTC_2665"
  Processing taxon: "Idiomarina_loihiensis_GSL_199"
  Processing taxon: "Leuconostoc_citreum_KM20"
  Processing taxon: "Geobacter_lovleyi_SZ"
  Processing taxon: "Bacillus_cereus_F837_76"
  Processing taxon: "Nitrosomonas_europaea_ATCC_19718"
  Processing taxon: "Roseobacter_litoralis_Och_149"
  Processing taxon: "Cyanobium_gracile_PCC_6307"
Processing library: "2"
  Processing taxon: "Idiomarina_loihiensis_GSL_199"
  Processing taxon: "Micrococcus_luteus_NCTC_2665"
  Processing taxon: "Geobacter_lovleyi_SZ"
  Processing taxon: "Roseobacter_litoralis_Och_149"
  Processing taxon: "Nitrosomonas_europaea_ATCC_19718"
  Processing taxon: "Cyanobium_gracile_PCC_6307"
  Processing taxon: "Bacillus_cereus_F837_76"
  Processing taxon: "Escherichia_coli_APEC_O78"
  Processing taxon: "Leuconostoc_citreum_KM20"
  Processing taxon: "Xanthomonas_axonopodis_Xac29-1"
Processing library: "3"
  Processing taxon: "Cyanobium_gracile_PCC_6307"
  Processing taxon: "Nitrosomonas_europaea_ATCC_19718"
  Processing taxon: "Idiomarina_loihiensis_GSL_199"
  Processing taxon: "Micrococcus_luteus_NCTC_2665"
  Processing taxon: "Escherichia_coli_APEC_O78"
  Processing taxon: "Leuconostoc_citreum_KM20"
  Processing taxon: "Roseobacter_litoralis_Och_149"
  Processing taxon: "Bacillus_cereus_F837_76"
  Processing taxon: "Geobacter_lovleyi_SZ"
  Processing taxon: "Xanthomonas_axonopodis_Xac29-1"

In [347]:
%%R -i workDir

infile = paste(c(workDir, 'genome10_OTU_noInc_abnd1e6.txt'), collapse='/')

tbl = read.csv(infile, sep='\t', row.names=1)
tbl$taxon_name = rownames(tbl)

# editing table
tbl.m = melt(tbl, id.var=c('taxon_name'))
colnames(tbl.m) = c('taxon_name', 'variable', 'abundance')

tbl.m$lib = gsub('X|\\..+', '', tbl.m$variable)
tbl.m$BD_min = gsub('X[0-9]+\\.([0-9]+\\.[0-9]+).+', '\\1', tbl.m$variable)
tbl.m$BD_min = as.numeric(tbl.m$BD_min)
tbl.m$BD_max = gsub('X[0-9]+\\.[0-9]+\\.[0-9]+\\.(.+)', '\\1', tbl.m$variable)
tbl.m$BD_max = as.numeric(tbl.m$BD_max)

In [351]:
%%R -w 800

ggplot(tbl.m, aes(BD_min, abundance, fill=taxon_name, group=taxon_name)) +
    geom_area(stat='identity', alpha=0.5, position='dodge') +
    facet_grid(lib ~ .) +
    labs(title='No isotope incorporation', x='Buoyant density') +
    theme( text = element_text(size=16) )



In [352]:
%%R -w 800

ggplot(tbl.m, aes(BD_min, abundance, fill=taxon_name, group=taxon_name)) +
    geom_area(stat='identity', position='fill') +
    facet_grid(lib ~ .) +
    labs(title='No isotope incorporation', x='Buoyant density') +
    theme( text = element_text(size=16) )


Trying with same normal distributions


In [308]:
import os

# making config file
config = """
[library 1]
  # normal distribution
  
  [[intraPopDist 1]]
  distribution = normal
  weight = 1

    [[[mu]]]

      [[[[interPopDist 1]]]]
        distribution = normal
        mu = 90
        sigma = 2

    [[[sigma]]]

      [[[[interPopDist 1]]]]
        distribution = normal
        mu = 2
        sigma = 0.1

[library 2]
  # normal distribution
  
  [[intraPopDist 1]]
  distribution = normal
  weight = 1

    [[[mu]]]

      [[[[interPopDist 1]]]]
        distribution = normal
        mu = 90
        sigma = 2

    [[[sigma]]]

      [[[[interPopDist 1]]]]
        distribution = normal
        mu = 2
        sigma = 0.1

[library 3]
  # normal distribution
  
  [[intraPopDist 1]]
  distribution = normal
  weight = 1

    [[[mu]]]

      [[[[interPopDist 1]]]]
        distribution = normal
        mu = 90
        sigma = 2

    [[[sigma]]]

      [[[[interPopDist 1]]]]
        distribution = normal
        mu = 2
        sigma = 0.1
"""

outfile = os.path.join(workDir, 'genome10_n3_norm.config')

outf = open(outfile, 'wb')
outf.write(config)
outf.close()

In [309]:
%%bash -s "$workDir"

cd $1


../../../SIPSim isoIncorp --percTaxa 100 \
genome10_comm_n3.txt genome10_n3_norm.config \
> genome10_comm_n3_normIncorp.txt

In [310]:
%%R -i workDir

infile = paste(c(workDir, 'genome10_comm_n3_normIncorp.txt'), collapse='/')

tbl = read.csv(infile, sep='\t')

In [311]:
%%R -w 700 

tbl$taxon_name = reorder(tbl$taxon_name, tbl$param_value, max)

ggplot(tbl, aes(taxon_name, param_value, color=param)) +
    geom_point() +
    facet_grid(param ~ library, scales='free_y') +
    theme(
        text = element_text(size=16),
        axis.text.x = element_text(angle=90)
    )



In [312]:
%%bash -s "$workDir"

cd $1

../../../SIPSim OTU_table \
genome10_shotFragGC.txt genome10_comm_n3.txt \
genome10_comm_n3_normIncorp.txt genome10_comm_n3_fracs.txt \
--abs_abund 1e4 > genome10_OTU_normInc_abnd1e4.txt


Processing library: "1"
  Processing taxon: "Escherichia_coli_APEC_O78"
  Processing taxon: "Xanthomonas_axonopodis_Xac29-1"
  Processing taxon: "Micrococcus_luteus_NCTC_2665"
  Processing taxon: "Idiomarina_loihiensis_GSL_199"
  Processing taxon: "Leuconostoc_citreum_KM20"
  Processing taxon: "Geobacter_lovleyi_SZ"
  Processing taxon: "Bacillus_cereus_F837_76"
  Processing taxon: "Nitrosomonas_europaea_ATCC_19718"
  Processing taxon: "Roseobacter_litoralis_Och_149"
  Processing taxon: "Cyanobium_gracile_PCC_6307"
Processing library: "2"
  Processing taxon: "Idiomarina_loihiensis_GSL_199"
  Processing taxon: "Micrococcus_luteus_NCTC_2665"
  Processing taxon: "Geobacter_lovleyi_SZ"
  Processing taxon: "Roseobacter_litoralis_Och_149"
  Processing taxon: "Nitrosomonas_europaea_ATCC_19718"
  Processing taxon: "Cyanobium_gracile_PCC_6307"
  Processing taxon: "Bacillus_cereus_F837_76"
  Processing taxon: "Escherichia_coli_APEC_O78"
  Processing taxon: "Leuconostoc_citreum_KM20"
  Processing taxon: "Xanthomonas_axonopodis_Xac29-1"
Processing library: "3"
  Processing taxon: "Cyanobium_gracile_PCC_6307"
  Processing taxon: "Nitrosomonas_europaea_ATCC_19718"
  Processing taxon: "Idiomarina_loihiensis_GSL_199"
  Processing taxon: "Micrococcus_luteus_NCTC_2665"
  Processing taxon: "Escherichia_coli_APEC_O78"
  Processing taxon: "Leuconostoc_citreum_KM20"
  Processing taxon: "Roseobacter_litoralis_Och_149"
  Processing taxon: "Bacillus_cereus_F837_76"
  Processing taxon: "Geobacter_lovleyi_SZ"
  Processing taxon: "Xanthomonas_axonopodis_Xac29-1"

In [313]:
%%R -i workDir

infile = paste(c(workDir, 'genome10_OTU_normInc_abnd1e4.txt'), collapse='/')

tbl = read.csv(infile, sep='\t', row.names=1)
tbl$taxon_name = rownames(tbl)

# editing table
tbl.m = melt(tbl, id.var=c('taxon_name'))
colnames(tbl.m) = c('taxon_name', 'variable', 'abundance')

tbl.m$lib = gsub('X|\\..+', '', tbl.m$variable)
tbl.m$BD_min = gsub('X[0-9]+\\.([0-9]+\\.[0-9]+).+', '\\1', tbl.m$variable)
tbl.m$BD_min = as.numeric(tbl.m$BD_min)
tbl.m$BD_max = gsub('X[0-9]+\\.[0-9]+\\.[0-9]+\\.(.+)', '\\1', tbl.m$variable)
tbl.m$BD_max = as.numeric(tbl.m$BD_max)

In [314]:
%%R -w 1000

ggplot(tbl.m, aes(BD_min, abundance, fill=taxon_name, group=taxon_name)) +
    #geom_area(stat='identity', alpha=0.5, position='dodge') +
    geom_point(aes(color=taxon_name)) +
    geom_line(aes(color=taxon_name)) +
    facet_grid(lib ~ .) +
    theme( text = element_text(size=16) )



In [315]:
%%R -w 600 -h 900

tbl.m2 = tbl.m
tbl.m2$taxon_name = gsub("_","\n", tbl.m2$taxon_name)

ggplot(tbl.m2, aes(BD_min, abundance, fill=taxon_name, group=taxon_name)) +
    geom_area(stat='identity', alpha=0.5, position='dodge') +
    facet_grid(taxon_name ~ lib) +
    theme( text = element_text(size=16),
           legend.position = 'None'
        )



In [ ]:
%%R -w 1000

ggplot(tbl.m, aes(BD_min, abundance, fill=taxon_name, group=taxon_name)) +
    geom_area(stat='identity', alpha=0.5, position='fill') +
    facet_grid(lib ~ .) +
    theme( text = element_text(size=16) )








Sandbox

Grinder testing

  • required options:
    • -rf
      • edit to use individual genome fasta files?
  • individual fasta file input:
    • need to edit database
    • 2950sub database_create
    • 1800sub community_read_abundances
    • 1471 $self->{c_structs} = $self->community_structures

Distributions in python


In [241]:
from scipy import linspace
from scipy import pi,sqrt,exp
from scipy.special import erf

from pylab import plot,show

def pdf(x):
    return 1/sqrt(2*pi) * exp(-x**2/2)

def cdf(x):
    return (1 + erf(x/sqrt(2))) / 2

def skew(x,e=0,w=1,a=0):
    t = (x-e) / w
    return 2 / w * pdf(t) * cdf(a*t)
    # You can of course use the scipy.stats.norm versions
    # return 2 * norm.pdf(t) * norm.cdf(a*t)


n = 2**10

e = 1.0 # location
w = 10.0 # scale

x = linspace(-30,10,n) 

for a in range(-5,0):
    p = skew(x,e,w,a)
    plot(x,p)

show()



In [253]:
import scipy.stats as ss

In [259]:
print ss.norm.stats(moments='mvsk')


(array(0.0), array(1.0), array(0.0), array(0.0))

In [144]:
import scipy
a, b = 0.1, 2.0
tn = scipy.stats.truncnorm(a,b)

In [164]:
from ggplot import *
import pandas as pd

In [263]:
import pymc

In [274]:
help(pymc.distributions.rskew_normal)


Help on function rskew_normal in module pymc.distributions:

rskew_normal(mu, tau, alpha, size=())
    Skew-normal random variates.


In [304]:
df = pd.DataFrame(pymc.distributions.rskew_normal(11000,10000,-100,size=10000)**10, columns=['x'])

In [328]:
import math

In [ ]:


In [358]:
df = pd.DataFrame(pymc.distributions.rnormal(math.log(10),math.log(10),size=10000)**2, columns=['x'])

In [369]:
df = pd.DataFrame(np.random.triangular(-3,7,8,10000), columns=['x'])

In [370]:
ggplot(aes(x='x'), data=df) + \
    geom_histogram()


Out[370]:
<ggplot: (8737283685749)>

skewed normal dist


In [250]:
import scipy.stats as ss
class skew_norm_gen(ss.rv_continuous):
    def pdf(self, x, s):
        return 2 * ss.norm.pdf(x) * ss.norm.cdf(x * s)
skew_norm = skew_norm_gen(name='skew_norm', shapes='s')

skew_norm.pdf(np.array([1,2,3]), 4)


Out[250]:
array([ 0.48392612,  0.10798193,  0.0088637 ])

In [243]:


In [117]:
a = set([1,2,3])
b = set([2])

a - b


Out[117]:
{1, 3}

In [125]:
print tree.all_intervals


set([Interval(1, 10, ['a', 10]), Interval(3, 6, ['c', 4]), Interval(4, 7, ['c', 4]), Interval(3, 6, ['b', 4])])

In [46]:
from intervaltree import Interval, IntervalTree

In [47]:
tree = IntervalTree()

In [126]:
tree.addi(1,10, ['a',10])
tree.addi(3,6, ['b',4])
tree.addi(3,6, ['c',4])

print tree.all_intervals


set([Interval(1, 10, ['a', 10]), Interval(3, 6, ['c', 4]), Interval(4, 7, ['c', 4]), Interval(3, 6, ['b', 4])])

In [49]:
tree.print_structure()


Node<3, balance=1>
||||:
 Interval(1, 10, ['a', 10])
 Interval(3, 6, ['b', 4])
>>>>:Node<4, balance=0>
    ||||:
     Interval(4, 7, ['c', 4])


In [108]:
def calcPercOverlap(iv1, iv2):
    if not iv1.overlaps(iv2):
        return 0.0
        
    tmpTree = IntervalTree()
    tmpTree.addi(iv1.begin, iv1.end, iv1.data)
    tmpTree.addi(iv2.begin, iv2.end, iv2.data)

    tmpTree.split_overlaps()
    
    if len(tmpTree) == 1:
        return 100.0
    
    for iv in tmpTree.iter():
        overlaps = tmpTree.search(iv.begin, iv.end)
        if len(overlaps) == 2:
            ivo = overlaps.pop()
            return float(ivo.end - ivo.begin) / ivo.data[1] * 100        
        elif len(overlaps) > 2:
            raise ValueError()

In [109]:
for iv in tree.iter():
    overlaps = tree.search(iv.begin, iv.end)
    for iv2 in overlaps:
        print calcPercOverlap(iv1, iv2)


100.0
75.0
30.0
30.0
75.0
100.0
30.0
75.0
100.0

In [50]:
iv1 = [x for x in tree.all_intervals][0]
iv2 = [x for x in tree.all_intervals][1]

In [72]:
tmpTree = IntervalTree()
tmpTree.addi(iv1.begin, iv1.end, iv1.data)
tmpTree.addi(iv2.begin, iv2.end, iv2.data)

tmpTree.split_overlaps()

for iv in tmpTree.iter():
    overlaps = tmpTree.search(iv.begin, iv.end)
    if len(overlaps) == 2:
        ivo = overlaps.pop()
        print float(ivo.end - ivo.begin) / ivo.data[1]        
    elif len(overlaps) > 2:
        raise ValueError()


0.75
0.75

In [70]:
[[x.begin,x.end,x.data] for x in overlaps]


Out[70]:
[[4, 7, ['c', 4]], [4, 7, ['a', 10]]]

In [ ]:


In [73]:
import itertools

In [77]:
for x in itertools.product(range(3), range(3)):
    print x


(0, 0)
(0, 1)
(0, 2)
(1, 0)
(1, 1)
(1, 2)
(2, 0)
(2, 1)
(2, 2)

In [79]:
def calcPercOverlap(iv1, iv2):
    tmpTree = IntervalTree()
    tmpTree.addi(iv1.begin, iv1.end, iv1.data)
    tmpTree.addi(iv2.begin, iv2.end, iv2.data)

    tmpTree.split_overlaps()
    if

    for iv in tmpTree.iter():
        overlaps = tmpTree.search(iv.begin, iv.end)
        if len(overlaps) == 2:
            ivo = overlaps.pop()
            print float(ivo.end - ivo.begin) / ivo.data[1]        
        elif len(overlaps) > 2:
            raise ValueError()

In [87]:
for ints in itertools.product(tree.all_intervals, tree.all_intervals):
    print '---'
    if ints[0].overlaps(ints[1]):
        #tmpTree = IntervalTree()
        #tmpTree.addi(ints[0].begin, ints[0].end, ints[0].data)
        #tmpTree.addi(ints[1].begin, ints[1].end, ints[1].data)        
        #print float(ivo.end - ivo.begin) / ivo.data[1]   
        calcPercOverlap(ints[0], ints[1])
    else:
        pass
#    elif len(overlaps) > 2:
#        raise ValueError()
#    else:


---
---
0.75
0.75
---
0.3
0.3
---
0.75
0.75
---
---
0.5
0.5
---
0.75
0.75
---
0.5
0.5
---

In [90]:
for ints in itertools.product(tree.all_intervals, tree.all_intervals):
    print sorted(ints, key=lambda x: x.data[1])[0]


Interval(1, 10, ['a', 10])
Interval(4, 7, ['c', 4])
Interval(3, 6, ['b', 4])
Interval(4, 7, ['c', 4])
Interval(4, 7, ['c', 4])
Interval(4, 7, ['c', 4])
Interval(3, 6, ['b', 4])
Interval(3, 6, ['b', 4])
Interval(3, 6, ['b', 4])