Description:

dev dataset
just using 10 random bacteria genomes
dataset copied from wPDF_py

Workflow:

Fragment GC distributions
Community abundance and incorporation
- Simulate communities for each gradient fraction
Simulate isotope incorp
Creating OTU tables



In [1]:

    
workDir = '/home/nick/notebook/grinderSIP/dev/PDF-PDF/bac_genome10/'
scriptDir = '/home/nick/notebook/grinderSIP/'



In [2]:

    
%load_ext rpy2.ipython









    



/opt/anaconda/lib/python2.7/site-packages/pytz/__init__.py:29: UserWarning: Module argparse was already imported from /opt/anaconda/lib/python2.7/argparse.pyc, but /opt/anaconda/lib/python2.7/site-packages is being added to sys.path
  from pkg_resources import resource_stream



In [3]:

    
%%R
library(ggplot2)
library(reshape)

Indexing genomes

@ system76-server:~/notebook/grinderSIP/dev/PDF-PDF/bac_genome10

Needed for in-silico PCR required for amplicon-fragment simulation

$ ../../../SIPSim indexGenomes genomes/genomes10.txt --fp ~/notebook/grinderSIP/dev/PDF-PDF/bac_genome10/genomes/ | less

Simulating fragments and calculating G+C

@ system76-server:~/notebook/grinderSIP/dev/PDF-PDF/bac_genome10

16S amplicons

$ ../../../SIPSim fragGC genomes/genomes10.txt --fp ~/notebook/grinderSIP/dev/PDF-PDF/bac_genome10/genomes/ \
--fr 515Fm-927Rm.fna --np 10 > genome10_ampFragGC.txt

Shotgun metagenome reads

~~~ $ ../../../SIPSim fragGC genomes/genomes10.txt --fp ~/notebook/grinderSIP/dev/PDF-PDF/bac_genome10/genomes/ \ --np 10 > genome10_shotFragGC.txt

Simulating gradient communities



In [91]:

    
@ system76-server:~/notebook/grinderSIP/dev/PDF-PDF/bac_genome10

~~~
$ ../../../SIPSim gradientComms genomes/genomes10.txt --fp ~/notebook/grinderSIP/dev/PDF-PDF/bac_genome10/genomes/ \
--pf grinder_profile --exe /home/nick/notebook/grinderSIP/bin/grinder/script/grinder > genome10_comm_n3.txt
~~~

Simulating isotope incorporation



In [267]:

    
import os

# making config file
config = """
[library 1]
  # baseline: no incorp
  
  [[intraPopDist 1]]
  distribution = uniform
  weight = 1

    [[[start]]]

      [[[[interPopDist 1]]]]
        distribution = uniform
        start = 0
        end = 0

    [[[end]]]

      [[[[interPopDist 1]]]]
        distribution = uniform
        start = 0
        end = 0

[library 2]
  # split intra-populations
  ## to get some taxa with split; use inter-pop mixture for 2nd intra-pop mu, where mixture is highly uneven
  
  [[intraPopDist 1]]
  distribution = normal
  weight = 0.5

    [[[mu]]]

      [[[[interPopDist 1]]]]
        distribution = normal
        mu = 90
        sigma = 2

    [[[sigma]]]

      [[[[interPopDist 1]]]]
        distribution = normal
        mu = 5
        sigma = 2

  [[intraPopDist 2]]
  distribution = normal
  weight = 0.5

    [[[mu]]]

      [[[[interPopDist 1]]]]
        distribution = normal
        mu = 5
        sigma = 2

    [[[sigma]]]

      [[[[interPopDist 1]]]]
        distribution = normal
        mu = 5
        sigma = 2


[library 3]
  # split inter-pop distribution (some approx. full; others none)
  
  [[intraPopDist 1]]
  distribution = normal
  weight = 1

    [[[mu]]]

      [[[[interPopDist 1]]]]
        distribution = normal
        mu = 90
        sigma = 2

      # these taxa in the community get no incorp
      [[[[interPopDist 2]]]]
        distribution = uniform
        start = 0
        end = 0

    [[[sigma]]]

      [[[[interPopDist 1]]]]
        distribution = normal
        mu = 5
        sigma = 2
        

"""

outfile = os.path.join(workDir, 'genome10_n3.config')

outf = open(outfile, 'wb')
outf.write(config)
outf.close()



In [269]:

    
%%bash -s "$workDir"

cd $1

../../../SIPSim isoIncorp genome10_comm_n3.txt genome10_n3.config\
> genome10_comm_n3_incorp.txt

Plotting in R



In [270]:

    
%%R
library(ggplot2)
library(dplyr)



In [354]:

    
%%R -i workDir

infile = paste(c(workDir, 'genome10_comm_n3_incorp.txt'), collapse='/')

tbl = read.csv(infile, sep='\t')



In [358]:

    
%%R -w 700 

tbl$taxon_name = reorder(tbl$taxon_name, tbl$param_value, max)

ggplot(tbl, aes(taxon_name, param_value, color=param)) +
    geom_point() +
    facet_grid(param ~ library, scales='free_y') +
    theme(
        text = element_text(size=16),
        axis.text.x = element_text(angle=90, hjust=1),
        axis.title.x = element_blank(),
        axis.title.y = element_blank()
    )

Simulating gradient fractions



In [273]:

    
%%bash -s "$workDir"

cd $1

../../../SIPSim fractions genome10_comm_n3.txt \
> genome10_comm_n3_fracs.txt



In [274]:

    
%%R -i workDir

infile = paste(c(workDir, 'genome10_comm_n3_fracs.txt'), collapse='/')

tbl = read.csv(infile, sep='\t')



In [275]:

    
%%R -w 700 

#tbl$taxon_name = reorder(tbl$taxon_name, tbl$param_value, max)
tbl$library = as.character(tbl$library)

ggplot(tbl, aes(library, fraction_size)) +
    geom_boxplot()



In [276]:

    
%%R -h 300
tbl_sum = group_by(tbl, library) %>%
    summarize( n_fracs = n())

ggplot(tbl_sum, aes(library, n_fracs)) +
    geom_bar(stat='identity')

Creating OTU table



In [330]:

    
%%bash -s "$workDir"
%%timeit

cd $1

../../../SIPSim OTU_table \
genome10_shotFragGC.txt genome10_comm_n3.txt \
genome10_comm_n3_incorp.txt genome10_comm_n3_fracs.txt \
--abs_abund 1e6 > genome10_OTU_abnd1e6.txt









    



bash: line 1: fg: no job control
Processing library: "1"
  Processing taxon: "Escherichia_coli_APEC_O78"
  Processing taxon: "Xanthomonas_axonopodis_Xac29-1"
  Processing taxon: "Micrococcus_luteus_NCTC_2665"
  Processing taxon: "Idiomarina_loihiensis_GSL_199"
  Processing taxon: "Leuconostoc_citreum_KM20"
  Processing taxon: "Geobacter_lovleyi_SZ"
  Processing taxon: "Bacillus_cereus_F837_76"
  Processing taxon: "Nitrosomonas_europaea_ATCC_19718"
  Processing taxon: "Roseobacter_litoralis_Och_149"
  Processing taxon: "Cyanobium_gracile_PCC_6307"
Processing library: "2"
  Processing taxon: "Idiomarina_loihiensis_GSL_199"
  Processing taxon: "Micrococcus_luteus_NCTC_2665"
  Processing taxon: "Geobacter_lovleyi_SZ"
  Processing taxon: "Roseobacter_litoralis_Och_149"
  Processing taxon: "Nitrosomonas_europaea_ATCC_19718"
  Processing taxon: "Cyanobium_gracile_PCC_6307"
  Processing taxon: "Bacillus_cereus_F837_76"
  Processing taxon: "Escherichia_coli_APEC_O78"
  Processing taxon: "Leuconostoc_citreum_KM20"
  Processing taxon: "Xanthomonas_axonopodis_Xac29-1"
Processing library: "3"
  Processing taxon: "Cyanobium_gracile_PCC_6307"
  Processing taxon: "Nitrosomonas_europaea_ATCC_19718"
  Processing taxon: "Idiomarina_loihiensis_GSL_199"
  Processing taxon: "Micrococcus_luteus_NCTC_2665"
  Processing taxon: "Escherichia_coli_APEC_O78"
  Processing taxon: "Leuconostoc_citreum_KM20"
  Processing taxon: "Roseobacter_litoralis_Och_149"
  Processing taxon: "Bacillus_cereus_F837_76"
  Processing taxon: "Geobacter_lovleyi_SZ"
  Processing taxon: "Xanthomonas_axonopodis_Xac29-1"

Plotting out values



In [367]:

    
%%R -i workDir

infile = paste(c(workDir, 'genome10_OTU_abnd1e6.txt'), collapse='/')

tbl = read.csv(infile, sep='\t', row.names=1)
tbl$taxon_name = rownames(tbl)



In [368]:

    
%%R
tbl.m = melt(tbl, id.var=c('taxon_name'))
colnames(tbl.m) = c('taxon_name', 'variable', 'abundance')

tbl.m$lib = gsub('X|\\..+', '', tbl.m$variable)
tbl.m$BD_min = gsub('X[0-9]+\\.([0-9]+\\.[0-9]+).+', '\\1', tbl.m$variable)
tbl.m$BD_min = as.numeric(tbl.m$BD_min)
tbl.m$BD_max = gsub('X[0-9]+\\.[0-9]+\\.[0-9]+\\.(.+)', '\\1', tbl.m$variable)
tbl.m$BD_max = as.numeric(tbl.m$BD_max)



In [369]:

    
%%R -w 800

ggplot(tbl.m, aes(BD_min, abundance, color=taxon_name, group=taxon_name)) +
    geom_point(size=1.5) +
    geom_line(alpha=0.5) +
    facet_grid(lib ~ .) +
    theme( text = element_text(size=16) )



In [370]:

    
%%R -w 800

ggplot(tbl.m, aes(BD_min, abundance, fill=taxon_name, group=taxon_name)) +
    geom_area(stat='identity', alpha=0.5, position='dodge') +
    facet_grid(lib ~ .) +
    labs(x='Buoyant density') +
    theme( text = element_text(size=16) )



In [373]:

    
%%R -w 800 -h 800

tbl.m2 = tbl.m
tbl.m2$taxon_name = gsub("_","\n", tbl.m2$taxon_name)

ggplot(tbl.m2, aes(BD_min, abundance, fill=taxon_name, group=taxon_name)) +
    geom_area(stat='identity', alpha=0.5, position='dodge') +
    facet_grid(taxon_name ~ lib) +
    labs(x='Buoyant density') +
    theme( text = element_text(size=16) )



In [372]:

    
%%R -w 800 -h 800

tbl.m2 = tbl.m
tbl.m2$taxon_name = gsub("_","\n", tbl.m2$taxon_name)

ggplot(tbl.m2, aes(BD_min, abundance, fill=taxon_name, group=taxon_name)) +
    geom_area(stat='identity', position='fill') +
    facet_grid(lib ~ .) +
    theme( text = element_text(size=16) )

Trying with same incorporation distributions

Just looking at variability via 'noise' and G+C KDE distributions



In [285]:

    
import os

# making config file
config = """
[library 1]
  # no incorp
  
  [[intraPopDist 1]]
  distribution = uniform
  weight = 1

    [[[start]]]

      [[[[interPopDist 1]]]]
        distribution = uniform
        start = 0
        end = 0

    [[[end]]]

      [[[[interPopDist 1]]]]
        distribution = uniform
        start = 0
        end = 0

[library 2]
  # no incorp
  
  [[intraPopDist 1]]
  distribution = uniform
  weight = 1

    [[[start]]]

      [[[[interPopDist 1]]]]
        distribution = uniform
        start = 0
        end = 0

    [[[end]]]

      [[[[interPopDist 1]]]]
        distribution = uniform
        start = 0
        end = 0

[library 3]
  # no incorp
  
  [[intraPopDist 1]]
  distribution = uniform
  weight = 1

    [[[start]]]

      [[[[interPopDist 1]]]]
        distribution = uniform
        start = 0
        end = 0

    [[[end]]]

      [[[[interPopDist 1]]]]
        distribution = uniform
        start = 0
        end = 0
"""

outfile = os.path.join(workDir, 'genome10_n3_noInc.config')

outf = open(outfile, 'wb')
outf.write(config)
outf.close()



In [290]:

    
%%bash -s "$workDir"

cd $1

../../../SIPSim isoIncorp --percTaxa 50 genome10_comm_n3.txt genome10_n3_noInc.config \
> genome10_comm_n3_noIncorp.txt



In [304]:

    
%%bash -s "$workDir"
%time

cd $1

../../../SIPSim OTU_table \
genome10_shotFragGC.txt genome10_comm_n3.txt \
genome10_comm_n3_noIncorp.txt genome10_comm_n3_fracs.txt \
--abs_abund 1e6 > genome10_OTU_noInc_abnd1e6.txt









    



bash: line 1: fg: no job control
Processing library: "1"
  Processing taxon: "Escherichia_coli_APEC_O78"
  Processing taxon: "Xanthomonas_axonopodis_Xac29-1"
  Processing taxon: "Micrococcus_luteus_NCTC_2665"
  Processing taxon: "Idiomarina_loihiensis_GSL_199"
  Processing taxon: "Leuconostoc_citreum_KM20"
  Processing taxon: "Geobacter_lovleyi_SZ"
  Processing taxon: "Bacillus_cereus_F837_76"
  Processing taxon: "Nitrosomonas_europaea_ATCC_19718"
  Processing taxon: "Roseobacter_litoralis_Och_149"
  Processing taxon: "Cyanobium_gracile_PCC_6307"
Processing library: "2"
  Processing taxon: "Idiomarina_loihiensis_GSL_199"
  Processing taxon: "Micrococcus_luteus_NCTC_2665"
  Processing taxon: "Geobacter_lovleyi_SZ"
  Processing taxon: "Roseobacter_litoralis_Och_149"
  Processing taxon: "Nitrosomonas_europaea_ATCC_19718"
  Processing taxon: "Cyanobium_gracile_PCC_6307"
  Processing taxon: "Bacillus_cereus_F837_76"
  Processing taxon: "Escherichia_coli_APEC_O78"
  Processing taxon: "Leuconostoc_citreum_KM20"
  Processing taxon: "Xanthomonas_axonopodis_Xac29-1"
Processing library: "3"
  Processing taxon: "Cyanobium_gracile_PCC_6307"
  Processing taxon: "Nitrosomonas_europaea_ATCC_19718"
  Processing taxon: "Idiomarina_loihiensis_GSL_199"
  Processing taxon: "Micrococcus_luteus_NCTC_2665"
  Processing taxon: "Escherichia_coli_APEC_O78"
  Processing taxon: "Leuconostoc_citreum_KM20"
  Processing taxon: "Roseobacter_litoralis_Och_149"
  Processing taxon: "Bacillus_cereus_F837_76"
  Processing taxon: "Geobacter_lovleyi_SZ"
  Processing taxon: "Xanthomonas_axonopodis_Xac29-1"



In [347]:

    
%%R -i workDir

infile = paste(c(workDir, 'genome10_OTU_noInc_abnd1e6.txt'), collapse='/')

tbl = read.csv(infile, sep='\t', row.names=1)
tbl$taxon_name = rownames(tbl)

# editing table
tbl.m = melt(tbl, id.var=c('taxon_name'))
colnames(tbl.m) = c('taxon_name', 'variable', 'abundance')

tbl.m$lib = gsub('X|\\..+', '', tbl.m$variable)
tbl.m$BD_min = gsub('X[0-9]+\\.([0-9]+\\.[0-9]+).+', '\\1', tbl.m$variable)
tbl.m$BD_min = as.numeric(tbl.m$BD_min)
tbl.m$BD_max = gsub('X[0-9]+\\.[0-9]+\\.[0-9]+\\.(.+)', '\\1', tbl.m$variable)
tbl.m$BD_max = as.numeric(tbl.m$BD_max)



In [351]:

    
%%R -w 800

ggplot(tbl.m, aes(BD_min, abundance, fill=taxon_name, group=taxon_name)) +
    geom_area(stat='identity', alpha=0.5, position='dodge') +
    facet_grid(lib ~ .) +
    labs(title='No isotope incorporation', x='Buoyant density') +
    theme( text = element_text(size=16) )



In [352]:

    
%%R -w 800

ggplot(tbl.m, aes(BD_min, abundance, fill=taxon_name, group=taxon_name)) +
    geom_area(stat='identity', position='fill') +
    facet_grid(lib ~ .) +
    labs(title='No isotope incorporation', x='Buoyant density') +
    theme( text = element_text(size=16) )

Trying with same normal distributions



In [308]:

    
import os

# making config file
config = """
[library 1]
  # normal distribution
  
  [[intraPopDist 1]]
  distribution = normal
  weight = 1

    [[[mu]]]

      [[[[interPopDist 1]]]]
        distribution = normal
        mu = 90
        sigma = 2

    [[[sigma]]]

      [[[[interPopDist 1]]]]
        distribution = normal
        mu = 2
        sigma = 0.1

[library 2]
  # normal distribution
  
  [[intraPopDist 1]]
  distribution = normal
  weight = 1

    [[[mu]]]

      [[[[interPopDist 1]]]]
        distribution = normal
        mu = 90
        sigma = 2

    [[[sigma]]]

      [[[[interPopDist 1]]]]
        distribution = normal
        mu = 2
        sigma = 0.1

[library 3]
  # normal distribution
  
  [[intraPopDist 1]]
  distribution = normal
  weight = 1

    [[[mu]]]

      [[[[interPopDist 1]]]]
        distribution = normal
        mu = 90
        sigma = 2

    [[[sigma]]]

      [[[[interPopDist 1]]]]
        distribution = normal
        mu = 2
        sigma = 0.1
"""

outfile = os.path.join(workDir, 'genome10_n3_norm.config')

outf = open(outfile, 'wb')
outf.write(config)
outf.close()



In [309]:

    
%%bash -s "$workDir"

cd $1


../../../SIPSim isoIncorp --percTaxa 100 \
genome10_comm_n3.txt genome10_n3_norm.config \
> genome10_comm_n3_normIncorp.txt



In [310]:

    
%%R -i workDir

infile = paste(c(workDir, 'genome10_comm_n3_normIncorp.txt'), collapse='/')

tbl = read.csv(infile, sep='\t')



In [311]:

    
%%R -w 700 

tbl$taxon_name = reorder(tbl$taxon_name, tbl$param_value, max)

ggplot(tbl, aes(taxon_name, param_value, color=param)) +
    geom_point() +
    facet_grid(param ~ library, scales='free_y') +
    theme(
        text = element_text(size=16),
        axis.text.x = element_text(angle=90)
    )



In [312]:

    
%%bash -s "$workDir"

cd $1

../../../SIPSim OTU_table \
genome10_shotFragGC.txt genome10_comm_n3.txt \
genome10_comm_n3_normIncorp.txt genome10_comm_n3_fracs.txt \
--abs_abund 1e4 > genome10_OTU_normInc_abnd1e4.txt









    



Processing library: "1"
  Processing taxon: "Escherichia_coli_APEC_O78"
  Processing taxon: "Xanthomonas_axonopodis_Xac29-1"
  Processing taxon: "Micrococcus_luteus_NCTC_2665"
  Processing taxon: "Idiomarina_loihiensis_GSL_199"
  Processing taxon: "Leuconostoc_citreum_KM20"
  Processing taxon: "Geobacter_lovleyi_SZ"
  Processing taxon: "Bacillus_cereus_F837_76"
  Processing taxon: "Nitrosomonas_europaea_ATCC_19718"
  Processing taxon: "Roseobacter_litoralis_Och_149"
  Processing taxon: "Cyanobium_gracile_PCC_6307"
Processing library: "2"
  Processing taxon: "Idiomarina_loihiensis_GSL_199"
  Processing taxon: "Micrococcus_luteus_NCTC_2665"
  Processing taxon: "Geobacter_lovleyi_SZ"
  Processing taxon: "Roseobacter_litoralis_Och_149"
  Processing taxon: "Nitrosomonas_europaea_ATCC_19718"
  Processing taxon: "Cyanobium_gracile_PCC_6307"
  Processing taxon: "Bacillus_cereus_F837_76"
  Processing taxon: "Escherichia_coli_APEC_O78"
  Processing taxon: "Leuconostoc_citreum_KM20"
  Processing taxon: "Xanthomonas_axonopodis_Xac29-1"
Processing library: "3"
  Processing taxon: "Cyanobium_gracile_PCC_6307"
  Processing taxon: "Nitrosomonas_europaea_ATCC_19718"
  Processing taxon: "Idiomarina_loihiensis_GSL_199"
  Processing taxon: "Micrococcus_luteus_NCTC_2665"
  Processing taxon: "Escherichia_coli_APEC_O78"
  Processing taxon: "Leuconostoc_citreum_KM20"
  Processing taxon: "Roseobacter_litoralis_Och_149"
  Processing taxon: "Bacillus_cereus_F837_76"
  Processing taxon: "Geobacter_lovleyi_SZ"
  Processing taxon: "Xanthomonas_axonopodis_Xac29-1"



In [313]:

    
%%R -i workDir

infile = paste(c(workDir, 'genome10_OTU_normInc_abnd1e4.txt'), collapse='/')

tbl = read.csv(infile, sep='\t', row.names=1)
tbl$taxon_name = rownames(tbl)

# editing table
tbl.m = melt(tbl, id.var=c('taxon_name'))
colnames(tbl.m) = c('taxon_name', 'variable', 'abundance')

tbl.m$lib = gsub('X|\\..+', '', tbl.m$variable)
tbl.m$BD_min = gsub('X[0-9]+\\.([0-9]+\\.[0-9]+).+', '\\1', tbl.m$variable)
tbl.m$BD_min = as.numeric(tbl.m$BD_min)
tbl.m$BD_max = gsub('X[0-9]+\\.[0-9]+\\.[0-9]+\\.(.+)', '\\1', tbl.m$variable)
tbl.m$BD_max = as.numeric(tbl.m$BD_max)



In [314]:

    
%%R -w 1000

ggplot(tbl.m, aes(BD_min, abundance, fill=taxon_name, group=taxon_name)) +
    #geom_area(stat='identity', alpha=0.5, position='dodge') +
    geom_point(aes(color=taxon_name)) +
    geom_line(aes(color=taxon_name)) +
    facet_grid(lib ~ .) +
    theme( text = element_text(size=16) )



In [315]:

    
%%R -w 600 -h 900

tbl.m2 = tbl.m
tbl.m2$taxon_name = gsub("_","\n", tbl.m2$taxon_name)

ggplot(tbl.m2, aes(BD_min, abundance, fill=taxon_name, group=taxon_name)) +
    geom_area(stat='identity', alpha=0.5, position='dodge') +
    facet_grid(taxon_name ~ lib) +
    theme( text = element_text(size=16),
           legend.position = 'None'
        )



In [ ]:

    
%%R -w 1000

ggplot(tbl.m, aes(BD_min, abundance, fill=taxon_name, group=taxon_name)) +
    geom_area(stat='identity', alpha=0.5, position='fill') +
    facet_grid(lib ~ .) +
    theme( text = element_text(size=16) )

Sandbox

Grinder testing

required options:
- -rf
  - edit to use individual genome fasta files?

individual fasta file input:
- need to edit database
- 2950sub database_create
- 1800sub community_read_abundances
- 1471 $self->{c_structs} = $self->community_structures

Distributions in python



In [241]:

    
from scipy import linspace
from scipy import pi,sqrt,exp
from scipy.special import erf

from pylab import plot,show

def pdf(x):
    return 1/sqrt(2*pi) * exp(-x**2/2)

def cdf(x):
    return (1 + erf(x/sqrt(2))) / 2

def skew(x,e=0,w=1,a=0):
    t = (x-e) / w
    return 2 / w * pdf(t) * cdf(a*t)
    # You can of course use the scipy.stats.norm versions
    # return 2 * norm.pdf(t) * norm.cdf(a*t)


n = 2**10

e = 1.0 # location
w = 10.0 # scale

x = linspace(-30,10,n) 

for a in range(-5,0):
    p = skew(x,e,w,a)
    plot(x,p)

show()



In [253]:

    
import scipy.stats as ss



In [259]:

    
print ss.norm.stats(moments='mvsk')









    



(array(0.0), array(1.0), array(0.0), array(0.0))



In [144]:

    
import scipy
a, b = 0.1, 2.0
tn = scipy.stats.truncnorm(a,b)



In [164]:

    
from ggplot import *
import pandas as pd



In [263]:

    
import pymc



In [274]:

    
help(pymc.distributions.rskew_normal)









    



Help on function rskew_normal in module pymc.distributions:

rskew_normal(mu, tau, alpha, size=())
    Skew-normal random variates.



In [304]:

    
df = pd.DataFrame(pymc.distributions.rskew_normal(11000,10000,-100,size=10000)**10, columns=['x'])



In [328]:

    
import math



In [ ]:



In [358]:

    
df = pd.DataFrame(pymc.distributions.rnormal(math.log(10),math.log(10),size=10000)**2, columns=['x'])



In [369]:

    
df = pd.DataFrame(np.random.triangular(-3,7,8,10000), columns=['x'])



In [370]:

    
ggplot(aes(x='x'), data=df) + \
    geom_histogram()









    












    Out[370]:





<ggplot: (8737283685749)>

skewed normal dist



In [250]:

    
import scipy.stats as ss
class skew_norm_gen(ss.rv_continuous):
    def pdf(self, x, s):
        return 2 * ss.norm.pdf(x) * ss.norm.cdf(x * s)
skew_norm = skew_norm_gen(name='skew_norm', shapes='s')

skew_norm.pdf(np.array([1,2,3]), 4)









    Out[250]:





array([ 0.48392612,  0.10798193,  0.0088637 ])



In [243]:



In [117]:

    
a = set([1,2,3])
b = set([2])

a - b









    Out[117]:





{1, 3}



In [125]:

    
print tree.all_intervals









    



set([Interval(1, 10, ['a', 10]), Interval(3, 6, ['c', 4]), Interval(4, 7, ['c', 4]), Interval(3, 6, ['b', 4])])



In [46]:

    
from intervaltree import Interval, IntervalTree



In [47]:

    
tree = IntervalTree()



In [126]:

    
tree.addi(1,10, ['a',10])
tree.addi(3,6, ['b',4])
tree.addi(3,6, ['c',4])

print tree.all_intervals









    



set([Interval(1, 10, ['a', 10]), Interval(3, 6, ['c', 4]), Interval(4, 7, ['c', 4]), Interval(3, 6, ['b', 4])])



In [49]:

    
tree.print_structure()









    



Node<3, balance=1>
||||:
 Interval(1, 10, ['a', 10])
 Interval(3, 6, ['b', 4])
>>>>:Node<4, balance=0>
    ||||:
     Interval(4, 7, ['c', 4])



In [108]:

    
def calcPercOverlap(iv1, iv2):
    if not iv1.overlaps(iv2):
        return 0.0
        
    tmpTree = IntervalTree()
    tmpTree.addi(iv1.begin, iv1.end, iv1.data)
    tmpTree.addi(iv2.begin, iv2.end, iv2.data)

    tmpTree.split_overlaps()
    
    if len(tmpTree) == 1:
        return 100.0
    
    for iv in tmpTree.iter():
        overlaps = tmpTree.search(iv.begin, iv.end)
        if len(overlaps) == 2:
            ivo = overlaps.pop()
            return float(ivo.end - ivo.begin) / ivo.data[1] * 100        
        elif len(overlaps) > 2:
            raise ValueError()



In [109]:

    
for iv in tree.iter():
    overlaps = tree.search(iv.begin, iv.end)
    for iv2 in overlaps:
        print calcPercOverlap(iv1, iv2)



In [50]:

    
iv1 = [x for x in tree.all_intervals][0]
iv2 = [x for x in tree.all_intervals][1]



In [72]:

    
tmpTree = IntervalTree()
tmpTree.addi(iv1.begin, iv1.end, iv1.data)
tmpTree.addi(iv2.begin, iv2.end, iv2.data)

tmpTree.split_overlaps()

for iv in tmpTree.iter():
    overlaps = tmpTree.search(iv.begin, iv.end)
    if len(overlaps) == 2:
        ivo = overlaps.pop()
        print float(ivo.end - ivo.begin) / ivo.data[1]        
    elif len(overlaps) > 2:
        raise ValueError()



In [70]:

    
[[x.begin,x.end,x.data] for x in overlaps]









    Out[70]:





[[4, 7, ['c', 4]], [4, 7, ['a', 10]]]



In [ ]:



In [73]:

    
import itertools



In [77]:

    
for x in itertools.product(range(3), range(3)):
    print x









    



(0, 0)
(0, 1)
(0, 2)
(1, 0)
(1, 1)
(1, 2)
(2, 0)
(2, 1)
(2, 2)



In [79]:

    
def calcPercOverlap(iv1, iv2):
    tmpTree = IntervalTree()
    tmpTree.addi(iv1.begin, iv1.end, iv1.data)
    tmpTree.addi(iv2.begin, iv2.end, iv2.data)

    tmpTree.split_overlaps()
    if

    for iv in tmpTree.iter():
        overlaps = tmpTree.search(iv.begin, iv.end)
        if len(overlaps) == 2:
            ivo = overlaps.pop()
            print float(ivo.end - ivo.begin) / ivo.data[1]        
        elif len(overlaps) > 2:
            raise ValueError()



In [87]:

    
for ints in itertools.product(tree.all_intervals, tree.all_intervals):
    print '---'
    if ints[0].overlaps(ints[1]):
        #tmpTree = IntervalTree()
        #tmpTree.addi(ints[0].begin, ints[0].end, ints[0].data)
        #tmpTree.addi(ints[1].begin, ints[1].end, ints[1].data)        
        #print float(ivo.end - ivo.begin) / ivo.data[1]   
        calcPercOverlap(ints[0], ints[1])
    else:
        pass
#    elif len(overlaps) > 2:
#        raise ValueError()
#    else:









    



---
---
0.75
0.75
---
0.3
0.3
---
0.75
0.75
---
---
0.5
0.5
---
0.75
0.75
---
0.5
0.5
---



In [90]:

    
for ints in itertools.product(tree.all_intervals, tree.all_intervals):
    print sorted(ints, key=lambda x: x.data[1])[0]









    



Interval(1, 10, ['a', 10])
Interval(4, 7, ['c', 4])
Interval(3, 6, ['b', 4])
Interval(4, 7, ['c', 4])
Interval(4, 7, ['c', 4])
Interval(4, 7, ['c', 4])
Interval(3, 6, ['b', 4])
Interval(3, 6, ['b', 4])
Interval(3, 6, ['b', 4])