In [9]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [10]:
from SuchTree import SuchTree, SuchLinkedTrees, pearson
import pandas as pd
import numpy as np
import seaborn

In [11]:
T1 = SuchTree( '../fishpoo/mcgee_trimmed.tree' )
T2 = SuchTree( 'http://edhar.genomecenter.ucdavis.edu/~russell/fishpoo/fishpoo2_p200_c2_unique_2_clustalo_fasttree.tree' )
links = pd.read_csv( 'http://edhar.genomecenter.ucdavis.edu/~russell/fishpoo/fishpoo2_p200_c2_host_count_table.tsv', 
                    sep='\t', index_col='Host')
links.index = map( lambda x : x.replace(' ','_'), links.index )

Exploring our example data

Before we start with the science, let's use SuchTree to help us explore the example data. We'll start by plotting a tree of T1 using ete. We will use T1 later as the phylogeny of host organisms.


In [12]:
from ete2 import Tree, TreeStyle, NodeStyle, TextFace
from numpy import linspace

ts = TreeStyle()
ts.mode = 'r'
ts.show_leaf_name = True
ts.branch_vertical_margin = 2
ts.scale = 1000
ts.show_leaf_name = False
ts.show_scale = False

nstyle = NodeStyle()
nstyle['size'] = 0

ete_tree = Tree( '../fishpoo/mcgee_trimmed.tree' )

for node in ete_tree.traverse() :
    node.set_style(nstyle)
    if node.is_leaf :
        tf = TextFace( node.name.replace('_',' ').replace('\'','') )
        tf.fsize = 10
        tf.hz_align = 100
        node.add_face( tf, 0 )

ete_tree.render("%%inline", w=120, units="mm", tree_style=ts)


Out[12]:

Calculating distances

SuchTree has two ways to calculate distances; one pair at a time using SuchTree.distance(), or several large groups at once using SuchTree.distances(). The later avoids the interpreter's overhead and is much faster.

Note that the dendrograms in the cluster maps shown below are re-computed from the distances in the matrix using neighbor joining, and so they do not exactly match the phylogeny. Send complaints to the authors of seaborn.

First, here we'll calculate the matrix of patristic distances for T1 using SuchTree.distance().


In [13]:
D1 = zeros( ( len(T1.leafs),len(T1.leafs) ) )
for i,a in enumerate(T1.leafs.values()) :
    for j,b in enumerate( T1.leafs.values() ) :
        D1[i,j] = T1.distance( a, b )
        
seaborn.clustermap(D1)


Out[13]:
<seaborn.matrix.ClusterGrid at 0x7f3c57146b50>

Here is the same calculation in batch mode using SuchTree.distances().


In [14]:
D2_list = []
for i,a in enumerate(T1.leafs.values()) :
    for j,b in enumerate( T1.leafs.values() ) :
        D2_list.append( ( a, b ) )
D2_array = array( D2_list )
D2 = T1.distances( D2_array )
D2 = D2.reshape( ( len(T1.leafs), len(T1.leafs) ) )
seaborn.clustermap(D2)


Out[14]:
<seaborn.matrix.ClusterGrid at 0x7f3c53fdb490>

In [15]:
%%time
SLT = SuchLinkedTrees( T1, T2, links )


139897135584512 allocating columns in 1460819224
bulding default subset.
bulding default link list.
CPU times: user 9min 45s, sys: 1.79 s, total: 9min 47s
Wall time: 9min 46s

In [16]:
SLT.TreeB.get_leafs( 7027 )


Out[16]:
array([6936, 7040, 7046, 7052, 6880, 6938, 6940, 6942, 6994, 7028, 7034,
       7042, 7044, 7048, 7050, 7054, 7056, 6882, 6944, 6950, 6996, 6998,
       7014, 7030, 7032, 7036, 7038, 6884, 6886, 6926, 6932, 6934, 6946,
       6948, 6952, 6954, 7000, 7008, 6914, 6916, 6918, 6920, 6928, 6930,
       6972, 6986, 7002, 7010, 7012, 7016, 7024, 7026, 6888, 6890, 6892,
       6922, 6924, 6956, 6958, 6960, 6962, 6964, 6974, 6980, 6992, 7004,
       7006, 7022, 6894, 6896, 6898, 6906, 6966, 6976, 6978, 6982, 6984,
       6988, 6990, 7018, 7020, 6900, 6908, 6968, 6970, 6902, 6904, 6910,
       6912])

In [17]:
SLT.subset_b( 7027 )
print 'subset size, a :', SLT.subset_a_size
print 'subset size, b :', SLT.subset_b_size
print 'subset links   :', SLT.subset_n_links
print 'link pairs     :', ( SLT.subset_n_links * ( SLT.subset_n_links -1 ) ) / 2


subset size, a : 14
subset size, b : 89
subset links   : 337
link pairs     : 56616

In [18]:
print SLT.col_ids
print SLT.subset_columns
print SLT.subset_b_leafs


[ 77012 106174 100632 ...,  60496  40312  67012]
[37633 25708 51090 17894 20203 28821 30994 51224 29565 37352 27576 37745
  5083  7167 29305 21102  2900 45778 12785 43434 28605 50204 42422 37275
 33623 28599   702 22655  8894 16086 52753  7036 22235 29536 25752 19840
 25913 45302  7254 11458 46359 17008 17559 13866 13594 30210 42254 35039
 51425 38488 26463 19919 21829 33133  7334 23312 43075 15132 11588 14504
 38995 14011 31929 26999 51838  5457 31613 17337 50120 27007 49497 28213
 45249 47995 41296 44556  4512  5212 16829 50626 26848 50728   386 32460
 19676 48859 24163  5066 36179]
[6936 7040 7046 7052 6880 6938 6940 6942 6994 7028 7034 7042 7044 7048 7050
 7054 7056 6882 6944 6950 6996 6998 7014 7030 7032 7036 7038 6884 6886 6926
 6932 6934 6946 6948 6952 6954 7000 7008 6914 6916 6918 6920 6928 6930 6972
 6986 7002 7010 7012 7016 7024 7026 6888 6890 6892 6922 6924 6956 6958 6960
 6962 6964 6974 6980 6992 7004 7006 7022 6894 6896 6898 6906 6966 6976 6978
 6982 6984 6988 6990 7018 7020 6900 6908 6968 6970 6902 6904 6910 6912]

In [19]:
result = SLT.linked_distances()
seaborn.jointplot( result['TreeA'], result['TreeB'] )


Out[19]:
<seaborn.axisgrid.JointGrid at 0x7f3c53fede10>

In [20]:
SLT.subset_a( 1 )
print 'subset size, a :', SLT.subset_a_size
print 'subset size, b :', SLT.subset_b_size
print 'subset links   :', SLT.subset_n_links
print 'link pairs     :', ( SLT.subset_n_links * ( SLT.subset_n_links -1 ) ) / 2


subset size, a : 2
subset size, b : 89
subset links   : 13
link pairs     : 78

In [21]:
result = SLT.linked_distances()
seaborn.jointplot( result['TreeA'], result['TreeB'] )


Out[21]:
<seaborn.axisgrid.JointGrid at 0x7f3c4f3fc390>

In [22]:
SLT.subset_a( SLT.TreeA.root )
print 'subset size, a :', SLT.subset_a_size
print 'subset size, b :', SLT.subset_b_size
print 'subset links   :', SLT.subset_n_links
print 'link pairs     :', ( SLT.subset_n_links * ( SLT.subset_n_links -1 ) ) / 2


subset size, a : 14
subset size, b : 89
subset links   : 337
link pairs     : 56616

In [23]:
SLT.get_column_leafs( 0, as_row_ids=True )


Out[23]:
array([11])

In [24]:
SLT.get_column_links( 0 )


Out[24]:
array([False, False, False, False, False, False, False, False, False,
       False, False,  True, False, False], dtype=bool)

In [25]:
result_sampled = SLT.sample_linked_distances(sigma=0.05, n=10000, buckets=10)
result_sampled


Out[25]:
{'TreeA': array([ 0.33099398,  0.27956101,  0.27074298, ...,  0.36522502,
         0.        ,  0.36843902]),
 'TreeB': array([ 0.0181 ,  0.01805,  0.01688, ...,  0.00948,  0.02258,  0.01256]),
 'deviation_a': 0.0011495854705572128,
 'deviation_b': 4.0235561755253e-05,
 'n_pairs': 56616,
 'n_samples': 100000}

In [26]:
seaborn.jointplot( result_sampled['TreeA'], result_sampled['TreeB'] )


Out[26]:
<seaborn.axisgrid.JointGrid at 0x7f3c5712cb90>

In [27]:
seaborn.kdeplot(result['TreeB'])


Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3c51794a50>

In [28]:
seaborn.kdeplot(result_sampled['TreeB'])


Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3c53526110>

In [29]:
sd = sorted(list(set(result_sampled['TreeB'])))
ad = sorted(list(set(result['TreeB'])))

plot( linspace(0,1,len(sd)), sd )
plot( linspace(0,1,len(ad)), ad )


Out[29]:
[<matplotlib.lines.Line2D at 0x7f3c56b2cb10>]

In [30]:
seaborn.kdeplot(array(sd))


Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3c55366cd0>

In [31]:
seaborn.kdeplot(array(ad))


Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3c4fd42250>

In [ ]:
import pyprind

p = pyprind.ProgBar( len( list( SLT.TreeB.get_internal_nodes() ) ), monitor=True, title='sampling trees...' )

big_nodes = []
table = {}
for n,node in enumerate( SLT.TreeB.get_internal_nodes() ) :
    p.update()
    SLT.subset_b( node )
    if SLT.subset_n_links > 4000 :
        big_nodes.append( node )
        result = SLT.sample_linked_distances( sigma=0.05, n=1000, buckets=100)
    else :
        result = SLT.linked_distances()
    table[node] = { 'n_leafs'    : SLT.subset_b_size, 
                    'n_links'    : SLT.subset_n_links,
                    'n_pairs'    : result['n_pairs'],
                    'n_samples'  : result['n_samples'],
                    'deviatnon_a': result['deviation_a'],
                    'deviation_b': result['deviation_b'],
                    'r'          : pearson( result['TreeA'], result['TreeB'] ) }

In [32]:
#C.to_csv( 'docs/fishpoo.csv' )
C = pd.DataFrame.from_csv( 'docs/fishpoo.csv' )

In [33]:
#C = pd.DataFrame( table ).T
seaborn.jointplot( 'n_links', 'r', data=C )


Out[33]:
<seaborn.axisgrid.JointGrid at 0x7f3c559e0910>

In [34]:
seaborn.jointplot( 'n_links', 'r', data=C.query('n_leafs > 5 and r > 0.05')  )


Out[34]:
<seaborn.axisgrid.JointGrid at 0x7f3c5245f510>

In [35]:
CC = C.query('n_leafs > 5 and r > 0.2').sort_values('r', ascending=False)
print CC.shape
CC.head()


(93, 7)
Out[35]:
deviation_b deviatnon_a n_leafs n_links n_pairs n_samples r
15421 NaN NaN 10.0 10.0 45.0 45.0 0.979979
23887 NaN NaN 33.0 34.0 561.0 561.0 0.836901
14909 NaN NaN 6.0 6.0 15.0 15.0 0.776373
42253 NaN NaN 46.0 53.0 1378.0 1378.0 0.722426
23003 NaN NaN 22.0 22.0 231.0 231.0 0.699994

In [36]:
from scipy.stats import kendalltau, pearsonr

pearson_p   = {}
kendall_tau = {}
kendall_p   = {}

for n,node in enumerate( CC.index ) :
    SLT.subset_b(node)
    result = SLT.linked_distances()
    p_r,p_p = pearsonr(   result['TreeA'], result['TreeB'] )
    k_t,k_p = kendalltau( result['TreeA'], result['TreeB'] )
    pearson_p[node]  = p_p
    kendall_tau[node] = k_t
    kendall_p[node]   = k_p

CC['pearson_p'] = pd.Series(pearson_p)
CC['kendall_tau'] = pd.Series(kendall_tau)
CC['kendall_p'] = pd.Series(kendall_p)
#CC.head()
#CC

In [37]:
seaborn.jointplot( 'r', 'kendall_tau', data=CC )


Out[37]:
<seaborn.axisgrid.JointGrid at 0x7f3c523b2f10>

In [38]:
seaborn.jointplot( 'n_links', 'kendall_tau', data=CC )


Out[38]:
<seaborn.axisgrid.JointGrid at 0x7f3c50d77190>

In [51]:
figure(figsize=(10,8))

for n,(node_id,row) in enumerate( CC.iterrows() ) :
    data = dict(row)
    subplot(4,4,n+1)
    SLT.subset_b( node_id )
    result = SLT.linked_distances()
    scatter( result['TreeA'], result['TreeB'], marker='o', s=3, c='black', alpha=0.4 )
    xticks([])
    yticks([])
    xlim((-0.1,max( result['TreeA'] )+0.1))
    ylim((-0.1,max( result['TreeB'] )+0.1))
    title( str(node_id) + ' : ' + str(data['kendall_tau']) )
    if n == 15 : break

tight_layout()



In [42]:
from skbio import TreeNode

skt = TreeNode.read('http://edhar.genomecenter.ucdavis.edu/~russell/fishpoo/fishpoo2_p200_c2_unique_2_clustalo_fasttree.tree', convert_underscores=False)

In [70]:
cladeid = 42333
SLT.subset_b(cladeid)

sfeal = dict( zip(SLT.TreeB.leafs.values(), SLT.TreeB.leafs.keys() ) )
clade_leafs = map( lambda x : sfeal[x], SLT.subset_b_leafs )

clade = skt.shear( clade_leafs )
clade.children[0].length = 0
clade.write( str(cladeid) + '.tree' )

with open( str(cladeid) + '.tree' ) as f1 :
    with open( str(cladeid) + '_noquote.tree', 'w' ) as f2 :
        f2.write( f1.read().replace('\'','') )

In [71]:
from ete2 import Tree, TreeStyle, NodeStyle, TextFace
from numpy import linspace

ts = TreeStyle()
ts.mode = 'r'
ts.show_leaf_name = True
ts.branch_vertical_margin = 2
ts.scale = 30000
ts.show_leaf_name = False
ts.show_scale = False

nstyle = NodeStyle()
nstyle['size'] = 0

ete_tree = Tree( str(cladeid) + '.tree' )

for node in ete_tree.traverse() :
    node.set_style(nstyle)
    if node.is_leaf :
        tf = TextFace( node.name.replace('_',' ').replace('\'','') )
        tf.fsize = 100
        tf.hz_align = 100
        node.add_face( tf, 0 )

ete_tree.render("%%inline", w=120, units="mm", tree_style=ts)


Out[71]:

In [72]:
clinks = links[ clade_leafs ]
uclinks = clinks.applymap( bool ).unstack()
uclinks = uclinks[ uclinks ]
with open( str(cladeid) + '.links', 'w' ) as f :
    for pair in list(uclinks.index) :
        f.write( '\t'.join(pair) + '\n' )

In [73]:
with open( str(cladeid) + '.rev_links', 'w' ) as f :
    for pair in list(uclinks.index) :
        f.write( '\t'.join(pair[::-1]) + '\n' )

In [74]:
from screed import ScreedDB, read_fasta_sequences

read_fasta_sequences( 'fishpoo2_p200_c2_unique_2_clustalo.fasta' )

db = ScreedDB( 'fishpoo2_p200_c2_unique_2_clustalo.fasta' )

In [75]:
with open( str(cladeid) + '.aln.fasta', 'w' ) as f :
    for leaf in clade_leafs :
        a = db[leaf]
        f.write( '>' + a.name + '\n' + str(a.sequence) + '\n' )

In [49]:
%load_ext rpy2.ipython


The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython

In [77]:
%%R -w 30 -h 20 -u cm

library('phytools')

tr1 <- read.tree( 'mcgee.tree' )
tr2 <- read.newick( '42333_noquote.tree' )
tr2 <- collapse.singles(tr2)
assoc = as.matrix(read.csv( '42333.rev_links', sep='\t', header=FALSE ))
colnames(assoc)<-c('tips.tr1','tips.tr2')

obj <- cophylo( tr1, tr2, assoc=assoc )
plot(obj)


Rotating nodes to optimize matching...
Done.