In [9]:
%pylab inline
In [10]:
from SuchTree import SuchTree, SuchLinkedTrees, pearson
import pandas as pd
import numpy as np
import seaborn
In [11]:
T1 = SuchTree( '../fishpoo/mcgee_trimmed.tree' )
T2 = SuchTree( 'http://edhar.genomecenter.ucdavis.edu/~russell/fishpoo/fishpoo2_p200_c2_unique_2_clustalo_fasttree.tree' )
links = pd.read_csv( 'http://edhar.genomecenter.ucdavis.edu/~russell/fishpoo/fishpoo2_p200_c2_host_count_table.tsv',
sep='\t', index_col='Host')
links.index = map( lambda x : x.replace(' ','_'), links.index )
Before we start with the science, let's use SuchTree
to help us explore
the example data. We'll start by plotting a tree of T1
using ete
.
We will use T1
later as the phylogeny of host organisms.
In [12]:
from ete2 import Tree, TreeStyle, NodeStyle, TextFace
from numpy import linspace
ts = TreeStyle()
ts.mode = 'r'
ts.show_leaf_name = True
ts.branch_vertical_margin = 2
ts.scale = 1000
ts.show_leaf_name = False
ts.show_scale = False
nstyle = NodeStyle()
nstyle['size'] = 0
ete_tree = Tree( '../fishpoo/mcgee_trimmed.tree' )
for node in ete_tree.traverse() :
node.set_style(nstyle)
if node.is_leaf :
tf = TextFace( node.name.replace('_',' ').replace('\'','') )
tf.fsize = 10
tf.hz_align = 100
node.add_face( tf, 0 )
ete_tree.render("%%inline", w=120, units="mm", tree_style=ts)
Out[12]:
SuchTree
has two ways to calculate distances; one pair at a time using
SuchTree.distance()
, or several large groups at once using SuchTree.distances()
.
The later avoids the interpreter's overhead and is much faster.
Note that the dendrograms in the cluster maps shown below are re-computed from
the distances in the matrix using neighbor joining, and so they do not exactly match
the phylogeny. Send complaints to the authors of seaborn
.
First, here we'll calculate the matrix of patristic distances for T1
using
SuchTree.distance()
.
In [13]:
D1 = zeros( ( len(T1.leafs),len(T1.leafs) ) )
for i,a in enumerate(T1.leafs.values()) :
for j,b in enumerate( T1.leafs.values() ) :
D1[i,j] = T1.distance( a, b )
seaborn.clustermap(D1)
Out[13]:
Here is the same calculation in batch mode using SuchTree.distances()
.
In [14]:
D2_list = []
for i,a in enumerate(T1.leafs.values()) :
for j,b in enumerate( T1.leafs.values() ) :
D2_list.append( ( a, b ) )
D2_array = array( D2_list )
D2 = T1.distances( D2_array )
D2 = D2.reshape( ( len(T1.leafs), len(T1.leafs) ) )
seaborn.clustermap(D2)
Out[14]:
In [15]:
%%time
SLT = SuchLinkedTrees( T1, T2, links )
In [16]:
SLT.TreeB.get_leafs( 7027 )
Out[16]:
In [17]:
SLT.subset_b( 7027 )
print 'subset size, a :', SLT.subset_a_size
print 'subset size, b :', SLT.subset_b_size
print 'subset links :', SLT.subset_n_links
print 'link pairs :', ( SLT.subset_n_links * ( SLT.subset_n_links -1 ) ) / 2
In [18]:
print SLT.col_ids
print SLT.subset_columns
print SLT.subset_b_leafs
In [19]:
result = SLT.linked_distances()
seaborn.jointplot( result['TreeA'], result['TreeB'] )
Out[19]:
In [20]:
SLT.subset_a( 1 )
print 'subset size, a :', SLT.subset_a_size
print 'subset size, b :', SLT.subset_b_size
print 'subset links :', SLT.subset_n_links
print 'link pairs :', ( SLT.subset_n_links * ( SLT.subset_n_links -1 ) ) / 2
In [21]:
result = SLT.linked_distances()
seaborn.jointplot( result['TreeA'], result['TreeB'] )
Out[21]:
In [22]:
SLT.subset_a( SLT.TreeA.root )
print 'subset size, a :', SLT.subset_a_size
print 'subset size, b :', SLT.subset_b_size
print 'subset links :', SLT.subset_n_links
print 'link pairs :', ( SLT.subset_n_links * ( SLT.subset_n_links -1 ) ) / 2
In [23]:
SLT.get_column_leafs( 0, as_row_ids=True )
Out[23]:
In [24]:
SLT.get_column_links( 0 )
Out[24]:
In [25]:
result_sampled = SLT.sample_linked_distances(sigma=0.05, n=10000, buckets=10)
result_sampled
Out[25]:
In [26]:
seaborn.jointplot( result_sampled['TreeA'], result_sampled['TreeB'] )
Out[26]:
In [27]:
seaborn.kdeplot(result['TreeB'])
Out[27]:
In [28]:
seaborn.kdeplot(result_sampled['TreeB'])
Out[28]:
In [29]:
sd = sorted(list(set(result_sampled['TreeB'])))
ad = sorted(list(set(result['TreeB'])))
plot( linspace(0,1,len(sd)), sd )
plot( linspace(0,1,len(ad)), ad )
Out[29]:
In [30]:
seaborn.kdeplot(array(sd))
Out[30]:
In [31]:
seaborn.kdeplot(array(ad))
Out[31]:
In [ ]:
import pyprind
p = pyprind.ProgBar( len( list( SLT.TreeB.get_internal_nodes() ) ), monitor=True, title='sampling trees...' )
big_nodes = []
table = {}
for n,node in enumerate( SLT.TreeB.get_internal_nodes() ) :
p.update()
SLT.subset_b( node )
if SLT.subset_n_links > 4000 :
big_nodes.append( node )
result = SLT.sample_linked_distances( sigma=0.05, n=1000, buckets=100)
else :
result = SLT.linked_distances()
table[node] = { 'n_leafs' : SLT.subset_b_size,
'n_links' : SLT.subset_n_links,
'n_pairs' : result['n_pairs'],
'n_samples' : result['n_samples'],
'deviatnon_a': result['deviation_a'],
'deviation_b': result['deviation_b'],
'r' : pearson( result['TreeA'], result['TreeB'] ) }
In [32]:
#C.to_csv( 'docs/fishpoo.csv' )
C = pd.DataFrame.from_csv( 'docs/fishpoo.csv' )
In [33]:
#C = pd.DataFrame( table ).T
seaborn.jointplot( 'n_links', 'r', data=C )
Out[33]:
In [34]:
seaborn.jointplot( 'n_links', 'r', data=C.query('n_leafs > 5 and r > 0.05') )
Out[34]:
In [35]:
CC = C.query('n_leafs > 5 and r > 0.2').sort_values('r', ascending=False)
print CC.shape
CC.head()
Out[35]:
In [36]:
from scipy.stats import kendalltau, pearsonr
pearson_p = {}
kendall_tau = {}
kendall_p = {}
for n,node in enumerate( CC.index ) :
SLT.subset_b(node)
result = SLT.linked_distances()
p_r,p_p = pearsonr( result['TreeA'], result['TreeB'] )
k_t,k_p = kendalltau( result['TreeA'], result['TreeB'] )
pearson_p[node] = p_p
kendall_tau[node] = k_t
kendall_p[node] = k_p
CC['pearson_p'] = pd.Series(pearson_p)
CC['kendall_tau'] = pd.Series(kendall_tau)
CC['kendall_p'] = pd.Series(kendall_p)
#CC.head()
#CC
In [37]:
seaborn.jointplot( 'r', 'kendall_tau', data=CC )
Out[37]:
In [38]:
seaborn.jointplot( 'n_links', 'kendall_tau', data=CC )
Out[38]:
In [51]:
figure(figsize=(10,8))
for n,(node_id,row) in enumerate( CC.iterrows() ) :
data = dict(row)
subplot(4,4,n+1)
SLT.subset_b( node_id )
result = SLT.linked_distances()
scatter( result['TreeA'], result['TreeB'], marker='o', s=3, c='black', alpha=0.4 )
xticks([])
yticks([])
xlim((-0.1,max( result['TreeA'] )+0.1))
ylim((-0.1,max( result['TreeB'] )+0.1))
title( str(node_id) + ' : ' + str(data['kendall_tau']) )
if n == 15 : break
tight_layout()
In [42]:
from skbio import TreeNode
skt = TreeNode.read('http://edhar.genomecenter.ucdavis.edu/~russell/fishpoo/fishpoo2_p200_c2_unique_2_clustalo_fasttree.tree', convert_underscores=False)
In [70]:
cladeid = 42333
SLT.subset_b(cladeid)
sfeal = dict( zip(SLT.TreeB.leafs.values(), SLT.TreeB.leafs.keys() ) )
clade_leafs = map( lambda x : sfeal[x], SLT.subset_b_leafs )
clade = skt.shear( clade_leafs )
clade.children[0].length = 0
clade.write( str(cladeid) + '.tree' )
with open( str(cladeid) + '.tree' ) as f1 :
with open( str(cladeid) + '_noquote.tree', 'w' ) as f2 :
f2.write( f1.read().replace('\'','') )
In [71]:
from ete2 import Tree, TreeStyle, NodeStyle, TextFace
from numpy import linspace
ts = TreeStyle()
ts.mode = 'r'
ts.show_leaf_name = True
ts.branch_vertical_margin = 2
ts.scale = 30000
ts.show_leaf_name = False
ts.show_scale = False
nstyle = NodeStyle()
nstyle['size'] = 0
ete_tree = Tree( str(cladeid) + '.tree' )
for node in ete_tree.traverse() :
node.set_style(nstyle)
if node.is_leaf :
tf = TextFace( node.name.replace('_',' ').replace('\'','') )
tf.fsize = 100
tf.hz_align = 100
node.add_face( tf, 0 )
ete_tree.render("%%inline", w=120, units="mm", tree_style=ts)
Out[71]:
In [72]:
clinks = links[ clade_leafs ]
uclinks = clinks.applymap( bool ).unstack()
uclinks = uclinks[ uclinks ]
with open( str(cladeid) + '.links', 'w' ) as f :
for pair in list(uclinks.index) :
f.write( '\t'.join(pair) + '\n' )
In [73]:
with open( str(cladeid) + '.rev_links', 'w' ) as f :
for pair in list(uclinks.index) :
f.write( '\t'.join(pair[::-1]) + '\n' )
In [74]:
from screed import ScreedDB, read_fasta_sequences
read_fasta_sequences( 'fishpoo2_p200_c2_unique_2_clustalo.fasta' )
db = ScreedDB( 'fishpoo2_p200_c2_unique_2_clustalo.fasta' )
In [75]:
with open( str(cladeid) + '.aln.fasta', 'w' ) as f :
for leaf in clade_leafs :
a = db[leaf]
f.write( '>' + a.name + '\n' + str(a.sequence) + '\n' )
In [49]:
%load_ext rpy2.ipython
In [77]:
%%R -w 30 -h 20 -u cm
library('phytools')
tr1 <- read.tree( 'mcgee.tree' )
tr2 <- read.newick( '42333_noquote.tree' )
tr2 <- collapse.singles(tr2)
assoc = as.matrix(read.csv( '42333.rev_links', sep='\t', header=FALSE ))
colnames(assoc)<-c('tips.tr1','tips.tr2')
obj <- cophylo( tr1, tr2, assoc=assoc )
plot(obj)