In [9]:
%pylab inline
In [10]:
from SuchTree import SuchTree, SuchLinkedTrees, pearson
import pandas as pd
import numpy as np
import seaborn
In [11]:
T1 = SuchTree( '../fishpoo/mcgee_trimmed.tree' )
T2 = SuchTree( 'http://edhar.genomecenter.ucdavis.edu/~russell/fishpoo/fishpoo2_p200_c2_unique_2_clustalo_fasttree.tree' )
links = pd.read_csv( 'http://edhar.genomecenter.ucdavis.edu/~russell/fishpoo/fishpoo2_p200_c2_host_count_table.tsv',
sep='\t', index_col='Host')
links.index = map( lambda x : x.replace(' ','_'), links.index )
Before we start with the science, let's use SuchTree to help us explore
the example data. We'll start by plotting a tree of T1 using ete.
We will use T1 later as the phylogeny of host organisms.
In [12]:
from ete2 import Tree, TreeStyle, NodeStyle, TextFace
from numpy import linspace
ts = TreeStyle()
ts.mode = 'r'
ts.show_leaf_name = True
ts.branch_vertical_margin = 2
ts.scale = 1000
ts.show_leaf_name = False
ts.show_scale = False
nstyle = NodeStyle()
nstyle['size'] = 0
ete_tree = Tree( '../fishpoo/mcgee_trimmed.tree' )
for node in ete_tree.traverse() :
node.set_style(nstyle)
if node.is_leaf :
tf = TextFace( node.name.replace('_',' ').replace('\'','') )
tf.fsize = 10
tf.hz_align = 100
node.add_face( tf, 0 )
ete_tree.render("%%inline", w=120, units="mm", tree_style=ts)
Out[12]:
SuchTree has two ways to calculate distances; one pair at a time using
SuchTree.distance(), or several large groups at once using SuchTree.distances().
The later avoids the interpreter's overhead and is much faster.
Note that the dendrograms in the cluster maps shown below are re-computed from
the distances in the matrix using neighbor joining, and so they do not exactly match
the phylogeny. Send complaints to the authors of seaborn.
First, here we'll calculate the matrix of patristic distances for T1 using
SuchTree.distance().
In [13]:
D1 = zeros( ( len(T1.leafs),len(T1.leafs) ) )
for i,a in enumerate(T1.leafs.values()) :
for j,b in enumerate( T1.leafs.values() ) :
D1[i,j] = T1.distance( a, b )
seaborn.clustermap(D1)
Out[13]:
Here is the same calculation in batch mode using SuchTree.distances().
In [14]:
D2_list = []
for i,a in enumerate(T1.leafs.values()) :
for j,b in enumerate( T1.leafs.values() ) :
D2_list.append( ( a, b ) )
D2_array = array( D2_list )
D2 = T1.distances( D2_array )
D2 = D2.reshape( ( len(T1.leafs), len(T1.leafs) ) )
seaborn.clustermap(D2)
Out[14]:
In [15]:
%%time
SLT = SuchLinkedTrees( T1, T2, links )
In [16]:
SLT.TreeB.get_leafs( 7027 )
Out[16]:
In [17]:
SLT.subset_b( 7027 )
print 'subset size, a :', SLT.subset_a_size
print 'subset size, b :', SLT.subset_b_size
print 'subset links :', SLT.subset_n_links
print 'link pairs :', ( SLT.subset_n_links * ( SLT.subset_n_links -1 ) ) / 2
In [18]:
print SLT.col_ids
print SLT.subset_columns
print SLT.subset_b_leafs
In [19]:
result = SLT.linked_distances()
seaborn.jointplot( result['TreeA'], result['TreeB'] )
Out[19]:
In [20]:
SLT.subset_a( 1 )
print 'subset size, a :', SLT.subset_a_size
print 'subset size, b :', SLT.subset_b_size
print 'subset links :', SLT.subset_n_links
print 'link pairs :', ( SLT.subset_n_links * ( SLT.subset_n_links -1 ) ) / 2
In [21]:
result = SLT.linked_distances()
seaborn.jointplot( result['TreeA'], result['TreeB'] )
Out[21]:
In [22]:
SLT.subset_a( SLT.TreeA.root )
print 'subset size, a :', SLT.subset_a_size
print 'subset size, b :', SLT.subset_b_size
print 'subset links :', SLT.subset_n_links
print 'link pairs :', ( SLT.subset_n_links * ( SLT.subset_n_links -1 ) ) / 2
In [23]:
SLT.get_column_leafs( 0, as_row_ids=True )
Out[23]:
In [24]:
SLT.get_column_links( 0 )
Out[24]:
In [25]:
result_sampled = SLT.sample_linked_distances(sigma=0.05, n=10000, buckets=10)
result_sampled
Out[25]:
In [26]:
seaborn.jointplot( result_sampled['TreeA'], result_sampled['TreeB'] )
Out[26]:
In [27]:
seaborn.kdeplot(result['TreeB'])
Out[27]:
In [28]:
seaborn.kdeplot(result_sampled['TreeB'])
Out[28]:
In [29]:
sd = sorted(list(set(result_sampled['TreeB'])))
ad = sorted(list(set(result['TreeB'])))
plot( linspace(0,1,len(sd)), sd )
plot( linspace(0,1,len(ad)), ad )
Out[29]:
In [30]:
seaborn.kdeplot(array(sd))
Out[30]:
In [31]:
seaborn.kdeplot(array(ad))
Out[31]:
In [ ]:
import pyprind
p = pyprind.ProgBar( len( list( SLT.TreeB.get_internal_nodes() ) ), monitor=True, title='sampling trees...' )
big_nodes = []
table = {}
for n,node in enumerate( SLT.TreeB.get_internal_nodes() ) :
p.update()
SLT.subset_b( node )
if SLT.subset_n_links > 4000 :
big_nodes.append( node )
result = SLT.sample_linked_distances( sigma=0.05, n=1000, buckets=100)
else :
result = SLT.linked_distances()
table[node] = { 'n_leafs' : SLT.subset_b_size,
'n_links' : SLT.subset_n_links,
'n_pairs' : result['n_pairs'],
'n_samples' : result['n_samples'],
'deviatnon_a': result['deviation_a'],
'deviation_b': result['deviation_b'],
'r' : pearson( result['TreeA'], result['TreeB'] ) }
In [32]:
#C.to_csv( 'docs/fishpoo.csv' )
C = pd.DataFrame.from_csv( 'docs/fishpoo.csv' )
In [33]:
#C = pd.DataFrame( table ).T
seaborn.jointplot( 'n_links', 'r', data=C )
Out[33]:
In [34]:
seaborn.jointplot( 'n_links', 'r', data=C.query('n_leafs > 5 and r > 0.05') )
Out[34]:
In [35]:
CC = C.query('n_leafs > 5 and r > 0.2').sort_values('r', ascending=False)
print CC.shape
CC.head()
Out[35]:
In [36]:
from scipy.stats import kendalltau, pearsonr
pearson_p = {}
kendall_tau = {}
kendall_p = {}
for n,node in enumerate( CC.index ) :
SLT.subset_b(node)
result = SLT.linked_distances()
p_r,p_p = pearsonr( result['TreeA'], result['TreeB'] )
k_t,k_p = kendalltau( result['TreeA'], result['TreeB'] )
pearson_p[node] = p_p
kendall_tau[node] = k_t
kendall_p[node] = k_p
CC['pearson_p'] = pd.Series(pearson_p)
CC['kendall_tau'] = pd.Series(kendall_tau)
CC['kendall_p'] = pd.Series(kendall_p)
#CC.head()
#CC
In [37]:
seaborn.jointplot( 'r', 'kendall_tau', data=CC )
Out[37]:
In [38]:
seaborn.jointplot( 'n_links', 'kendall_tau', data=CC )
Out[38]:
In [51]:
figure(figsize=(10,8))
for n,(node_id,row) in enumerate( CC.iterrows() ) :
data = dict(row)
subplot(4,4,n+1)
SLT.subset_b( node_id )
result = SLT.linked_distances()
scatter( result['TreeA'], result['TreeB'], marker='o', s=3, c='black', alpha=0.4 )
xticks([])
yticks([])
xlim((-0.1,max( result['TreeA'] )+0.1))
ylim((-0.1,max( result['TreeB'] )+0.1))
title( str(node_id) + ' : ' + str(data['kendall_tau']) )
if n == 15 : break
tight_layout()
In [42]:
from skbio import TreeNode
skt = TreeNode.read('http://edhar.genomecenter.ucdavis.edu/~russell/fishpoo/fishpoo2_p200_c2_unique_2_clustalo_fasttree.tree', convert_underscores=False)
In [70]:
cladeid = 42333
SLT.subset_b(cladeid)
sfeal = dict( zip(SLT.TreeB.leafs.values(), SLT.TreeB.leafs.keys() ) )
clade_leafs = map( lambda x : sfeal[x], SLT.subset_b_leafs )
clade = skt.shear( clade_leafs )
clade.children[0].length = 0
clade.write( str(cladeid) + '.tree' )
with open( str(cladeid) + '.tree' ) as f1 :
with open( str(cladeid) + '_noquote.tree', 'w' ) as f2 :
f2.write( f1.read().replace('\'','') )
In [71]:
from ete2 import Tree, TreeStyle, NodeStyle, TextFace
from numpy import linspace
ts = TreeStyle()
ts.mode = 'r'
ts.show_leaf_name = True
ts.branch_vertical_margin = 2
ts.scale = 30000
ts.show_leaf_name = False
ts.show_scale = False
nstyle = NodeStyle()
nstyle['size'] = 0
ete_tree = Tree( str(cladeid) + '.tree' )
for node in ete_tree.traverse() :
node.set_style(nstyle)
if node.is_leaf :
tf = TextFace( node.name.replace('_',' ').replace('\'','') )
tf.fsize = 100
tf.hz_align = 100
node.add_face( tf, 0 )
ete_tree.render("%%inline", w=120, units="mm", tree_style=ts)
Out[71]:
In [72]:
clinks = links[ clade_leafs ]
uclinks = clinks.applymap( bool ).unstack()
uclinks = uclinks[ uclinks ]
with open( str(cladeid) + '.links', 'w' ) as f :
for pair in list(uclinks.index) :
f.write( '\t'.join(pair) + '\n' )
In [73]:
with open( str(cladeid) + '.rev_links', 'w' ) as f :
for pair in list(uclinks.index) :
f.write( '\t'.join(pair[::-1]) + '\n' )
In [74]:
from screed import ScreedDB, read_fasta_sequences
read_fasta_sequences( 'fishpoo2_p200_c2_unique_2_clustalo.fasta' )
db = ScreedDB( 'fishpoo2_p200_c2_unique_2_clustalo.fasta' )
In [75]:
with open( str(cladeid) + '.aln.fasta', 'w' ) as f :
for leaf in clade_leafs :
a = db[leaf]
f.write( '>' + a.name + '\n' + str(a.sequence) + '\n' )
In [49]:
%load_ext rpy2.ipython
In [77]:
%%R -w 30 -h 20 -u cm
library('phytools')
tr1 <- read.tree( 'mcgee.tree' )
tr2 <- read.newick( '42333_noquote.tree' )
tr2 <- collapse.singles(tr2)
assoc = as.matrix(read.csv( '42333.rev_links', sep='\t', header=FALSE ))
colnames(assoc)<-c('tips.tr1','tips.tr2')
obj <- cophylo( tr1, tr2, assoc=assoc )
plot(obj)