In [14]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import re
import math
from sys import argv
In [15]:
sns.set_style("whitegrid")
sns.set_style("ticks")
sns.set_context("poster")
In [16]:
def read_clusters_tsv(path):
df = pd.read_csv(path, sep='\t', header=0, index_col=0)
df = df.rename(columns={'Unnamed: 1': 'cluster_size'})
col = [int(re.sub(r'SIZE:([0-9]*)', r'\1', i)) for i in df['cluster_size']]
df['cluster_size'] = col
return df
In [17]:
def plot_hist_tara(df_all_clusters, df_missing_clusters, bin_max, bin_size, max_frac, num_genomes, ymax, fig_path):
fig, ax = plt.subplots()
sns.distplot(df_all_clusters.cluster_size, kde=False, color='b', bins=np.arange(0,bin_max+bin_size,bin_size), label='All gene clusters')
sns.distplot(df_missing_clusters.cluster_size, kde=False, color='r', bins=np.arange(0,bin_max+bin_size,bin_size), label='Missing from Tara metagenomes')
sns.despine(offset=10)
xticks = np.array(np.arange(0, max_frac, 0.2) * num_genomes)
xticklabels = xticks / num_genomes
plt.xticks(xticks, xticklabels)
plt.xlim(0, max_frac*num_genomes)
plt.xlabel('Cluster copy number (per genome)')
plt.yscale('log')
plt.ylim(0.5, 1e4)
yticks = np.array([1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000])
yticklabels = ['1', '2', '5', '10', '20', '50', '100', '200', '500', '1000', '2000', '5000', '10000']
plt.yticks(yticks, yticklabels)
plt.ylabel('Number of clusters')
plt.legend()
fig.set_size_inches(12, 8)
plt.savefig(fig_path)
In [18]:
def merge_cluster_counts(path):
# Paths of input files, containing cluster counts
paths = pd.Series.from_csv(path, header=-1, sep='\t', index_col=None)
# Data frame containing all samples cluster counts (NaN if missing)
pieces = []
for path in paths:
fullpath = "/Users/luke/singlecell/tara/PROK-139/%s" % path
counts = pd.DataFrame.from_csv(fullpath, header=-1, sep='\t', index_col=0)
pieces.append(counts)
frame = pd.concat(pieces, axis=1)
headings = paths.tolist()
frame.columns = headings
return frame
In [19]:
def make_df_size_count(df_all, df_counts):
df_size_count = pd.DataFrame()
df_size_count['cluster_size'] = df_all.cluster_size
df_size_count['tara_count'] = df_counts.count(axis=1)
df_size_count.fillna(0, inplace=True)
df_size_count.tara_count = df_size_count.tara_count.astype(int)
return(df_size_count)
In [20]:
def make_groupby_size_count(df_size_count):
# matrix style of cluster_size and tara_count
groupby_size_count = df_size_count.groupby(['cluster_size', 'tara_count']).size().unstack().transpose()
max_size = df_size_count.cluster_size.max()
max_count = int(df_size_count.tara_count.max())
# add empty columns
for i in range(1, max_size+1):
if not i in groupby_size_count:
groupby_size_count[i] = np.nan
# add empty rows (might not be any)
for i in range(1, max_count+1):
if not i in groupby_size_count.index:
groupby_size_count.loc[i] = np.nan
groupby_size_count.sort_index(axis=1, inplace=True)
#groupby_size_count.fillna(0, inplace=True)
return(groupby_size_count, max_size, max_count)
In [21]:
def plot_heatmap_tara(df, num_genomes, max_size, max_count, max_frac, xinches, yinches, species, fig_path):
fig, ax = plt.subplots()
myfig = sns.heatmap(np.log(df.iloc[::-1]), square=False, robust=True, cmap='inferno')
xticks = np.array(np.arange(0, 5, 0.2) * num_genomes)
xticklabels = xticks / num_genomes
ax.set_xticks(xticks)
ax.set_xticklabels(xticklabels)
ax.set_xlabel('Cluster copy number (per genome)')
ylabels = np.array(df.index[0::20].tolist() + [max_count])
ax.set_yticks(ylabels+0.5)
ax.set_yticklabels(ylabels)
ax.set_ylabel('Tara metagenomes cluster found in')
ax.set_title('Density heatmap of %s gene clusters' % species)
ax.axis([0, max_frac*num_genomes, 0, max_count+2])
fig.set_size_inches(xinches, yinches)
fig.savefig(fig_path)
In [22]:
# missing clusters
path = '/Users/luke/singlecell/tara/table_missing_proch_all_1e-5.tsv'
num_genomes = 145
fig_full = '/Users/luke/singlecell/tara/hist_missing_proch_all_1e-5_full.pdf'
fig_zoom = '/Users/luke/singlecell/tara/hist_missing_proch_all_1e-5_zoom.pdf'
df_missing_clusters = read_clusters_tsv(path)
In [23]:
# all clusters
path = '/Users/luke/singlecell/tara/table_all_proch.tsv'
df_all_clusters = read_clusters_tsv(path)
In [24]:
# tara counts by cluster
species = 'proch'
evalue = '1e-5'
path = '/Users/luke/singlecell/tara/paths_%s_%s.list' % (species, evalue)
df_counts = merge_cluster_counts(path)
# col_SRF = [col for col in list(df_counts.columns) if 'SRF' in col]
# df_counts = df_counts[col_SRF]
In [26]:
# full hist
plot_hist_tara(df_all_clusters, df_missing_clusters, 240, 5.8, 1.6001, num_genomes, 10000, fig_full)
In [27]:
# zoom hist
# plot_hist_tara(df_all_clusters, df_missing_clusters, 1.6201, num_genomes, 30, fig_zoom)
In [28]:
df_size_count = make_df_size_count(df_all_clusters, df_counts)
groupby_size_count, max_size, max_count = make_groupby_size_count(df_size_count)
In [29]:
print groupby_size_count.max().max()
In [30]:
# def fourth_root(num):
# return num**0.25
# def square_root(num):
# return num**0.5
# groupby_size_count_fourth_root = groupby_size_count_nonan.apply(fourth_root)
# groupby_size_count_square_root = groupby_size_count_nonan.apply(square_root)
In [31]:
fig_path = '/Users/luke/singlecell/tara/heatmap_tara_proch.pdf'
plot_heatmap_tara(groupby_size_count, num_genomes, max_size, max_count, 1.1, 12, 8, 'Prochlorococcus', fig_path) # 1.1 was 1.62
In [32]:
# jointplot of cluster_size and tara_count
fig = sns.jointplot(x='cluster_size', y='tara_count', data=df_size_count)
In [33]:
# missing clusters
path = '/Users/luke/singlecell/tara/table_missing_pelag_all_1e-5.tsv'
num_genomes = 47
fig_full = '/Users/luke/singlecell/tara/hist_missing_pelag_all_1e-5_full.pdf'
fig_zoom = '/Users/luke/singlecell/tara/hist_missing_pelag_all_1e-5_zoom.pdf'
df_missing_clusters = read_clusters_tsv(path)
In [34]:
# all clusters
path = '/Users/luke/singlecell/tara/table_all_pelag.tsv'
df_all_clusters = read_clusters_tsv(path)
In [35]:
# tara counts by cluster
species = 'pelag'
evalue = '1e-5'
path = '/Users/luke/singlecell/tara/paths_%s_%s.list' % (species, evalue)
df_counts = merge_cluster_counts(path)
# col_SRF = [col for col in list(df_counts.columns) if 'SRF' in col]
# df_counts = df_counts[col_SRF]
In [36]:
df_counts.shape
Out[36]:
In [37]:
# full hist
plot_hist_tara(df_all_clusters, df_missing_clusters, 100, 1.88, 1.6001, num_genomes, 5000, fig_full)
In [38]:
# zoom hist
#plot_hist_tara(df_all_clusters, df_missing_clusters, 1.6201, num_genomes, 30, fig_zoom)
In [39]:
df_size_count = make_df_size_count(df_all_clusters, df_counts)
groupby_size_count, max_size, max_count = make_groupby_size_count(df_size_count)
In [40]:
print groupby_size_count.max().max()
In [44]:
groupby_size_count.max().max()
Out[44]:
In [41]:
fig_path = '/Users/luke/singlecell/tara/heatmap_tara_pelag.pdf'
plot_heatmap_tara(groupby_size_count, num_genomes, max_size, max_count, 1.1, 12, 8, 'Pelagibacter', fig_path) # 1.1 was 1.62