In [ ]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import os
import re
import seaborn as sns
import time # for filenames
In [ ]:
# load in the spreadsheet tabulating # of reads per organism. Data not normalized (as of 150104)
d = pd.read_csv("/gscratch/lidstrom/meta4/analysis/assemble_summaries/summary_genome.dat", sep="\t")
In [ ]:
# preview the columns
d.columns[1:5]
In [ ]:
# Trim names like SUM(summary_week14_O2High.LakWasM130_HOW14_2_reads_mapped) to 130_HOW14
def extract_colname(string):
return re.search(pattern=r'summary_week[0-9]+_O2[A-z]+.[A-z]+([0-9]+_[H,L]OW[0-9]+)_2_reads_mapped',
string=string).group(1)
# function demo
extract_colname("summary_week14_O2High.LakWasM128_HOW14_2_reads_mapped")
In [ ]:
# replace the columns with a new list of column names. Don't change the 0th one, which is 'genome'.
d.columns = [d.columns[0]] + map(extract_colname, d.columns[1:])
In [ ]:
d.head()
In [ ]:
# Need to set indexes or seaborn will be confused why genome column doesn't have numeric values.
d_indexed = d.set_index(['genome'])
d_indexed.head()
In [ ]:
# ask for inline plots (assumes interactive .ipnb session)
%matplotlib inline
In [ ]:
# make a demo plot using fake data
uniform_data = np.random.rand(10, 12)
#print uniform_data
ax = sns.heatmap(uniform_data)
In [ ]:
# Show it works for our data (but will need to facet/organize)
sns.heatmap(data = d_indexed)
In [ ]:
# melt the data. Seaborn facets ask for melted data.
# You pivot it back into a rectangle before plotting; see below.
d_melt = pd.melt(d, id_vars='genome', var_name='id', value_name='reads')
d_melt.head()
In [ ]:
# read in the metadata.
# Produced by an R script: /Users/janet/Dropbox/meta4/data/151229_sample_meta_info/sample_meta_info.tsv
md = pd.read_csv('sample_meta_info.tsv', sep='\t')
md.head()
# tidy things up
md = md.rename(columns = {'oxy':'oxygen'})
del md['name']
del md['week_long']
# make the oxygen amounts lowercase.
md['oxygen'] = md['oxygen'].str.lower()
In [ ]:
md.head()
In [ ]:
# merge on the metadata.
df_melt = pd.merge(left=d_melt, right = md, how='outer')
In [ ]:
df_melt.head()
In [ ]:
# now cast it so we can make seaborn plots.
df = df_melt.pivot_table(index=['rep', 'genome', 'oxygen'], columns=['week'], values='reads')
df.head()
In [ ]:
# get subsets of the data. http://pandas.pydata.org/pandas-docs/stable/advanced.html
idx = pd.IndexSlice
# demo:
df.loc[idx[:,"Arthrobacter sp. 31Y", 'high'],]
# any index 1 (rep), only genome == "Arthrobacter sp. 31Y", only oxygen == 'high'
In [ ]:
# accessing subsets of the data using the multi-indexes
print df.loc[idx[:,:, 'high'],].head()
sns.heatmap(data = df.loc[idx[:,:, 'high'],])
In [ ]:
# Make a new column, called 'facet rep' to use as facet titles
df_melt['facet rep'] = 'replicate ' + df_melt['rep'].astype(str)
In [ ]:
df_melt.head()
In [ ]:
# Make a plot directory.
def make_dir(directory):
if not os.path.exists(directory):
os.makedirs(directory)
make_dir('plots')
In [ ]:
# Make facetd heatmaps.
# http://stackoverflow.com/questions/34552770/getting-a-legend-in-a-seaborn-facetgrid-heatmap-plot
def facet_heatmap(data, color, **kwargs):
""" Tells the individual plots in the FacetGrid what to do.
Also passes on info about the legend via the **kws"""
print kwargs # holds info about the colorbar. E.g. cbar_ax=cbar_ax
# use pivot_table to cast the data how seaborn wants.
# Note that the default way to handle duplicate rows would be to average the values.
# We overwrite this by telling it to use the sum.
data = data.pivot_table(index='genome', columns='week', values='reads', aggfunc=np.sum)
g = sns.heatmap(data, cmap='Blues', **kwargs) # <-- Pass kwargs to heatmap
def plot_by_oxygen_tension(data, filename, oxygen='all', dir='plots'):
""" Make and save a seaborn FacetGrid plot for the data."""
# prepare filename
filename = dir +"/" + time.strftime("%Y%m%d") + '_' + filename + '--' + oxygen + "_O2"+ '.pdf'
print filename
if oxygen=='all':
print "using all data"
else:
# .loc[idx[:,:,['C1','C3']],idx[:,'foo']]
data = data.loc[data['oxygen'] == oxygen]
print data['oxygen'].unique()
# make the replicate number a facet label
col_order = ['replicate 1','replicate 2','replicate 3','replicate 4']
# need to facet by replicate.
# use the seaborn.plotting_context to change the settings for just the current plot:
# http://stackoverflow.com/questions/25328003/how-can-i-change-the-font-size-using-seaborn-facetgrid
num_genomes = len(data['genome'].unique())
with sns.plotting_context(font_scale=8):
g = sns.FacetGrid(data, col="facet rep", col_wrap=4,
size=1+.2*num_genomes, aspect=50./num_genomes/4, # size is height of each facet.
col_order=col_order)
cbar_ax = g.fig.add_axes([.92, .3, .02, .4]) # <-- Create a colorbar axes
g = g.map_dataframe(facet_heatmap, cbar_ax=cbar_ax)
# , vmin=0, vmax=1 # <-- Specify the colorbar axes and limits
g.set_titles(col_template="{col_name}", fontweight='bold', fontsize=18)
g.set_axis_labels('week')
g.fig.subplots_adjust(right=.9) # <-- Add space so the colorbar doesn't overlap the plot
g.savefig(filename)
In [ ]:
# Make the first handful of plots.
plot_by_oxygen_tension(df_melt, filename = 'reads_per_isolate_genome')
plot_by_oxygen_tension(df_melt, filename = 'reads_per_isolate_genome', oxygen='high')
plot_by_oxygen_tension(df_melt, filename = 'reads_per_isolate_genome', oxygen='low')
In [ ]:
# We want to be able to see subsets of the organisms.
# For example, we may only want to include organisms who had high experssion in at least one sample
def genomes_with_min_expression(data, min_val):
""" give back a genome names for genomes with at least one read sum over the mean_val """
return data.loc[data['reads'] > min_val]['genome'].unique()
# demo:
genomes_with_min_expression(data= df_melt, min_val = 10**6)
In [ ]:
def subset_by_threshold(df, min_val):
""" Subset DataFrame to genomes with names in a list. """
return df[df['genome'].isin(genomes_with_min_expression(data=df, min_val = min_val))]
In [ ]:
# Demo of subset_by_threshold, which leverages genomes_with_min_expression()
subset_by_threshold(df=df_melt, min_val=10**6).head()
In [ ]:
plot_by_oxygen_tension(data = subset_by_threshold(df=df_melt, min_val=10**5),
filename = 'reads_per_isolate_genome--limit_10_to_5th')
plot_by_oxygen_tension(data = subset_by_threshold(df=df_melt, min_val=10**6),
filename = 'reads_per_isolate_genome--limit_10_to_6th')
plot_by_oxygen_tension(data = subset_by_threshold(df=df_melt, min_val=10**5), oxygen='high',
filename = 'reads_per_isolate_genome--limit_10_to_5th')
plot_by_oxygen_tension(data = subset_by_threshold(df=df_melt, min_val=10**6), oxygen='high',
filename = 'reads_per_isolate_genome--limit_10_to_6th')
plot_by_oxygen_tension(data = subset_by_threshold(df=df_melt, min_val=10**5), oxygen='low',
filename = 'reads_per_isolate_genome--limit_10_to_5th')
plot_by_oxygen_tension(data = subset_by_threshold(df=df_melt, min_val=10**6), oxygen='low',
filename = 'reads_per_isolate_genome--limit_10_to_6th')
#plot_by_oxygen_tension(df_melt, filename = 'reads_per_isolate_genome', oxygen='high')
#plot_by_oxygen_tension(df_melt, filename = 'reads_per_isolate_genome', oxygen='low')