notebook.community

Edit and run



In [ ]:

    
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import os
import re
import seaborn as sns
import time  # for filenames



In [ ]:

    
# load in the spreadsheet tabulating # of reads per organism.  Data not normalized (as of 150104)
d = pd.read_csv("/gscratch/lidstrom/meta4/analysis/assemble_summaries/summary_genome.dat", sep="\t")



In [ ]:

    
# preview the columns
d.columns[1:5]



In [ ]:

    
# Trim names like SUM(summary_week14_O2High.LakWasM130_HOW14_2_reads_mapped) to 130_HOW14
def extract_colname(string):
    return re.search(pattern=r'summary_week[0-9]+_O2[A-z]+.[A-z]+([0-9]+_[H,L]OW[0-9]+)_2_reads_mapped', 
                     string=string).group(1)
# function demo
extract_colname("summary_week14_O2High.LakWasM128_HOW14_2_reads_mapped")



In [ ]:

    
# replace the columns with a new list of column names.  Don't change the 0th one, which is 'genome'. 
d.columns = [d.columns[0]] + map(extract_colname, d.columns[1:])



In [ ]:

    
d.head()



In [ ]:

    
# Need to set indexes or seaborn will be confused why genome column doesn't have numeric values. 
d_indexed = d.set_index(['genome'])
d_indexed.head()



In [ ]:

    
# ask for inline plots (assumes interactive .ipnb session)
%matplotlib inline



In [ ]:

    
# make a demo plot using fake data
uniform_data = np.random.rand(10, 12)
#print uniform_data
ax = sns.heatmap(uniform_data)



In [ ]:

    
# Show it works for our data (but will need to facet/organize)
sns.heatmap(data = d_indexed)



In [ ]:

    
# melt the data.  Seaborn facets ask for melted data.  
# You pivot it back into a rectangle before plotting; see below. 
d_melt = pd.melt(d, id_vars='genome', var_name='id', value_name='reads')
d_melt.head()



In [ ]:

    
# read in the metadata. 
# Produced by an R script:  /Users/janet/Dropbox/meta4/data/151229_sample_meta_info/sample_meta_info.tsv
md = pd.read_csv('sample_meta_info.tsv', sep='\t')
md.head()
# tidy things up
md = md.rename(columns = {'oxy':'oxygen'})
del md['name']
del md['week_long']
# make the oxygen amounts lowercase. 
md['oxygen'] = md['oxygen'].str.lower()



In [ ]:

    
md.head()



In [ ]:

    
# merge on the metadata. 
df_melt = pd.merge(left=d_melt, right = md, how='outer')



In [ ]:

    
df_melt.head()



In [ ]:

    
# now cast it so we can make seaborn plots. 
df = df_melt.pivot_table(index=['rep', 'genome', 'oxygen'], columns=['week'], values='reads')
df.head()



In [ ]:

    
# get subsets of the data. http://pandas.pydata.org/pandas-docs/stable/advanced.html
idx = pd.IndexSlice
# demo: 
df.loc[idx[:,"Arthrobacter sp. 31Y", 'high'],]   
# any index 1 (rep), only genome == "Arthrobacter sp. 31Y", only oxygen == 'high'



In [ ]:

    
# accessing subsets of the data using the multi-indexes
print df.loc[idx[:,:, 'high'],].head()
sns.heatmap(data = df.loc[idx[:,:, 'high'],])



In [ ]:

    
# Make a new column, called 'facet rep' to use as facet titles
df_melt['facet rep'] = 'replicate ' + df_melt['rep'].astype(str)



In [ ]:

    
df_melt.head()



In [ ]:

    
# Make a plot directory. 
def make_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        
make_dir('plots')



In [ ]:

    
# Make facetd heatmaps. 
# http://stackoverflow.com/questions/34552770/getting-a-legend-in-a-seaborn-facetgrid-heatmap-plot

def facet_heatmap(data, color, **kwargs):
    """ Tells the individual plots in the FacetGrid what to do.
    Also passes on info about the legend via the **kws"""
    print kwargs  # holds info about the colorbar.  E.g. cbar_ax=cbar_ax
    # use pivot_table to cast the data how seaborn wants.  
    # Note that the default way to handle duplicate rows would be to average the values.
    # We overwrite this by telling it to use the sum. 
    data = data.pivot_table(index='genome', columns='week', values='reads', aggfunc=np.sum)
    g = sns.heatmap(data, cmap='Blues', **kwargs) # <-- Pass kwargs to heatmap    

def plot_by_oxygen_tension(data, filename, oxygen='all', dir='plots'):
    """ Make and save a seaborn FacetGrid plot for the data."""
    # prepare filename
    filename = dir +"/" + time.strftime("%Y%m%d") + '_' + filename + '--' + oxygen + "_O2"+ '.pdf'
    print filename
    if oxygen=='all':
        print "using all data"
    else: 
        # .loc[idx[:,:,['C1','C3']],idx[:,'foo']]
        data = data.loc[data['oxygen'] == oxygen]
        print data['oxygen'].unique() 
    # make the replicate number a facet label
    col_order = ['replicate 1','replicate 2','replicate 3','replicate 4']
    # need to facet by replicate. 
    # use the seaborn.plotting_context to change the settings for just the current plot:
        # http://stackoverflow.com/questions/25328003/how-can-i-change-the-font-size-using-seaborn-facetgrid
    num_genomes = len(data['genome'].unique())
    with sns.plotting_context(font_scale=8):
        g = sns.FacetGrid(data, col="facet rep", col_wrap=4, 
                          size=1+.2*num_genomes, aspect=50./num_genomes/4,  # size is height of each facet.
                          col_order=col_order)  
        cbar_ax = g.fig.add_axes([.92, .3, .02, .4])  # <-- Create a colorbar axes
        g = g.map_dataframe(facet_heatmap, cbar_ax=cbar_ax) 
               # ,  vmin=0, vmax=1 # <-- Specify the colorbar axes and limits
        g.set_titles(col_template="{col_name}", fontweight='bold', fontsize=18)
        g.set_axis_labels('week')
        g.fig.subplots_adjust(right=.9)  # <-- Add space so the colorbar doesn't overlap the plot
        g.savefig(filename)



In [ ]:

    
# Make the first handful of plots.         
plot_by_oxygen_tension(df_melt, filename = 'reads_per_isolate_genome')
plot_by_oxygen_tension(df_melt, filename = 'reads_per_isolate_genome', oxygen='high')
plot_by_oxygen_tension(df_melt, filename = 'reads_per_isolate_genome', oxygen='low')



In [ ]:

    
# We want to be able to see subsets of the organisms.  
# For example, we may only want to include organisms who had high experssion in at least one sample 
def genomes_with_min_expression(data, min_val):
    """ give back a genome names for genomes with at least one read sum over the mean_val """ 
    return data.loc[data['reads'] > min_val]['genome'].unique()

# demo: 
genomes_with_min_expression(data= df_melt, min_val = 10**6)



In [ ]:

    
def subset_by_threshold(df, min_val):
    """ Subset DataFrame to genomes with names in a list.  """
    return df[df['genome'].isin(genomes_with_min_expression(data=df, min_val = min_val))]



In [ ]:

    
# Demo of subset_by_threshold, which leverages genomes_with_min_expression()
subset_by_threshold(df=df_melt, min_val=10**6).head()



In [ ]:

    
plot_by_oxygen_tension(data = subset_by_threshold(df=df_melt, min_val=10**5), 
                       filename = 'reads_per_isolate_genome--limit_10_to_5th')
plot_by_oxygen_tension(data = subset_by_threshold(df=df_melt, min_val=10**6), 
                       filename = 'reads_per_isolate_genome--limit_10_to_6th')

plot_by_oxygen_tension(data = subset_by_threshold(df=df_melt, min_val=10**5), oxygen='high', 
                       filename = 'reads_per_isolate_genome--limit_10_to_5th')
plot_by_oxygen_tension(data = subset_by_threshold(df=df_melt, min_val=10**6), oxygen='high', 
                       filename = 'reads_per_isolate_genome--limit_10_to_6th')

plot_by_oxygen_tension(data = subset_by_threshold(df=df_melt, min_val=10**5), oxygen='low', 
                       filename = 'reads_per_isolate_genome--limit_10_to_5th')
plot_by_oxygen_tension(data = subset_by_threshold(df=df_melt, min_val=10**6), oxygen='low', 
                       filename = 'reads_per_isolate_genome--limit_10_to_6th')

#plot_by_oxygen_tension(df_melt, filename = 'reads_per_isolate_genome', oxygen='high')
#plot_by_oxygen_tension(df_melt, filename = 'reads_per_isolate_genome', oxygen='low')