In [1]:
import matplotlib as mpl
mpl.use('TkAgg')
import matplotlib.pyplot as plt
%matplotlib inline
import datetime
import os
import pandas as pd
import seaborn as sns
In [2]:
import datetime
Option to save plots upon regeneration (makes a new file since date string is appended.):
In [3]:
mpl.rcParams.update({
'font.size': 16, 'axes.titlesize': 17, 'axes.labelsize': 15,
'xtick.labelsize': 10, 'ytick.labelsize': 13,
#'font.family': 'Lato',
'font.weight': 600,
'axes.labelweight': 600, 'axes.titleweight': 600
#'figure.autolayout': True
})
In [4]:
regen_plots = False
In [5]:
if regen_plots:
date_string = datetime.date.today().strftime("%y%m%d")
plotpath = "./plots/" + date_string + "_read_alignments_to_bins"+ "/"
print(plotpath)
if not os.path.exists(plotpath):
os.makedirs(plotpath)
In [6]:
# Note: we are going to change the names of these files.
# I had originally used summary.dat but learned these are RPKM values even though
# they are integers.
read_counts_path = \
"/gscratch/lidstrom/meta4_bins/analysis/assemble_summaries/summary_counts.xls"
gene_data = pd.read_csv(read_counts_path, sep = '\t')
In [7]:
gene_data.head(2)
Out[7]:
In [8]:
gene_data.set_index(['genome', 'locus_tag', 'product'], inplace=True)
In [9]:
gene_data.head()
Out[9]:
In [10]:
sample_sums = pd.DataFrame({'read counts':gene_data.sum(index=0)})
In [11]:
sample_sums['reads/10^6'] = sample_sums['read counts']/(10**6)
In [12]:
sample_sums.head()
Out[12]:
In [13]:
ax = sample_sums['reads/10^6'].plot.hist(bins = 60) #range=[6.5, 12.5])
fig = ax.get_figure()
ax.set_xlabel("million reads mapped to metagenome bin")
ax.set_ylabel("# of metagenome bins \n with that many million reads")
ax.set_title("The number of reads per bin varies more than an order of magnitude across samples")
if regen_plots:
fig.savefig(plotpath + 'reads_per_sample--unnormalized.pdf')
Whats the one with less than a million?
In [14]:
sample_sums.sort(columns='reads/10^6', axis=0, ascending=True)
Out[14]:
In [15]:
gene_sums = pd.DataFrame({'gene sum':gene_data.sum(axis=1, index=2)})
In [16]:
gene_sums['gene sum'] = gene_sums['gene sum'].astype('int')
In [17]:
gene_sums = gene_sums[gene_sums['gene sum'] > 0]
In [18]:
gene_sums.head()
Out[18]:
In [19]:
gene_sums.shape
Out[19]:
In [20]:
fig, ax = plt.subplots()
gene_sums['gene sum'].hist(ax=ax, bins=100, )
ax.set_yscale('log')
ax.set_xlabel('number of reads mapped')
ax.set_xlabel('number of genes with that many reads')
ax.set_title("Most genes have almost no reads")
if regen_plots:
fig.savefig(plotpath + 'most_genes_have_no_reads--zoom0.pdf')
In [21]:
fig, ax = plt.subplots()
#ax.set_xlim(0, 10**5)
gene_sums[gene_sums['gene sum'] < 10**7]['gene sum'].hist(ax=ax, bins=100, )
ax.set_yscale('log')
ax.set_xlabel('number of reads mapped')
ax.set_xlabel('number of genes with that many reads')
ax.set_title("Most genes have almost no reads")
if regen_plots:
fig.savefig(plotpath + 'most_genes_have_no_reads--zoom1.pdf')
In [22]:
fig, ax = plt.subplots()
#ax.set_xlim(0, 10**5)
gene_sums[gene_sums['gene sum'] > 10**3]['gene sum'].hist(ax=ax, bins=100, )
ax.set_yscale('log')
ax.set_xlabel('number of reads mapped')
ax.set_xlabel('number of genes with that many reads')
ax.set_title("Most genes have almost no reads")
if regen_plots:
fig.savefig(plotpath + 'most_genes_have_no_reads--zoom_high.pdf')
In [23]:
gene_sums[gene_sums['gene sum'] > 5].head()
Out[23]:
In [24]:
gene_sums[gene_sums['gene sum'] > 10**6].head()
Out[24]:
In [25]:
fig, ax = plt.subplots()
gene_sums[gene_sums['gene sum'] > 0]['gene sum'].hist(ax=ax, bins=100, )
ax.set_yscale('log')
ax.set_xlabel('number of genes with ')
Out[25]:
In [26]:
fig, ax = plt.subplots()
gene_sums[gene_sums['gene sum'] > 10**4]['gene sum'].hist(ax=ax, bins=100, )
ax.set_yscale('log')
ax.set_xlabel('number of genes with ')
Out[26]:
In [ ]: