In [1]:
import matplotlib as mpl
mpl.use('TkAgg')
import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
import seaborn as sns

In [2]:
gene_data = pd.read_csv('summary.dat', sep = '\t')

In [3]:
gene_data.head()


Out[3]:
genome locus_tag product LakWasM100_LOW12_2_rpkm LakWasMe97_LOW12_2_rpkm LakWasMe98_LOW12_2_rpkm LakWasMe99_LOW12_2_rpkm LakWasM104_HOW12_2_rpkm LakWasM105_HOW12_2_rpkm LakWasM106_HOW12_2_rpkm ... LakWasMet56_HOW8_2_rpkm LakWasMet57_HOW8_2_rpkm LakWasMet58_HOW8_2_rpkm LakWasMet61_LOW9_2_rpkm LakWasMet62_LOW9_2_rpkm LakWasMet63_LOW9_2_rpkm LakWasMet64_LOW9_2_rpkm LakWasMet67_HOW9_2_rpkm LakWasMet69_HOW9_2_rpkm LakWasMet70_HOW9_2_rpkm
0 Acidovora-69x (UID4105) Ga0081644_10011 serine/threonine protein kinase 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 Acidovora-69x (UID4105) Ga0081644_10012 simple sugar transport system permease protein 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 Acidovora-69x (UID4105) Ga0081644_10013 nucleoside ABC transporter ATP-binding protein 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 0 1 0 0
3 Acidovora-69x (UID4105) Ga0081644_10021 hypothetical protein 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 Acidovora-69x (UID4105) Ga0081644_10022 D-amino-acid dehydrogenase 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 2 0 2

5 rows × 86 columns


In [4]:
gene_data.set_index(['genome', 'locus_tag', 'product'], inplace=True)

In [5]:
gene_data.head()


Out[5]:
LakWasM100_LOW12_2_rpkm LakWasMe97_LOW12_2_rpkm LakWasMe98_LOW12_2_rpkm LakWasMe99_LOW12_2_rpkm LakWasM104_HOW12_2_rpkm LakWasM105_HOW12_2_rpkm LakWasM106_HOW12_2_rpkm LakWasM109_LOW13_2_rpkm LakWasM110_LOW13_2_rpkm LakWasM111_LOW13_2_rpkm ... LakWasMet56_HOW8_2_rpkm LakWasMet57_HOW8_2_rpkm LakWasMet58_HOW8_2_rpkm LakWasMet61_LOW9_2_rpkm LakWasMet62_LOW9_2_rpkm LakWasMet63_LOW9_2_rpkm LakWasMet64_LOW9_2_rpkm LakWasMet67_HOW9_2_rpkm LakWasMet69_HOW9_2_rpkm LakWasMet70_HOW9_2_rpkm
genome locus_tag product
Acidovora-69x (UID4105) Ga0081644_10011 serine/threonine protein kinase 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
Ga0081644_10012 simple sugar transport system permease protein 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
Ga0081644_10013 nucleoside ABC transporter ATP-binding protein 0 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 0 1 0 0
Ga0081644_10021 hypothetical protein 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
Ga0081644_10022 D-amino-acid dehydrogenase 0 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 2 0 2

5 rows × 83 columns


In [6]:
sample_sums = pd.DataFrame({'read counts':gene_data.sum(index=0)})

In [7]:
sample_sums.head()


Out[7]:
read counts
LakWasM100_LOW12_2_rpkm 1266898
LakWasMe97_LOW12_2_rpkm 1116377
LakWasMe98_LOW12_2_rpkm 1119813
LakWasMe99_LOW12_2_rpkm 1242902
LakWasM104_HOW12_2_rpkm 1269758

In [8]:
sample_sums['reads/10^6'] = sample_sums['read counts']/(10**6)

In [ ]:
sample_sums[]

In [13]:
sample_sums.index


Out[13]:
Index(['LakWasM100_LOW12_2_rpkm', 'LakWasMe97_LOW12_2_rpkm',
       'LakWasMe98_LOW12_2_rpkm', 'LakWasMe99_LOW12_2_rpkm',
       'LakWasM104_HOW12_2_rpkm', 'LakWasM105_HOW12_2_rpkm',
       'LakWasM106_HOW12_2_rpkm', 'LakWasM109_LOW13_2_rpkm',
       'LakWasM110_LOW13_2_rpkm', 'LakWasM111_LOW13_2_rpkm',
       'LakWasM112_LOW13_2_rpkm', 'LakWasM115_HOW13_2_rpkm',
       'LakWasM116_HOW13_2_rpkm', 'LakWasM117_HOW13_2_rpkm',
       'LakWasM118_HOW13_2_rpkm', 'LakWasM121_LOW14_2_rpkm',
       'LakWasM122_LOW14_2_rpkm', 'LakWasM123_LOW14_2_rpkm',
       'LakWasM124_LOW14_2_rpkm', 'LakWasM127_HOW14_2_rpkm',
       'LakWasM128_HOW14_2_rpkm', 'LakWasM129_HOW14_2_rpkm',
       'LakWasM130_HOW14_2_rpkm', 'LakWasMe73_LOW10_2_rpkm',
       'LakWasMe74_LOW10_2_rpkm', 'LakWasMe75_LOW10_2_rpkm',
       'LakWasMe76_LOW10_2_rpkm', 'LakWasMe79_HOW10_2_rpkm',
       'LakWasMe80_HOW10_2_rpkm', 'LakWasMe81_HOW10_2_rpkm',
       'LakWasMe82_HOW10_2_rpkm', 'LakWasMe85_LOW11_2_rpkm',
       'LakWasMe86_LOW11_2_rpkm', 'LakWasMe91_HOW11_2_rpkm',
       'LakWasMe92_HOW11_2_rpkm', 'LakWasMe93_HOW11_2_rpkm',
       'LakWasMe94_HOW11_2_rpkm', 'LakWasMeta1_LOW4_2_rpkm',
       'LakWasMeta2_LOW4_2_rpkm', 'LakWasMeta3_LOW4_2_rpkm',
       'LakWasMeta4_LOW4_2_rpkm', 'LakWasMet10_HOW4_2_rpkm',
       'LakWasMeta7_HOW4_2_rpkm', 'LakWasMeta8_HOW4_2_rpkm',
       'LakWasMeta9_HOW4_2_rpkm', 'LakWasMet13_LOW5_2_rpkm',
       'LakWasMet14_LOW5_2_rpkm', 'LakWasMet15_LOW5_2_rpkm',
       'LakWasMet16_LOW5_2_rpkm', 'LakWasMet19_HOW5_2_rpkm',
       'LakWasMet20_HOW5_2_rpkm', 'LakWasMet21_HOW5_2_rpkm',
       'LakWasMet22_HOW5_2_rpkm', 'LakWasMet25_LOW6_2_rpkm',
       'LakWasMet26_LOW6_2_rpkm', 'LakWasMet27_LOW6_2_rpkm',
       'LakWasMet28_LOW6_2_rpkm', 'LakWasMet31_HOW6_2_rpkm',
       'LakWasMet32_HOW6_2_rpkm', 'LakWasMet33_HOW6_2_rpkm',
       'LakWasMet34_HOW6_2_rpkm', 'LakWasMet37_LOW7_2_rpkm',
       'LakWasMet38_LOW7_2_rpkm', 'LakWasMet39_LOW7_2_rpkm',
       'LakWasMet40_LOW7_2_rpkm', 'LakWasMet43_HOW7_2_rpkm',
       'LakWasMet44_HOW7_2_rpkm', 'LakWasMet45_HOW7_2_rpkm',
       'LakWasMet49_LOW8_2_rpkm', 'LakWasMet50_LOW8_2_rpkm',
       'LakWasMet51_LOW8_2_rpkm', 'LakWasMet52_LOW8_2_rpkm',
       'LakWasMet55_HOW8_2_rpkm', 'LakWasMet56_HOW8_2_rpkm',
       'LakWasMet57_HOW8_2_rpkm', 'LakWasMet58_HOW8_2_rpkm',
       'LakWasMet61_LOW9_2_rpkm', 'LakWasMet62_LOW9_2_rpkm',
       'LakWasMet63_LOW9_2_rpkm', 'LakWasMet64_LOW9_2_rpkm',
       'LakWasMet67_HOW9_2_rpkm', 'LakWasMet69_HOW9_2_rpkm',
       'LakWasMet70_HOW9_2_rpkm'],
      dtype='object')

In [18]:
sample_sums.loc['LakWasM106_HOW12_2_rpkm']


Out[18]:
read counts    1001033.000000
reads/10^6           1.001033
Name: LakWasM106_HOW12_2_rpkm, dtype: float64

In [9]:
sample_sums.head()


Out[9]:
read counts reads/10^6
LakWasM100_LOW12_2_rpkm 1266898 1.266898
LakWasMe97_LOW12_2_rpkm 1116377 1.116377
LakWasMe98_LOW12_2_rpkm 1119813 1.119813
LakWasMe99_LOW12_2_rpkm 1242902 1.242902
LakWasM104_HOW12_2_rpkm 1269758 1.269758

In [10]:
ax = sample_sums['reads/10^6'].plot.hist(range=[0, 1.5], bins = 60) #range=[6.5, 12.5])
fig = ax.get_figure()
fig.savefig('reads_per_sample--unnormalized.pdf')



In [11]:
gene_sums = pd.DataFrame({'gene sum':gene_data.sum(axis=1, index=2)})

In [ ]:
gene_sums['gene sum'] = gene_sums['gene sum'].astype('int')

In [ ]:
gene_sums = gene_sums[gene_sums['gene sum'] > 0]
import numpy as np
gene_sums['log10(gene sum)'] = np.log10(gene_sums['gene sum'])

In [ ]:
gene_sums.head()

In [ ]:
gene_sums.head() #.plot(kind='hist')

In [ ]:
gene_sums.shape

In [ ]:
fig, ax = plt.subplots()
gene_sums['gene sum'].hist(ax=ax, bins=100, )
ax.set_yscale('log')
ax.set_xlabel('...')

In [ ]:
gene_sums[gene_sums['gene sum'] > 5].head()

In [ ]:
gene_sums[gene_sums['gene sum'] > 10**6].head()

In [ ]:
fig, ax = plt.subplots()
gene_sums[gene_sums['gene sum'] > 0]['gene sum'].hist(ax=ax, bins=100, )
ax.set_yscale('log')
ax.set_xlabel('number of genes with ')

In [ ]:
fig, ax = plt.subplots()
gene_sums[gene_sums['gene sum'] > 10**4]['gene sum'].hist(ax=ax, bins=100, )
ax.set_yscale('log')
ax.set_xlabel('number of genes with ')

In [ ]: