In [1]:
import matplotlib as mpl
mpl.use('TkAgg')
import matplotlib.pyplot as plt
%matplotlib inline

import os
import pandas as pd
import seaborn as sns

In [2]:
save_plots = True

In [3]:
plot_dir = "./plots/"
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)

In [4]:
sns.set_style("ticks")

In [5]:
sns.despine()


<matplotlib.figure.Figure at 0x10e5e0c18>

In [6]:
mpl.rcParams.update({
    'font.size': 16, 'axes.titlesize': 17, 'axes.labelsize': 15,
    'xtick.labelsize': 10, 'ytick.labelsize': 13,
    #'font.family': 'Lato', 
    'font.weight': 600,
    'axes.labelweight': 600, 'axes.titleweight': 600,
    'figure.autolayout': True,
    })

In [7]:
!ls -l '../../statistics.xls'


-rw-r--r--  1 janet  staff  11503 Mar 14 16:21 ../../statistics.xls

In [8]:
df = pd.read_csv('../../statistics.xls', sep="\t")

In [9]:
df.head()


Out[9]:
sample shortd total_reads total_reads_mapped s.total_reads_mapped / s.total_reads reads_mapped_to_rRNA s.reads_mapped_to_rRNA / s.total_reads_mapped reads_mapped_to_tRNA s.reads_mapped_to_tRNA / s.total_reads_mapped reads_mapped_to_CDS s.reads_mapped_to_CDS / s.total_reads_mapped reads_mapped_to_hypo s.reads_mapped_to_hypo / s.total_reads_mapped CDS_max_hits_per_kb CDS_mean_hits_per_kb
0 LakWasM100_LOW12_2 LakWasM100_LOW12_2 49751684 36402439 0.7317 568151 0.0156 13301 0.0004 19281601 0.5297 1747581 0.0480 4392450.0 144.1740
1 LakWasM104_HOW12_2 LakWasM104_HOW12_2 44057367 40004440 0.9080 1199481 0.0300 16308 0.0004 21358502 0.5339 2552060 0.0638 1685880.0 160.1830
2 LakWasM105_HOW12_2 LakWasM105_HOW12_2 45238626 39941526 0.8829 3498624 0.0876 13137 0.0003 13091596 0.3278 1936590 0.0485 1357410.0 111.6500
3 LakWasM106_HOW12_2 LakWasM106_HOW12_2 41878161 13552702 0.3236 2838281 0.2094 2479 0.0002 4648171 0.3430 597351 0.0441 427542.0 27.4376
4 LakWasM109_LOW13_2 LakWasM109_LOW13_2 36236191 34764674 0.9594 321671 0.0093 5118 0.0001 9683358 0.2785 124209 0.0036 3223020.0 64.1973

In [10]:
df['ID'] = df.shortd.str.extract('[A-z]+([0-9]+_[LH]OW[0-9]+)')


-c:1: FutureWarning: currently extract(expand=None) means expand=False (return Index/Series/DataFrame) but in a future version of pandas this will be changed to expand=True (return DataFrame)

In [11]:
df.head()


Out[11]:
sample shortd total_reads total_reads_mapped s.total_reads_mapped / s.total_reads reads_mapped_to_rRNA s.reads_mapped_to_rRNA / s.total_reads_mapped reads_mapped_to_tRNA s.reads_mapped_to_tRNA / s.total_reads_mapped reads_mapped_to_CDS s.reads_mapped_to_CDS / s.total_reads_mapped reads_mapped_to_hypo s.reads_mapped_to_hypo / s.total_reads_mapped CDS_max_hits_per_kb CDS_mean_hits_per_kb ID
0 LakWasM100_LOW12_2 LakWasM100_LOW12_2 49751684 36402439 0.7317 568151 0.0156 13301 0.0004 19281601 0.5297 1747581 0.0480 4392450.0 144.1740 100_LOW12
1 LakWasM104_HOW12_2 LakWasM104_HOW12_2 44057367 40004440 0.9080 1199481 0.0300 16308 0.0004 21358502 0.5339 2552060 0.0638 1685880.0 160.1830 104_HOW12
2 LakWasM105_HOW12_2 LakWasM105_HOW12_2 45238626 39941526 0.8829 3498624 0.0876 13137 0.0003 13091596 0.3278 1936590 0.0485 1357410.0 111.6500 105_HOW12
3 LakWasM106_HOW12_2 LakWasM106_HOW12_2 41878161 13552702 0.3236 2838281 0.2094 2479 0.0002 4648171 0.3430 597351 0.0441 427542.0 27.4376 106_HOW12
4 LakWasM109_LOW13_2 LakWasM109_LOW13_2 36236191 34764674 0.9594 321671 0.0093 5118 0.0001 9683358 0.2785 124209 0.0036 3223020.0 64.1973 109_LOW13

In [12]:
!ls "../../sample_meta_info.tsv"


../../sample_meta_info.tsv

In [13]:
smi = pd.read_csv('../../sample_meta_info.tsv', sep = '\t')

In [14]:
smi.head()


Out[14]:
ID oxy rep week project
0 1_LOW4 Low 1 4 1056013
1 13_LOW5 Low 1 5 1056037
2 25_LOW6 Low 1 6 1056061
3 37_LOW7 Low 1 7 1056085
4 49_LOW8 Low 1 8 1056109

Merge on sample info so I can get at the names for plotting.


In [15]:
df.shape


Out[15]:
(82, 16)

In [16]:
pd.merge(smi, df).shape


Out[16]:
(82, 20)

In [17]:
df = pd.merge(smi, df)

In [18]:
df.columns


Out[18]:
Index(['ID', 'oxy', 'rep', 'week', 'project', 'sample', 'shortd',
       'total_reads', 'total_reads_mapped',
       's.total_reads_mapped / s.total_reads', 'reads_mapped_to_rRNA',
       's.reads_mapped_to_rRNA / s.total_reads_mapped', 'reads_mapped_to_tRNA',
       's.reads_mapped_to_tRNA / s.total_reads_mapped', 'reads_mapped_to_CDS',
       's.reads_mapped_to_CDS / s.total_reads_mapped', 'reads_mapped_to_hypo',
       's.reads_mapped_to_hypo / s.total_reads_mapped', 'CDS_max_hits_per_kb',
       'CDS_mean_hits_per_kb'],
      dtype='object')

In [19]:
ax = df['total_reads'].plot(kind='hist')
ax.set_xlabel("total reads / 10^7")
ax.set_ylabel("number of samples")
ax.set_title("total reads (mapped & unmapped)", y=1.05)
ax.set_xlim([0, max(df['total_reads'])])
if save_plots:
    ax.figure.savefig(plot_dir+"total_reads_histogram.pdf")


/Users/janet/miniconda3/envs/m4_janalysis/lib/python3.5/site-packages/matplotlib/figure.py:1744: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "
df['unmapped reads'] = (df['total_reads'] - df['reads_mapped_to_rRNA']

In [20]:
df.head(3)


Out[20]:
ID oxy rep week project sample shortd total_reads total_reads_mapped s.total_reads_mapped / s.total_reads reads_mapped_to_rRNA s.reads_mapped_to_rRNA / s.total_reads_mapped reads_mapped_to_tRNA s.reads_mapped_to_tRNA / s.total_reads_mapped reads_mapped_to_CDS s.reads_mapped_to_CDS / s.total_reads_mapped reads_mapped_to_hypo s.reads_mapped_to_hypo / s.total_reads_mapped CDS_max_hits_per_kb CDS_mean_hits_per_kb
0 1_LOW4 Low 1 4 1056013 LakWasMeta1_LOW4_2 LakWasMeta1_LOW4_2 49467855 34913644 0.7058 18052199 0.5171 3244 0.0001 4212642 0.1207 463270 0.0133 755669.0 33.8984
1 13_LOW5 Low 1 5 1056037 LakWasMet13_LOW5_2 LakWasMet13_LOW5_2 40105612 29735707 0.7414 11809949 0.3972 6779 0.0002 6439789 0.2166 799935 0.0269 993211.0 53.3784
2 37_LOW7 Low 1 7 1056085 LakWasMet37_LOW7_2 LakWasMet37_LOW7_2 33363991 28420269 0.8518 2993615 0.1053 5624 0.0002 13473120 0.4741 1212334 0.0427 2740460.0 99.3911

In [21]:
# !! 160602: this x-axis label is misleading.  Was thre a mistake? 
ax = (df['total_reads'] - df['reads_mapped_to_rRNA']).plot(kind='hist')
ax.set_xlabel("unmapped reads / 10^7")
ax.set_ylabel("number of samples")
ax.set_title('unmapped reads', y=1.05)
if save_plots:
    ax.figure.savefig(plot_dir+"unmapped_reads_histogram.pdf")


/Users/janet/miniconda3/envs/m4_janalysis/lib/python3.5/site-packages/matplotlib/figure.py:1744: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "

In [22]:
ax = df.plot.scatter(x='total_reads', y='total_reads_mapped', figsize=(5,5))
sns.despine()
#low = 3*10**7
low = 0
high = 6.5*10**7
plt.plot([low, high], [low, high], '--', c='gray', lw=2)
plt.ylim((low, high))
plt.xlim((low, high))

ax.set_title('total mapped vs total available', y=1.05)


Out[22]:
<matplotlib.text.Text at 0x1116f8be0>
/Users/janet/miniconda3/envs/m4_janalysis/lib/python3.5/site-packages/matplotlib/figure.py:1744: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "

In [23]:
# http://pyhogs.github.io/colormap-examples.html

import numpy as np 

x = np.linspace(-np.pi, np.pi, 50) 
y = np.linspace(-np.pi, np.pi, 50)

X,Y = np.meshgrid(x,y)
Z = np.sin(X + Y/4)


def custom_div_cmap(numcolors=11, name='custom_div_cmap',
                    mincol='blue', midcol='white', maxcol='red'):
    """ Create a custom diverging colormap with three colors
    
    Default is blue to white to red with 11 colors.  Colors can be specified
    in any way understandable by matplotlib.colors.ColorConverter.to_rgb()
    """

    from matplotlib.colors import LinearSegmentedColormap 
    
    cmap = LinearSegmentedColormap.from_list(name=name, 
                                             colors =[mincol, midcol, maxcol],
                                             N=numcolors)
    return cmap

custom_map = custom_div_cmap(11, mincol='g', midcol='0.9' ,maxcol='CornflowerBlue')
plt.pcolormesh(X, Y, Z, cmap=custom_map)
plt.axis([-3, 3, -2, 3])
plt.colorbar()
plt.title('green-gray-blue custom colormap')


Out[23]:
<matplotlib.text.Text at 0x11187fa20>
/Users/janet/miniconda3/envs/m4_janalysis/lib/python3.5/site-packages/matplotlib/figure.py:1744: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "

In [24]:
# from http://pyhogs.github.io/colormap-examples.html  : 
def custom_seq_cmap(numcolors=11, name='custom_div_cmap',
                    mincol='blue', maxcol='red'):
    """ Create a custom diverging colormap with three colors
    
    Default is blue to white to red with 11 colors.  Colors can be specified
    in any way understandable by matplotlib.colors.ColorConverter.to_rgb()
    """

    from matplotlib.colors import LinearSegmentedColormap 
    
    cmap = LinearSegmentedColormap.from_list(name=name, 
                                             colors =[mincol, maxcol],
                                             N=numcolors)
    return cmap

In [25]:
custom_map = custom_seq_cmap(numcolors=11, name='custom_div_cmap',
                    mincol='#bdbdbd', maxcol='#e34a33')
fig, ax = plt.subplots()
df.plot.scatter(x='total_reads', y='total_reads_mapped', 
                c='reads_mapped_to_rRNA', s= 60,
                sharex=False, # hack to get back the x label
                # hack from: https://github.com/pydata/pandas/issues/10611
                figsize=(6,6),
                cmap=custom_map,
                colorbar = True,
                ax=ax
                )
sns.despine()
ax.set_title('CDS vs total mapped', y=1.08)

#plt.tight_layout()
#ax.figure.savefig(plot_dir + 'CDS_vs_total_mapped--color_by_rRNA_reads_mapped.pdf')

# Add a dashed line for the expected result. 
#low = 3*10**7
low = 0
high = 6.5*10**7
plt.plot([low, high], [low, high], '--', c='gray', lw=2)
plt.ylim((low, high))
plt.xlim((low, high))

ax.set_title('total mapped vs total available', y=1.05)

ax.figure.savefig(plot_dir + '160602_motivate_unmapped_read_hunt.pdf')
ax.figure.savefig(plot_dir + '160602_motivate_unmapped_read_hunt.svg')


/Users/janet/miniconda3/envs/m4_janalysis/lib/python3.5/site-packages/matplotlib/figure.py:1744: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "

In [26]:
ax = df.plot.scatter(x='total_reads', y='total_reads_mapped', 
                     c='reads_mapped_to_rRNA', s= 60,
                     sharex=False, # hack to get back the x label
                     # hack from: https://github.com/pydata/pandas/issues/10611
                     figsize=(6,6),
                     cmap=custom_map
                    )
sns.despine()


/Users/janet/miniconda3/envs/m4_janalysis/lib/python3.5/site-packages/matplotlib/figure.py:1744: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "

In [27]:
ax = df.plot.scatter(x='total_reads', y='reads_mapped_to_CDS')
ax.set_title('CDS vs total', y=1.05)


Out[27]:
<matplotlib.text.Text at 0x111aec198>
/Users/janet/miniconda3/envs/m4_janalysis/lib/python3.5/site-packages/matplotlib/figure.py:1744: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "

In [28]:
ax = df.plot.scatter(x='total_reads_mapped', 
                     y='reads_mapped_to_CDS', 
                     c='reads_mapped_to_rRNA', s= 60,
                     sharex=False # hack to get back the x label
                     # hack from: https://github.com/pydata/pandas/issues/10611
                    )
ax.set_title('CDS vs total mapped', y=1.05)
plt.tight_layout()
ax.figure.savefig(plot_dir + 'CDS_vs_total_mapped--color_by_rRNA_reads_mapped.pdf')


/Users/janet/miniconda3/envs/m4_janalysis/lib/python3.5/site-packages/matplotlib/figure.py:1744: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "

In [29]:
def label_point(x, y, val, ax):
    a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
    for i, point in a.iterrows():
        ax.text(point['x'], point['y'], str(point['val']),
        color='silver', fontsize=12)

In [30]:
ax = df.plot.scatter(x='total_reads_mapped', 
                     y='reads_mapped_to_CDS', 
                     c='reads_mapped_to_rRNA', s= 100,
                     sharex=False # hack to get back the x label
                     # hack from: https://github.com/pydata/pandas/issues/10611
                    )
ax.set_title('CDS vs total mapped', y=1.05)
ax.figure.savefig(plot_dir + 
                  'CDS_vs_total_mapped--color_by_rRNA_reads_mapped.pdf')
label_point(df.total_reads_mapped, df.reads_mapped_to_CDS, 
            df.ID, ax)
ax.figure.set_size_inches(14, 10)
ax.figure.savefig(plot_dir + 'CDS_vs_total_mapped--color_by_rRNA_reads_mapped--labeled.pdf')


/Users/janet/miniconda3/envs/m4_janalysis/lib/python3.5/site-packages/matplotlib/figure.py:1744: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "
ax = df.plot.scatter(x='total_reads', y='reads_mapped_to_CDS', c='total_reads_mapped') #ax.set_title('CDS vs mapped') ax.set_xlabel('total reads mapped') ax.figure.savefig(plot_dir + 'CDS_vs_total_mapped--color_by_rRNA_reads_mapped.pdf')

In [31]:
df.head()


Out[31]:
ID oxy rep week project sample shortd total_reads total_reads_mapped s.total_reads_mapped / s.total_reads reads_mapped_to_rRNA s.reads_mapped_to_rRNA / s.total_reads_mapped reads_mapped_to_tRNA s.reads_mapped_to_tRNA / s.total_reads_mapped reads_mapped_to_CDS s.reads_mapped_to_CDS / s.total_reads_mapped reads_mapped_to_hypo s.reads_mapped_to_hypo / s.total_reads_mapped CDS_max_hits_per_kb CDS_mean_hits_per_kb
0 1_LOW4 Low 1 4 1056013 LakWasMeta1_LOW4_2 LakWasMeta1_LOW4_2 49467855 34913644 0.7058 18052199 0.5171 3244 0.0001 4212642 0.1207 463270 0.0133 755669.0 33.8984
1 13_LOW5 Low 1 5 1056037 LakWasMet13_LOW5_2 LakWasMet13_LOW5_2 40105612 29735707 0.7414 11809949 0.3972 6779 0.0002 6439789 0.2166 799935 0.0269 993211.0 53.3784
2 37_LOW7 Low 1 7 1056085 LakWasMet37_LOW7_2 LakWasMet37_LOW7_2 33363991 28420269 0.8518 2993615 0.1053 5624 0.0002 13473120 0.4741 1212334 0.0427 2740460.0 99.3911
3 49_LOW8 Low 1 8 1056109 LakWasMet49_LOW8_2 LakWasMet49_LOW8_2 48116830 28931614 0.6013 1356581 0.0469 8572 0.0003 17123824 0.5919 1735830 0.0600 2848280.0 124.2090
4 61_LOW9 Low 1 9 1056133 LakWasMet61_LOW9_2 LakWasMet61_LOW9_2 42124631 37974206 0.9015 830569 0.0219 16935 0.0004 19753275 0.5202 2251523 0.0593 2915610.0 154.0810

In [32]:
df.columns


Out[32]:
Index(['ID', 'oxy', 'rep', 'week', 'project', 'sample', 'shortd',
       'total_reads', 'total_reads_mapped',
       's.total_reads_mapped / s.total_reads', 'reads_mapped_to_rRNA',
       's.reads_mapped_to_rRNA / s.total_reads_mapped', 'reads_mapped_to_tRNA',
       's.reads_mapped_to_tRNA / s.total_reads_mapped', 'reads_mapped_to_CDS',
       's.reads_mapped_to_CDS / s.total_reads_mapped', 'reads_mapped_to_hypo',
       's.reads_mapped_to_hypo / s.total_reads_mapped', 'CDS_max_hits_per_kb',
       'CDS_mean_hits_per_kb'],
      dtype='object')

In [33]:
# s.total_reads_mapped / s.total_reads
#my_cmap = (matplotlib.color.LinearSegmentedColormap.
#        from_list('blueWhiteRed', ['blue', 'white', 'red']))

ax = df.plot.scatter(x='total_reads', y='total_reads_mapped', 
                     c='s.reads_mapped_to_CDS / s.total_reads_mapped',
                    cmap=plt.cm.bone,
                    sharex=False # hack to get back the x label
                     # hack from: https://github.com/pydata/pandas/issues/10611
                    )
#ax.set_title('CDS vs mapped')
ax.set_xlabel('total reads (mapped + unmapped)')
ax.figure.savefig(plot_dir + 
                  'total_mapped_vs_total--color_by_frac_to_CDS.pdf')


/Users/janet/miniconda3/envs/m4_janalysis/lib/python3.5/site-packages/matplotlib/figure.py:1744: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "

In [34]:
df["ufunc 'sqrt' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''"]


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/Users/janet/miniconda3/envs/m4_janalysis/lib/python3.5/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   1875             try:
-> 1876                 return self._engine.get_loc(key)
   1877             except KeyError:

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4027)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:3891)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12408)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12359)()

KeyError: "ufunc 'sqrt' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''"

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-34-4fdc2e728a12> in <module>()
----> 1 df["ufunc 'sqrt' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''"]

/Users/janet/miniconda3/envs/m4_janalysis/lib/python3.5/site-packages/pandas/core/frame.py in __getitem__(self, key)
   1990             return self._getitem_multilevel(key)
   1991         else:
-> 1992             return self._getitem_column(key)
   1993 
   1994     def _getitem_column(self, key):

/Users/janet/miniconda3/envs/m4_janalysis/lib/python3.5/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   1997         # get column
   1998         if self.columns.is_unique:
-> 1999             return self._get_item_cache(key)
   2000 
   2001         # duplicate columns & possible reduce dimensionality

/Users/janet/miniconda3/envs/m4_janalysis/lib/python3.5/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   1343         res = cache.get(item)
   1344         if res is None:
-> 1345             values = self._data.get(item)
   1346             res = self._box_item_values(item, values)
   1347             cache[item] = res

/Users/janet/miniconda3/envs/m4_janalysis/lib/python3.5/site-packages/pandas/core/internals.py in get(self, item, fastpath)
   3223 
   3224             if not isnull(item):
-> 3225                 loc = self.items.get_loc(item)
   3226             else:
   3227                 indexer = np.arange(len(self.items))[isnull(self.items)]

/Users/janet/miniconda3/envs/m4_janalysis/lib/python3.5/site-packages/pandas/indexes/base.py in get_loc(self, key, method, tolerance)
   1876                 return self._engine.get_loc(key)
   1877             except KeyError:
-> 1878                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   1879 
   1880         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4027)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:3891)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12408)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12359)()

KeyError: "ufunc 'sqrt' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''"

In [ ]:
# s.total_reads_mapped / s.total_reads
#my_cmap = (matplotlib.color.LinearSegmentedColormap.
#        from_list('blueWhiteRed', ['blue', 'white', 'red']))

ax = df.plot.scatter(x='total_reads', y='total_reads_mapped', 
                     c='s.reads_mapped_to_CDS / s.total_reads_mapped',
                    cmap=plt.cm.bone,
                     s='s.reads_mapped_to_rRNA / s.total_reads_mapped',
                    sharex=False # hack to get back the x label
                     # hack from: https://github.com/pydata/pandas/issues/10611
                    )
label_point(df.total_reads, df.total_reads_mapped, 
            df.ID, ax)
ax.figure.set_size_inches(14, 10)
#ax.figure.savefig(plot_dir + 
#                  'total_mapped_vs_total--color_by_frac_to_CDS--labeled.pdf')

In [ ]:
# s.total_reads_mapped / s.total_reads
#my_cmap = (matplotlib.color.LinearSegmentedColormap.
#        from_list('blueWhiteRed', ['blue', 'white', 'red']))

ax = df.plot.scatter(x='total_reads', y='total_reads_mapped', 
                     c='s.reads_mapped_to_CDS / s.total_reads_mapped',
                    cmap=plt.cm.bone,
                     s=100,
                    sharex=False # hack to get back the x label
                     # hack from: https://github.com/pydata/pandas/issues/10611
                    )
label_point(df.total_reads, df.total_reads_mapped, 
            df.ID, ax)
ax.figure.set_size_inches(14, 10)
ax.figure.savefig(plot_dir + 
                  'total_mapped_vs_total--color_by_frac_to_CDS--labeled.pdf')

In [ ]:
ax = df.plot.scatter(x='total_reads', y='reads_mapped_to_rRNA')
ax.set_title('rRNA vs total')

In [ ]:
ax = df['reads_mapped_to_rRNA'].plot(kind='hist')
ax.set_xlabel("reads mapped to rRNA / 10^7")
ax.set_ylabel("number of samples")

In [ ]:
ax = df['s.total_reads_mapped / s.total_reads'].plot(kind='hist')
ax.set_xlabel("fraction of reads mapped")
ax.set_ylabel("number of samples")

In [ ]: