In [1]:
import matplotlib as mpl
mpl.use('TkAgg')
import matplotlib.pyplot as plt
%matplotlib inline
import os
import pandas as pd
import seaborn as sns
In [2]:
save_plots = True
In [3]:
plot_dir = "./plots/"
if not os.path.exists(plot_dir):
os.makedirs(plot_dir)
In [4]:
sns.set_style("ticks")
In [5]:
sns.despine()
In [6]:
mpl.rcParams.update({
'font.size': 16, 'axes.titlesize': 17, 'axes.labelsize': 15,
'xtick.labelsize': 10, 'ytick.labelsize': 13,
#'font.family': 'Lato',
'font.weight': 600,
'axes.labelweight': 600, 'axes.titleweight': 600,
'figure.autolayout': True,
})
In [7]:
!ls -l '../../statistics.xls'
In [8]:
df = pd.read_csv('../../statistics.xls', sep="\t")
In [9]:
df.head()
Out[9]:
In [10]:
df['ID'] = df.shortd.str.extract('[A-z]+([0-9]+_[LH]OW[0-9]+)')
In [11]:
df.head()
Out[11]:
In [12]:
!ls "../../sample_meta_info.tsv"
In [13]:
smi = pd.read_csv('../../sample_meta_info.tsv', sep = '\t')
In [14]:
smi.head()
Out[14]:
In [15]:
df.shape
Out[15]:
In [16]:
pd.merge(smi, df).shape
Out[16]:
In [17]:
df = pd.merge(smi, df)
In [18]:
df.columns
Out[18]:
In [19]:
ax = df['total_reads'].plot(kind='hist')
ax.set_xlabel("total reads / 10^7")
ax.set_ylabel("number of samples")
ax.set_title("total reads (mapped & unmapped)", y=1.05)
ax.set_xlim([0, max(df['total_reads'])])
if save_plots:
ax.figure.savefig(plot_dir+"total_reads_histogram.pdf")
In [20]:
df.head(3)
Out[20]:
In [21]:
# !! 160602: this x-axis label is misleading. Was thre a mistake?
ax = (df['total_reads'] - df['reads_mapped_to_rRNA']).plot(kind='hist')
ax.set_xlabel("unmapped reads / 10^7")
ax.set_ylabel("number of samples")
ax.set_title('unmapped reads', y=1.05)
if save_plots:
ax.figure.savefig(plot_dir+"unmapped_reads_histogram.pdf")
In [22]:
ax = df.plot.scatter(x='total_reads', y='total_reads_mapped', figsize=(5,5))
sns.despine()
#low = 3*10**7
low = 0
high = 6.5*10**7
plt.plot([low, high], [low, high], '--', c='gray', lw=2)
plt.ylim((low, high))
plt.xlim((low, high))
ax.set_title('total mapped vs total available', y=1.05)
Out[22]:
In [23]:
# http://pyhogs.github.io/colormap-examples.html
import numpy as np
x = np.linspace(-np.pi, np.pi, 50)
y = np.linspace(-np.pi, np.pi, 50)
X,Y = np.meshgrid(x,y)
Z = np.sin(X + Y/4)
def custom_div_cmap(numcolors=11, name='custom_div_cmap',
mincol='blue', midcol='white', maxcol='red'):
""" Create a custom diverging colormap with three colors
Default is blue to white to red with 11 colors. Colors can be specified
in any way understandable by matplotlib.colors.ColorConverter.to_rgb()
"""
from matplotlib.colors import LinearSegmentedColormap
cmap = LinearSegmentedColormap.from_list(name=name,
colors =[mincol, midcol, maxcol],
N=numcolors)
return cmap
custom_map = custom_div_cmap(11, mincol='g', midcol='0.9' ,maxcol='CornflowerBlue')
plt.pcolormesh(X, Y, Z, cmap=custom_map)
plt.axis([-3, 3, -2, 3])
plt.colorbar()
plt.title('green-gray-blue custom colormap')
Out[23]:
In [24]:
# from http://pyhogs.github.io/colormap-examples.html :
def custom_seq_cmap(numcolors=11, name='custom_div_cmap',
mincol='blue', maxcol='red'):
""" Create a custom diverging colormap with three colors
Default is blue to white to red with 11 colors. Colors can be specified
in any way understandable by matplotlib.colors.ColorConverter.to_rgb()
"""
from matplotlib.colors import LinearSegmentedColormap
cmap = LinearSegmentedColormap.from_list(name=name,
colors =[mincol, maxcol],
N=numcolors)
return cmap
In [25]:
custom_map = custom_seq_cmap(numcolors=11, name='custom_div_cmap',
mincol='#bdbdbd', maxcol='#e34a33')
fig, ax = plt.subplots()
df.plot.scatter(x='total_reads', y='total_reads_mapped',
c='reads_mapped_to_rRNA', s= 60,
sharex=False, # hack to get back the x label
# hack from: https://github.com/pydata/pandas/issues/10611
figsize=(6,6),
cmap=custom_map,
colorbar = True,
ax=ax
)
sns.despine()
ax.set_title('CDS vs total mapped', y=1.08)
#plt.tight_layout()
#ax.figure.savefig(plot_dir + 'CDS_vs_total_mapped--color_by_rRNA_reads_mapped.pdf')
# Add a dashed line for the expected result.
#low = 3*10**7
low = 0
high = 6.5*10**7
plt.plot([low, high], [low, high], '--', c='gray', lw=2)
plt.ylim((low, high))
plt.xlim((low, high))
ax.set_title('total mapped vs total available', y=1.05)
ax.figure.savefig(plot_dir + '160602_motivate_unmapped_read_hunt.pdf')
ax.figure.savefig(plot_dir + '160602_motivate_unmapped_read_hunt.svg')
In [26]:
ax = df.plot.scatter(x='total_reads', y='total_reads_mapped',
c='reads_mapped_to_rRNA', s= 60,
sharex=False, # hack to get back the x label
# hack from: https://github.com/pydata/pandas/issues/10611
figsize=(6,6),
cmap=custom_map
)
sns.despine()
In [27]:
ax = df.plot.scatter(x='total_reads', y='reads_mapped_to_CDS')
ax.set_title('CDS vs total', y=1.05)
Out[27]:
In [28]:
ax = df.plot.scatter(x='total_reads_mapped',
y='reads_mapped_to_CDS',
c='reads_mapped_to_rRNA', s= 60,
sharex=False # hack to get back the x label
# hack from: https://github.com/pydata/pandas/issues/10611
)
ax.set_title('CDS vs total mapped', y=1.05)
plt.tight_layout()
ax.figure.savefig(plot_dir + 'CDS_vs_total_mapped--color_by_rRNA_reads_mapped.pdf')
In [29]:
def label_point(x, y, val, ax):
a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
for i, point in a.iterrows():
ax.text(point['x'], point['y'], str(point['val']),
color='silver', fontsize=12)
In [30]:
ax = df.plot.scatter(x='total_reads_mapped',
y='reads_mapped_to_CDS',
c='reads_mapped_to_rRNA', s= 100,
sharex=False # hack to get back the x label
# hack from: https://github.com/pydata/pandas/issues/10611
)
ax.set_title('CDS vs total mapped', y=1.05)
ax.figure.savefig(plot_dir +
'CDS_vs_total_mapped--color_by_rRNA_reads_mapped.pdf')
label_point(df.total_reads_mapped, df.reads_mapped_to_CDS,
df.ID, ax)
ax.figure.set_size_inches(14, 10)
ax.figure.savefig(plot_dir + 'CDS_vs_total_mapped--color_by_rRNA_reads_mapped--labeled.pdf')
In [31]:
df.head()
Out[31]:
In [32]:
df.columns
Out[32]:
In [33]:
# s.total_reads_mapped / s.total_reads
#my_cmap = (matplotlib.color.LinearSegmentedColormap.
# from_list('blueWhiteRed', ['blue', 'white', 'red']))
ax = df.plot.scatter(x='total_reads', y='total_reads_mapped',
c='s.reads_mapped_to_CDS / s.total_reads_mapped',
cmap=plt.cm.bone,
sharex=False # hack to get back the x label
# hack from: https://github.com/pydata/pandas/issues/10611
)
#ax.set_title('CDS vs mapped')
ax.set_xlabel('total reads (mapped + unmapped)')
ax.figure.savefig(plot_dir +
'total_mapped_vs_total--color_by_frac_to_CDS.pdf')
In [34]:
df["ufunc 'sqrt' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''"]
In [ ]:
# s.total_reads_mapped / s.total_reads
#my_cmap = (matplotlib.color.LinearSegmentedColormap.
# from_list('blueWhiteRed', ['blue', 'white', 'red']))
ax = df.plot.scatter(x='total_reads', y='total_reads_mapped',
c='s.reads_mapped_to_CDS / s.total_reads_mapped',
cmap=plt.cm.bone,
s='s.reads_mapped_to_rRNA / s.total_reads_mapped',
sharex=False # hack to get back the x label
# hack from: https://github.com/pydata/pandas/issues/10611
)
label_point(df.total_reads, df.total_reads_mapped,
df.ID, ax)
ax.figure.set_size_inches(14, 10)
#ax.figure.savefig(plot_dir +
# 'total_mapped_vs_total--color_by_frac_to_CDS--labeled.pdf')
In [ ]:
# s.total_reads_mapped / s.total_reads
#my_cmap = (matplotlib.color.LinearSegmentedColormap.
# from_list('blueWhiteRed', ['blue', 'white', 'red']))
ax = df.plot.scatter(x='total_reads', y='total_reads_mapped',
c='s.reads_mapped_to_CDS / s.total_reads_mapped',
cmap=plt.cm.bone,
s=100,
sharex=False # hack to get back the x label
# hack from: https://github.com/pydata/pandas/issues/10611
)
label_point(df.total_reads, df.total_reads_mapped,
df.ID, ax)
ax.figure.set_size_inches(14, 10)
ax.figure.savefig(plot_dir +
'total_mapped_vs_total--color_by_frac_to_CDS--labeled.pdf')
In [ ]:
ax = df.plot.scatter(x='total_reads', y='reads_mapped_to_rRNA')
ax.set_title('rRNA vs total')
In [ ]:
ax = df['reads_mapped_to_rRNA'].plot(kind='hist')
ax.set_xlabel("reads mapped to rRNA / 10^7")
ax.set_ylabel("number of samples")
In [ ]:
ax = df['s.total_reads_mapped / s.total_reads'].plot(kind='hist')
ax.set_xlabel("fraction of reads mapped")
ax.set_ylabel("number of samples")
In [ ]: