This notebook is for the analysis of outputs from the peakcalling pipeline
There are severals stats that you want collected and graphed (topics covered in this notebook in bold).
These are:
This notebook takes the sqlite3 database created by CGAT peakcalling_pipeline.py and uses it for plotting the above statistics
It assumes a file directory of:
location of database = project_folder/csvdb
location of this notebook = project_folder/notebooks.dir/
Firstly lets load all the things that might be needed
In [ ]:
import sqlite3
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import CGATPipelines.Pipeline as P
import os
import statistics
import collections
#load R and the R packages required
%load_ext rpy2.ipython
%R require(ggplot2)
# use these functions to display tables nicely as html
from IPython.display import display, HTML
plt.style.use('bmh')
#look at other available styles for plotting
#plt.style.available
This is where we are and when the notebook was run
In [ ]:
!pwd
!date
First lets set the output path for where we want our plots to be saved and the database path and see what tables it contains
In [ ]:
database_path ='../csvdb'
output_path = '.'
This code adds a button to see/hide code in html
In [ ]:
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
The code below provides functions for accessing the project database and extract a table names so you can see what tables have been loaded into the database and are available for plotting. It also has a function for geting table from the database and indexing the table with the track name
In [ ]:
def getTableNamesFromDB(database_path):
# Create a SQL connection to our SQLite database
con = sqlite3.connect(database_path)
cur = con.cursor()
# the result of a "cursor.execute" can be iterated over by row
cur.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;")
available_tables = (cur.fetchall())
#Be sure to close the connection.
con.close()
return available_tables
db_tables = getTableNamesFromDB(database_path)
print('Tables contained by the database:')
for x in db_tables:
print('\t\t%s' % x[0])
#This function retrieves a table from sql database and indexes it with track name
def getTableFromDB(statement,database_path):
'''gets table from sql database depending on statement
and set track as index if contains track in column names'''
conn = sqlite3.connect(database_path)
df = pd.read_sql_query(statement,conn)
if 'track' in df.columns:
df.index = df['track']
return df
Here are some functions we need
In [ ]:
###These are functions used to manipulate the table so order of chromsomes is consistent with numbers
def StrIsInt(string):
'''function that takes string and tests if it can be represented as an int
e.g. returns true for "3", but False for "Chr3" '''
try:
int(string)
return True
except ValueError:
return False
def orderListOfChr(unordered_chr_list):
'''take a list of chromosomes and return them in
order of chromosome number not string order
e.g. input = ["chr1",'chr11","chr2","chrM"]
output = ["chr1",'chr2","chr11","chrM"]'''
#make a empty list same length as chromosomes
chr_id = [None]* len(unordered_chr_list)
for value in unordered_chr_list:
x = value.split("chr")[-1]
# check if chr name is int or str
if StrIsInt(x):
chr_id[int(x)-1] = value
else:
chr_id.append(value)
#remove none values from list
ordered_chr_list = [x for x in chr_id if x is not None]
return ordered_chr_list
def reorderDFbyChrOrder(df):
'''Takes a dataframe indexed on chr name
and returns dataframe so that index is sorted based on the
chromosome number
e.g.dataframe with index chr1,chr11,chr12,chr2,chrM
will be returned with rows in the order "chr1, chr2, chr11, chr12, chrM" '''
list_of_reordered_chr = orderListOfChr(df.index)
return df.reindex(list_of_reordered_chr)
# this subsets dataframe so only includes columns containing chr
def getChrNames(df):
'''takes dataframe with chromocome names in columns and returns a list of the chromosomes present'''
to_keep = []
for item in df.columns:
if 'chr' in item:
to_keep.append(item)
return to_keep
1) lets get IDXstats tabel from database lets look at total number of maapped reads per chromosome for each sample
In [ ]:
idxstats_df = getTableFromDB('select * from idxstats_reads_per_chromosome;',database_path)
idxstats_df.index = idxstats_df['region']
reads_per_chr_df = reorderDFbyChrOrder(idxstats_df.drop('region', 1))
print ('this table shows million reads per chromosome')
reads_per_chr_df.divide(1000000)
Contigs that have been filtered should clearly show up with 0 reads across the row
In [ ]:
def makeReadsPerChrPlot(df,path):
'''takes table from database of chromosome lengths and makes individual plot for
each sample of how many reads map to each chromosome'''
to_keep = []
for item in df.columns:
if 'chr' in item:
to_keep.append(item)
df = df[to_keep]
df = df.divide(1000000)
#where plot will be sent to
file_path = "/".join([path,'mapped_reads_per_chromosome_plot.pdf'])
print ('figure_saved_to %s' % file_path)
ax = df.T.plot(figsize=(11,5),
xticks = range(len(to_keep)),
title = 'Million reads mapped to each chromosome',
ylim=(0,10))
#set labels for plots
ax.set_xlabel("Contig")
ax.set_ylabel("million reads")
fig = matplotlib.figure.Figure()
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
matplotlib.pyplot.savefig(file_path, bbox_inches='tight')
matplotlib.pyplot.show()
makeReadsPerChrPlot(reads_per_chr_df.T,output_path)
In [ ]:
def makePercentReadsPerChrPlot(df,path):
'''takes the idxstats_reads_per_chromosome table from database and calculates percentage
of reads mapping to each chromosome and plots this for each chromosome and returns
percentage table'''
c = df.copy()
for item in c.columns:
if 'chr' not in item and item != 'total_reads':
c = c.drop(item,1)
y = c.div(c.total_reads, axis ='index')*100
y = y.drop('total_reads',1)
file_path = "/".join([path,'percentage_mapped_reads_per_chromosome_plot.pdf'])
print ('figure_saved_to %s' % file_path)
ax = y.T.plot(figsize=(10,5),
xticks = range(len(y.columns)),
title = 'Percentage of total input reads that map to each contig',
ylim=(0,100))
ax.set_xlabel("Contig")
ax.set_ylabel("percentage_reads")
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
fig = matplotlib.figure.Figure()
matplotlib.pyplot.savefig(file_path, bbox_inches='tight')
matplotlib.pyplot.show()
return y
percent_idxdf = makePercentReadsPerChrPlot(reads_per_chr_df.T,output_path)
percent_idxdf.T
In [ ]:
len(reads_per_chr_df.columns)
In [ ]:
def makeReadsPerSampleChrPlot(df,path,subplot_dims):
'''takes table from database of chromosome lengths and makes individual plot for
each sample of how many reads map to each chromosome
subplot dims = tuples of the format (num_rows,num_cols)'''
to_keep = []
for item in df.columns:
if 'chr' in item:
to_keep.append(item)
df = df[to_keep]
df = df.divide(1000000)
#where plot will be sent to
file_path = "/".join([path,'mapped_reads_per_chromosome_per_sample_plot.pdf'])
print ('figure_saved_to %s' % file_path)
#plot as subplots-
# can change layout to be better layout=(num_rows,num_cols)
# returns a list of axis of the subplots - select the right axis to add labels
ax = df.T.plot(subplots=True,
figsize=(10,10),
layout = subplot_dims,
xticks = range(len(to_keep)),
title = 'Million reads mapped to each chromosome per sample',
ylim=(0,10))
#set labels for plots
bottom_plot = ax[-1][0]
middle_plot = ((int(subplot_dims[0]/2), int(subplot_dims[1]/2)))
a = ax[middle_plot]
a.set_ylabel("million reads")
fig = matplotlib.figure.Figure()
matplotlib.pyplot.savefig(file_path, bbox_inches='tight')
matplotlib.pyplot.show()
makeReadsPerSampleChrPlot(reads_per_chr_df.T,output_path,(len(reads_per_chr_df.T.columns),1))
In [ ]:
def makePercentReadsPerSampleChrPlot(df,path,subplot_dims):
'''takes the idxstats_reads_per_chromosome table from database and calculates percentage
of reads mapping to each chromosome and plots this for each chromosome and returns
percentage table'''
c = df.copy()
for item in c.columns:
if 'chr' not in item and item != 'total_reads':
c = c.drop(item,1)
y = c.div(c['total_reads'], axis ='index')*100
y = y.drop('total_reads',1)
file_path = "/".join([path,'percentage_mapped_reads_per_chromosome_per_sample_plot.pdf'])
print ('figure_saved_to %s' % file_path)
ax = y.T.plot(subplots=True,
layout = subplot_dims,
figsize=(10,10),
xticks = range(len(y.columns)),
title = 'Percentage of total input reads that map to each contig',
ylim=(0,100))
ax[-1][0].set_xlabel("Contig")
middle_plot = ((int(subplot_dims[0]/2), int(subplot_dims[1]/2)))
ax[middle_plot].set_ylabel("percentage_reads")
fig = matplotlib.figure.Figure()
matplotlib.pyplot.savefig(file_path, bbox_inches='tight')
matplotlib.pyplot.show()
makePercentReadsPerSampleChrPlot(reads_per_chr_df.T,output_path,(len(reads_per_chr_df.columns),1))
From this notebook you will see how many reads map to each contig - hopefully it will show no reads mapping to any that you filtered out in the peakcalling pipeline - it shpould also shwo you id some chromsomes unexpectedly high mapping rates compared to others - remember chromsomes are often names in order of size so in therory chr1 is more likely to have the most reads mapping to it purely becuase it is the biggest
In [ ]:
x_vs_y_df = idxstats_df.drop('region', 1).T[['chrX','chrY']].copy()
print (x_vs_y_df.head())
x_vs_y_df['total_xy'] = x_vs_y_df.chrX + x_vs_y_df.chrY
x_vs_y_df['percentX'] = x_vs_y_df.chrX/x_vs_y_df.total_xy * 100
x_vs_y_df['percentY'] = x_vs_y_df.chrY/x_vs_y_df.total_xy * 100
display(x_vs_y_df)
#plot bar graph of number of thousand reads mapping to chrX vs chrY
ax = x_vs_y_df[['chrX','chrY']].divide(1000).plot.bar()
ax.set_ylabel('Thousand Reads (not pairs)')
ax.legend(['chrX','chrY'], loc=2,bbox_to_anchor=(1.05, 1),borderaxespad=0. )
ax.set_title('number of reads (not pairs) \n mapping to chrX or chrY')
# plot graph of percentage of reads mapping to either chr X or Y
ax = x_vs_y_df[['percentX', 'percentY']].plot.bar(stacked=True)
ax.legend(['chrX','chrY'], loc=2,bbox_to_anchor=(1.05, 1),borderaxespad=0. )
ax.set_ylabel('percentage reads')
ax.set_title('percentage of sex chromosome reads mapping \n to chrX or chrY')
ax.set_ylim((0,110))
WRITE YOUR COMMENTS HERE
From the plots above you should be able to see which samples are male and which are female depending on the percentage of reads mapping to the Y chromosome
In [ ]:
In [ ]:
def add_expt_to_df(dataframe):
''' splits track name for example HsTh1-RATotal-R1.star into expt
featues, expt, sample_treatment and replicate and adds these as
collumns to the dataframe'''
expt = []
treatment = []
replicate = []
for value in dataframe.track:
#remone star label
#print value
x = value.split(".")
# split into design features
y = x[0].split('-')
expt.append(y[0])
treatment.append(y[1])
replicate.append(y[2])
if len(expt) == len(treatment) and len(expt)== len(replicate):
print ('all values in list correctly')
else:
print ('error in loading values into lists')
#add collums to dataframe
dataframe['expt_name'] = expt
dataframe['sample_treatment'] = treatment
dataframe['replicate'] = replicate
return dataframe