In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import matplotlib as plt
#plt.use('TkAgg')
import operator
import pylab
pylab.show()
%pylab inline
In [2]:
fileUrl = "https://dl.dropboxusercontent.com/u/753166/jbrowse/gbrowse_repeats.gff3.Gypsy.gff3"
FULL_FIG_W , FULL_FIG_H = 16, 8
CHROM_FIG_W, CHROM_FIG_H = FULL_FIG_W, 20
In [3]:
class size_controller(object):
def __init__(self, w, h):
self.w = w
self.h = h
def __enter__(self):
self.o = rcParams['figure.figsize']
rcParams['figure.figsize'] = self.w, self.h
return None
def __exit__(self, type, value, traceback):
rcParams['figure.figsize'] = self.o
In [37]:
col_type_int = np.int64
col_type_flo = np.float64
col_type_str = np.str_ #np.object
col_type_char = np.character
col_info =[
[ "chromosome", col_type_str ],
[ "source" , col_type_str ],
[ "type" , col_type_str ],
[ "start" , col_type_int ],
[ "end" , col_type_int ],
[ "qual" , col_type_int ],
[ "strand" , col_type_char ],
[ "frame" , col_type_char ],
[ "info" , col_type_str ],
]
col_names=[cf[0] for cf in col_info]
col_types=dict(zip([c[0] for c in col_info], [c[1] for c in col_info]))
col_types
Out[37]:
In [14]:
info_keys = set()
def filter_conv(fi):
global info_keys
vs = []
for pair in fi.split(";"):
kv = pair.split("=")
info_keys.add(kv[0])
if len(kv) == 2:
#in case of key/value pairs
vs.append(kv)
else:
#in case of flags such as INDEL
vs.append([kv[0], True])
x = dict(zip([x[0] for x in vs], [x[1] for x in vs]))
#z = pd.Series(x)
#print z
return x
In [43]:
CONVERTERS = {
'info': filter_conv
}
SKIP_ROWS = 3
NROWS = None
#index_col=['chromosome', 'start'], usecols=col_names,
gffData = pd.read_csv(fileUrl, header=None, names=col_names, dtype=col_types, nrows=NROWS, skiprows=SKIP_ROWS, converters=CONVERTERS, verbose=True, delimiter="\t", comment="#")
print gffData.shape
gffData.head()
Out[43]:
In [44]:
gffData['length'] = gffData['end'] - gffData['start']
gffData.head()
Out[44]:
In [45]:
info_keys = list(info_keys)
info_keys.sort()
info_keys
Out[45]:
In [ ]:
info_keys_types = {
'score': col_type_int
}
In [46]:
def gen_val_extracter(info_keys_g):
def val_extracter_l(info_row, **kwargs):
vals = [None] * len(info_keys_g)
for k,v in info_row.items():
if k in info_keys_g:
vals[info_keys_g.index(k)] = v
else:
pass
return vals
return val_extracter_l
gffData[info_keys] = gffData['info'].apply(gen_val_extracter(info_keys), axis=1).apply(pd.Series, 1)
gffData.head()
Out[46]:
http://pandas.pydata.org/pandas-docs/dev/visualization.html
https://bespokeblog.wordpress.com/2011/07/11/basic-data-plotting-with-matplotlib-part-3-histograms/
http://nbviewer.ipython.org/github/mwaskom/seaborn/blob/master/examples/plotting_distributions.ipynb
http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/week3/exploratory_graphs.ipynb
http://pandas.pydata.org/pandas-docs/version/0.15.0/visualization.html
http://www.gregreda.com/2013/10/26/working-with-pandas-dataframes/
In [47]:
gffData.dtypes
Out[47]:
In [48]:
gffData.describe()
Out[48]:
In [49]:
chromosomes = np.unique(gffData['chromosome'].values)
chromosomes
Out[49]:
In [50]:
with size_controller(FULL_FIG_W, FULL_FIG_H):
bq = gffData.boxplot(column='qual', return_type='dict')
In [51]:
with size_controller(FULL_FIG_W, FULL_FIG_H):
bqc = gffData.boxplot(column='qual', by='chromosome', return_type='dict')
In [52]:
with size_controller(FULL_FIG_W, FULL_FIG_H):
bqc = gffData.boxplot(column='start', by='chromosome', return_type='dict')
In [53]:
with size_controller(FULL_FIG_W, FULL_FIG_H):
hs = gffData['start'].hist()
In [54]:
hsc = gffData['start'].hist(by=gffData['chromosome'], figsize=(CHROM_FIG_W, CHROM_FIG_H), layout=(len(chromosomes),1))
In [55]:
with size_controller(FULL_FIG_W, FULL_FIG_H):
hl = gffData['length'].hist()
In [56]:
hlc = gffData['length'].hist(by=gffData['chromosome'], figsize=(CHROM_FIG_W, CHROM_FIG_H), layout=(len(chromosomes),1))
In [ ]:
#http://stackoverflow.com/questions/27934885/how-to-hide-code-from-cells-in-ipython-notebook-visualized-with-nbviewer
from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
var classes_to_hide = ['div.input', 'div.output_stderr', 'div.output_prompt', 'div.input_prompt', 'div.prompt'];
if (code_show){
for ( var c in classes_to_hide ) {
$(classes_to_hide[c]).hide();
}
} else {
for ( var c in classes_to_hide ) {
$(classes_to_hide[c]).show();
}
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Clickk here to toggle on/off the raw code."></form>''')