In [92]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import matplotlib as plt
#plt.use('TkAgg')
import operator
import pylab
pylab.show()
%pylab inline
In [93]:
fileUrl = "probes/extraGff/ITAG2.3_repeats.gff3"
#fileUrl = "probes/extraGff/ITAG2.3_repeats.gff3.Copia.agp"
#fileUrl = "probes/extraGff/ITAG2.3_repeats.gff3.Gypsy.agp"
#fileUrl = "probes/extraGff/ITAG2.3_repeats.gff3.Low_complexity.agp"
#fileUrl = "probes/extraGff/ITAG2.3_repeats.gff3.LTR.agp"
#fileUrl = "probes/extraGff/ITAG2.3_repeats.gff3.Simple_repeat.agp"
#fileUrl = "probes/extraGff/ITAG2.3_repeats.gff3.SINE.agp"
FULL_FIG_W , FULL_FIG_H = 16, 8
CHROM_FIG_W, CHROM_FIG_H = FULL_FIG_W, 20
In [94]:
class size_controller(object):
def __init__(self, w, h):
self.w = w
self.h = h
def __enter__(self):
self.o = rcParams['figure.figsize']
rcParams['figure.figsize'] = self.w, self.h
return None
def __exit__(self, type, value, traceback):
rcParams['figure.figsize'] = self.o
In [95]:
col_type_int = np.int64
col_type_flo = np.float64
col_type_str = np.str_ #np.object
col_type_char = np.character
col_info =[
[ "chromosome", col_type_str ],
[ "source" , col_type_str ],
[ "type" , col_type_str ],
[ "start" , col_type_int ],
[ "end" , col_type_int ],
[ "qual" , col_type_int ],
[ "strand" , col_type_char ],
[ "frame" , col_type_char ],
[ "info" , col_type_str ],
]
col_names=[cf[0] for cf in col_info]
col_types=dict(zip([c[0] for c in col_info], [c[1] for c in col_info]))
col_types
Out[95]:
In [96]:
info_keys = set()
def filter_conv(fi):
global info_keys
vs = []
for pair in fi.split(";"):
kv = pair.split("=")
info_keys.add(kv[0])
if len(kv) == 2:
#in case of key/value pairs
vs.append(kv)
else:
#in case of flags such as INDEL
vs.append([kv[0], True])
x = dict(zip([x[0] for x in vs], [x[1] for x in vs]))
#z = pd.Series(x)
#print z
return x
In [97]:
CONVERTERS = {
'info': filter_conv
}
SKIP_ROWS = 3
NROWS = None
#index_col=['chromosome', 'start'], usecols=col_names,
gffData = pd.read_csv(fileUrl, header=None, names=col_names, dtype=col_types, nrows=NROWS, skiprows=SKIP_ROWS, converters=CONVERTERS, verbose=True, delimiter="\t", comment="#")
print gffData.shape
gffData.head()
Out[97]:
In [98]:
gffData['length'] = gffData['end'] - gffData['start']
gffData.head()
Out[98]:
In [99]:
info_keys = list(info_keys)
info_keys.sort()
info_keys
Out[99]:
In [100]:
info_keys_types = {
'score': col_type_int
}
In [101]:
def gen_val_extracter(info_keys_g):
def val_extracter_l(info_row, **kwargs):
vals = [None] * len(info_keys_g)
for k,v in info_row.items():
if k in info_keys_g:
vals[info_keys_g.index(k)] = v
else:
pass
return vals
return val_extracter_l
#gffData[info_keys] = gffData['info'].apply(gen_val_extracter(info_keys), axis=1).apply(pd.Series, 1)
gffData.head()
Out[101]:
http://pandas.pydata.org/pandas-docs/dev/visualization.html
https://bespokeblog.wordpress.com/2011/07/11/basic-data-plotting-with-matplotlib-part-3-histograms/
http://nbviewer.ipython.org/github/mwaskom/seaborn/blob/master/examples/plotting_distributions.ipynb
http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/week3/exploratory_graphs.ipynb
http://pandas.pydata.org/pandas-docs/version/0.15.0/visualization.html
http://www.gregreda.com/2013/10/26/working-with-pandas-dataframes/
In [102]:
gffData.dtypes
Out[102]:
In [103]:
gffData.describe()
Out[103]:
In [104]:
maxPos = gffData['end' ].max()
print "sum ", gffData['length'].sum()
print "avg ", gffData['length'].sum() * 1.0 / 950000000
In [105]:
chromosomes = np.unique(gffData['chromosome'].values)
chromstats = {}
for chrom in chromosomes:
chromdata = gffData['length'][gffData['chromosome'] == chrom]
chromdatasize = gffData['end' ][gffData['chromosome'] == chrom].max() - gffData['start' ][gffData['chromosome'] == chrom].min()
chromdatasum = chromdata.sum()
#print "chrom", chrom
#print " size ", chromdatasize
#print " count", chromdata.count()
#print " sum ", chromdatasum
#print " avg ", chromdatasum * 1.0 / chromdatasize
chromstats[ chrom ] = { 'size': chromdatasize, 'count': chromdata.count(), 'sum': chromdatasum, 'avg': chromdatasum * 1.0 / chromdatasize }
chromstats = pd.DataFrame.from_dict(chromstats, orient='index')
#print "col types ", chromstats.dtypes
#print "col names ", chromstats.columns
#print "col indexes", chromstats.index
print "median.avg ", chromstats['avg'].median()
print "MAD.avg ", chromstats['avg'].mad()
print chromstats
In [106]:
gffData.median()
Out[106]:
In [107]:
gffData.mad()
Out[107]:
In [108]:
chromosomes = np.unique(gffData['chromosome'].values)
chromosomes
Out[108]:
In [109]:
with size_controller(FULL_FIG_W, FULL_FIG_H):
bq = gffData.boxplot(column='qual', return_type='dict')
In [110]:
with size_controller(FULL_FIG_W, FULL_FIG_H):
bqc = gffData.boxplot(column='qual', by='chromosome', return_type='dict')
In [111]:
with size_controller(FULL_FIG_W, FULL_FIG_H):
bqc = gffData.boxplot(column='start', by='chromosome', return_type='dict')
In [112]:
with size_controller(FULL_FIG_W, FULL_FIG_H):
hs = gffData['start'].hist()
In [113]:
hsc = gffData['start'].hist(by=gffData['chromosome'], figsize=(CHROM_FIG_W, CHROM_FIG_H), layout=(len(chromosomes),1))
In [114]:
with size_controller(FULL_FIG_W, FULL_FIG_H):
hl = gffData['length'].hist()
In [115]:
hlc = gffData['length'].hist(by=gffData['chromosome'], figsize=(CHROM_FIG_W, CHROM_FIG_H), layout=(len(chromosomes),1))
In [116]:
#http://stackoverflow.com/questions/27934885/how-to-hide-code-from-cells-in-ipython-notebook-visualized-with-nbviewer
from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
var classes_to_hide = ['div.input', 'div.output_stderr', 'div.output_prompt', 'div.input_prompt', 'div.prompt'];
if (code_show){
for ( var c in classes_to_hide ) {
$(classes_to_hide[c]).hide();
}
} else {
for ( var c in classes_to_hide ) {
$(classes_to_hide[c]).show();
}
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Clickk here to toggle on/off the raw code."></form>''')
Out[116]: