In [3]:
import pandas as pd
import numpy as np
import os

coverage_file = os.path.expanduser("/Users/zamparol/projects/crisprML/data/annotations/coverage_count.bed")

In [6]:
coverages = pd.read_csv(coverage_file, sep='\t', header=None)
coverages.columns = ["chr", "start", "end", "transcript", "gene", "strand", "coverage"]

In [7]:
coverages.head()


Out[7]:
chr start end transcript gene strand coverage
0 chr1 4807891 4846734 uc007afh.1 Lypla1 + 4
1 chr1 4807891 4842826 uc007afg.1 Lypla1 + 4
2 chr1 4857692 4897908 uc007afi.2 Tcea1 + 6
3 chr1 4857692 4897908 uc011wht.1 Tcea1 + 6
4 chr1 4858326 4897908 uc011whu.1 Tcea1 + 5

In [15]:
# number of transcripts per gene?  Split into bins & plot
avg_tx_per_gene = coverages.groupby(["gene"])["coverage"].agg(['median', 'count'])
avg_tx_per_gene.columns = ["median exons per tx", "number of tx"]

In [16]:
avg_tx_per_gene


Out[16]:
median exons per tx number of tx
gene
0610005C13Rik 11.0 2
0610007P14Rik 4.0 1
0610009B22Rik 2.5 2
0610009L18Rik 2.0 1
0610009O20Rik 6.0 3
0610010B08Rik 12.0 5
0610010F05Rik 14.0 5
0610010K14Rik 12.0 7
0610011F06Rik 4.0 1
0610012G03Rik 1.0 1
0610030E20Rik 6.0 1
0610031O16Rik 7.5 2
0610037L13Rik 8.0 3
0610038B21Rik 2.0 1
0610039H22Rik 3.0 1
0610039K10Rik 1.0 1
0610040B10Rik 2.0 1
0610040F04Rik 7.0 2
0610040J01Rik 4.0 1
0610043K17Rik 5.0 1
1010001N08Rik 8.0 3
1110001J03Rik 2.0 1
1110002L01Rik 4.0 2
1110004E09Rik 4.0 2
1110004F10Rik 7.0 2
1110006O24Rik 1.0 1
1110008F13Rik 4.5 2
1110008L16Rik 5.0 2
1110008P14Rik 2.0 1
1110012L19Rik 3.0 1
... ... ...
Zswim7 4.0 1
Zswim8 8.0 4
Zufsp 10.0 5
Zw10 7.0 3
Zwilch 6.0 2
Zwint 4.0 3
Zxda 1.0 1
Zxdb 1.0 1
Zxdc 5.0 4
Zyg11a 4.0 1
Zyg11b 6.0 2
Zyx 7.0 4
Zzef1 6.0 4
Zzz3 10.5 4
a 6.5 2
abParts 18.0 5
env 2.0 1
l7Rn6 7.0 5
mFLJ00251 6.0 1
mFLJ00385 8.0 1
mKIAA0013 2.0 1
mKIAA0208 3.0 1
mKIAA0217 1.0 1
mKIAA0383 3.0 1
mKIAA1554 8.0 1
mKIAA1630 1.0 1
mannose receptor precursor-like 7.0 1
mdab1 2.0 1
rjs 1.0 1
unknown 3.0 1

32723 rows × 2 columns


In [ ]: