In [1]:
%run ~/relmapping/annot/notebooks/__init__.ipynb


/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
os.getcwd(): /mnt/b2/scratch/ahringer/jj374/lab/relmapping

In [2]:
df_operon = pd.read_csv('WS260_ce10/WS260_ce10.operon.gtf', sep='\t', names=yp.NAMES_GTF)
df_operon.head()


Out[2]:
chrom source feature start end score strand frame attribute
0 chrI operon operon 47467 54426 . + . Operon "CEOP1971" ; Gene "WBGene00021677" ; Ge...
1 chrI operon operon 71425 92877 . + . Operon "CEOP1940" ; Gene "WBGene00000812" ; Ge...
2 chrI operon operon 96544 110107 . - . Operon "CEOP1957" ; Gene "WBGene00004274" ; Ge...
3 chrI operon operon 112285 113721 . + . Operon "CEOP1959" ; Gene "WBGene00018774"
4 chrI operon operon 323856 337840 . + . Operon "CEOP1968" ; Gene "WBGene00000917" ; Ge...

In [3]:
def n_genes(s):
    n = 0
    for s_i in s.split(';'):
        k, v = s_i.split()
        if k == 'Gene':
            n += 1
    return n

df_operon['no_of_genes'] = list(map(n_genes, df_operon['attribute']))

In [4]:
# Distribution of the number of genes in an operon
df_operon['no_of_genes'].value_counts().sort_index()


Out[4]:
1     16
2    914
3    295
4    104
5     37
6     14
7      5
8      3
Name: no_of_genes, dtype: int64

In [5]:
# Single-gene operons???
df_operon.query('no_of_genes == 1')


Out[5]:
chrom source feature start end score strand frame attribute no_of_genes
3 chrI operon operon 112285 113721 . + . Operon "CEOP1959" ; Gene "WBGene00018774" 1
133 chrI operon operon 6441639 6446467 . - . Operon "CEOP1002" ; Gene "WBGene00020742" 1
249 chrI operon operon 10823352 10828443 . + . Operon "CEOP1001" ; Gene "WBGene00016457" 1
353 chrII operon operon 4995545 4998029 . + . Operon "CEOP2727" ; Gene "WBGene00004382" 1
477 chrII operon operon 10720172 10729193 . + . Operon "CEOP2001" ; Gene "WBGene00007490" 1
521 chrII operon operon 12915348 12918949 . - . Operon "CEOP2002" ; Gene "WBGene00010264" 1
584 chrIII operon operon 2473858 2486610 . + . Operon "CEOP3846" ; Gene "WBGene00004884" 1
797 chrIII operon operon 10238581 10244719 . + . Operon "CEOP3821" ; Gene "WBGene00013499" 1
828 chrIII operon operon 11245021 11252182 . + . Operon "CEOP3825" ; Gene "WBGene00012929" 1
894 chrIV operon operon 1275865 1277091 . + . Operon "CEOP4001" ; Gene "WBGene00007476" 1
913 chrIV operon operon 2918439 2919778 . + . Operon "CEOP4637" ; Gene "WBGene00044493" 1
945 chrIV operon operon 5356530 5364298 . - . Operon "CEOP4679" ; Gene "WBGene00018319" 1
1201 chrV operon operon 10011847 10028134 . + . Operon "CEOP5548" ; Gene "WBGene00002263" 1
1356 chrX operon operon 10769223 10782908 . + . Operon "CEOPX001" ; Gene "WBGene00004862" 1
1374 chrX operon operon 14703645 14706789 . - . Operon "CEOPX003" ; Gene "WBGene00009574" 1
1379 chrX operon operon 15689381 15696355 . + . Operon "CEOPX002" ; Gene "WBGene00007284" 1

In [6]:
n_ = sum(df_operon['no_of_genes'] - 1)
print('%d genes annotated as downstream in an operon' % (n_))


2085 genes annotated as downstream in an operon

In [8]:
#for s in df_operon['attribute'].head(20):
#    print(s)
df_operon.head()


Out[8]:
chrom source feature start end score strand frame attribute no_of_genes
0 chrI operon operon 47467 54426 . + . Operon "CEOP1971" ; Gene "WBGene00021677" ; Ge... 2
1 chrI operon operon 71425 92877 . + . Operon "CEOP1940" ; Gene "WBGene00000812" ; Ge... 3
2 chrI operon operon 96544 110107 . - . Operon "CEOP1957" ; Gene "WBGene00004274" ; Ge... 2
3 chrI operon operon 112285 113721 . + . Operon "CEOP1959" ; Gene "WBGene00018774" 1
4 chrI operon operon 323856 337840 . + . Operon "CEOP1968" ; Gene "WBGene00000917" ; Ge... 2

In [21]:
def df_gene_operon():
    df_ = pd.read_csv('WS260_ce10/WS260_ce10.operon.gtf', sep='\t', names=yp.NAMES_GTF)
    df_['operon_id'] = list(map(lambda attr: [attr_i.split()[1].lstrip('"').rstrip('"') 
        for attr_i in attr.split(';') if attr_i.split()[0] == 'Operon'][0], df_['attribute']))
    df_['gene_ids'] = list(map(lambda attr: ' '.join([attr_i.split()[1].lstrip('"').rstrip('"') 
        for attr_i in attr.split(';') if attr_i.split()[0] == 'Gene']), df_['attribute']))
    #http://stackoverflow.com/questions/17116814/pandas-how-do-i-split-text-in-a-column-into-multiple-rows
    s_gene_id = pd.DataFrame(list(df_.gene_ids.str.split())).stack()
    s_gene_id.index = s_gene_id.index.droplevel(-1)
    s_gene_id.name='gene_id'
    df_r = df_.join(s_gene_id)[['gene_id', 'operon_id']].reset_index(drop=True)
    return df_r

df_gene_operon()


Out[21]:
gene_id operon_id
0 WBGene00021677 CEOP1971
1 WBGene00044345 CEOP1971
2 WBGene00000812 CEOP1940
3 WBGene00021682 CEOP1940
4 WBGene00021683 CEOP1940
5 WBGene00004274 CEOP1957
6 WBGene00021681 CEOP1957
7 WBGene00018774 CEOP1959
8 WBGene00000917 CEOP1968
9 WBGene00021660 CEOP1968
10 WBGene00021657 CEOP1997
11 WBGene00021658 CEOP1997
12 WBGene00004143 CEOP1004
13 WBGene00020089 CEOP1004
14 WBGene00006593 CEOP1008
15 WBGene00021026 CEOP1008
16 WBGene00022035 CEOP1972
17 WBGene00022036 CEOP1972
18 WBGene00007009 CEOP1934
19 WBGene00022042 CEOP1934
20 WBGene00018923 CEOP1012
21 WBGene00022029 CEOP1012
22 WBGene00021208 CEOP1014
23 WBGene00021209 CEOP1014
24 WBGene00021210 CEOP1014
25 WBGene00021329 CEOP1908
26 WBGene00021331 CEOP1908
27 WBGene00021332 CEOP1908
28 WBGene00000165 CEOP1939
29 WBGene00021689 CEOP1939
... ... ...
3443 WBGene00045291 CEOPX163
3444 WBGene00045292 CEOPX163
3445 WBGene00008353 CEOPX168
3446 WBGene00044134 CEOPX168
3447 WBGene00009574 CEOPX003
3448 WBGene00008975 CEOPX130
3449 WBGene00008980 CEOPX130
3450 WBGene00007668 CEOPX132
3451 WBGene00008979 CEOPX132
3452 WBGene00001649 CEOPX136
3453 WBGene00010397 CEOPX136
3454 WBGene00010725 CEOPX156
3455 WBGene00010726 CEOPX156
3456 WBGene00007284 CEOPX002
3457 WBGene00023497 CEOPX140
3458 WBGene00023498 CEOPX140
3459 WBGene00012414 CEOPX155
3460 WBGene00044155 CEOPX155
3461 WBGene00000803 CEOPX182
3462 WBGene00010011 CEOPX182
3463 WBGene00001837 CEOPX150
3464 WBGene00015673 CEOPX150
3465 WBGene00001537 CEOPX152
3466 WBGene00016268 CEOPX152
3467 WBGene00020366 CEOPX144
3468 WBGene00020812 CEOPX144
3469 WBGene00016900 CEOPX157
3470 WBGene00044770 CEOPX157
3471 WBGene00017341 CEOPX161
3472 WBGene00017342 CEOPX161

3473 rows × 2 columns


In [ ]: