In [1]:
%run ~/relmapping/annot/notebooks/__init__.ipynb
In [2]:
df_operon = pd.read_csv('WS260_ce10/WS260_ce10.operon.gtf', sep='\t', names=yp.NAMES_GTF)
df_operon.head()
Out[2]:
In [3]:
def n_genes(s):
n = 0
for s_i in s.split(';'):
k, v = s_i.split()
if k == 'Gene':
n += 1
return n
df_operon['no_of_genes'] = list(map(n_genes, df_operon['attribute']))
In [4]:
# Distribution of the number of genes in an operon
df_operon['no_of_genes'].value_counts().sort_index()
Out[4]:
In [5]:
# Single-gene operons???
df_operon.query('no_of_genes == 1')
Out[5]:
In [6]:
n_ = sum(df_operon['no_of_genes'] - 1)
print('%d genes annotated as downstream in an operon' % (n_))
In [8]:
#for s in df_operon['attribute'].head(20):
# print(s)
df_operon.head()
Out[8]:
In [21]:
def df_gene_operon():
df_ = pd.read_csv('WS260_ce10/WS260_ce10.operon.gtf', sep='\t', names=yp.NAMES_GTF)
df_['operon_id'] = list(map(lambda attr: [attr_i.split()[1].lstrip('"').rstrip('"')
for attr_i in attr.split(';') if attr_i.split()[0] == 'Operon'][0], df_['attribute']))
df_['gene_ids'] = list(map(lambda attr: ' '.join([attr_i.split()[1].lstrip('"').rstrip('"')
for attr_i in attr.split(';') if attr_i.split()[0] == 'Gene']), df_['attribute']))
#http://stackoverflow.com/questions/17116814/pandas-how-do-i-split-text-in-a-column-into-multiple-rows
s_gene_id = pd.DataFrame(list(df_.gene_ids.str.split())).stack()
s_gene_id.index = s_gene_id.index.droplevel(-1)
s_gene_id.name='gene_id'
df_r = df_.join(s_gene_id)[['gene_id', 'operon_id']].reset_index(drop=True)
return df_r
df_gene_operon()
Out[21]:
In [ ]: