In [1]:
%run ~/relmapping/annot/notebooks/__init__.ipynb
In [2]:
df_genes = pd.read_csv('WS260_ce10/WS260_ce10.genes_by_CV.tsv', sep='\t').drop_duplicates(subset=['gene_id'])
print('%d gene records loaded' % (len(df_genes),))
fp_regl = 'annot/S2_regulatory_annotation/S2_regulatory_annotation.tsv'
df_regl = pd.read_csv(fp_regl, sep='\t')
print('%d regions loaded' % (len(df_regl),))
In [3]:
l_ = list(df_regl.query('annot_fwd == "coding_promoter"')['promoter_gene_id_fwd'])\
+ list(df_regl.query('annot_rev == "coding_promoter"')['promoter_gene_id_rev'])
s_ = pd.Series(list(filter(lambda gene_id: gene_id != '.', l_)))
df_ = pd.DataFrame(s_.value_counts(), columns=['n_promoter'])
pr_ = df_genes.merge(df_, left_on='gene_id', right_index=True, how='left')['n_promoter']
df_genes['n_promoter'] = list(map(lambda n: n if n==n else 0, pr_))
df_genes.head()
Out[3]:
In [4]:
l_ = list(df_regl.query('annot == "putative_enhancer"')['associated_gene_id'])
s_ = pd.Series(list(itertools.chain(*list([l_i_.split(',') for l_i_ in filter(lambda gene_id: gene_id != '.', l_)]))))
df_ = pd.DataFrame(s_.value_counts(), columns=['n_putative_enhancer'])
pe_ = df_genes.merge(df_, left_on='gene_id', right_index=True, how='left')['n_putative_enhancer']
df_genes['n_putative_enhancer'] = list(map(lambda n: n if n==n else 0, pe_))
df_genes.head()
Out[4]:
In [5]:
df_genes['n_promoter'].value_counts()
Out[5]:
In [6]:
df_genes['n_putative_enhancer'].value_counts()
Out[6]:
In [8]:
plt.figure(figsize=(2,3))
plt.boxplot([
df_genes.query('n_promoter == 1')['n_putative_enhancer'],
df_genes.query('n_promoter == 2')['n_putative_enhancer'],
df_genes.query('n_promoter >= 3')['n_putative_enhancer'],
],
labels = ['1', '2', '>2'],
sym='', # Do not show outliers -- by eye, <100 in each group...
widths=0.7,
)
plt.xlabel('Number of promoters')
plt.ylabel('Number of enhancers')
plt.gca().set_ylim(-1, 17)
plt.yticks([0,5,10,15])
plt.savefig('annot_Apr27/Fig2C_npromoter_vs_nenhancer.pdf', bbox_inches='tight', transparent=True)
In [ ]: