Take bedtools intersect output and make a nice table!


In [52]:
import re
import numpy as np
import pandas as pd
import seaborn as sb

%matplotlib notebook

In [47]:
values = []
with open('/Users/bergeric/data/dm6_phastcons_intersect.txt') as f:
    for line in f:
        #print(line)
        #break
        pattern = re.compile(r'.*(FBgn\w*)\t(\S*)\t.*(chr\w*)\tFlyBase\sgene\t(\w*)\t(\w*).*ID=(\w*);Name=(\w*).*\t(\S*)')
        match = pattern.match(line)
        TF = match.group(1)
        score = float(match.group(2))
        chrom = match.group(3)
        start = match.group(4)
        end= match.group(5)
        symbol = match.group(7)
        FBgn= match.group(6)
        phastcon = float(match.group(8))
        reorder = (TF, FBgn, symbol, chrom, start, end, score, phastcon)
        values.append(reorder)
#print(np.vstack(values[:5]))
df = pd.DataFrame(values, columns=['TF','FBgn','Symbol', 'Chrom', 'Start', 'End', 'Score','Phastcons'])
df.head()
#df.to_csv('/Users/bergeric/data/bedtoolsoutput_df.txt', sep='\t', index=False)


Out[47]:
TF FBgn Symbol Chrom Start End Score Phastcons
0 FBgn0036179 FBgn0051774 fred chr2L 3901928 3991713 30.5046 0.495
1 FBgn0036179 FBgn0051774 fred chr2L 3901928 3991713 30.5046 0.965
2 FBgn0036179 FBgn0051774 fred chr2L 3901928 3991713 30.5046 0.998
3 FBgn0036179 FBgn0051774 fred chr2L 3901928 3991713 30.5046 0.999
4 FBgn0036179 FBgn0051774 fred chr2L 3901928 3991713 30.5046 0.994

In [45]:
grp = df.groupby(['TF', 'FBgn', 'Symbol', 'Chrom', 'Start', 'End', 'Score'])

In [ ]:


In [49]:
meanframe = grp.mean()
meanframe.head()


Out[49]:
Phastcons
TF FBgn Symbol Chrom Start End Score
FBgn0000028 FBgn0085422 CG34393 chr2L 3174112 3200496 18.1270 0.999850
FBgn0002521_3 FBgn0011259 Sema chr2L 8541147 8672041 19.6364 1.000000
FBgn0032151 nAChRalpha6 chr2L 9792317 9886250 19.6364 0.944429
FBgn0264815 Pde1c chr2L 11813156 11928572 20.1667 1.000000
FBgn0003499 FBgn0000114 bru1 chr2L 12174004 12313438 19.5816 0.014154

In [53]:
sb.distplot(meanframe["Phastcons"])


Out[53]:
<matplotlib.axes._subplots.AxesSubplot at 0x1965c7128>

In [ ]: