In [22]:
import numpy as np
import pandas as pd

%matplotlib notebook

In [41]:
with open('/Users/bergeric/Projects/s2rnai/output/phastcons_workflow/outfile.txt') as f: 
    df = pd.read_table(f, lineterminator='\n')
df['cnt'] = 1
df.head()


Out[41]:
TF FBgn Symbol Chrom Start End q Value Phastcons cnt
0 FBgn0000028 FBgn0085422 CG34393 chr2L 3194175 3194195 0.0229 0.999850 1
1 FBgn0002521_3 FBgn0011259 Sema chr2L 8561075 8561089 0.0338 1.000000 1
2 FBgn0002521_3 FBgn0032151 nAChRalpha6 chr2L 9797252 9797266 0.0338 0.944429 1
3 FBgn0002521_3 FBgn0264815 Pde1c chr2L 11921078 11921092 0.0330 1.000000 1
4 FBgn0003499 FBgn0000114 bru1 chr2L 12195053 12195066 0.0346 0.014154 1

In [25]:
type(df)


Out[25]:
pandas.core.frame.DataFrame

In [57]:
grp = df[['TF', 'FBgn', 'cnt']].groupby(['TF','FBgn'])
df1 = grp.sum()
df1


Out[57]:
cnt
TF FBgn
FBgn0000028 FBgn0085422 1
FBgn0002521_3 FBgn0011259 1
FBgn0032151 1
FBgn0264815 1
FBgn0003499 FBgn0000114 1
FBgn0000286 1
FBgn0000529 1
FBgn0000547 3
FBgn0003502 1
FBgn0003963 1
FBgn0011676 1
FBgn0016076 1
FBgn0025777 1
FBgn0027932 2
FBgn0028888 1
FBgn0031294 1
FBgn0031299 1
FBgn0031903 1
FBgn0032006 1
FBgn0032178 2
FBgn0032350 1
FBgn0032456 1
FBgn0032817 1
FBgn0032968 2
FBgn0040992 1
FBgn0041789 1
FBgn0045852 1
FBgn0051634 1
FBgn0051676 2
FBgn0051774 1
... ... ...
FBgn0259172_2 FBgn0266888 1
FBgn0266891 1
FBgn0266901 2
FBgn0266903 1
FBgn0266909 1
FBgn0266910 1
FBgn0266912 1
FBgn0267007 1
FBgn0267017 1
FBgn0267252 5
FBgn0267264 2
FBgn0267267 1
FBgn0267268 1
FBgn0267279 1
FBgn0267283 2
FBgn0267287 1
FBgn0267486 1
FBgn0267573 1
FBgn0267579 2
FBgn0267689 3
FBgn0267727 1
FBgn0267732 1
FBgn0267738 1
FBgn0267828 1
FBgn0267923 1
FBgn0267964 18
FBgn0267993 1
FBgn0278598 1
FBgn0283651 15
FBgn0283682 1

14578 rows × 1 columns


In [59]:
df2 = df1.copy()
df2['cnt'] = 1
df3 = df2.reset_index().groupby('TF').sum()
print(df3.shape)
df3


(36, 1)
Out[59]:
cnt
TF
FBgn0000028 1
FBgn0002521_3 3
FBgn0003499 43
FBgn0003567 14
FBgn0005558_2 1
FBgn0005630_19 3
FBgn0005694_2 663
FBgn0005694_3 701
FBgn0010768 1187
FBgn0010768_2 1348
FBgn0013469_2 78
FBgn0020309 324
FBgn0020309_2 329
FBgn0020378 388
FBgn0020378_2 316
FBgn0027339 1018
FBgn0027339_2 756
FBgn0034810 217
FBgn0034810_2 89
FBgn0034946 458
FBgn0034946_2 491
FBgn0034946_3 518
FBgn0035625_3 1
FBgn0036179 204
FBgn0036179_2 509
FBgn0038787 217
FBgn0038787_2 733
FBgn0039169 128
FBgn0039169_2 208
FBgn0039350 532
FBgn0039905_2 1264
FBgn0040765_2 85
FBgn0086910 339
FBgn0086910_2 525
FBgn0259172 123
FBgn0259172_2 764

In [52]:
df1.loc['FBgn0003499'].shape


Out[52]:
(43, 1)

In [69]:
cols = ['patname', 'seqname','start', 'stop', 'strand','score','pval','qval','matchedseq']
fimodf = pd.read_table('/Users/bergeric/Projects/s2rnai/data/motif_alignments_flyFactor_dm6.2L.txt', header=None, skiprows=1, names=cols)
fimodf.head()


Out[69]:
patname seqname start stop strand score pval qval matchedseq
0 FBgn0086910 chr2L 6842555 6842573 - 27.0000 2.550000e-11 0.000403 AGGGGGGGGGGGGAGGGGT
1 FBgn0036179 chr2L 12330607 12330626 - 30.6697 3.440000e-11 0.000350 TGGGGGGGGGGGGGGAAAGT
2 FBgn0036179 chr2L 3984006 3984025 - 30.5046 4.590000e-11 0.000363 GGGGGGGGGGGGGGGAATGG
3 FBgn0036179 chr2L 19377896 19377915 + 30.3670 5.040000e-11 0.000363 GGGGGGGGGGGGGGGTTAAG
4 FBgn0036179 chr2L 21889128 21889147 - 30.2202 5.620000e-11 0.000363 GGGGGGGGGGGGGGGTTTCT

In [73]:
fimodf.patname.unique().shape


Out[73]:
(36,)

In [76]:
onthefly_df = pd.read_table('/Users/bergeric/Projects/s2rnai/output/phastcons_workflow/onTheFly/outfile.txt', lineterminator='\n')
onthefly_df['cnt']=1
onthefly_df.head()


Out[76]:
TF FBgn Symbol Chrom Start End q Value Phastcons cnt
0 OTF0039.1 FBgn0000097 aop chr2L 2158817 2158827 0.0458 0.9995 1
1 OTF0039.1 FBgn0000097 aop chr2L 2159453 2159463 0.0458 0.9982 1
2 OTF0039.1 FBgn0000097 aop chr2L 2171472 2171482 0.0273 0.9851 1
3 OTF0039.1 FBgn0000097 aop chr2L 2178400 2178410 0.0353 0.9989 1
4 OTF0039.1 FBgn0000097 aop chr2L 2178493 2178503 0.0273 0.9683 1

In [79]:
grp2 = onthefly_df[['TF', 'FBgn', 'cnt']].groupby(['TF','FBgn'])
flydf = grp2.sum()
flydf


Out[79]:
cnt
TF FBgn
OTF0039.1 FBgn0000097 5
FBgn0000114 25
FBgn0000180 2
FBgn0000182 1
FBgn0000183 1
FBgn0000239 2
FBgn0000250 1
FBgn0000251 12
FBgn0000256 4
FBgn0000273 2
FBgn0000286 2
FBgn0000299 1
FBgn0000307 3
FBgn0000308 3
FBgn0000317 2
FBgn0000320 2
FBgn0000370 1
FBgn0000392 4
FBgn0000409 1
FBgn0000464 12
FBgn0000490 4
FBgn0000497 6
FBgn0000547 21
FBgn0000636 5
FBgn0000721 5
FBgn0000964 2
FBgn0001075 4
FBgn0001114 2
FBgn0001170 2
FBgn0001301 10
... ... ...
OTF0516.1 FBgn0267257 3
FBgn0267259 2
FBgn0267264 1
FBgn0267277 2
FBgn0267281 3
FBgn0267285 1
FBgn0267293 1
FBgn0267294 6
FBgn0267486 2
FBgn0267488 1
FBgn0267573 2
FBgn0267579 1
FBgn0267689 1
FBgn0267727 4
FBgn0267732 1
FBgn0267821 1
FBgn0267828 6
FBgn0267929 4
FBgn0267964 45
FBgn0267972 1
FBgn0267993 1
FBgn0278598 2
FBgn0283440 3
FBgn0283442 3
FBgn0283467 1
FBgn0283531 2
FBgn0283651 21
FBgn0283652 1
FBgn0283682 3
FBgn0283728 1

7558 rows × 1 columns


In [80]:
flydf2 = flydf.copy()
flydf2['cnt'] = 1
flydf3 = flydf2.reset_index().groupby('TF').sum()
print(flydf3.shape)
flydf3


(13, 1)
Out[80]:
cnt
TF
OTF0039.1 663
OTF0063.1 1293
OTF0231.2 959
OTF0249.1 257
OTF0302.1 217
OTF0304.1 452
OTF0351.1 1005
OTF0353.1 425
OTF0361.1 374
OTF0397.1 307
OTF0481.1 204
OTF0494.1 215
OTF0516.1 1187

In [81]:
flydf.loc['OTF0063.1'].shape


Out[81]:
(1293, 1)

In [ ]: