In [28]:
%pylab inline
%config InlineBackend.figure_format = 'retina'
import seaborn as sns
import pandas as pd
sns.set_style('ticks')


Populating the interactive namespace from numpy and matplotlib
/home/camille/miniconda/envs/py3.assembly/lib/python3.5/site-packages/IPython/core/magics/pylab.py:161: UserWarning: pylab import has clobbered these variables: ['ma']
`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"

In [29]:
data = pd.read_csv('Ast_g.heptamerbias.csv', header=None, names=['kmer', 1, 2, 3, 4, 5, 6, 7, 8])

In [32]:
data.head()


Out[32]:
kmer 1 2 3 4 5 6 7 8
0 AAAAAAA 633772 112055 2946239 749791 253383 108725 32248 23436
1 AAAAAAT 354222 31994 1903689 365184 123257 53758 13768 4579
2 AAAAAAC 118413 20474 1138971 222564 75488 45414 13983 2549
3 AAAAAAG 159322 30781 1808843 382415 114882 42002 10321 2794
4 AAAAATA 99801 19939 1455795 235026 60661 21810 5664 189

In [30]:
figsize(16,12)
for d in range(2,9):
    sns.distplot(data[d], hist=False, label='degree={0}'.format(d))
legend()


/home/camille/miniconda/envs/py3.assembly/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[30]:
<matplotlib.legend.Legend at 0x7fb7a11d6b00>

In [33]:
P_h = pd.Series(dict(zip(data.kmer, 
                         data[list(range(1,9))].sum(axis=1) / data.set_index('kmer')[list(range(1,9))].sum(axis=1).sum())))

In [34]:
P_h.head()


Out[34]:
AAAAAAA    0.000372
AAAAAAC    0.000125
AAAAAAG    0.000195
AAAAAAT    0.000218
AAAAACA    0.000176
dtype: float64

In [35]:
P_D = data[list(range(1,9))].sum(axis=0) / data[list(range(1,9))].sum(axis=0).sum()

In [36]:
P_D


Out[36]:
1    0.039067
2    0.006898
3    0.667756
4    0.150372
5    0.064586
6    0.035847
7    0.022010
8    0.013465
dtype: float64

In [37]:
tidy = pd.melt(data, id_vars=['kmer'], value_vars=[1,2,3,4,5,6,7,8], var_name='degree', value_name='heptamer_count')

In [38]:
tidy.head()


Out[38]:
kmer degree heptamer_count
0 AAAAAAA 1 633772
1 AAAAAAT 1 354222
2 AAAAAAC 1 118413
3 AAAAAAG 1 159322
4 AAAAATA 1 99801

In [39]:
tidy['P(h|D)'] = (tidy.set_index(['degree', 'kmer']) / tidy.groupby('degree').sum()).reset_index()['heptamer_count']

In [40]:
tidy.head()


Out[40]:
kmer degree heptamer_count P(h|D)
0 AAAAAAA 1 633772 0.001242
1 AAAAAAT 1 354222 0.000694
2 AAAAAAC 1 118413 0.000232
3 AAAAAAG 1 159322 0.000312
4 AAAAATA 1 99801 0.000196

In [41]:
def p_D_h(row):
    return (row['P(h|D)'] * P_D[row.degree]) / P_h[row.kmer]

tidy['P(D|h)'] = tidy.apply(p_D_h, axis=1)

In [42]:
tidy.sort_values(['kmer', 'degree'])


Out[42]:
kmer degree heptamer_count P(h|D) P(D|h)
0 AAAAAAA 1 633772 0.001242 0.130415
16384 AAAAAAA 2 112055 0.001244 0.023058
32768 AAAAAAA 3 2946239 0.000338 0.606266
49152 AAAAAAA 4 749791 0.000382 0.154289
65536 AAAAAAA 5 253383 0.000300 0.052140
81920 AAAAAAA 6 108725 0.000232 0.022373
98304 AAAAAAA 7 32248 0.000112 0.006636
114688 AAAAAAA 8 23436 0.000133 0.004823
2 AAAAAAC 1 118413 0.000232 0.072298
16386 AAAAAAC 2 20474 0.000227 0.012500
32770 AAAAAAC 3 1138971 0.000131 0.695404
49154 AAAAAAC 4 222564 0.000113 0.135887
65538 AAAAAAC 5 75488 0.000089 0.046090
81922 AAAAAAC 6 45414 0.000097 0.027728
98306 AAAAAAC 7 13983 0.000049 0.008537
114690 AAAAAAC 8 2549 0.000014 0.001556
3 AAAAAAG 1 159322 0.000312 0.062446
16387 AAAAAAG 2 30781 0.000342 0.012065
32771 AAAAAAG 3 1808843 0.000207 0.708972
49155 AAAAAAG 4 382415 0.000195 0.149887
65539 AAAAAAG 5 114882 0.000136 0.045028
81923 AAAAAAG 6 42002 0.000090 0.016463
98307 AAAAAAG 7 10321 0.000036 0.004045
114691 AAAAAAG 8 2794 0.000016 0.001095
1 AAAAAAT 1 354222 0.000694 0.124269
16385 AAAAAAT 2 31994 0.000355 0.011224
32769 AAAAAAT 3 1903689 0.000218 0.667855
49153 AAAAAAT 4 365184 0.000186 0.128114
65537 AAAAAAT 5 123257 0.000146 0.043241
81921 AAAAAAT 6 53758 0.000115 0.018859
... ... ... ... ... ...
38228 TTTTTTA 3 1462283 0.000168 0.702995
54612 TTTTTTA 4 304387 0.000155 0.146335
70996 TTTTTTA 5 93509 0.000111 0.044955
87380 TTTTTTA 6 36263 0.000077 0.017434
103764 TTTTTTA 7 3950 0.000014 0.001899
120148 TTTTTTA 8 1676 0.000010 0.000806
5462 TTTTTTC 1 311874 0.000611 0.101950
21846 TTTTTTC 2 40853 0.000453 0.013355
38230 TTTTTTC 3 2098806 0.000241 0.686091
54614 TTTTTTC 4 426911 0.000217 0.139555
70998 TTTTTTC 5 125302 0.000149 0.040961
87382 TTTTTTC 6 33772 0.000072 0.011040
103766 TTTTTTC 7 14792 0.000051 0.004835
120150 TTTTTTC 8 6767 0.000038 0.002212
5463 TTTTTTG 1 156159 0.000306 0.062530
21847 TTTTTTG 2 37543 0.000417 0.015033
38231 TTTTTTG 3 1798205 0.000206 0.720049
54615 TTTTTTG 4 357419 0.000182 0.143120
70999 TTTTTTG 5 107241 0.000127 0.042942
87383 TTTTTTG 6 31288 0.000067 0.012529
103767 TTTTTTG 7 8535 0.000030 0.003418
120151 TTTTTTG 8 948 0.000005 0.000380
5461 TTTTTTT 1 687400 0.001347 0.120051
21845 TTTTTTT 2 168027 0.001865 0.029345
38229 TTTTTTT 3 3448386 0.000395 0.602244
54613 TTTTTTT 4 920055 0.000468 0.160683
70997 TTTTTTT 5 310907 0.000369 0.054298
87381 TTTTTTT 6 117485 0.000251 0.020518
103765 TTTTTTT 7 41443 0.000144 0.007238
120149 TTTTTTT 8 32189 0.000183 0.005622

131072 rows × 5 columns

P(D|h) = P(h|D) * P(D) / P(h)


In [31]:
tidy.sort_values('heptamer_prob', ascending=False)


Out[31]:
kmer degree heptamer_count heptamer_prob
39269 TCTTCTT 3 4287311 0.000328
42390 CTTCTTC 3 4090565 0.000313
38489 TTCTTCT 3 3976754 0.000304
33548 AAGAAGA 3 3853936 0.000295
45251 GAAGAAG 3 3719223 0.000285
35888 AGAAGAA 3 3572836 0.000274
34848 ACAACAA 3 3506415 0.000268
33288 AACAACA 3 3476987 0.000266
38749 TTGTTGT 3 3452287 0.000264
38229 TTTTTTT 3 3448386 0.000264
38245 TTTTCTT 3 3423248 0.000262
40309 TGTTGTT 3 3415981 0.000262
41350 CATCATC 3 3367580 0.000258
38485 TTCTTTT 3 3344746 0.000256
39009 TCATCAT 3 3298124 0.000252
38293 TTTCTTT 3 3209276 0.000246
38294 TTTCTTC 3 3159250 0.000242
41090 CAACAAC 3 3155236 0.000242
34328 ATCATCA 3 3153522 0.000241
45511 GATGATG 3 3117612 0.000239
46551 GTTGTTG 3 3078574 0.000236
34588 ATGATGA 3 3059253 0.000234
33536 AAGAAAA 3 3029086 0.000232
32816 AAAAGAA 3 3024434 0.000232
40049 TGATGAT 3 2994084 0.000229
32768 AAAAAAA 3 2946239 0.000226
39253 TCTTTTT 3 2939847 0.000225
38488 TTCTTCA 3 2910304 0.000223
45248 GAAGAAA 3 2881748 0.000221
32960 AAAGAAA 3 2820685 0.000216
... ... ... ... ...
118436 AGCCCTA 8 0 0.000000
128682 GTCCCCC 8 0 0.000000
93932 CGCGCGA 6 0 0.000000
122171 TGTAGCG 8 0 0.000000
122177 TGTTAAT 8 0 0.000000
122219 TGTTCCG 8 0 0.000000
128628 GTCTGTA 8 0 0.000000
118446 AGCCCGC 8 0 0.000000
128612 GTCTCTA 8 0 0.000000
124347 CTTCGCG 8 0 0.000000
118995 TAAGTAG 8 0 0.000000
128620 GTCTCGA 8 0 0.000000
118996 TAAGTTA 8 0 0.000000
128622 GTCTCGC 8 0 0.000000
128623 GTCTCGG 8 0 0.000000
118439 AGCCCTG 8 0 0.000000
126372 CGTCCTA 8 0 0.000000
119000 TAAGTCA 8 0 0.000000
122180 TGTTATA 8 0 0.000000
128631 GTCTGTG 8 0 0.000000
128633 GTCTGCT 8 0 0.000000
119001 TAAGTCT 8 0 0.000000
119003 TAAGTCG 8 0 0.000000
126365 CGTCTGT 8 0 0.000000
122191 TGTTAGG 8 0 0.000000
126363 CGTCTCG 8 0 0.000000
119004 TAAGTGA 8 0 0.000000
122185 TGTTACT 8 0 0.000000
110075 CGTGGCG 7 0 0.000000
119143 TATTCTG 8 0 0.000000

131072 rows × 4 columns


In [43]:
sns.distplot(tidy.heptamer_prob, hist=False)
vlines(.25 ** 7, 0, plt.gca().get_ylim()[1], label='Random heptamer prob')
legend()


/home/camille/miniconda/envs/py3.assembly/lib/python3.5/site-packages/statsmodels/nonparametric/kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[43]:
<matplotlib.legend.Legend at 0x7fb7270ad9b0>

In [2]:
paths = pd.read_csv('Ast_g.paths.csv')

In [13]:
paths.set_index('t', inplace=True)

In [4]:
for name, group in paths.groupby('kmer'):
    plot(group.t, group.path_len)



In [14]:
paths.head()


Out[14]:
hash kmer path_len kmer_count
t
422 13993627044986761216 TCGCAGTGTTGTTGGTGTTGATGTGGA 50 0
425 7691119506052603904 CCACCGATCTTCCACCCACTCACCCTA 50 0
495 17154751307217350656 GAAGATGTTGACACTGTCATAGATTTG 50 0
676 8555021847222198272 CTTTCTCCTTGGTGACTGATTACGAAA 50 0
843 18048606665953738752 GTTATGGAAGTTTACACAGAAGATGGT 50 0

In [27]:
ma = paths.path_len.rolling(1000).mean()
mstd = paths.path_len.rolling(1000).std()
plot(ma.index, ma)
fill_between(mstd.index, ma-2*mstd, ma+2*mstd, color='b', alpha=0.2)


Out[27]:
<matplotlib.collections.PolyCollection at 0x7fb7ae7059b0>

In [ ]: