# read, statistics, quality, crypt, decrypt, Metropolis, likelihood functions:

``````

In [2]:

import numpy as np
import math
import matplotlib.pyplot as plt
%matplotlib inline
import random
from numpy.random import rand
from copy import copy
from __future__ import division
import time

with open(filename) as f:
X = X[:wordsnumber]
X = ''.join(X)
X = X.replace('\n', '')
return X

with open(filename) as f:
X = X.replace('\n', '')
return X

def get_unicount(text):
length = len(text)
counts = np.zeros(26)
for i in xrange(length):
c = ord(text[i])
counts[c-97]+=1
#97-122
return counts

def get_bigram_stats_dic(text):
length = len(text)
dic = {}
for i in 'abcdefghijklmnopqrstuvwxyz':
for j in 'abcdefghijklmnopqrstuvwxyz':
dic[(i,j)]=0

for i in xrange(length-1):
if (text[i], text[i+1]) in dic:
dic[(text[i], text[i+1])] += 1

for k,v in dic.items():
dic[k] = v/(counts[ord(k[0])-97])
return dic

def quality(decrypted, original):
l = len(decrypted)
zipped = zip(decrypted, original)
return sum(1.0 for x,y in zipped if x == y)/l

def crypt(text):
p = range(26)
random.shuffle(p)
output=''
for ch in text:
try:
x = ord(ch) - ord('a')
output+=(chr(p[x] + ord('a')))
except:
pass
return output, p

def get_desiredPDF_bigram(permutation):
logp = 0
for i in xrange(len(encrypted)-1):
pr = stats[chr(permutation[ord(encrypted[i])-97]+97),
chr(permutation[ord(encrypted[i+1])-97]+97)]
if pr>0:
logp += math.log(pr)
else:
logp += -9 #penalty for non existant pairs
return logp

def metropolis( desiredPDF, initValue, computableRVS, skipIterations = 1000):
random_variable = initValue
random_variableDensityValue = desiredPDF( random_variable )

for i in xrange( skipIterations ):
candidate = computableRVS( random_variable )
candidateDensityValue = desiredPDF( candidate )
acceptanceProb = min(0, candidateDensityValue - random_variableDensityValue )
if math.log(random.random()) < acceptanceProb:
random_variable = candidate
random_variableDensityValue = candidateDensityValue

while True:
candidate = computableRVS( random_variable )
candidateDensityValue = desiredPDF( candidate )

acceptanceProb = min( 0, candidateDensityValue - random_variableDensityValue )

if math.log(random.random()) < acceptanceProb:
random_variable = candidate
random_variableDensityValue = candidateDensityValue
yield random_variable, random_variableDensityValue

def uniform( n ):
#initialize permutation with identical
permutation = [ i for i in xrange( n ) ]

#swap ith object with random onject from i to n - 1 enclusively
for i in xrange( n ):
j = random.randint( i, n - 1 )
permutation[ i ], permutation[ j ] = permutation[ j ], permutation[ i ]

return permutation

def applyTransposition( basePermutation ):
n = len( basePermutation )

permutation = copy( basePermutation )

#apply n random transpositions (including identical) to base permutation
#     for i in xrange( n ):
k, l = random.randint( 0, n - 1 ), random.randint( 0, n - 1 )
permutation[ k ], permutation[ l ] = permutation[ l ], permutation[ k ]

return  permutation

def decrypt(permutation, encrypted):
decrypted = []
for i in encrypted:
decrypted.append(chr(permutation[ord(i)-97]+97))
return ''.join(decrypted)

``````

fixed test text:

``````

In [6]:

fname = 'main/oliver_twist.txt'
encrypted,p = crypt(original)
print encrypted[:20]

``````
``````

lwrocgsrtlnylrxircnr

``````

vary size of train text (used War and Peace, total 3Mb)

``````

In [10]:

from sys import stdout
sizes =  [0.25,0.5,0.75,1]
qs = []
times = 4
for k in xrange(times):
for s in sizes:
print 'Start at', time.strftime('%H:%M:%S', time.localtime())
stdout.flush()
counts = get_unicount(train_text)
stats = get_bigram_stats_dic(train_text)
init_p = uniform(26)
#Metropolis here
st = time.time()
computableGen = lambda t: applyTransposition(t)
metropolisgenerator = \
metropolis(get_desiredPDF_bigram, init_p, computableGen, 1500)

print 'Sampling...'
stdout.flush()
y = -float("inf")
for i in xrange( 1000 ): #<=========
cand = metropolisgenerator.next()
if (cand[1] > y):
y = cand[1]
x = cand[0]

et =  time.time() - st
print 'metropolis time: ', et
print 'best density among 1000 last iterations: ', y
print 'corresponding permutation: ', x

decrypted = decrypt(x, encrypted)
qs.append( quality(decrypted, original))
print 'End at  ', time.strftime('%H:%M:%S', time.localtime())
time.sleep(1.0)

plt.plot(sizes[:len(qs)], qs[:len(sizes)],
sizes[:len(qs)], qs[len(sizes):2*len(sizes)],
sizes[:len(qs)], qs[2*len(sizes):3*len(sizes)],
sizes[:len(qs)], qs[3*len(sizes):4*len(sizes)],)

``````
``````

Start at 09:34:00
Sampling...
metropolis time:  167.453365088
best density among 1000 last iterations:  -56612.5920152
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   09:36:48
Start at 09:36:49
Sampling...
metropolis time:  242.96111989
best density among 1000 last iterations:  -56449.9500405
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   09:40:53
Start at 09:40:54
Sampling...
metropolis time:  248.331638813
best density among 1000 last iterations:  -56489.294617
corresponding permutation:  [8, 3, 17, 9, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 25, 2, 10, 18, 7, 13, 20, 24]
End at   09:45:04
Start at 09:45:05
Sampling...
metropolis time:  187.028258085
best density among 1000 last iterations:  -56443.5009922
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   09:48:18
Start at 09:48:19
Sampling...
metropolis time:  212.592648029
best density among 1000 last iterations:  -57907.4385259
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 2, 22, 4, 9, 10, 21, 18, 7, 13, 20, 24]
End at   09:51:52
Start at 09:51:53
Sampling...
metropolis time:  165.989115
best density among 1000 last iterations:  -56449.9500405
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   09:54:40
Start at 09:54:41
Sampling...
metropolis time:  245.037575006
best density among 1000 last iterations:  -56459.7567613
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   09:58:48
Start at 09:58:49
Sampling...
metropolis time:  236.601491928
best density among 1000 last iterations:  -69695.6823139
corresponding permutation:  [13, 18, 4, 16, 5, 23, 19, 0, 22, 25, 11, 14, 24, 10, 15, 6, 1, 17, 9, 2, 21, 3, 20, 8, 7, 12]
End at   10:02:48
Start at 10:02:49
Sampling...
metropolis time:  205.495482922
best density among 1000 last iterations:  -57606.4074425
corresponding permutation:  [8, 3, 17, 21, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 25, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   10:06:15
Start at 10:06:16
Sampling...
metropolis time:  167.041108131
best density among 1000 last iterations:  -56449.9500405
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   10:09:04
Start at 10:09:05
Sampling...
metropolis time:  165.338798046
best density among 1000 last iterations:  -56459.7567613
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   10:11:52
Start at 10:11:53
Sampling...
metropolis time:  165.96084094
best density among 1000 last iterations:  -56443.5009922
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   10:14:41
Start at 10:14:42
Sampling...
metropolis time:  165.614757061
best density among 1000 last iterations:  -56612.5920152
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   10:17:29
Start at 10:17:30
Sampling...
metropolis time:  167.244617939
best density among 1000 last iterations:  -56576.3395128
corresponding permutation:  [8, 3, 17, 16, 12, 23, 14, 11, 1, 25, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   10:20:18
Start at 10:20:19
Sampling...
metropolis time:  166.071938038
best density among 1000 last iterations:  -56459.7567613
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   10:23:07
Start at 10:23:08
Sampling...
metropolis time:  224.484329939
best density among 1000 last iterations:  -56443.5009922
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   10:26:54

Out[10]:

[<matplotlib.lines.Line2D at 0xaf02f70c>,
<matplotlib.lines.Line2D at 0xaf02faec>,
<matplotlib.lines.Line2D at 0xaf02f24c>,
<matplotlib.lines.Line2D at 0xaf032b6c>]

``````
``````

In [29]:

#qualities_1 = np.array(qs).reshape((4, 4))
print qualities_1
I = np.array([0, 1, 2, 3]) # take into account 'explosions'
means_1 = np.mean(qualities_1[I, :], 0)
stds_1 = np.std(qualities_1[I, :], 0)
sizes = [0.25, 0.5, 0.75, 1.0]
plt.title('Figure t3b2. Dependence quality on train text size.')
plt.xlabel('size, MB')
plt.ylabel('quality')
plt.plot(sizes, means_1 + stds_1, 'r:+')
plt.plot(sizes, means_1 - stds_1, 'r:+')
plt.plot(sizes, means_1, 'b-x')

``````
``````

[[ 1.          1.          0.99918831  1.        ]
[ 0.95463564  1.          1.          0.05104618]
[ 0.98656205  1.          1.          1.        ]
[ 1.          0.998557    1.          1.        ]]

``````

for small train text quality graphs prove intuition that the small sized training does not give much information.

``````

In [17]:

sizes =  [2, 3]
qs = []
times = 4
for k in xrange(times):
print '%d-th series running...' % k
for j, s in enumerate(sizes):
print j, 'Start at', time.strftime('%H:%M:%S', time.localtime())
stdout.flush()
counts = get_unicount(train_text)
stats = get_bigram_stats_dic(train_text)
init_p = uniform(26)
#Metropolis here
st = time.time()
computableGen = lambda t: applyTransposition(t)
metropolisgenerator = \
metropolis(get_desiredPDF_bigram, init_p, computableGen, 1500)

y = -float("inf")
for i in xrange( 1000 ): #<=========
cand = metropolisgenerator.next()
if (cand[1] > y):
y = cand[1]
x = cand[0]

et =  time.time() - st
print 'metropolis time: ', et
print 'best density among 1000 last iterations: ', y
print 'corresponding permutation: ', x

decrypted = decrypt(x, encrypted)
qs.append( quality(decrypted, original))
print 'End at  ', time.strftime('%H:%M:%S', time.localtime())
time.sleep(1.0)

qualities_2 = np.array(qs)
plt.plot(sizes[:len(qs)], qs[:len(sizes)],
sizes[:len(qs)], qs[len(sizes):2*len(sizes)],
sizes[:len(qs)], qs[2*len(sizes):3*len(sizes)],
sizes[:len(qs)], qs[3*len(sizes):4*len(sizes)],)

``````
``````

0-th series running...
0 Start at 10:37:00
metropolis time:  165.200503111
best density among 1000 last iterations:  -67112.4596578
corresponding permutation:  [19, 3, 17, 25, 2, 23, 14, 11, 1, 16, 0, 8, 5, 20, 15, 21, 12, 4, 9, 22, 10, 7, 13, 18, 6, 24]
End at   10:39:50
1 Start at 10:39:51
metropolis time:  165.211738825
best density among 1000 last iterations:  -71611.1805524
corresponding permutation:  [13, 20, 8, 16, 6, 23, 19, 14, 1, 25, 18, 4, 24, 10, 5, 15, 22, 17, 9, 2, 21, 3, 0, 7, 11, 12]
End at   10:42:43
1-th series running...
0 Start at 10:42:44
metropolis time:  201.486129045
best density among 1000 last iterations:  -56409.8117851
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   10:46:10
1 Start at 10:46:11
metropolis time:  208.907721043
best density among 1000 last iterations:  -71104.1219204
corresponding permutation:  [11, 13, 8, 9, 24, 23, 17, 20, 1, 25, 19, 4, 7, 22, 15, 10, 2, 18, 16, 5, 21, 3, 0, 14, 12, 6]
End at   10:49:57
2-th series running...
0 Start at 10:49:58
metropolis time:  217.643985987
best density among 1000 last iterations:  -58550.850755
corresponding permutation:  [8, 3, 17, 9, 12, 23, 14, 11, 1, 16, 0, 19, 20, 6, 15, 21, 22, 4, 25, 2, 10, 18, 7, 13, 5, 24]
End at   10:53:47
1 Start at 10:53:48
metropolis time:  210.609104156
best density among 1000 last iterations:  -56398.5046942
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   10:57:25
3-th series running...
0 Start at 10:57:26
metropolis time:  163.672365189
best density among 1000 last iterations:  -56409.8117851
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   11:00:15
1 Start at 11:00:16
metropolis time:  210.031395912
best density among 1000 last iterations:  -56398.5046942
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   11:03:52

Out[17]:

[<matplotlib.lines.Line2D at 0xaeda584c>,
<matplotlib.lines.Line2D at 0xaeda5a6c>,

``````
``````

In [31]:

qualities_2 = np.array(qs).reshape((4, 2))
print qualities_2
means_2 = np.mean(qualities_2, 0)
stds_2 = np.std(qualities_2, 0)
sizes = [2.0, 3.0]
plt.title('Figure t3b2. Dependence quality on train text size.')
plt.xlabel('size, MB')
plt.ylabel('quality')
plt.plot(sizes, means_2 + stds_2, 'r:+')
plt.plot(sizes, means_2 - stds_2, 'r:+')
plt.plot(sizes, means_2, 'b-x')

``````
``````

[[ 0.53323413  0.07593795]
[ 1.          0.04486833]
[ 0.95305736  1.        ]
[ 1.          1.        ]]

``````
``````

In [32]:

means = np.zeros(6)
means[0:4] = means_1
means[4:6] = means_2
stds = np.zeros(6)
stds[0:4] = stds_1
stds[4:6] = stds_2
full_sizes = [0.25, 0.5, 0.75, 1.0, 2.0, 3.0]
plt.title('Figure t3b_full. Dependence quality on train text size.')
plt.xlabel('size, MB')
plt.ylabel('quality')
plt.plot(full_sizes, means + stds, 'r:+')
plt.plot(full_sizes, means - stds, 'r:+')
plt.plot(full_sizes, means, 'b-x')

``````
``````

``````

this may be caused by too little col iterations, let's raise it up to 2500:

``````

In [53]:

sizes =  [0.5, 1.75, 3]
qs = []
times = 3
for k in xrange(times):
for s in sizes:

counts = get_unicount(train_text)
stats = get_bigram_stats_dic(train_text)
init_p = uniform(26)
#Metropolis here
st = time.time()
computableGen = lambda t: applyTransposition(t)
metropolisgenerator = \
metropolis(get_desiredPDF_bigram, init_p, computableGen, 2500)

y = -float("inf")
for i in xrange( 500 ): #<=========
cand = metropolisgenerator.next()
if (cand[1] > y):
y = cand[1]
x = cand[0]

et =  time.time() - st
print 'metropolis time: ', et

print 'best density among 500 last iterations: ', y
print 'corresponding permutation: ', x

decrypted = decrypt(x, encrypted)
qs.append( quality(decrypted, original))

plt.plot(sizes[:len(qs)], qs[:len(sizes)],
sizes[:len(qs)], qs[len(sizes):2*len(sizes)],
sizes[:len(qs)], qs[2*len(sizes):3*len(sizes)],
sizes[:len(qs)], qs[3*len(sizes):4*len(sizes)],)

``````
``````

metropolis time:  161.106999874
best density among 500 last iterations:  -56449.9500405
corresponding permutation:  [18, 21, 24, 4, 19, 15, 1, 3, 20, 16, 2, 8, 9, 23, 7, 10, 12, 0, 17, 11, 6, 25, 5, 13, 14, 22]
metropolis time:  161.350000143
best density among 500 last iterations:  -56410.2005938
corresponding permutation:  [18, 21, 24, 4, 19, 15, 1, 3, 20, 16, 2, 8, 9, 23, 7, 10, 12, 0, 17, 11, 6, 25, 5, 13, 14, 22]
metropolis time:  157.572000027
best density among 500 last iterations:  -56398.5046942
corresponding permutation:  [18, 21, 24, 4, 19, 15, 1, 3, 20, 16, 2, 8, 9, 23, 7, 10, 12, 0, 17, 11, 6, 25, 5, 13, 14, 22]
metropolis time:  156.710000038
best density among 500 last iterations:  -58108.899908
corresponding permutation:  [18, 21, 24, 4, 19, 15, 1, 3, 20, 16, 2, 0, 9, 23, 7, 10, 12, 8, 17, 11, 6, 25, 5, 13, 14, 22]
metropolis time:  156.017999887
best density among 500 last iterations:  -56410.2005938
corresponding permutation:  [18, 21, 24, 4, 19, 15, 1, 3, 20, 16, 2, 8, 9, 23, 7, 10, 12, 0, 17, 11, 6, 25, 5, 13, 14, 22]
metropolis time:  154.515999794
best density among 500 last iterations:  -56398.5046942
corresponding permutation:  [18, 21, 24, 4, 19, 15, 1, 3, 20, 16, 2, 8, 9, 23, 7, 10, 12, 0, 17, 11, 6, 25, 5, 13, 14, 22]
metropolis time:  153.356999874
best density among 500 last iterations:  -56449.9500405
corresponding permutation:  [18, 21, 24, 4, 19, 15, 1, 3, 20, 16, 2, 8, 9, 23, 7, 10, 12, 0, 17, 11, 6, 25, 5, 13, 14, 22]
metropolis time:  153.322999954
best density among 500 last iterations:  -56410.2005938
corresponding permutation:  [18, 21, 24, 4, 19, 15, 1, 3, 20, 16, 2, 8, 9, 23, 7, 10, 12, 0, 17, 11, 6, 25, 5, 13, 14, 22]
metropolis time:  157.593999863
best density among 500 last iterations:  -56398.5046942
corresponding permutation:  [18, 21, 24, 4, 19, 15, 1, 3, 20, 16, 2, 8, 9, 23, 7, 10, 12, 0, 17, 11, 6, 25, 5, 13, 14, 22]

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-53-621c4230538e> in <module>()
33          sizes[:len(qs)], qs[len(sizes):2*len(sizes)],
34          sizes[:len(qs)], qs[2*len(sizes):3*len(sizes)],
---> 35          sizes[:len(qs)], qs[3*len(sizes):4*len(sizes)],)

C:\Users\Maremun\Anaconda\lib\site-packages\matplotlib\pyplot.pyc in plot(*args, **kwargs)
2985         ax.hold(hold)
2986     try:
-> 2987         ret = ax.plot(*args, **kwargs)
2988         draw_if_interactive()
2989     finally:

C:\Users\Maremun\Anaconda\lib\site-packages\matplotlib\axes.pyc in plot(self, *args, **kwargs)
4135         lines = []
4136
-> 4137         for line in self._get_lines(*args, **kwargs):
4139             lines.append(line)

C:\Users\Maremun\Anaconda\lib\site-packages\matplotlib\axes.pyc in _grab_next_args(self, *args, **kwargs)
315                 return
316             if len(remaining) <= 3:
--> 317                 for seg in self._plot_args(remaining, kwargs):
318                     yield seg
319                 return

C:\Users\Maremun\Anaconda\lib\site-packages\matplotlib\axes.pyc in _plot_args(self, tup, kwargs)
293             x = np.arange(y.shape[0], dtype=float)
294
--> 295         x, y = self._xy_from_xy(x, y)
296
297         if self.command == 'plot':

C:\Users\Maremun\Anaconda\lib\site-packages\matplotlib\axes.pyc in _xy_from_xy(self, x, y)
235         y = np.atleast_1d(y)
236         if x.shape[0] != y.shape[0]:
--> 237             raise ValueError("x and y must have same first dimension")
238         if x.ndim > 2 or y.ndim > 2:
239             raise ValueError("x and y can be no greater than 2-D")

ValueError: x and y must have same first dimension

``````
``````

In [54]:

plt.plot(sizes[:len(qs)], qs[:len(sizes)],
sizes[:len(qs)], qs[len(sizes):2*len(sizes)],
sizes[:len(qs)], qs[2*len(sizes):3*len(sizes)])

``````
``````

Out[54]:

[<matplotlib.lines.Line2D at 0xd2b5358>,
<matplotlib.lines.Line2D at 0xd2b5588>,
<matplotlib.lines.Line2D at 0xd2b5b70>]

``````
``````

In [52]:

bp = np.zeros(26, dtype=int)
for i in p:
bp[p[i]] = i
q = get_desiredPDF_bigram(bp)
print bp
print q

``````
``````

[18 21 24  4 19 15  1  3 20 16  2  8  9 23  7 10 12  0 17 11  6 25  5 13 14
22]
-56398.5046942

``````

super.txt

``````

In [8]:

print supertext[:100]
print supertext[140000:140100]

``````
``````

projectgutenbergsetextofshakespearesfirstfolioplaysthisisourrdeditionofmostoftheseplaysseetheindexco
owinwordsisawomansonelyvertueipraytheeoutwithtandplaceitforherchiefevertuespitemsheisproudlaoutwitht

``````

shakespeare!