In [5]:
import numpy as np
import math
import matplotlib.pyplot as plt
%matplotlib inline
import random
from numpy.random import rand
from copy import copy
from __future__ import division
import time
def read_text_words(filename, wordsnumber):
with open(filename) as f:
X = f.readlines()
X = X[:wordsnumber]
X = ''.join(X)
X = X.replace('\n', '')
return X
def read_text_filesize(filename, size):
with open(filename) as f:
X = f.read(int(round(1024*1024*size)))
X = X.replace('\n', '')
return X
def get_unigram_stats(text):
length = len(text)
stats = np.zeros(27)
for i in xrange(length):
c = ord(text[i])
stats[c-97]+=1
stats = stats[:26]
return stats/length
def get_desiredPDF_unigram(permutation):
logp = 0
for i in xrange(len(encrypted)):
if ord(encrypted[i]) != 123:
logp += math.log(stats[permutation[ord(encrypted[i])-97]])
return logp
def quality(decrypted, original):
l = len(decrypted)
zipped = zip(decrypted, original)
return sum(1.0 for x,y in zipped if x == y)/l
def crypt(text):
p = range(26)
random.shuffle(p)
output=''
for ch in text:
try:
x = ord(ch) - ord('a')
output+=(chr(p[x] + ord('a')))
except:
pass
return output, p
def metropolis( desiredPDF, initValue, computableRVS, skipIterations = 1000):
random_variable = initValue
random_variableDensityValue = desiredPDF( random_variable )
for i in xrange( skipIterations ):
candidate = computableRVS( random_variable )
candidateDensityValue = desiredPDF( candidate )
acceptanceProb = min(0, candidateDensityValue - random_variableDensityValue )
if math.log(random.random()) < acceptanceProb:
random_variable = candidate
random_variableDensityValue = candidateDensityValue
while True:
candidate = computableRVS( random_variable )
candidateDensityValue = desiredPDF( candidate )
acceptanceProb = min( 0, candidateDensityValue - random_variableDensityValue )
if math.log(random.random()) < acceptanceProb:
random_variable = candidate
random_variableDensityValue = candidateDensityValue
yield random_variable, random_variableDensityValue
def uniform( n ):
#initialize permutation with identical
permutation = [ i for i in xrange( n ) ]
#swap ith object with random onject from i to n - 1 enclusively
for i in xrange( n ):
j = random.randint( i, n - 1 )
permutation[ i ], permutation[ j ] = permutation[ j ], permutation[ i ]
return permutation
def applyTransposition( basePermutation ):
n = len( basePermutation )
permutation = copy( basePermutation )
#apply n random transpositions (including identical) to base permutation
# for i in xrange( n ):
k, l = random.randint( 0, n - 1 ), random.randint( 0, n - 1 )
permutation[ k ], permutation[ l ] = permutation[ l ], permutation[ k ]
return permutation
def decrypt(permutation, encrypted):
decrypted = []
for i in encrypted:
decrypted.append(chr(permutation[ord(i)-97]+97))
return ''.join(decrypted)
crypt with given permutation:
In [3]:
def crypt_given(permutation, text):
output=''
for ch in text:
try:
x = ord(ch) - ord('a')
output+=(chr(permutation[x] + ord('a')))
except:
pass
return output
fixed training text (War and Peace):
In [6]:
train_text = read_text_filesize('main/super.txt',64)
stats = get_unigram_stats(train_text)
vary size of train text (used War and Peace, total 3Mb)
In [7]:
sizes = [100,200,400,800,1000,2000,4000,8000,16000]
qs = list()
fixed_permutation = uniform(26)
for s in sizes:
print 'Start at', time.strftime('%H:%M:%S', time.localtime())
print 'size of encrypted text: ',s
original = read_text_words('main/oliver_twist.txt', s)[3:]
encrypted = crypt_given(fixed_permutation, original)
init_p = uniform(26)
#Metropolis here
st = time.time()
computableGen = lambda t: applyTransposition(t)
metropolisgenerator = \
metropolis(get_desiredPDF_unigram, init_p, computableGen, 1500)
y = -float("inf")
for i in xrange( 1000 ):
cand = metropolisgenerator.next()
if (cand[1] > y):
y = cand[1]
x = cand[0]
et = time.time() - st
print 'metropolis time: ', et
print 'best density among 1000 last iterations: ', y
print 'corresponding permutation: ', x
decrypted = decrypt(x, encrypted)
qs.append(quality(decrypted, original))
print 'End at ', time.strftime('%H:%M:%S', time.localtime())
plt.plot(sizes[:len(qs)], qs)
Out[7]:
In [15]:
plt.title('Dependence quality on size of tests')
plt.xlabel('sizes')
plt.ylabel('quality')
plt.plot(sizes[:len(qs)], qs)
plt.savefig('task4 unigram shapespeare.png')
In [ ]: