In [5]:
import numpy as np
import math
import matplotlib.pyplot as plt
%matplotlib inline
import random
from numpy.random import rand
from copy import copy
from __future__ import division
import time

def read_text_words(filename, wordsnumber):    
    with open(filename) as f:
        X = f.readlines()
        X = X[:wordsnumber]
    X = ''.join(X) 
    X = X.replace('\n', '') 
    return X

def read_text_filesize(filename, size):
    with open(filename) as f:
        X = f.read(int(round(1024*1024*size)))
    X = X.replace('\n', '')
    return X

def get_unicount(text):
    length = len(text)
    counts = np.zeros(26)
    for i in xrange(length):
        c = ord(text[i])
        counts[c-97]+=1
        #97-122 
    return counts

def get_bigram_stats_dic(text):        
    length = len(text)
    dic = {}
    for i in 'abcdefghijklmnopqrstuvwxyz':
        for j in 'abcdefghijklmnopqrstuvwxyz':
            dic[(i,j)]=0

    for i in xrange(length-1):                   
        if (text[i], text[i+1]) in dic:
            dic[(text[i], text[i+1])] += 1
            
    for k,v in dic.items():        
        dic[k] = v/(counts[ord(k[0])-97])
    return dic

def quality(decrypted, original):
    l = len(decrypted)
    zipped = zip(decrypted, original)    
    return sum(1.0 for x,y in zipped if x == y)/l

def crypt(text):
    p = range(26)
    random.shuffle(p)
    output=''
    for ch in text:
            try:
                x = ord(ch) - ord('a')
                output+=(chr(p[x] + ord('a')))
            except:
                pass
    return output, p

def get_desiredPDF_bigram(permutation):
    logp = 0
    for i in xrange(len(encrypted)-1):         
        pr = stats[chr(permutation[ord(encrypted[i])-97]+97), 
                   chr(permutation[ord(encrypted[i+1])-97]+97)]
        if pr>0:
            logp += math.log(pr)
        else:
            logp += -9 #penalty for non existant pairs
    return logp

def metropolis( desiredPDF, initValue, computableRVS, skipIterations = 1000):
    random_variable = initValue
    random_variableDensityValue = desiredPDF( random_variable )
    
    for i in xrange( skipIterations ):
        candidate = computableRVS( random_variable )
        candidateDensityValue = desiredPDF( candidate )
        acceptanceProb = min(0, candidateDensityValue - random_variableDensityValue )
        if math.log(random.random()) < acceptanceProb:
            random_variable = candidate
            random_variableDensityValue = candidateDensityValue
   
    while True:
        candidate = computableRVS( random_variable )
        candidateDensityValue = desiredPDF( candidate )
    
        acceptanceProb = min( 0, candidateDensityValue - random_variableDensityValue )
       
        if math.log(random.random()) < acceptanceProb:
            random_variable = candidate
            random_variableDensityValue = candidateDensityValue
        yield random_variable, random_variableDensityValue

def uniform( n ):
    #initialize permutation with identical
    permutation = [ i for i in xrange( n ) ]
    
    #swap ith object with random onject from i to n - 1 enclusively
    for i in xrange( n ):
        j = random.randint( i, n - 1 )
        permutation[ i ], permutation[ j ] = permutation[ j ], permutation[ i ]
        
    return permutation

def applyTransposition( basePermutation ):
    n = len( basePermutation )
    
    permutation = copy( basePermutation )
    
    #apply n random transpositions (including identical) to base permutation
#     for i in xrange( n ):
    k, l = random.randint( 0, n - 1 ), random.randint( 0, n - 1 )
    permutation[ k ], permutation[ l ] = permutation[ l ], permutation[ k ]
        
    return  permutation


def decrypt(permutation, encrypted):
    decrypted = []
    for i in encrypted:
        decrypted.append(chr(permutation[ord(i)-97]+97))
    return ''.join(decrypted)

In [6]:
fname = 'main/oliver_twist.txt'
original = read_text_words(fname, 5000)[3:]
encrypted,p = crypt(original)
print encrypted[:20]


dsfuzlafpdondfqwfzof

In [12]:
train_text = read_text_filesize('main/war_and_peace.txt',3)    
counts = get_unicount(train_text)
stats = get_bigram_stats_dic(train_text)

for k in xrange(5): 
    init_p = uniform(26)
    #Metropolis here
    st = time.time()
    computableGen = lambda t: applyTransposition(t)
    metropolisgenerator = \
    metropolis(get_desiredPDF_bigram, init_p, computableGen, 2000)

    y = -float("inf")
    for i in xrange( 1000 ): #<=========
        cand = metropolisgenerator.next() 
        if (cand[1] > y):
            y = cand[1]
            x = cand[0]

    et =  time.time() - st
    print 'metropolis time: ', et

    print 'best density among 1000 last iterations: ', y
    print 'corresponding permutation: ', x

    decrypted = decrypt(x, encrypted)
    q = quality(decrypted, original)
    print q


metropolis time:  153.507999897
best density among 1000 last iterations:  -56398.5046942
corresponding permutation:  [9, 16, 11, 19, 25, 4, 0, 3, 23, 12, 24, 14, 22, 20, 6, 2, 13, 5, 7, 21, 15, 18, 1, 8, 10, 17]
1.0
metropolis time:  152.47300005
best density among 1000 last iterations:  -56398.5046942
corresponding permutation:  [9, 16, 11, 19, 25, 4, 0, 3, 23, 12, 24, 14, 22, 20, 6, 2, 13, 5, 7, 21, 15, 18, 1, 8, 10, 17]
1.0
metropolis time:  161.621000051
best density among 1000 last iterations:  -56398.5046942
corresponding permutation:  [9, 16, 11, 19, 25, 4, 0, 3, 23, 12, 24, 14, 22, 20, 6, 2, 13, 5, 7, 21, 15, 18, 1, 8, 10, 17]
1.0
metropolis time:  156.662000179
best density among 1000 last iterations:  -56398.5046942
corresponding permutation:  [9, 16, 11, 19, 25, 4, 0, 3, 23, 12, 24, 14, 22, 20, 6, 2, 13, 5, 7, 21, 15, 18, 1, 8, 10, 17]
1.0
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-12-9c831e362cb6> in <module>()
     13    y = -float("inf")
     14    for i in xrange( 1000 ): #<=========
---> 15        cand = metropolisgenerator.next()
     16        if (cand[1] > y):
     17            y = cand[1]

<ipython-input-5-c1d614a58db1> in metropolis(desiredPDF, initValue, computableRVS, skipIterations)
     81     for i in xrange( skipIterations ):
     82         candidate = computableRVS( random_variable )
---> 83         candidateDensityValue = desiredPDF( candidate )
     84         acceptanceProb = min(0, candidateDensityValue - random_variableDensityValue )
     85         if math.log(random.random()) < acceptanceProb:

<ipython-input-5-c1d614a58db1> in get_desiredPDF_bigram(permutation)
     67     logp = 0
     68     for i in xrange(len(encrypted)-1):
---> 69         pr = stats[chr(permutation[ord(encrypted[i])-97]+97), 
     70                    chr(permutation[ord(encrypted[i+1])-97]+97)]
     71         if pr>0:

KeyboardInterrupt: 

In [13]:
train_text = read_text_filesize('main/super.txt', 3)    
counts = get_unicount(train_text)
stats = get_bigram_stats_dic(train_text)

for k in xrange(3):
    init_p = uniform(26)
#Metropolis here
    st = time.time()
    computableGen = lambda t: applyTransposition(t)
    metropolisgenerator = \
    metropolis(get_desiredPDF_bigram, init_p, computableGen, 2000)

    y = -float("inf")
    for i in xrange( 1000 ): #<=========
        cand = metropolisgenerator.next() 
        if (cand[1] > y):
            y = cand[1]
            x = cand[0]

    et =  time.time() - st
    print 'metropolis time: ', et

    print 'best density among 1000 last iterations: ', y
    print 'corresponding permutation: ', x

    decrypted = decrypt(x, encrypted)
    q = quality(decrypted, original)
    print q


metropolis time:  153.292999983
best density among 1000 last iterations:  -65548.7996672
corresponding permutation:  [9, 16, 11, 8, 25, 4, 0, 24, 23, 15, 6, 14, 22, 20, 7, 12, 19, 5, 13, 10, 2, 18, 1, 3, 21, 17]
0.538870851371
metropolis time:  157.934999943
best density among 1000 last iterations:  -58104.3291223
corresponding permutation:  [9, 16, 11, 19, 25, 4, 0, 3, 23, 12, 24, 14, 22, 20, 6, 2, 13, 5, 7, 10, 15, 18, 1, 8, 21, 17]
0.980068542569
metropolis time:  155.993000031
best density among 1000 last iterations:  -58104.3291223
corresponding permutation:  [9, 16, 11, 19, 25, 4, 0, 3, 23, 12, 24, 14, 22, 20, 6, 2, 13, 5, 7, 10, 15, 18, 1, 8, 21, 17]
0.980068542569

war and peace as training text gives better results than super.txt confirming our intuition that english used in the super.txt file is not modern english