In [5]:
    
import numpy as np
import math
import matplotlib.pyplot as plt
%matplotlib inline
import random
from numpy.random import rand
from copy import copy
from __future__ import division
import time
def read_text_words(filename, wordsnumber):    
    with open(filename) as f:
        X = f.readlines()
        X = X[:wordsnumber]
    X = ''.join(X) 
    X = X.replace('\n', '') 
    return X
def read_text_filesize(filename, size):
    with open(filename) as f:
        X = f.read(int(round(1024*1024*size)))
    X = X.replace('\n', '')
    return X
def get_unigram_stats(text):
    length = len(text)
    stats = np.zeros(27)
    for i in xrange(length):
        c = ord(text[i])
        stats[c-97]+=1
    stats = stats[:26]
    return stats/length
def get_desiredPDF_unigram(permutation):
    logp = 0
    for i in xrange(len(encrypted)):
        if ord(encrypted[i]) != 123:            
            logp += math.log(stats[permutation[ord(encrypted[i])-97]])
    return logp
def quality(decrypted, original):
    l = len(decrypted)
    zipped = zip(decrypted, original)    
    return sum(1.0 for x,y in zipped if x == y)/l
def crypt(text):
    p = range(26)
    random.shuffle(p)
    output=''
    for ch in text:
            try:
                x = ord(ch) - ord('a')
                output+=(chr(p[x] + ord('a')))
            except:
                pass
    return output, p
def metropolis( desiredPDF, initValue, computableRVS, skipIterations = 1000):
    random_variable = initValue
    random_variableDensityValue = desiredPDF( random_variable )
    
    for i in xrange( skipIterations ):
        candidate = computableRVS( random_variable )
        candidateDensityValue = desiredPDF( candidate )
        acceptanceProb = min(0, candidateDensityValue - random_variableDensityValue )
        if math.log(random.random()) < acceptanceProb:
            random_variable = candidate
            random_variableDensityValue = candidateDensityValue
   
    while True:
        candidate = computableRVS( random_variable )
        candidateDensityValue = desiredPDF( candidate )
    
        acceptanceProb = min( 0, candidateDensityValue - random_variableDensityValue )
       
        if math.log(random.random()) < acceptanceProb:
            random_variable = candidate
            random_variableDensityValue = candidateDensityValue
        yield random_variable, random_variableDensityValue
def uniform( n ):
    #initialize permutation with identical
    permutation = [ i for i in xrange( n ) ]
    
    #swap ith object with random onject from i to n - 1 enclusively
    for i in xrange( n ):
        j = random.randint( i, n - 1 )
        permutation[ i ], permutation[ j ] = permutation[ j ], permutation[ i ]
        
    return permutation
def applyTransposition( basePermutation ):
    n = len( basePermutation )
    
    permutation = copy( basePermutation )
    
    #apply n random transpositions (including identical) to base permutation
#     for i in xrange( n ):
    k, l = random.randint( 0, n - 1 ), random.randint( 0, n - 1 )
    permutation[ k ], permutation[ l ] = permutation[ l ], permutation[ k ]
        
    return  permutation
def decrypt(permutation, encrypted):
    decrypted = []
    for i in encrypted:
        decrypted.append(chr(permutation[ord(i)-97]+97))
    return ''.join(decrypted)
    
crypt with given permutation:
In [3]:
    
def crypt_given(permutation, text):    
    output=''
    for ch in text:
            try:
                x = ord(ch) - ord('a')
                output+=(chr(permutation[x] + ord('a')))
            except:
                pass
    return output
    
fixed training text (War and Peace):
In [6]:
    
train_text = read_text_filesize('main/super.txt',64)
stats = get_unigram_stats(train_text)
    
vary size of train text (used War and Peace, total 3Mb)
In [7]:
    
sizes =  [100,200,400,800,1000,2000,4000,8000,16000]
qs = list()
fixed_permutation = uniform(26)
for s in sizes:   
    print 'Start at', time.strftime('%H:%M:%S', time.localtime())
    print 'size of encrypted text: ',s
    original = read_text_words('main/oliver_twist.txt', s)[3:]
    encrypted = crypt_given(fixed_permutation, original)
    init_p = uniform(26)
    #Metropolis here
    st = time.time()
    computableGen = lambda t: applyTransposition(t)
    metropolisgenerator = \
        metropolis(get_desiredPDF_unigram, init_p, computableGen, 1500)
        
    y = -float("inf")
    for i in xrange( 1000 ):
        cand = metropolisgenerator.next() 
        if (cand[1] > y):
            y = cand[1]
            x = cand[0]
    et =  time.time() - st
    print 'metropolis time: ', et
        
    print 'best density among 1000 last iterations: ', y
    print 'corresponding permutation: ', x
        
    decrypted = decrypt(x, encrypted)
    qs.append(quality(decrypted, original))
    print 'End at  ', time.strftime('%H:%M:%S', time.localtime())
    
plt.plot(sizes[:len(qs)], qs)
    
    
    Out[7]:
    
In [15]:
    
plt.title('Dependence quality on size of tests')
plt.xlabel('sizes')
plt.ylabel('quality')
plt.plot(sizes[:len(qs)], qs) 
plt.savefig('task4 unigram shapespeare.png')
    
    
In [ ]: