read, statistics, quality, crypt, decrypt, Metropolis, likelihood functions:


In [5]:
import numpy as np
import math
import matplotlib.pyplot as plt
%matplotlib inline
import random
from numpy.random import rand
from copy import copy
from __future__ import division
import time

def read_text_words(filename, wordsnumber):    
    with open(filename) as f:
        X = f.readlines()
        X = X[:wordsnumber]
    X = ''.join(X) 
    X = X.replace('\n', '') 
    return X
def read_text_filesize(filename, size):
    with open(filename) as f:
        X = f.read(int(round(1024*1024*size)))
    X = X.replace('\n', '')
    return X

def get_unigram_stats(text):
    length = len(text)
    stats = np.zeros(27)
    for i in xrange(length):
        c = ord(text[i])
        stats[c-97]+=1
    stats = stats[:26]
    return stats/length

def get_desiredPDF_unigram(permutation):
    logp = 0
    for i in xrange(len(encrypted)):
        if ord(encrypted[i]) != 123:            
            logp += math.log(stats[permutation[ord(encrypted[i])-97]])
    return logp

def quality(decrypted, original):
    l = len(decrypted)
    zipped = zip(decrypted, original)    
    return sum(1.0 for x,y in zipped if x == y)/l

def crypt(text):
    p = range(26)
    random.shuffle(p)
    output=''
    for ch in text:
            try:
                x = ord(ch) - ord('a')
                output+=(chr(p[x] + ord('a')))
            except:
                pass
    return output, p

def metropolis( desiredPDF, initValue, computableRVS, skipIterations = 1000):
    random_variable = initValue
    random_variableDensityValue = desiredPDF( random_variable )
    
    for i in xrange( skipIterations ):
        candidate = computableRVS( random_variable )
        candidateDensityValue = desiredPDF( candidate )
        acceptanceProb = min(0, candidateDensityValue - random_variableDensityValue )
        if math.log(random.random()) < acceptanceProb:
            random_variable = candidate
            random_variableDensityValue = candidateDensityValue
   
    while True:
        candidate = computableRVS( random_variable )
        candidateDensityValue = desiredPDF( candidate )
    
        acceptanceProb = min( 0, candidateDensityValue - random_variableDensityValue )
       
        if math.log(random.random()) < acceptanceProb:
            random_variable = candidate
            random_variableDensityValue = candidateDensityValue
        yield random_variable, random_variableDensityValue

def uniform( n ):
    #initialize permutation with identical
    permutation = [ i for i in xrange( n ) ]
    
    #swap ith object with random onject from i to n - 1 enclusively
    for i in xrange( n ):
        j = random.randint( i, n - 1 )
        permutation[ i ], permutation[ j ] = permutation[ j ], permutation[ i ]
        
    return permutation

def applyTransposition( basePermutation ):
    n = len( basePermutation )
    
    permutation = copy( basePermutation )
    
    #apply n random transpositions (including identical) to base permutation
#     for i in xrange( n ):
    k, l = random.randint( 0, n - 1 ), random.randint( 0, n - 1 )
    permutation[ k ], permutation[ l ] = permutation[ l ], permutation[ k ]
        
    return  permutation

def decrypt(permutation, encrypted):
    decrypted = []
    for i in encrypted:
        decrypted.append(chr(permutation[ord(i)-97]+97))
    return ''.join(decrypted)

Task 4:

crypt with given permutation:


In [3]:
def crypt_given(permutation, text):    
    output=''
    for ch in text:
            try:
                x = ord(ch) - ord('a')
                output+=(chr(permutation[x] + ord('a')))
            except:
                pass
    return output

fixed training text (War and Peace):


In [6]:
train_text = read_text_filesize('main/super.txt',64)
stats = get_unigram_stats(train_text)

vary size of train text (used War and Peace, total 3Mb)


In [7]:
sizes =  [100,200,400,800,1000,2000,4000,8000,16000]
qs = list()
fixed_permutation = uniform(26)
for s in sizes:   
    print 'Start at', time.strftime('%H:%M:%S', time.localtime())
    print 'size of encrypted text: ',s
    original = read_text_words('main/oliver_twist.txt', s)[3:]
    encrypted = crypt_given(fixed_permutation, original)
    init_p = uniform(26)
    #Metropolis here
    st = time.time()
    computableGen = lambda t: applyTransposition(t)
    metropolisgenerator = \
        metropolis(get_desiredPDF_unigram, init_p, computableGen, 1500)
        
    y = -float("inf")
    for i in xrange( 1000 ):
        cand = metropolisgenerator.next() 
        if (cand[1] > y):
            y = cand[1]
            x = cand[0]

    et =  time.time() - st
    print 'metropolis time: ', et
        
    print 'best density among 1000 last iterations: ', y
    print 'corresponding permutation: ', x
        
    decrypted = decrypt(x, encrypted)
    qs.append(quality(decrypted, original))
    print 'End at  ', time.strftime('%H:%M:%S', time.localtime())
    

plt.plot(sizes[:len(qs)], qs)


Start at 09:53:26
size of encrypted text:  100
metropolis time:  0.960999965668
best density among 1000 last iterations:  -1413.33443353
corresponding permutation:  [5, 20, 1, 17, 0, 13, 18, 10, 3, 7, 22, 9, 14, 11, 2, 16, 12, 21, 8, 19, 6, 24, 25, 23, 15, 4]
End at   09:53:27
Start at 09:53:27
size of encrypted text:  200
metropolis time:  1.85600018501
best density among 1000 last iterations:  -2751.40353007
corresponding permutation:  [24, 17, 21, 11, 8, 13, 18, 10, 22, 7, 1, 16, 0, 3, 20, 25, 2, 15, 14, 19, 5, 12, 9, 23, 6, 4]
End at   09:53:29
Start at 09:53:29
size of encrypted text:  400
metropolis time:  3.97799992561
best density among 1000 last iterations:  -5670.04772721
corresponding permutation:  [1, 17, 10, 11, 14, 0, 13, 15, 24, 8, 21, 23, 18, 12, 3, 25, 22, 5, 7, 19, 2, 20, 16, 9, 6, 4]
End at   09:53:33
Start at 09:53:33
size of encrypted text:  800
metropolis time:  7.41499996185
best density among 1000 last iterations:  -11383.665151
corresponding permutation:  [24, 17, 23, 11, 18, 19, 14, 12, 21, 8, 10, 6, 13, 15, 3, 25, 2, 1, 7, 0, 20, 5, 16, 9, 22, 4]
End at   09:53:41
Start at 09:53:41
size of encrypted text:  1000
metropolis time:  9.10099983215
best density among 1000 last iterations:  -14012.9283499
corresponding permutation:  [15, 17, 23, 3, 18, 19, 14, 12, 21, 13, 10, 1, 8, 24, 11, 25, 2, 6, 7, 0, 20, 5, 16, 9, 22, 4]
End at   09:53:50
Start at 09:53:50
size of encrypted text:  2000
metropolis time:  17.381000042
best density among 1000 last iterations:  -26730.7713568
corresponding permutation:  [6, 17, 23, 11, 7, 0, 8, 2, 24, 14, 21, 10, 13, 15, 20, 25, 12, 22, 18, 19, 3, 5, 9, 16, 1, 4]
End at   09:54:07
Start at 09:54:07
size of encrypted text:  4000
metropolis time:  34.1340000629
best density among 1000 last iterations:  -52291.5321111
corresponding permutation:  [22, 7, 10, 11, 18, 13, 8, 2, 15, 0, 1, 23, 14, 24, 20, 25, 12, 6, 17, 19, 3, 5, 9, 16, 21, 4]
End at   09:54:41
Start at 09:54:41
size of encrypted text:  8000
metropolis time:  68.8310000896
best density among 1000 last iterations:  -102975.449752
corresponding permutation:  [6, 18, 10, 11, 7, 13, 8, 20, 15, 0, 1, 23, 14, 24, 12, 25, 2, 22, 17, 19, 3, 5, 9, 16, 21, 4]
End at   09:55:50
Start at 09:55:50
size of encrypted text:  16000
metropolis time:  138.506000042
best density among 1000 last iterations:  -203705.21404
corresponding permutation:  [15, 18, 10, 11, 7, 13, 8, 20, 6, 0, 24, 23, 14, 1, 5, 25, 12, 22, 17, 19, 3, 2, 9, 16, 21, 4]
End at   09:58:09
Out[7]:
[<matplotlib.lines.Line2D at 0xa906630>]

In [15]:
plt.title('Dependence quality on size of tests')
plt.xlabel('sizes')
plt.ylabel('quality')
plt.plot(sizes[:len(qs)], qs) 
plt.savefig('task4 unigram shapespeare.png')



In [ ]: