read, statistics, quality, crypt, decrypt, Metropolis, likelihood functions:


In [6]:
import numpy as np
import math
import matplotlib.pyplot as plt
%matplotlib inline
import random
from numpy.random import rand
from copy import copy
from __future__ import division
import time

def read_text_whole(filename):
    with open(filename, 'r') as f:
        X = ''.join(f.readlines()).replace('\n', '')
    return X

def read_text_words(filename, wordsnumber):    
    with open(filename) as f:
        X = f.readlines()
        X = X[:wordsnumber]
    X = ''.join(X) 
    X = X.replace('\n', '') 
    return X

def read_text_filesize(filename, size):
    with open(filename) as f:
        X = f.read(1024*1024*size)
    X = X.replace('\n', '')
    return X

def get_unicount(text):
    length = len(text)
    counts = np.zeros(26)
    for i in xrange(length):
        c = ord(text[i])
        counts[c-97]+=1
        #97-122 
    return counts

def get_bigram_stats_dic(text):        
    length = len(text)
    dic = {}
    for i in 'abcdefghijklmnopqrstuvwxyz':
        for j in 'abcdefghijklmnopqrstuvwxyz':
            dic[(i,j)]=0

    for i in xrange(length-1):                   
        if (text[i], text[i+1]) in dic:
            dic[(text[i], text[i+1])] += 1
            
    for k,v in dic.items():        
        dic[k] = v/(counts[ord(k[0])-97])
    return dic

def quality(decrypted, original):
    l = len(decrypted)
    zipped = zip(decrypted, original)    
    return sum(1.0 for x,y in zipped if x == y)/l

def crypt(text):
    p = range(26)
    random.shuffle(p)
    output=''
    for ch in text:
            try:
                x = ord(ch) - ord('a')
                output+=(chr(p[x] + ord('a')))
            except:
                pass
    return output, p

def get_desiredPDF_bigram(permutation):
    logp = 0
    for i in xrange(len(encrypted)-1):         
        pr = stats[chr(permutation[ord(encrypted[i])-97]+97), 
                   chr(permutation[ord(encrypted[i+1])-97]+97)]
        if pr>0:
            logp += math.log(pr)
        else:
            logp += -9 #penalty for non existant pairs
    return logp

def metropolis( desiredPDF, initValue, computableRVS, skipIterations = 1000):
    random_variable = initValue
    random_variableDensityValue = desiredPDF( random_variable )
    
    for i in xrange( skipIterations ):
        candidate = computableRVS( random_variable )
        candidateDensityValue = desiredPDF( candidate )
        acceptanceProb = min(0, candidateDensityValue - random_variableDensityValue )
        if math.log(random.random()) < acceptanceProb:
            random_variable = candidate
            random_variableDensityValue = candidateDensityValue
   
    while True:
        candidate = computableRVS( random_variable )
        candidateDensityValue = desiredPDF( candidate )
    
        acceptanceProb = min( 0, candidateDensityValue - random_variableDensityValue )
       
        if math.log(random.random()) < acceptanceProb:
            random_variable = candidate
            random_variableDensityValue = candidateDensityValue
        yield random_variable, random_variableDensityValue

def uniform( n ):
    #initialize permutation with identical
    permutation = [ i for i in xrange( n ) ]
    
    #swap ith object with random onject from i to n - 1 enclusively
    for i in xrange( n ):
        j = random.randint( i, n - 1 )
        permutation[ i ], permutation[ j ] = permutation[ j ], permutation[ i ]
        
    return permutation

def applyTransposition( basePermutation ):
    n = len( basePermutation )
    
    permutation = copy( basePermutation )
    
    #apply n random transpositions (including identical) to base permutation
#     for i in xrange( n ):
    k, l = random.randint( 0, n - 1 ), random.randint( 0, n - 1 )
    permutation[ k ], permutation[ l ] = permutation[ l ], permutation[ k ]
        
    return  permutation

def decrypt(permutation, encrypted):
    decrypted = []
    for i in encrypted:
        decrypted.append(chr(permutation[ord(i)-97]+97))
    return ''.join(decrypted)

Task 4:

crypt with given permutation:


In [7]:
def crypt_given(permutation, text):    
    output=''
#     p.append(26)
    for ch in text:
            try:
                x = ord(ch) - ord('a')
                output+=(chr(permutation[x] + ord('a')))
            except:
                pass
    return output

fixed training text (War and Peace):


In [9]:
train_text = read_text_whole('main/war_and_peace.txt')
counts = get_unicount(train_text)
stats = get_bigram_stats_dic(train_text)

vary size of train text (used War and Peace, total 3Mb)


In [12]:
sizes =  [1000,2000,4000,8000,16000,32000,64000,128000]
qs = list()
fixed_permutation = uniform(26)
for s in sizes:   
    print 'size of encrypted text: ',s
    original = read_text_words('main/oliver_twist.txt', s)[3:]
    encrypted = crypt_given(fixed_permutation, original)
    init_p = uniform(26)
    #Metropolis here
    st = time.time()
    computableGen = lambda t: applyTransposition(t)
    metropolisgenerator = \
        metropolis(get_desiredPDF_bigram, init_p, computableGen, 1500)
        
    y = float('-inf')
    for i in xrange( 1000 ):
        cand = metropolisgenerator.next() 
        if (cand[1] > y):
            y = cand[1]
            x = cand[0]

    et =  time.time() - st
    print 'metropolis time: ', et
        
    print 'best density among 1000 last iterations: ', y
    print 'corresponding permutation: ', x
        
    decrypted = decrypt(x, encrypted)
    qs.append(quality(decrypted, original))

plt.plot(sizes[:len(qs)], qs)


size of encrypted text:  1000
metropolis time:  35.6748039722
best density among 1000 last iterations:  -12634.9801002
corresponding permutation:  [22, 24, 7, 10, 5, 13, 2, 17, 1, 19, 16, 8, 21, 23, 20, 25, 0, 4, 18, 11, 14, 9, 12, 3, 6, 15]
size of encrypted text:  2000
metropolis time:  70.1986711025
best density among 1000 last iterations:  -23482.4403826
corresponding permutation:  [22, 24, 7, 5, 23, 13, 2, 17, 1, 19, 16, 8, 21, 25, 20, 10, 0, 4, 18, 11, 14, 9, 12, 3, 6, 15]
size of encrypted text:  4000
metropolis time:  132.329380989
best density among 1000 last iterations:  -55905.3265833
corresponding permutation:  [6, 10, 13, 5, 23, 7, 22, 17, 1, 8, 9, 3, 12, 16, 0, 21, 19, 4, 18, 11, 14, 25, 15, 24, 20, 2]
size of encrypted text:  8000
metropolis time:  261.615154028
best density among 1000 last iterations:  -89693.449803
corresponding permutation:  [22, 24, 7, 5, 23, 13, 2, 17, 1, 19, 16, 8, 21, 25, 20, 10, 0, 4, 18, 11, 14, 9, 12, 3, 6, 15]
size of encrypted text:  16000
metropolis time:  558.243676901
best density among 1000 last iterations:  -177140.316335
corresponding permutation:  [22, 24, 7, 5, 23, 13, 2, 17, 1, 19, 16, 8, 21, 25, 20, 10, 0, 4, 18, 11, 14, 9, 12, 3, 6, 15]
size of encrypted text:  32000
metropolis time:  1014.72146583
best density among 1000 last iterations:  -349751.654302
corresponding permutation:  [22, 24, 7, 5, 23, 13, 2, 17, 1, 19, 16, 8, 21, 25, 20, 10, 0, 4, 18, 11, 14, 9, 12, 3, 6, 15]
size of encrypted text:  64000
metropolis time:  2261.47678995
best density among 1000 last iterations:  -699311.590259
corresponding permutation:  [22, 24, 7, 5, 23, 13, 2, 17, 1, 19, 16, 8, 10, 25, 20, 21, 0, 4, 18, 11, 14, 9, 12, 3, 6, 15]
size of encrypted text:  128000
metropolis time:  4571.27480602
best density among 1000 last iterations:  -1381820.9484
corresponding permutation:  [22, 24, 7, 5, 23, 13, 2, 17, 1, 19, 16, 8, 21, 25, 20, 10, 0, 4, 18, 11, 14, 9, 12, 3, 6, 15]
Out[12]:
[<matplotlib.lines.Line2D at 0xad9da02c>]

compare war and peace and shakespeare as training sets


In [ ]: