read, statistics, quality, crypt, decrypt, Metropolis, likelihood functions:


In [1]:
import numpy as np
import math
import matplotlib.pyplot as plt
%matplotlib inline
import random
from numpy.random import rand
from copy import copy
from __future__ import division
import time

def read_text_words(filename, wordsnumber):    
    with open(filename) as f:
        X = f.readlines()
        X = X[:wordsnumber]
    X = ''.join(X) 
    X = X.replace('\n', '') 
    return X

def read_text_whole(filename):
    with open(filename) as f:
        X = f.read()    
    X = X.replace('\n', '') 
    return X

def get_unigram_stats(text):
    length = len(text)
    stats = np.zeros(27)
    for i in xrange(length):
        c = ord(text[i])
        stats[c-97]+=1
    stats = stats[:26]
    return stats/length

def get_desiredPDF_unigram(permutation):
    logp = 0
    for i in xrange(len(encrypted)):
        if ord(encrypted[i]) != 123:            
            logp += math.log(stats[permutation[ord(encrypted[i])-97]])
    return logp

def quality(decrypted, original):
    l = len(decrypted)
    zipped = zip(decrypted, original)    
    return sum(1.0 for x,y in zipped if x == y)/l

def crypt(text):
    p = range(26)
    random.shuffle(p)
    output=''
    for ch in text:
            try:
                x = ord(ch) - ord('a')
                output+=(chr(p[x] + ord('a')))
            except:
                pass
    return output, p

def metropolis( desiredPDF, initValue, computableRVS, skipIterations = 1000):
    random_variable = initValue
    random_variableDensityValue = desiredPDF( random_variable )
    
    for i in xrange( skipIterations ):
        candidate = computableRVS( random_variable )
        candidateDensityValue = desiredPDF( candidate )
        acceptanceProb = min(0, candidateDensityValue - random_variableDensityValue )
        if math.log(random.random()) < acceptanceProb:
            random_variable = candidate
            random_variableDensityValue = candidateDensityValue
   
    while True:
        candidate = computableRVS( random_variable )
        candidateDensityValue = desiredPDF( candidate )
    
        acceptanceProb = min( 0, candidateDensityValue - random_variableDensityValue )
       
        if math.log(random.random()) < acceptanceProb:
            random_variable = candidate
            random_variableDensityValue = candidateDensityValue
        yield random_variable, random_variableDensityValue

def uniform( n ):
    #initialize permutation with identical
    permutation = [ i for i in xrange( n ) ]
    
    #swap ith object with random onject from i to n - 1 enclusively
    for i in xrange( n ):
        j = random.randint( i, n - 1 )
        permutation[ i ], permutation[ j ] = permutation[ j ], permutation[ i ]
        
    return permutation

def applyTransposition( basePermutation ):
    n = len( basePermutation )
    
    permutation = copy( basePermutation )
    
    #apply n random transpositions (including identical) to base permutation
#     for i in xrange( n ):
    k, l = random.randint( 0, n - 1 ), random.randint( 0, n - 1 )
    permutation[ k ], permutation[ l ] = permutation[ l ], permutation[ k ]
        
    return  permutation

def decrypt(permutation, encrypted):
    decrypted = []
    for i in encrypted:
        decrypted.append(chr(permutation[ord(i)-97]+97))
    return ''.join(decrypted)

Task 4:

crypt with given permutation:


In [2]:
def crypt_given(permutation, text):    
    output=''
    for ch in text:
            try:
                x = ord(ch) - ord('a')
                output+=(chr(permutation[x] + ord('a')))
            except:
                pass
    return output

fixed training text (War and Peace):


In [3]:
train_text = read_text_whole('main/war_and_peace.txt')
stats = get_unigram_stats(train_text)

vary size of train text (used War and Peace, total 3Mb)


In [4]:
sizes =  [100,200,400,800,1000,2000,4000,8000,16000]
qs = list()
fixed_permutation = uniform(26)
for s in sizes:   
    print 'size of encrypted text: ',s
    original = read_text_words('main/oliver_twist.txt', s)[3: ]
    encrypted = crypt_given(fixed_permutation, original)
    init_p = uniform(26)
    #Metropolis here
    st = time.time()
    computableGen = lambda t: applyTransposition(t)
    metropolisgenerator = \
        metropolis(get_desiredPDF_unigram, init_p, computableGen, 1500)
        
    y = -float("inf")
    for i in xrange( 1000 ):
        cand = metropolisgenerator.next() 
        if (cand[1] > y):
            y = cand[1]
            x = cand[0]

    et =  time.time() - st
    print 'metropolis time: ', et
        
    print 'best density among 1000 last iterations: ', y
    print 'corresponding permutation: ', x
        
    decrypted = decrypt(x, encrypted)
    qs.append(quality(decrypted, original))
    

plt.plot(sizes[:len(qs)], qs)


size of encrypted text:  100
metropolis time:  0.913999795914
best density among 1000 last iterations:  -1413.88440115
corresponding permutation:  [20, 25, 1, 15, 0, 11, 23, 16, 24, 4, 14, 21, 17, 3, 18, 19, 12, 6, 7, 13, 22, 10, 2, 9, 8, 5]
size of encrypted text:  200
metropolis time:  1.85299992561
best density among 1000 last iterations:  -2749.50046625
corresponding permutation:  [20, 9, 15, 2, 7, 3, 23, 16, 10, 4, 14, 6, 18, 11, 17, 19, 5, 21, 0, 8, 12, 1, 22, 25, 13, 24]
size of encrypted text:  400
metropolis time:  4.04399991035
best density among 1000 last iterations:  -5670.32163758
corresponding permutation:  [5, 16, 6, 22, 13, 11, 9, 23, 10, 4, 14, 15, 8, 20, 17, 19, 2, 21, 0, 18, 3, 24, 12, 25, 7, 1]
size of encrypted text:  800
metropolis time:  7.37300014496
best density among 1000 last iterations:  -11384.142993
corresponding permutation:  [21, 16, 12, 20, 18, 3, 9, 5, 23, 4, 13, 1, 14, 15, 17, 0, 22, 24, 19, 7, 11, 6, 2, 25, 8, 10]
size of encrypted text:  1000
metropolis time:  9.06599998474
best density among 1000 last iterations:  -14007.5137695
corresponding permutation:  [21, 9, 5, 20, 7, 3, 25, 1, 23, 4, 13, 6, 14, 24, 17, 0, 22, 15, 19, 18, 11, 2, 12, 16, 8, 10]
size of encrypted text:  2000
metropolis time:  17.1050000191
best density among 1000 last iterations:  -26729.7564683
corresponding permutation:  [24, 16, 21, 3, 18, 11, 25, 10, 23, 4, 13, 5, 8, 15, 17, 19, 22, 6, 0, 7, 20, 12, 2, 9, 14, 1]
size of encrypted text:  4000
metropolis time:  33.0810000896
best density among 1000 last iterations:  -52295.7330423
corresponding permutation:  [24, 9, 21, 3, 18, 11, 16, 23, 10, 4, 14, 6, 8, 15, 7, 19, 22, 5, 13, 17, 20, 2, 12, 25, 0, 1]
size of encrypted text:  8000
metropolis time:  66.6210000515
best density among 1000 last iterations:  -102989.364202
corresponding permutation:  [15, 9, 21, 3, 18, 11, 25, 23, 10, 4, 14, 5, 8, 24, 7, 19, 22, 6, 13, 17, 2, 20, 12, 16, 0, 1]
size of encrypted text:  16000
metropolis time:  131.217000008
best density among 1000 last iterations:  -203731.260509
corresponding permutation:  [6, 9, 21, 3, 18, 11, 25, 23, 10, 4, 14, 5, 8, 1, 7, 19, 12, 24, 13, 17, 22, 20, 2, 16, 0, 15]
Out[4]:
[<matplotlib.lines.Line2D at 0xa732b00>]

In [12]:
means = np.mean(Q, 0)
stds = np.std(Q, 0)
print(means)
print(stds)
plt.title('Dependence quality on size of tests')
plt.xlabel('sizes')
plt.ylabel('quality')
plt.plot(sizes[:len(qs)], qs) 
plt.savefig('task-4-unigram-daniel.png')


[ 0.36494845  0.33123028  0.40143737  0.14322514  0.22257053  0.37158768
  0.55831196  0.60930088  0.56178157]
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.]

In [9]:
qs


Out[9]:
[0.3649484536082474,
 0.3312302839116719,
 0.40143737166324434,
 0.1432251416795466,
 0.2225705329153605,
 0.37158768290019656,
 0.5583119634295908,
 0.6093008809577592,
 0.5617815682135828]

In [ ]: