In [3]:
import numpy as np
import math
import matplotlib.pyplot as plt
%matplotlib inline
import random
from numpy.random import rand
from copy import copy
from __future__ import division
def read_text_words(filename, wordsnumber):    
    with open(filename) as f:
        X = f.readlines()
        X = X[:wordsnumber]
    X = ''.join(X) 
    X = X.replace('\n', '') 
    return X

def read_text_whole(filename):
    with open(filename) as f:
        X = f.read()    
    X = X.replace('\n', '') 
    return X

def chop_text_to_size(text, size):
    return text[:1024*1024*size]

def read_text_filesize(filename, size):
    with open(filename) as f:
        X = f.read(1024*1024*size)
    X = X.replace('\n', '') 
    return X  
def get_unicount(text):
    length = len(text)
    counts = np.zeros(26)
    for i in xrange(length):
        c = ord(text[i])
        counts[c-97]+=1
        #97-122 
    return counts
def get_bigram_stats_dic(text):        
    length = len(text)
    dic = {}
    for i in 'abcdefghijklmnopqrstuvwxyz':
        for j in 'abcdefghijklmnopqrstuvwxyz':
            dic[(i,j)]=0

    for i in xrange(length-1):                   
        if (text[i], text[i+1]) in dic:
            dic[(text[i], text[i+1])] += 1
            
    for k,v in dic.items():        
        dic[k] = v/(counts[ord(k[0])-97])
    return dic
def quality(decrypted, original):
    l = len(decrypted)
    zipped = zip(decrypted, original)    
    return sum(1.0 for x,y in zipped if x == y)/l

def crypt(text):
    p = range(26)
    random.shuffle(p)
    output=''
    for ch in text:
            try:
                x = ord(ch) - ord('a')
                output+=(chr(p[x] + ord('a')))
            except:
                pass
    return output, p
def get_desiredPDF_bigram(permutation):
    logp = 0
    for i in xrange(len(encrypted)-1):         
        pr = stats[chr(permutation[ord(encrypted[i])-97]+97), 
                   chr(permutation[ord(encrypted[i+1])-97]+97)]
        if pr>0:
            logp += math.log(pr)
        else:
            logp += -9 #penalty for non existant pairs
    return logp

def uniform( n ):
    
    #initialize permutation with identical
    permutation = [ i for i in xrange( n ) ]
    
    #swap ith object with random onject from i to n - 1 enclusively
    for i in xrange( n ):
        j = random.randint( i, n - 1 )
        permutation[ i ], permutation[ j ] = permutation[ j ], permutation[ i ]
        
    return permutation

def applyedTranspostions( basePermutation ):
   
    n = len( basePermutation )
    
    
    permutation = copy( basePermutation )
    #apply n random transpositions (including identical) to base permutation
#     for i in xrange( n ):
    k, l = random.randint( 0, n - 1 ), random.randint( 0, n - 1 )
    permutation[ k ], permutation[ l ] = permutation[ l ], permutation[ k ]
        
    return  permutation

density maximization


In [4]:
def densityMaximization( desiredPDF, initValue, computableRVS, skipIterations = 200 ):
    """
    This function return a generator, which generates random variables 
    from some space S by trying to maximize givven density. 
    The algorithm is a modification of Metropolis-Hastings. 
    It rejects all objects, which decrease density.
    
    Args:
        desiredPDF (func) : PDF of desired distribution p( T ), where T from S
        initValue : an object from S to initialize the starting point 
        of iterative proccess
        computableRVS (func) : a generator of random value from space S 
        with given parameter T, which is also from S
        skipIterations (int) : number of iterations to skip 
        (skipping more iterations leads to better accuracy? 
        but greater time consuming)
        
    Returns: generator, which produce some values from S, 
    where each next value has no less density, and their denisity
    """
    
    random_variable = initValue
    random_variableDensityValue = desiredPDF( random_variable )
    """
    A state of MCMC
    """
    
    #ignore first iterations to let the iterative proccess to enter 
    #the high density regions
    for i in xrange( skipIterations ):
        candidate = computableRVS( random_variable )
        candidateDensityValue = desiredPDF( candidate )
        """
        next candidate for sample, generated by computableRVS
        """
        
        
        if random_variableDensityValue < candidateDensityValue:
            random_variable = candidate
            random_variableDensityValue = candidateDensityValue
            
    #now when the procces is in high density regions, 
    #return acceptable candidates
    while True:
        candidate = computableRVS( random_variable )
        candidateDensityValue = desiredPDF( candidate )
        """
        next candidate for sample, generated by computableRVS
        """
       
        if random_variableDensityValue < candidateDensityValue:
            random_variable = candidate
            random_variableDensityValue = candidateDensityValue
        yield random_variable, random_variableDensityValue

decrypt


In [5]:
def decrypt(permutation, encrypted):
    decrypted = []
    for i in encrypted:
        decrypted.append(chr(permutation[ord(i)-97]+97))
    return ''.join(decrypted)

Density maximization

various number of iterations


In [6]:
#TEST TEXT
fname = 'main/oliver_twist.txt'
original = read_text_words(fname, 5000)[3:]
#3 first symbols in oliver twist are unsupported by encryption
encrypted, p = crypt(original)
#TRAIN TEXT
train_text = read_text_whole('main/war_and_peace.txt')
counts = get_unicount(train_text)
stats = get_bigram_stats_dic(train_text)
# print stats
print p
bp = np.zeros(26, dtype=int)
for i in p:
    bp[p[i]] = i
q = get_desiredPDF_bigram(bp)
print 'inverse to permutation used in encryption ', bp
print 'its density ', q
ra = uniform(26)
q = get_desiredPDF_bigram(ra)
print 'random permutation density ', q


[25, 16, 4, 23, 24, 8, 17, 19, 6, 13, 22, 2, 1, 3, 7, 14, 12, 0, 20, 18, 21, 5, 15, 11, 9, 10]
inverse to permutation used in encryption  [17 12 11 13  2 21  8 14  5 24 25 23 16  9 15 22  1  6 19  7 18 20 10  3  4
  0]
its density  -56398.5046942
random permutation density  -117012.60994

In [9]:
import time
iterations = [250,500,1000,1500]
qs = list()
times =5
init_p = uniform(26)
for k in xrange(times):
    for it in iterations:
        st = time.time()
        computableGen = lambda t: applyedTranspostions(t)
        dmgenerator = \
            densityMaximization(get_desiredPDF_bigram, init_p, computableGen, it)

        for i in xrange( 500 ):
            x,y= dmgenerator.next() 
            
        #last one will be max
        et =  time.time() - st
        print 'cold iterations: ', it
        print 'dm time: ', et
        print 'best density among 500 last iterations: ', y
        print 'corresponding permutation: ', x
        decrypted = decrypt(x, encrypted)
        qs.append(quality(decrypted, original))
#         print 'quality: ',  
        
plt.plot(iterations, qs[:len(iterations)],
         iterations, qs[len(iterations):2*len(iterations)],
         iterations, qs[2*len(iterations):3*len(iterations)],
         iterations, qs[3*len(iterations):4*len(iterations)],
         iterations, qs[4*len(iterations):5*len(iterations)])


 cold iterations:  250
dm time:  40.5469999313
best density among 500 last iterations:  -71233.837043
corresponding permutation:  [7, 15, 11, 14, 20, 21, 18, 19, 5, 24, 16, 6, 23, 9, 2, 3, 1, 10, 8, 13, 0, 22, 25, 12, 4, 17]
cold iterations:  500
dm time:  51.9959998131
best density among 500 last iterations:  -57997.6960981
corresponding permutation:  [17, 15, 11, 13, 12, 21, 8, 14, 5, 24, 25, 23, 16, 9, 6, 22, 1, 2, 19, 7, 18, 20, 10, 3, 4, 0]
cold iterations:  1000
dm time:  79.8470001221
best density among 500 last iterations:  -57805.9432831
corresponding permutation:  [17, 18, 11, 13, 2, 21, 8, 14, 5, 24, 25, 23, 16, 9, 15, 22, 1, 6, 19, 7, 12, 20, 10, 3, 4, 0]
cold iterations:  1500
dm time:  106.698000193
best density among 500 last iterations:  -56398.5046942
corresponding permutation:  [17, 12, 11, 13, 2, 21, 8, 14, 5, 24, 25, 23, 16, 9, 15, 22, 1, 6, 19, 7, 18, 20, 10, 3, 4, 0]
cold iterations:  250
dm time:  41.2650001049
best density among 500 last iterations:  -67380.1067748
corresponding permutation:  [17, 1, 11, 13, 5, 21, 14, 19, 24, 20, 16, 23, 25, 9, 15, 7, 2, 6, 3, 12, 18, 8, 10, 22, 4, 0]
cold iterations:  500
dm time:  51.9799997807
best density among 500 last iterations:  -59816.7818808
corresponding permutation:  [17, 12, 11, 13, 2, 21, 0, 14, 5, 24, 25, 23, 16, 9, 15, 22, 1, 6, 18, 7, 19, 20, 10, 3, 4, 8]
cold iterations:  1000
dm time:  78.0910000801
best density among 500 last iterations:  -56398.5046942
corresponding permutation:  [17, 12, 11, 13, 2, 21, 8, 14, 5, 24, 25, 23, 16, 9, 15, 22, 1, 6, 19, 7, 18, 20, 10, 3, 4, 0]
cold iterations:  1500
dm time:  103.005999804
best density among 500 last iterations:  -56398.5046942
corresponding permutation:  [17, 12, 11, 13, 2, 21, 8, 14, 5, 24, 25, 23, 16, 9, 15, 22, 1, 6, 19, 7, 18, 20, 10, 3, 4, 0]
cold iterations:  250
dm time:  39.3270001411
best density among 500 last iterations:  -68324.5057939
corresponding permutation:  [17, 12, 15, 3, 10, 21, 8, 14, 5, 20, 16, 25, 23, 9, 24, 1, 7, 6, 18, 11, 19, 13, 2, 22, 4, 0]
cold iterations:  500
dm time:  51.4679999352
best density among 500 last iterations:  -65456.1539384
corresponding permutation:  [18, 12, 13, 11, 2, 21, 8, 14, 22, 24, 25, 23, 16, 9, 15, 7, 1, 6, 5, 17, 19, 20, 10, 3, 4, 0]
cold iterations:  1000
dm time:  81.0460000038
best density among 500 last iterations:  -63189.0943414
corresponding permutation:  [17, 1, 5, 13, 2, 21, 8, 18, 24, 20, 16, 23, 9, 25, 15, 22, 11, 6, 19, 7, 12, 14, 10, 3, 4, 0]
cold iterations:  1500
dm time:  108.279999971
best density among 500 last iterations:  -56398.5046942
corresponding permutation:  [17, 12, 11, 13, 2, 21, 8, 14, 5, 24, 25, 23, 16, 9, 15, 22, 1, 6, 19, 7, 18, 20, 10, 3, 4, 0]
cold iterations:  250
dm time:  39.0190000534
best density among 500 last iterations:  -64534.5032604
corresponding permutation:  [12, 6, 11, 17, 2, 21, 8, 14, 5, 24, 9, 23, 16, 25, 22, 7, 1, 15, 19, 18, 13, 20, 10, 3, 4, 0]
cold iterations:  500
dm time:  53.4149999619
best density among 500 last iterations:  -57604.0186169
corresponding permutation:  [17, 12, 11, 13, 2, 21, 8, 14, 6, 24, 16, 23, 9, 25, 15, 22, 1, 5, 19, 7, 18, 20, 10, 3, 4, 0]
cold iterations:  1000
dm time:  82.6840000153
best density among 500 last iterations:  -70679.5338583
corresponding permutation:  [3, 24, 12, 7, 5, 2, 19, 0, 15, 1, 16, 23, 25, 9, 6, 20, 10, 22, 14, 17, 11, 13, 21, 8, 4, 18]
cold iterations:  1500
dm time:  107.26699996
best density among 500 last iterations:  -57077.6393446
corresponding permutation:  [17, 15, 11, 13, 2, 21, 8, 14, 5, 24, 25, 23, 16, 9, 12, 22, 1, 6, 19, 7, 18, 20, 10, 3, 4, 0]
cold iterations:  250
dm time:  38.5279998779
best density among 500 last iterations:  -69405.2807661
corresponding permutation:  [3, 13, 1, 18, 12, 21, 14, 0, 2, 24, 16, 23, 10, 9, 6, 15, 7, 22, 19, 8, 5, 20, 25, 11, 4, 17]
cold iterations:  500
dm time:  50.5429999828
best density among 500 last iterations:  -66179.9101767
corresponding permutation:  [17, 7, 11, 18, 6, 21, 3, 14, 5, 24, 23, 25, 16, 9, 2, 12, 1, 15, 8, 13, 19, 20, 10, 22, 4, 0]
cold iterations:  1000
dm time:  79.0640001297
best density among 500 last iterations:  -56398.5046942
corresponding permutation:  [17, 12, 11, 13, 2, 21, 8, 14, 5, 24, 25, 23, 16, 9, 15, 22, 1, 6, 19, 7, 18, 20, 10, 3, 4, 0]
cold iterations:  1500
dm time:  107.544000149
best density among 500 last iterations:  -56398.5046942
corresponding permutation:  [17, 12, 11, 13, 2, 21, 8, 14, 5, 24, 25, 23, 16, 9, 15, 22, 1, 6, 19, 7, 18, 20, 10, 3, 4, 0]
Out[9]:
[<matplotlib.lines.Line2D at 0xa7795c0>,
 <matplotlib.lines.Line2D at 0xa7797f0>,
 <matplotlib.lines.Line2D at 0xa779dd8>,
 <matplotlib.lines.Line2D at 0xa788390>,
 <matplotlib.lines.Line2D at 0xa788908>]

In [ ]: