read, statistics, quality, crypt, decrypt, Metropolis, likelihood functions:


In [6]:
import numpy as np
import math
import matplotlib.pyplot as plt
%matplotlib inline
import random
from numpy.random import rand
from copy import copy
from __future__ import division
import time

def read_text_words(filename, wordsnumber):    
    with open(filename) as f:
        X = f.readlines()
        X = X[:wordsnumber]
    X = ''.join(X) 
    X = X.replace('\n', '') 
    return X

def read_text_filesize(filename, size):
    with open(filename) as f:
        X = f.read(int(round(1024*1024*size)))
    X = X.replace('\n', '')
    return X

def get_unigram_stats(text):
    length = len(text)
    stats = np.zeros(26)
    for i in xrange(length):
        c = ord(text[i])
        stats[c-97]+=1
        #97-122
    return stats/(length)

def get_desiredPDF_unigram(permutation):
    logp = 0
    for i in xrange(len(encrypted)):
        if ord(encrypted[i]) != 123:            
            logp += math.log(stats[permutation[ord(encrypted[i])-97]])
    return logp

def quality(decrypted, original):
    l = len(decrypted)
    zipped = zip(decrypted, original)    
    return sum(1.0 for x,y in zipped if x == y)/l

def crypt(text):
    p = range(26)
    random.shuffle(p)
    output=''
    for ch in text:
            try:
                x = ord(ch) - ord('a')
                output+=(chr(p[x] + ord('a')))
            except:
                pass
    return output, p

def metropolis( desiredPDF, initValue, computableRVS, skipIterations = 1000):
    random_variable = initValue
    random_variableDensityValue = desiredPDF( random_variable )
    
    for i in xrange( skipIterations ):
        candidate = computableRVS( random_variable )
        candidateDensityValue = desiredPDF( candidate )
        acceptanceProb = min(0, candidateDensityValue - random_variableDensityValue )
        if math.log(random.random()) < acceptanceProb:
            random_variable = candidate
            random_variableDensityValue = candidateDensityValue
   
    while True:
        candidate = computableRVS( random_variable )
        candidateDensityValue = desiredPDF( candidate )
    
        acceptanceProb = min( 0, candidateDensityValue - random_variableDensityValue )
       
        if math.log(random.random()) < acceptanceProb:
            random_variable = candidate
            random_variableDensityValue = candidateDensityValue
        yield random_variable, random_variableDensityValue

def uniform( n ):
    #initialize permutation with identical
    permutation = [ i for i in xrange( n ) ]
    
    #swap ith object with random onject from i to n - 1 enclusively
    for i in xrange( n ):
        j = random.randint( i, n - 1 )
        permutation[ i ], permutation[ j ] = permutation[ j ], permutation[ i ]
        
    return permutation

def applyTransposition( basePermutation ):
    n = len( basePermutation )
    
    permutation = copy( basePermutation )
    
    #apply n random transpositions (including identical) to base permutation
#     for i in xrange( n ):
    k, l = random.randint( 0, n - 1 ), random.randint( 0, n - 1 )
    permutation[ k ], permutation[ l ] = permutation[ l ], permutation[ k ]
        
    return  permutation


def decrypt(permutation, encrypted):
    decrypted = []
    for i in encrypted:
        decrypted.append(chr(permutation[ord(i)-97]+97))
    return ''.join(decrypted)

Task 3:

fixed test text:


In [2]:
fname = 'main/oliver_twist.txt'
original = read_text_words(fname, 5000)[3:]
encrypted,p = crypt(original)
print encrypted[:20]


uemsvybmwufaumdhmvfm

vary size of train text (used War and Peace, total 3Mb)


In [7]:
sizes =  [0.5,1.75,3]
qs=[]
times = 4
for k in xrange(times):
    for s in sizes: 
        train_text = read_text_filesize('main/war_and_peace.txt', s)        
        stats = get_unigram_stats(train_text)
        init_p = uniform(26)
        #Metropolis here
        st = time.time()
        computableGen = lambda t: applyTransposition(t)
        metropolisgenerator = \
            metropolis(get_desiredPDF_unigram, init_p, computableGen, 2500)

        y = -float("inf")
        for i in xrange( 500 ): #<=========
            cand = metropolisgenerator.next() 
            if (cand[1] > y):
                y = cand[1]
                x = cand[0]

        et =  time.time() - st
        print 'metropolis time: ', et

        print 'best density among 500 last iterations: ', y
        print 'corresponding permutation: ', x

        decrypted = decrypt(x, encrypted)
        qs.append(quality(decrypted, original))

plt.plot(sizes[:len(qs)], qs[:len(sizes)],
         sizes[:len(qs)], qs[len(sizes):2*len(sizes)],
         sizes[:len(qs)], qs[2*len(sizes):3*len(sizes)],
         sizes[:len(qs)], qs[3*len(sizes):4*len(sizes)],)


metropolis time:  57.0709998608
best density among 500 last iterations:  -64567.8837648
corresponding permutation:  [12, 25, 9, 8, 18, 24, 17, 5, 10, 2, 3, 1, 4, 23, 16, 11, 6, 21, 15, 20, 19, 7, 22, 13, 14, 0]
metropolis time:  58.367000103
best density among 500 last iterations:  -64581.9989541
corresponding permutation:  [12, 16, 9, 8, 18, 1, 17, 24, 10, 22, 3, 15, 4, 23, 25, 11, 5, 21, 6, 20, 19, 7, 2, 13, 14, 0]
metropolis time:  56.8069999218
best density among 500 last iterations:  -64581.7243843
corresponding permutation:  [20, 25, 9, 8, 18, 15, 17, 6, 10, 22, 3, 1, 4, 23, 16, 11, 5, 21, 24, 2, 19, 7, 12, 13, 14, 0]
metropolis time:  54.2109999657
best density among 500 last iterations:  -64567.4881177
corresponding permutation:  [20, 16, 9, 8, 18, 24, 17, 6, 10, 22, 3, 1, 4, 23, 25, 11, 5, 21, 15, 12, 19, 7, 2, 13, 14, 0]
metropolis time:  56.5490000248
best density among 500 last iterations:  -64580.1999932
corresponding permutation:  [20, 16, 9, 8, 18, 15, 17, 6, 10, 2, 3, 1, 4, 23, 25, 11, 5, 21, 24, 12, 19, 7, 22, 13, 14, 0]
metropolis time:  54.9630000591
best density among 500 last iterations:  -64581.9348514
corresponding permutation:  [12, 16, 9, 8, 18, 15, 17, 24, 10, 22, 3, 1, 4, 23, 25, 11, 5, 21, 6, 20, 19, 7, 2, 13, 14, 0]
metropolis time:  55.8289999962
best density among 500 last iterations:  -64566.8781372
corresponding permutation:  [20, 16, 9, 8, 18, 15, 17, 6, 10, 22, 3, 1, 4, 23, 25, 11, 5, 21, 24, 2, 19, 7, 12, 13, 14, 0]
metropolis time:  62.4309999943
best density among 500 last iterations:  -64581.9558181
corresponding permutation:  [20, 16, 9, 8, 18, 24, 17, 6, 10, 2, 3, 1, 4, 23, 25, 11, 15, 21, 5, 12, 19, 7, 22, 13, 14, 0]
metropolis time:  56.743999958
best density among 500 last iterations:  -64583.5600176
corresponding permutation:  [12, 25, 9, 8, 18, 24, 17, 15, 10, 20, 3, 1, 4, 23, 16, 11, 5, 21, 6, 2, 19, 7, 22, 13, 14, 0]
metropolis time:  56.9989998341
best density among 500 last iterations:  -64565.9485114
corresponding permutation:  [20, 25, 9, 8, 18, 15, 17, 24, 10, 22, 3, 1, 4, 23, 16, 11, 6, 21, 5, 2, 19, 7, 12, 13, 14, 0]
metropolis time:  59.5310001373
best density among 500 last iterations:  -64580.6787243
corresponding permutation:  [20, 16, 9, 8, 18, 15, 17, 6, 10, 12, 3, 1, 4, 23, 25, 11, 5, 21, 24, 2, 19, 7, 22, 13, 14, 0]
metropolis time:  58.763999939
best density among 500 last iterations:  -64582.4341347
corresponding permutation:  [20, 25, 9, 8, 7, 15, 17, 6, 10, 22, 3, 1, 4, 23, 16, 11, 5, 21, 24, 2, 19, 18, 12, 13, 14, 0]
Out[7]:
[<matplotlib.lines.Line2D at 0xaa2b470>,
 <matplotlib.lines.Line2D at 0x3ecbeb8>,
 <matplotlib.lines.Line2D at 0xaa2bc50>,
 <matplotlib.lines.Line2D at 0xaa9a208>]

we can see clearly unigram quality is not as good as bigram (which is expected). let's try with shakespeare:


In [8]:
sizes =  [4,8,16,32,64]
qs=[]
times = 4
for k in xrange(times):
    for s in sizes: 
        train_text = read_text_filesize('main/super.txt', s)        
        stats = get_unigram_stats(train_text)
        init_p = uniform(26)
        #Metropolis here
        st = time.time()
        computableGen = lambda t: applyTransposition(t)
        metropolisgenerator = \
            metropolis(get_desiredPDF_unigram, init_p, computableGen, 2500)

        y = -float("inf")
        for i in xrange( 500 ): #<=========
            cand = metropolisgenerator.next() 
            if (cand[1] > y):
                y = cand[1]
                x = cand[0]

        et =  time.time() - st 
        print 'train text size: ', s 
        print 'metropolis time: ', et

        print 'best density among 500 last iterations: ', y
        print 'corresponding permutation: ', x

        decrypted = decrypt(x, encrypted)
        qs.append(quality(decrypted, original))

plt.plot(sizes[:len(qs)], qs[:len(sizes)],
         sizes[:len(qs)], qs[len(sizes):2*len(sizes)],
         sizes[:len(qs)], qs[2*len(sizes):3*len(sizes)],
         sizes[:len(qs)], qs[3*len(sizes):4*len(sizes)],)


train text size:  4
metropolis time:  58.9590001106
best density among 500 last iterations:  -64712.7424571
corresponding permutation:  [3, 25, 16, 7, 17, 1, 13, 6, 21, 24, 11, 15, 4, 23, 9, 20, 2, 10, 5, 12, 19, 18, 22, 8, 0, 14]
train text size:  8
metropolis time:  57.4759998322
best density among 500 last iterations:  -64631.8807829
corresponding permutation:  [20, 9, 16, 13, 18, 1, 17, 6, 21, 22, 11, 15, 4, 23, 25, 3, 24, 10, 5, 12, 19, 7, 2, 8, 0, 14]
train text size:  16
metropolis time:  59.2220001221
best density among 500 last iterations:  -64610.4898416
corresponding permutation:  [20, 16, 9, 8, 18, 15, 17, 6, 10, 22, 11, 1, 4, 23, 25, 3, 5, 21, 24, 2, 19, 7, 12, 13, 14, 0]
train text size:  32
metropolis time:  56.9309999943
best density among 500 last iterations:  -64574.3335476
corresponding permutation:  [20, 16, 9, 8, 7, 15, 17, 22, 10, 5, 11, 1, 4, 23, 25, 3, 24, 21, 6, 12, 19, 18, 2, 13, 14, 0]
train text size:  64
metropolis time:  60.0520000458
best density among 500 last iterations:  -64573.8447435
corresponding permutation:  [12, 16, 9, 8, 18, 15, 17, 6, 10, 5, 3, 1, 4, 23, 25, 11, 22, 21, 24, 20, 19, 7, 2, 13, 14, 0]
train text size:  4
metropolis time:  58.4760000706
best density among 500 last iterations:  -64712.9998288
corresponding permutation:  [3, 25, 16, 7, 18, 1, 13, 5, 21, 22, 11, 15, 4, 23, 9, 20, 2, 10, 6, 12, 19, 17, 24, 8, 0, 14]
train text size:  8
metropolis time:  56.1559998989
best density among 500 last iterations:  -64632.7758466
corresponding permutation:  [20, 9, 16, 13, 18, 1, 17, 5, 21, 2, 11, 15, 4, 23, 25, 3, 24, 10, 6, 12, 19, 7, 22, 8, 0, 14]
train text size:  16
metropolis time:  59.0520000458
best density among 500 last iterations:  -64610.5917624
corresponding permutation:  [20, 9, 16, 8, 18, 1, 17, 6, 10, 22, 11, 15, 4, 23, 25, 3, 5, 21, 24, 12, 19, 7, 2, 13, 14, 0]
train text size:  32
metropolis time:  59.6920001507
best density among 500 last iterations:  -64573.0306311
corresponding permutation:  [20, 16, 9, 8, 7, 15, 17, 6, 10, 5, 11, 1, 4, 23, 25, 3, 22, 21, 24, 2, 19, 18, 12, 13, 14, 0]
train text size:  64
metropolis time:  54.6919999123
best density among 500 last iterations:  -64574.8570277
corresponding permutation:  [12, 16, 9, 8, 18, 24, 17, 15, 10, 5, 3, 1, 4, 23, 25, 11, 22, 21, 6, 20, 19, 7, 2, 13, 14, 0]
train text size:  4
metropolis time:  59.1819999218
best density among 500 last iterations:  -64713.1527229
corresponding permutation:  [3, 25, 16, 17, 18, 1, 13, 6, 21, 22, 11, 15, 4, 23, 9, 20, 2, 10, 5, 12, 19, 7, 24, 8, 0, 14]
train text size:  8
metropolis time:  58.1790001392
best density among 500 last iterations:  -64631.8829798
corresponding permutation:  [20, 9, 16, 13, 18, 15, 17, 24, 21, 22, 11, 1, 4, 23, 25, 3, 5, 10, 6, 12, 19, 7, 2, 8, 0, 14]
train text size:  16
metropolis time:  56.760999918
best density among 500 last iterations:  -64611.3177819
corresponding permutation:  [12, 16, 9, 8, 18, 15, 17, 24, 10, 22, 3, 1, 4, 23, 25, 11, 5, 21, 6, 20, 19, 7, 2, 13, 14, 0]
train text size:  32
metropolis time:  58.6629998684
best density among 500 last iterations:  -64573.7471185
corresponding permutation:  [12, 16, 9, 8, 7, 15, 17, 24, 10, 5, 3, 1, 4, 23, 25, 11, 22, 21, 6, 20, 19, 18, 2, 13, 14, 0]
train text size:  64
metropolis time:  57.3109998703
best density among 500 last iterations:  -64573.6471902
corresponding permutation:  [12, 16, 9, 8, 7, 15, 17, 24, 10, 5, 3, 1, 4, 23, 25, 11, 22, 21, 6, 20, 19, 18, 2, 13, 14, 0]
train text size:  4
metropolis time:  57.995000124
best density among 500 last iterations:  -64712.9344728
corresponding permutation:  [3, 25, 16, 7, 18, 6, 13, 1, 21, 24, 11, 15, 4, 23, 9, 20, 2, 10, 5, 12, 19, 17, 22, 8, 0, 14]
train text size:  8
metropolis time:  55.9140000343
best density among 500 last iterations:  -64633.210548
corresponding permutation:  [2, 9, 16, 13, 18, 1, 17, 6, 21, 22, 11, 15, 4, 23, 25, 3, 5, 10, 24, 20, 19, 7, 12, 8, 0, 14]
train text size:  16
metropolis time:  56.9179999828
best density among 500 last iterations:  -64610.9606155
corresponding permutation:  [20, 16, 9, 8, 18, 1, 17, 24, 10, 2, 11, 15, 4, 23, 25, 3, 5, 21, 6, 12, 19, 7, 22, 13, 14, 0]
train text size:  32
metropolis time:  56.7839999199
best density among 500 last iterations:  -64575.5593928
corresponding permutation:  [20, 16, 9, 13, 7, 24, 17, 15, 10, 5, 3, 1, 4, 23, 25, 11, 22, 21, 6, 12, 19, 18, 2, 8, 14, 0]
train text size:  64
metropolis time:  57.5930001736
best density among 500 last iterations:  -64575.1419616
corresponding permutation:  [20, 16, 9, 8, 7, 1, 17, 24, 10, 2, 3, 15, 4, 23, 25, 11, 6, 21, 22, 12, 19, 18, 5, 13, 14, 0]
Out[8]:
[<matplotlib.lines.Line2D at 0xb42ccc0>,
 <matplotlib.lines.Line2D at 0xb42cef0>,
 <matplotlib.lines.Line2D at 0xb43a4e0>,
 <matplotlib.lines.Line2D at 0xb43aa58>]

In [10]:
plt.plot(sizes[:len(qs)], qs[:len(sizes)],
         sizes[:len(qs)], qs[len(sizes):2*len(sizes)],
         sizes[:len(qs)], qs[2*len(sizes):3*len(sizes)],
         sizes[:len(qs)], qs[3*len(sizes):4*len(sizes)],)
plt.xlabel('size, MB')
plt.ylabel('quality')


Out[10]:
<matplotlib.text.Text at 0xb755d30>

In [ ]: