read, statistics, quality, crypt, decrypt, Metropolis, likelihood functions:


In [2]:
import numpy as np
import math
import matplotlib.pyplot as plt
%matplotlib inline
import random
from numpy.random import rand
from copy import copy
from __future__ import division
import time

def read_text_words(filename, wordsnumber):    
    with open(filename) as f:
        X = f.readlines()
        X = X[:wordsnumber]
    X = ''.join(X) 
    X = X.replace('\n', '') 
    return X

def read_text_filesize(filename, size):
    with open(filename) as f:
        X = f.read(int(round(1024*1024*size)))
    X = X.replace('\n', '')
    return X

def get_unicount(text):
    length = len(text)
    counts = np.zeros(26)
    for i in xrange(length):
        c = ord(text[i])
        counts[c-97]+=1
        #97-122 
    return counts

def get_bigram_stats_dic(text):        
    length = len(text)
    dic = {}
    for i in 'abcdefghijklmnopqrstuvwxyz':
        for j in 'abcdefghijklmnopqrstuvwxyz':
            dic[(i,j)]=0

    for i in xrange(length-1):                   
        if (text[i], text[i+1]) in dic:
            dic[(text[i], text[i+1])] += 1
            
    for k,v in dic.items():        
        dic[k] = v/(counts[ord(k[0])-97])
    return dic

def quality(decrypted, original):
    l = len(decrypted)
    zipped = zip(decrypted, original)    
    return sum(1.0 for x,y in zipped if x == y)/l

def crypt(text):
    p = range(26)
    random.shuffle(p)
    output=''
    for ch in text:
            try:
                x = ord(ch) - ord('a')
                output+=(chr(p[x] + ord('a')))
            except:
                pass
    return output, p

def get_desiredPDF_bigram(permutation):
    logp = 0
    for i in xrange(len(encrypted)-1):         
        pr = stats[chr(permutation[ord(encrypted[i])-97]+97), 
                   chr(permutation[ord(encrypted[i+1])-97]+97)]
        if pr>0:
            logp += math.log(pr)
        else:
            logp += -9 #penalty for non existant pairs
    return logp

def metropolis( desiredPDF, initValue, computableRVS, skipIterations = 1000):
    random_variable = initValue
    random_variableDensityValue = desiredPDF( random_variable )
    
    for i in xrange( skipIterations ):
        candidate = computableRVS( random_variable )
        candidateDensityValue = desiredPDF( candidate )
        acceptanceProb = min(0, candidateDensityValue - random_variableDensityValue )
        if math.log(random.random()) < acceptanceProb:
            random_variable = candidate
            random_variableDensityValue = candidateDensityValue
   
    while True:
        candidate = computableRVS( random_variable )
        candidateDensityValue = desiredPDF( candidate )
    
        acceptanceProb = min( 0, candidateDensityValue - random_variableDensityValue )
       
        if math.log(random.random()) < acceptanceProb:
            random_variable = candidate
            random_variableDensityValue = candidateDensityValue
        yield random_variable, random_variableDensityValue

def uniform( n ):
    #initialize permutation with identical
    permutation = [ i for i in xrange( n ) ]
    
    #swap ith object with random onject from i to n - 1 enclusively
    for i in xrange( n ):
        j = random.randint( i, n - 1 )
        permutation[ i ], permutation[ j ] = permutation[ j ], permutation[ i ]
        
    return permutation

def applyTransposition( basePermutation ):
    n = len( basePermutation )
    
    permutation = copy( basePermutation )
    
    #apply n random transpositions (including identical) to base permutation
#     for i in xrange( n ):
    k, l = random.randint( 0, n - 1 ), random.randint( 0, n - 1 )
    permutation[ k ], permutation[ l ] = permutation[ l ], permutation[ k ]
        
    return  permutation


def decrypt(permutation, encrypted):
    decrypted = []
    for i in encrypted:
        decrypted.append(chr(permutation[ord(i)-97]+97))
    return ''.join(decrypted)

Task 3:

fixed test text:


In [6]:
fname = 'main/oliver_twist.txt'
original = read_text_words(fname, 5000)[3:]
encrypted,p = crypt(original)
print encrypted[:20]


lwrocgsrtlnylrxircnr

vary size of train text (used War and Peace, total 3Mb)


In [10]:
from sys import stdout
sizes =  [0.25,0.5,0.75,1]
qs = []
times = 4
for k in xrange(times):
    for s in sizes:   
        print 'Start at', time.strftime('%H:%M:%S', time.localtime())
        stdout.flush()
        train_text = read_text_filesize('main/war_and_peace.txt', s)    
        counts = get_unicount(train_text)
        stats = get_bigram_stats_dic(train_text)
        init_p = uniform(26)
        #Metropolis here
        st = time.time()
        computableGen = lambda t: applyTransposition(t)
        metropolisgenerator = \
            metropolis(get_desiredPDF_bigram, init_p, computableGen, 1500)

        print 'Sampling...'
        stdout.flush()
        y = -float("inf")
        for i in xrange( 1000 ): #<=========
            cand = metropolisgenerator.next() 
            if (cand[1] > y):
                y = cand[1]
                x = cand[0]

        et =  time.time() - st
        print 'metropolis time: ', et
        print 'best density among 1000 last iterations: ', y
        print 'corresponding permutation: ', x

        decrypted = decrypt(x, encrypted)
        qs.append( quality(decrypted, original))
        print 'End at  ', time.strftime('%H:%M:%S', time.localtime())
        time.sleep(1.0)
  
plt.plot(sizes[:len(qs)], qs[:len(sizes)],
         sizes[:len(qs)], qs[len(sizes):2*len(sizes)],
         sizes[:len(qs)], qs[2*len(sizes):3*len(sizes)],
         sizes[:len(qs)], qs[3*len(sizes):4*len(sizes)],)


Start at 09:34:00
Sampling...
metropolis time:  167.453365088
best density among 1000 last iterations:  -56612.5920152
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   09:36:48
Start at 09:36:49
Sampling...
metropolis time:  242.96111989
best density among 1000 last iterations:  -56449.9500405
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   09:40:53
Start at 09:40:54
Sampling...
metropolis time:  248.331638813
best density among 1000 last iterations:  -56489.294617
corresponding permutation:  [8, 3, 17, 9, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 25, 2, 10, 18, 7, 13, 20, 24]
End at   09:45:04
Start at 09:45:05
Sampling...
metropolis time:  187.028258085
best density among 1000 last iterations:  -56443.5009922
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   09:48:18
Start at 09:48:19
Sampling...
metropolis time:  212.592648029
best density among 1000 last iterations:  -57907.4385259
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 2, 22, 4, 9, 10, 21, 18, 7, 13, 20, 24]
End at   09:51:52
Start at 09:51:53
Sampling...
metropolis time:  165.989115
best density among 1000 last iterations:  -56449.9500405
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   09:54:40
Start at 09:54:41
Sampling...
metropolis time:  245.037575006
best density among 1000 last iterations:  -56459.7567613
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   09:58:48
Start at 09:58:49
Sampling...
metropolis time:  236.601491928
best density among 1000 last iterations:  -69695.6823139
corresponding permutation:  [13, 18, 4, 16, 5, 23, 19, 0, 22, 25, 11, 14, 24, 10, 15, 6, 1, 17, 9, 2, 21, 3, 20, 8, 7, 12]
End at   10:02:48
Start at 10:02:49
Sampling...
metropolis time:  205.495482922
best density among 1000 last iterations:  -57606.4074425
corresponding permutation:  [8, 3, 17, 21, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 25, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   10:06:15
Start at 10:06:16
Sampling...
metropolis time:  167.041108131
best density among 1000 last iterations:  -56449.9500405
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   10:09:04
Start at 10:09:05
Sampling...
metropolis time:  165.338798046
best density among 1000 last iterations:  -56459.7567613
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   10:11:52
Start at 10:11:53
Sampling...
metropolis time:  165.96084094
best density among 1000 last iterations:  -56443.5009922
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   10:14:41
Start at 10:14:42
Sampling...
metropolis time:  165.614757061
best density among 1000 last iterations:  -56612.5920152
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   10:17:29
Start at 10:17:30
Sampling...
metropolis time:  167.244617939
best density among 1000 last iterations:  -56576.3395128
corresponding permutation:  [8, 3, 17, 16, 12, 23, 14, 11, 1, 25, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   10:20:18
Start at 10:20:19
Sampling...
metropolis time:  166.071938038
best density among 1000 last iterations:  -56459.7567613
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   10:23:07
Start at 10:23:08
Sampling...
metropolis time:  224.484329939
best density among 1000 last iterations:  -56443.5009922
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   10:26:54
Out[10]:
[<matplotlib.lines.Line2D at 0xaf02f70c>,
 <matplotlib.lines.Line2D at 0xaf02faec>,
 <matplotlib.lines.Line2D at 0xaf02f24c>,
 <matplotlib.lines.Line2D at 0xaf032b6c>]

In [29]:
#qualities_1 = np.array(qs).reshape((4, 4))
print qualities_1
I = np.array([0, 1, 2, 3]) # take into account 'explosions'
means_1 = np.mean(qualities_1[I, :], 0)
stds_1 = np.std(qualities_1[I, :], 0)
sizes = [0.25, 0.5, 0.75, 1.0]
plt.title('Figure t3b2. Dependence quality on train text size.')
plt.xlabel('size, MB')
plt.ylabel('quality')
plt.plot(sizes, means_1 + stds_1, 'r:+')
plt.plot(sizes, means_1 - stds_1, 'r:+')
plt.plot(sizes, means_1, 'b-x')
plt.savefig('task-3-bigram-1.png')


[[ 1.          1.          0.99918831  1.        ]
 [ 0.95463564  1.          1.          0.05104618]
 [ 0.98656205  1.          1.          1.        ]
 [ 1.          0.998557    1.          1.        ]]

for small train text quality graphs prove intuition that the small sized training does not give much information.


In [17]:
sizes =  [2, 3]
qs = []
times = 4
for k in xrange(times):
    print '%d-th series running...' % k
    for j, s in enumerate(sizes):
        print j, 'Start at', time.strftime('%H:%M:%S', time.localtime())
        stdout.flush()
        train_text = read_text_filesize('main/war_and_peace.txt', s)    
        counts = get_unicount(train_text)
        stats = get_bigram_stats_dic(train_text)
        init_p = uniform(26)
        #Metropolis here
        st = time.time()
        computableGen = lambda t: applyTransposition(t)
        metropolisgenerator = \
            metropolis(get_desiredPDF_bigram, init_p, computableGen, 1500)

        y = -float("inf")
        for i in xrange( 1000 ): #<=========
            cand = metropolisgenerator.next() 
            if (cand[1] > y):
                y = cand[1]
                x = cand[0]

        et =  time.time() - st
        print 'metropolis time: ', et
        print 'best density among 1000 last iterations: ', y
        print 'corresponding permutation: ', x

        decrypted = decrypt(x, encrypted)
        qs.append( quality(decrypted, original))
        print 'End at  ', time.strftime('%H:%M:%S', time.localtime())
        time.sleep(1.0)
        
qualities_2 = np.array(qs)
plt.plot(sizes[:len(qs)], qs[:len(sizes)],
         sizes[:len(qs)], qs[len(sizes):2*len(sizes)],
         sizes[:len(qs)], qs[2*len(sizes):3*len(sizes)],
         sizes[:len(qs)], qs[3*len(sizes):4*len(sizes)],)


0-th series running...
0 Start at 10:37:00
metropolis time:  165.200503111
best density among 1000 last iterations:  -67112.4596578
corresponding permutation:  [19, 3, 17, 25, 2, 23, 14, 11, 1, 16, 0, 8, 5, 20, 15, 21, 12, 4, 9, 22, 10, 7, 13, 18, 6, 24]
End at   10:39:50
1 Start at 10:39:51
metropolis time:  165.211738825
best density among 1000 last iterations:  -71611.1805524
corresponding permutation:  [13, 20, 8, 16, 6, 23, 19, 14, 1, 25, 18, 4, 24, 10, 5, 15, 22, 17, 9, 2, 21, 3, 0, 7, 11, 12]
End at   10:42:43
1-th series running...
0 Start at 10:42:44
metropolis time:  201.486129045
best density among 1000 last iterations:  -56409.8117851
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   10:46:10
1 Start at 10:46:11
metropolis time:  208.907721043
best density among 1000 last iterations:  -71104.1219204
corresponding permutation:  [11, 13, 8, 9, 24, 23, 17, 20, 1, 25, 19, 4, 7, 22, 15, 10, 2, 18, 16, 5, 21, 3, 0, 14, 12, 6]
End at   10:49:57
2-th series running...
0 Start at 10:49:58
metropolis time:  217.643985987
best density among 1000 last iterations:  -58550.850755
corresponding permutation:  [8, 3, 17, 9, 12, 23, 14, 11, 1, 16, 0, 19, 20, 6, 15, 21, 22, 4, 25, 2, 10, 18, 7, 13, 5, 24]
End at   10:53:47
1 Start at 10:53:48
metropolis time:  210.609104156
best density among 1000 last iterations:  -56398.5046942
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   10:57:25
3-th series running...
0 Start at 10:57:26
metropolis time:  163.672365189
best density among 1000 last iterations:  -56409.8117851
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   11:00:15
1 Start at 11:00:16
metropolis time:  210.031395912
best density among 1000 last iterations:  -56398.5046942
corresponding permutation:  [8, 3, 17, 25, 12, 23, 14, 11, 1, 16, 0, 19, 5, 6, 15, 21, 22, 4, 9, 2, 10, 18, 7, 13, 20, 24]
End at   11:03:52
Out[17]:
[<matplotlib.lines.Line2D at 0xaeda584c>,
 <matplotlib.lines.Line2D at 0xaeda5a6c>,
 <matplotlib.lines.Line2D at 0xaedad06c>,
 <matplotlib.lines.Line2D at 0xaedad8ec>]

In [31]:
qualities_2 = np.array(qs).reshape((4, 2))
print qualities_2
means_2 = np.mean(qualities_2, 0)
stds_2 = np.std(qualities_2, 0)
sizes = [2.0, 3.0]
plt.title('Figure t3b2. Dependence quality on train text size.')
plt.xlabel('size, MB')
plt.ylabel('quality')
plt.plot(sizes, means_2 + stds_2, 'r:+')
plt.plot(sizes, means_2 - stds_2, 'r:+')
plt.plot(sizes, means_2, 'b-x')
plt.savefig('task-3-bigram-2.png')


[[ 0.53323413  0.07593795]
 [ 1.          0.04486833]
 [ 0.95305736  1.        ]
 [ 1.          1.        ]]

In [32]:
means = np.zeros(6)
means[0:4] = means_1
means[4:6] = means_2
stds = np.zeros(6)
stds[0:4] = stds_1
stds[4:6] = stds_2
full_sizes = [0.25, 0.5, 0.75, 1.0, 2.0, 3.0]
plt.title('Figure t3b_full. Dependence quality on train text size.')
plt.xlabel('size, MB')
plt.ylabel('quality')
plt.plot(full_sizes, means + stds, 'r:+')
plt.plot(full_sizes, means - stds, 'r:+')
plt.plot(full_sizes, means, 'b-x')
plt.savefig('task-3-bigram.png')


this may be caused by too little col iterations, let's raise it up to 2500:


In [53]:
sizes =  [0.5, 1.75, 3]
qs = []
times = 3
for k in xrange(times):
    for s in sizes:   

        train_text = read_text_filesize('main/war_and_peace.txt', s)    
        counts = get_unicount(train_text)
        stats = get_bigram_stats_dic(train_text)
        init_p = uniform(26)
        #Metropolis here
        st = time.time()
        computableGen = lambda t: applyTransposition(t)
        metropolisgenerator = \
            metropolis(get_desiredPDF_bigram, init_p, computableGen, 2500)

        y = -float("inf")
        for i in xrange( 500 ): #<=========
            cand = metropolisgenerator.next() 
            if (cand[1] > y):
                y = cand[1]
                x = cand[0]

        et =  time.time() - st
        print 'metropolis time: ', et

        print 'best density among 500 last iterations: ', y
        print 'corresponding permutation: ', x

        decrypted = decrypt(x, encrypted)
        qs.append( quality(decrypted, original))
  
plt.plot(sizes[:len(qs)], qs[:len(sizes)],
         sizes[:len(qs)], qs[len(sizes):2*len(sizes)],
         sizes[:len(qs)], qs[2*len(sizes):3*len(sizes)],
         sizes[:len(qs)], qs[3*len(sizes):4*len(sizes)],)


metropolis time:  161.106999874
best density among 500 last iterations:  -56449.9500405
corresponding permutation:  [18, 21, 24, 4, 19, 15, 1, 3, 20, 16, 2, 8, 9, 23, 7, 10, 12, 0, 17, 11, 6, 25, 5, 13, 14, 22]
metropolis time:  161.350000143
best density among 500 last iterations:  -56410.2005938
corresponding permutation:  [18, 21, 24, 4, 19, 15, 1, 3, 20, 16, 2, 8, 9, 23, 7, 10, 12, 0, 17, 11, 6, 25, 5, 13, 14, 22]
metropolis time:  157.572000027
best density among 500 last iterations:  -56398.5046942
corresponding permutation:  [18, 21, 24, 4, 19, 15, 1, 3, 20, 16, 2, 8, 9, 23, 7, 10, 12, 0, 17, 11, 6, 25, 5, 13, 14, 22]
metropolis time:  156.710000038
best density among 500 last iterations:  -58108.899908
corresponding permutation:  [18, 21, 24, 4, 19, 15, 1, 3, 20, 16, 2, 0, 9, 23, 7, 10, 12, 8, 17, 11, 6, 25, 5, 13, 14, 22]
metropolis time:  156.017999887
best density among 500 last iterations:  -56410.2005938
corresponding permutation:  [18, 21, 24, 4, 19, 15, 1, 3, 20, 16, 2, 8, 9, 23, 7, 10, 12, 0, 17, 11, 6, 25, 5, 13, 14, 22]
metropolis time:  154.515999794
best density among 500 last iterations:  -56398.5046942
corresponding permutation:  [18, 21, 24, 4, 19, 15, 1, 3, 20, 16, 2, 8, 9, 23, 7, 10, 12, 0, 17, 11, 6, 25, 5, 13, 14, 22]
metropolis time:  153.356999874
best density among 500 last iterations:  -56449.9500405
corresponding permutation:  [18, 21, 24, 4, 19, 15, 1, 3, 20, 16, 2, 8, 9, 23, 7, 10, 12, 0, 17, 11, 6, 25, 5, 13, 14, 22]
metropolis time:  153.322999954
best density among 500 last iterations:  -56410.2005938
corresponding permutation:  [18, 21, 24, 4, 19, 15, 1, 3, 20, 16, 2, 8, 9, 23, 7, 10, 12, 0, 17, 11, 6, 25, 5, 13, 14, 22]
metropolis time:  157.593999863
best density among 500 last iterations:  -56398.5046942
corresponding permutation:  [18, 21, 24, 4, 19, 15, 1, 3, 20, 16, 2, 8, 9, 23, 7, 10, 12, 0, 17, 11, 6, 25, 5, 13, 14, 22]
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-53-621c4230538e> in <module>()
     33          sizes[:len(qs)], qs[len(sizes):2*len(sizes)],
     34          sizes[:len(qs)], qs[2*len(sizes):3*len(sizes)],
---> 35          sizes[:len(qs)], qs[3*len(sizes):4*len(sizes)],)  

C:\Users\Maremun\Anaconda\lib\site-packages\matplotlib\pyplot.pyc in plot(*args, **kwargs)
   2985         ax.hold(hold)
   2986     try:
-> 2987         ret = ax.plot(*args, **kwargs)
   2988         draw_if_interactive()
   2989     finally:

C:\Users\Maremun\Anaconda\lib\site-packages\matplotlib\axes.pyc in plot(self, *args, **kwargs)
   4135         lines = []
   4136 
-> 4137         for line in self._get_lines(*args, **kwargs):
   4138             self.add_line(line)
   4139             lines.append(line)

C:\Users\Maremun\Anaconda\lib\site-packages\matplotlib\axes.pyc in _grab_next_args(self, *args, **kwargs)
    315                 return
    316             if len(remaining) <= 3:
--> 317                 for seg in self._plot_args(remaining, kwargs):
    318                     yield seg
    319                 return

C:\Users\Maremun\Anaconda\lib\site-packages\matplotlib\axes.pyc in _plot_args(self, tup, kwargs)
    293             x = np.arange(y.shape[0], dtype=float)
    294 
--> 295         x, y = self._xy_from_xy(x, y)
    296 
    297         if self.command == 'plot':

C:\Users\Maremun\Anaconda\lib\site-packages\matplotlib\axes.pyc in _xy_from_xy(self, x, y)
    235         y = np.atleast_1d(y)
    236         if x.shape[0] != y.shape[0]:
--> 237             raise ValueError("x and y must have same first dimension")
    238         if x.ndim > 2 or y.ndim > 2:
    239             raise ValueError("x and y can be no greater than 2-D")

ValueError: x and y must have same first dimension

In [54]:
plt.plot(sizes[:len(qs)], qs[:len(sizes)],
         sizes[:len(qs)], qs[len(sizes):2*len(sizes)],
         sizes[:len(qs)], qs[2*len(sizes):3*len(sizes)])


Out[54]:
[<matplotlib.lines.Line2D at 0xd2b5358>,
 <matplotlib.lines.Line2D at 0xd2b5588>,
 <matplotlib.lines.Line2D at 0xd2b5b70>]

In [52]:
bp = np.zeros(26, dtype=int)
for i in p:
    bp[p[i]] = i
q = get_desiredPDF_bigram(bp)
print bp
print q


[18 21 24  4 19 15  1  3 20 16  2  8  9 23  7 10 12  0 17 11  6 25  5 13 14
 22]
-56398.5046942

super.txt


In [8]:
supertext = read_text_filesize('main/super.txt', 0.25)
print supertext[:100]
print supertext[140000:140100]


projectgutenbergsetextofshakespearesfirstfolioplaysthisisourrdeditionofmostoftheseplaysseetheindexco
owinwordsisawomansonelyvertueipraytheeoutwithtandplaceitforherchiefevertuespitemsheisproudlaoutwitht

shakespeare!