read text



In [1]:

    
def read_text_words(filename, wordsnumber):
    X = list()
    with open(filename) as f:
        for i in xrange(wordsnumber):
            line = f.readline()
            if not line:
                print 'reached end of file'
                break            
            X.append(line)     
    X = ''.join(X) 
    X = X.replace('\n', '{') #123
    return X

def read_text_whole(filename):
    with open(filename) as f:
        X = f.read()    
    X = X.replace('\n', '{') #123
    return X

def chop_text_to_size(text, size):
    return text[:1024*1024*size]

def read_text_filesize(filename, size):
    with open(filename) as f:
        X = f.read(1024*1024*size)
    X = X.replace('\n', '{') #123
    return X    

text = read_text_filesize('main\war_and_peace.txt', 3)
print len(text)
print text[:10]









    



3111644
the{projec



In [2]:

    
test_text = read_text_words('main\oliver_twist.txt', 7000)
# for i in xrange(10):
#     print test_text[i]
print len(test_text)
print test_text[:15]









    



38141
the{project{



In [3]:

    
length = 575514
train_text = read_text_words('main\war_and_peace.txt', length)
# print train_text[575513]
print train_text[:15]









    



the{project{gut

unigram statistics



In [38]:

    
import numpy as np
from __future__ import division
def get_unigram_stats(text):
    length = len(text)
    stats = np.zeros(27)
    for i in xrange(length):
        c = ord(text[i])
#         print text[i]
        stats[c-97]+=1
        #97-122, 123 - word delimiter  
    delimcount = stats[26]
    stats = stats[:26]
    return stats/(length-delimcount)



In [53]:

    
stats = get_unigram_stats(train_text)
print sum(stats)
print stats
# print (np.argmax(stats))
# print chr(4+97)









    



1.0
[  7.39113459e-02   1.81305456e-02   2.06821066e-02   3.73550425e-02
   1.35326174e-01   2.11409947e-02   1.72996665e-02   6.27944681e-02
   6.73202599e-02   4.64244817e-05   9.12895769e-03   4.52049462e-02
   2.86462859e-02   6.20385823e-02   8.21028862e-02   1.52540133e-02
   9.76104486e-04   6.32206210e-02   6.18939522e-02   8.58275580e-02
   3.83424555e-02   3.96334132e-03   2.41907261e-02   1.02371934e-03
   2.38104024e-02   3.68419925e-04]
4
e

counts



In [68]:

    
def get_unicount(text):
    length = len(text)
    counts = np.zeros(27)
    for i in xrange(length):
        c = ord(text[i])
        counts[c-97]+=1
        #97-122, 123 - word delimiter    
    return counts[:26]
counts = get_unicount(train_text)
print counts
# print chr(np.argmax(counts)+97)









    



[ 518305.  113684.  165626.  271591.  886827.  152173.  125299.  429434.
  456822.    5105.   55261.  286239.  185267.  446899.  533060.  111999.
    7130.  415610.  427365.  609987.  225422.   46685.  157447.    9380.
  142848.    2767.]

bigram statistics

array



In [57]:

    
def get_bigram_stats_array(text):        
    length = len(text)
    stats = np.zeros((27,27))
    for i in xrange(length-1):
        c = ord(text[i])
        d = ord(text[i+1])
        stats[c-97, d-97]+=1
        #97-122, 123 - word delimiter  
    stats = stats[:26]
    for i in xrange(26):        
        stats[i] = stats[i]/counts[i]
    return stats[:26,:26]

stats = get_bigram_stats_array(train_text)
# print stats



In [65]:

    
# print np.amax(stats, 1)
print np.amax(stats)
print np.argmax(stats)
print chr(436//26+97)
print chr(436 - 26*(436//26) +97)
print np.count_nonzero(stats)









    



0.965243902439
436
q
u
519



In [59]:

    
%timeit get_bigram_stats_array(train_text)









    



1 loops, best of 3: 3.02 s per loop

dict



In [69]:

    
def get_bigram_stats_dic(text):        
    length = len(text)
    dic = {}
    for i in xrange(length-1):
        if ord(text[i]) == 123 or ord(text[i+1]) == 123:
            continue            
        if (text[i], text[i+1]) in dic:
            dic[(text[i], text[i+1])] += 1
        else: 
            dic[(text[i], text[i+1])] = 1 
            
    for k,v in dic.items():
        dic[k] = v/counts[ord(k[0])-97]
    return dic

d = get_bigram_stats_dic(train_text)
# print d
import operator
print max(d.iteritems(), key=operator.itemgetter(1))[0]
print len(d)









    



('q', 'u')
591



In [64]:

    
%timeit get_bigram_stats_dic(train_text)









    



1 loops, best of 3: 1.19 s per loop

TODO zip



In [ ]:

Quality measurement: $$ r(x, y, f) = \frac{\sum^{|x|}_{i=1} I[f(x_i) = y_i]}{|x|}$$

Don’t forget to average over several restarts of Metropolis algorithm since finite-size samples produced by it depend on the initialization



In [66]:

    
def quality(decrypted, original):
    l = len(decrypted)
    zipped = zip(decrypted, original)    
    return sum(1 for x,y in zipped if x != y)/l
    
    
quality('abcd','adrd')









    Out[66]:





0.5

crypt



In [86]:

    
import random
from numpy.random import rand

def crypt(text):
    p = range(26)
    random.shuffle(p)
    output=''
    for ch in text:
            try:
                x = ord(ch) - ord('a')
                output+=(chr(p[x] + ord('a')))
            except:
                pass
#                 print '{} unsupported'.format(ch)
    for i in xrange(len(p)):
        print '{} -> {}'.format(chr(ord('a') + i), chr(ord('a') + p[i]))
     
    return output

Varying size of training text

Fix large (e.g. 5000 or more words) encrypted text and explore how the ratio of correctly decrypted symbols depends on the size of training text (using the same number of MCMC iterations)

Encryption



In [87]:

    
fname = 'main\oliver_twist.txt'
test_text = read_text_words(fname, 7000)
#NB remove word delimiters { before encryption!
original = test_text.replace('{','')
#encryption
encrypted = crypt(original)
print encrypted[:20]









    



a -> r
b -> e
c -> c
d -> z
e -> j
f -> m
g -> f
h -> w
i -> b
j -> o
k -> s
l -> u
m -> y
n -> g
o -> d
p -> h
q -> a
r -> q
s -> i
t -> l
u -> t
v -> n
w -> p
x -> v
y -> k
z -> x
lwjhqdojclftljgejqfj

stats, metropolis, decrypt, quality, plot



In [90]:

    
import matplotlib.pyplot as plt

sizes =  [2,4,8,16]
for s in sizes:   
    i=0
    train_text = read_text_filesize('main\super.txt', s)
    unistats = get_unigram_stats(train_text)
    counts = get_unicount(train_text)
    bistats = get_bigram_stats_dic(train_text)
#     print chr(np.argmax(counts)+97)
#     print max(bistats.iteritems(), key=operator.itemgetter(1))[0]
    
    #Metropolis here
    #decryption
    #output - decrypted text
#     qs = np.zeros(len(sizes))
#     qs[i] = quality(decrypted, original)
#     i+=1

print train_text[:1000]
# plt.plot(sizes, qs)









    



project{gutenberg{s{etext{of{shakespeare{s{first{folio{plays{this{is{our{rd{edition{of{most{of{these{plays{see{the{index{copyright{laws{are{changing{all{over{the{world{be{sure{to{check{the{copyright{laws{for{your{country{before{posting{these{files{please{take{a{look{at{the{important{information{in{this{header{we{encourage{you{to{keep{this{file{on{your{own{disk{keeping{an{electronic{path{open{for{the{next{readers{do{not{remove{this{welcome{to{the{world{of{free{plain{vanilla{electronic{texts{etexts{readable{by{both{humans{and{by{computers{since{these{etexts{prepared{by{hundreds{of{volunteers{and{donations{information{on{contacting{project{gutenberg{to{get{etexts{and{further{information{is{included{below{we{need{your{donations{the{first{folio{lsb{plays{rsb{by{william{shakespeare{july{lsb{etext{rsb{project{gutenberg{s{etext{of{shakespeare{s{first{folio{plays{this{file{should{be{named{wstxt{or{ws{zip{corrected{editions{of{our{etexts{get{a{new{number{wstxt{versions{based{on{separate{sources{

Varying size of test text

Fix large (e.g. 3 MB of raw text) training text and explore how the ratio of correctly decrypted symbols depends on the size of observed encrypted text (using the same number of MCMC iterations as in step 2).



In [89]:

    
fname = 'main\oliver_twist.txt'
sizes =  [1000,10000,100000]
for s in sizes:       
    test_text = read_text_words(fname, s)
    #NB remove word delimiters { before encryption!
    original = test_text.replace('{','')
    #encryption
    encrypted = crypt(original)
    
    i=0
    train_text = read_text_whole('main\war_and_peace.txt')
    unistats = get_unigram_stats(train_text)
    counts = get_unicount(train_text)
    bistats = get_bigram_stats_dic(train_text)
#     print chr(np.argmax(counts)+97)
#     print max(bistats.iteritems(), key=operator.itemgetter(1))[0]
    
    #Metropolis here
    #decryption
    #output - decrypted text
#     qs = np.zeros(len(sizes))
#     qs[i] = quality(decrypted, original)
#     i+=1

print train_text[:500]
# plt.plot(sizes, qs)









    



a -> n
b -> z
c -> a
d -> q
e -> o
f -> h
g -> g
h -> u
i -> p
j -> f
k -> i
l -> w
m -> r
n -> b
o -> v
p -> l
q -> m
r -> j
s -> d
t -> c
u -> e
v -> x
w -> k
x -> t
y -> s
z -> y
e
('q', 'u')
a -> h
b -> k
c -> r
d -> d
e -> y
f -> v
g -> u
h -> x
i -> c
j -> s
k -> g
l -> f
m -> o
n -> j
o -> i
p -> n
q -> m
r -> a
s -> p
t -> l
u -> z
v -> b
w -> t
x -> w
y -> q
z -> e
e
('q', 'u')
a -> b
b -> m
c -> j
d -> i
e -> c
f -> k
g -> u
h -> y
i -> l
j -> x
k -> w
l -> f
m -> d
n -> a
o -> z
p -> v
q -> p
r -> o
s -> t
t -> g
u -> s
v -> q
w -> n
x -> h
y -> e
z -> r
e
('q', 'u')
the{project{gutenberg{ebook{of{war{and{peace{by{leo{tolstoy{this{ebook{is{for{the{use{of{anyone{anywhere{at{no{cost{and{with{almost{no{restrictions{whatsoever{you{may{copy{it{give{it{away{or{reuse{it{under{the{terms{of{the{project{gutenberg{license{included{with{this{ebook{or{online{at{wwwgutenbergorg{title{war{and{peace{author{leo{tolstoy{translators{louise{and{aylmer{maude{posting{date{january{lsb{ebook{rsb{last{updated{march{language{english{start{of{this{project{gutenberg{ebook{war{and{peace



In [ ]: