read text

In [1]:
def read_text_words(filename, wordsnumber):
    X = list()
    with open(filename) as f:
        for i in xrange(wordsnumber):
            line = f.readline()
            if not line:
                print 'reached end of file'
                break            
            X.append(line)     
    X = ''.join(X) 
    X = X.replace('\n', '{') #123
    return X

def read_text_whole(filename):
    with open(filename) as f:
        X = f.read()    
    X = X.replace('\n', '{') #123
    return X

def chop_text_to_size(text, size):
    return text[:1024*1024*size]

def read_text_filesize(filename, size):
    with open(filename) as f:
        X = f.read(1024*1024*size)
    X = X.replace('\n', '{') #123
    return X    

text = read_text_filesize('main\war_and_peace.txt', 3)
print len(text)
print text[:10]


3111644
the{projec

In [2]:
test_text = read_text_words('main\oliver_twist.txt', 7000)
# for i in xrange(10):
#     print test_text[i]
print len(test_text)
print test_text[:15]


38141
the{project{

In [3]:
length = 575514
train_text = read_text_words('main\war_and_peace.txt', length)
# print train_text[575513]
print train_text[:15]


the{project{gut
unigram statistics

In [38]:
import numpy as np
from __future__ import division
def get_unigram_stats(text):
    length = len(text)
    stats = np.zeros(27)
    for i in xrange(length):
        c = ord(text[i])
#         print text[i]
        stats[c-97]+=1
        #97-122, 123 - word delimiter  
    delimcount = stats[26]
    stats = stats[:26]
    return stats/(length-delimcount)

In [53]:
stats = get_unigram_stats(train_text)
print sum(stats)
print stats
# print (np.argmax(stats))
# print chr(4+97)


1.0
[  7.39113459e-02   1.81305456e-02   2.06821066e-02   3.73550425e-02
   1.35326174e-01   2.11409947e-02   1.72996665e-02   6.27944681e-02
   6.73202599e-02   4.64244817e-05   9.12895769e-03   4.52049462e-02
   2.86462859e-02   6.20385823e-02   8.21028862e-02   1.52540133e-02
   9.76104486e-04   6.32206210e-02   6.18939522e-02   8.58275580e-02
   3.83424555e-02   3.96334132e-03   2.41907261e-02   1.02371934e-03
   2.38104024e-02   3.68419925e-04]
4
e

counts


In [68]:
def get_unicount(text):
    length = len(text)
    counts = np.zeros(27)
    for i in xrange(length):
        c = ord(text[i])
        counts[c-97]+=1
        #97-122, 123 - word delimiter    
    return counts[:26]
counts = get_unicount(train_text)
print counts
# print chr(np.argmax(counts)+97)


[ 518305.  113684.  165626.  271591.  886827.  152173.  125299.  429434.
  456822.    5105.   55261.  286239.  185267.  446899.  533060.  111999.
    7130.  415610.  427365.  609987.  225422.   46685.  157447.    9380.
  142848.    2767.]
bigram statistics

array


In [57]:
def get_bigram_stats_array(text):        
    length = len(text)
    stats = np.zeros((27,27))
    for i in xrange(length-1):
        c = ord(text[i])
        d = ord(text[i+1])
        stats[c-97, d-97]+=1
        #97-122, 123 - word delimiter  
    stats = stats[:26]
    for i in xrange(26):        
        stats[i] = stats[i]/counts[i]
    return stats[:26,:26]

stats = get_bigram_stats_array(train_text)
# print stats

In [65]:
# print np.amax(stats, 1)
print np.amax(stats)
print np.argmax(stats)
print chr(436//26+97)
print chr(436 - 26*(436//26) +97)
print np.count_nonzero(stats)


0.965243902439
436
q
u
519

In [59]:
%timeit get_bigram_stats_array(train_text)


1 loops, best of 3: 3.02 s per loop

dict


In [69]:
def get_bigram_stats_dic(text):        
    length = len(text)
    dic = {}
    for i in xrange(length-1):
        if ord(text[i]) == 123 or ord(text[i+1]) == 123:
            continue            
        if (text[i], text[i+1]) in dic:
            dic[(text[i], text[i+1])] += 1
        else: 
            dic[(text[i], text[i+1])] = 1 
            
    for k,v in dic.items():
        dic[k] = v/counts[ord(k[0])-97]
    return dic

d = get_bigram_stats_dic(train_text)
# print d
import operator
print max(d.iteritems(), key=operator.itemgetter(1))[0]
print len(d)


('q', 'u')
591

In [64]:
%timeit get_bigram_stats_dic(train_text)


1 loops, best of 3: 1.19 s per loop

TODO zip


In [ ]:

Quality measurement: $$ r(x, y, f) = \frac{\sum^{|x|}_{i=1} I[f(x_i) = y_i]}{|x|}$$

Don’t forget to average over several restarts of Metropolis algorithm since finite-size samples produced by it depend on the initialization


In [66]:
def quality(decrypted, original):
    l = len(decrypted)
    zipped = zip(decrypted, original)    
    return sum(1 for x,y in zipped if x != y)/l
    
    
quality('abcd','adrd')


Out[66]:
0.5
crypt

In [86]:
import random
from numpy.random import rand

def crypt(text):
    p = range(26)
    random.shuffle(p)
    output=''
    for ch in text:
            try:
                x = ord(ch) - ord('a')
                output+=(chr(p[x] + ord('a')))
            except:
                pass
#                 print '{} unsupported'.format(ch)
    for i in xrange(len(p)):
        print '{} -> {}'.format(chr(ord('a') + i), chr(ord('a') + p[i]))
     
    return output

Varying size of training text

Fix large (e.g. 5000 or more words) encrypted text and explore how the ratio of correctly decrypted symbols depends on the size of training text (using the same number of MCMC iterations)

Encryption

In [87]:
fname = 'main\oliver_twist.txt'
test_text = read_text_words(fname, 7000)
#NB remove word delimiters { before encryption!
original = test_text.replace('{','')
#encryption
encrypted = crypt(original)
print encrypted[:20]


a -> r
b -> e
c -> c
d -> z
e -> j
f -> m
g -> f
h -> w
i -> b
j -> o
k -> s
l -> u
m -> y
n -> g
o -> d
p -> h
q -> a
r -> q
s -> i
t -> l
u -> t
v -> n
w -> p
x -> v
y -> k
z -> x
lwjhqdojclftljgejqfj
stats, metropolis, decrypt, quality, plot

In [90]:
import matplotlib.pyplot as plt

sizes =  [2,4,8,16]
for s in sizes:   
    i=0
    train_text = read_text_filesize('main\super.txt', s)
    unistats = get_unigram_stats(train_text)
    counts = get_unicount(train_text)
    bistats = get_bigram_stats_dic(train_text)
#     print chr(np.argmax(counts)+97)
#     print max(bistats.iteritems(), key=operator.itemgetter(1))[0]
    
    #Metropolis here
    #decryption
    #output - decrypted text
#     qs = np.zeros(len(sizes))
#     qs[i] = quality(decrypted, original)
#     i+=1

print train_text[:1000]
# plt.plot(sizes, qs)


project{gutenberg{s{etext{of{shakespeare{s{first{folio{plays{this{is{our{rd{edition{of{most{of{these{plays{see{the{index{copyright{laws{are{changing{all{over{the{world{be{sure{to{check{the{copyright{laws{for{your{country{before{posting{these{files{please{take{a{look{at{the{important{information{in{this{header{we{encourage{you{to{keep{this{file{on{your{own{disk{keeping{an{electronic{path{open{for{the{next{readers{do{not{remove{this{welcome{to{the{world{of{free{plain{vanilla{electronic{texts{etexts{readable{by{both{humans{and{by{computers{since{these{etexts{prepared{by{hundreds{of{volunteers{and{donations{information{on{contacting{project{gutenberg{to{get{etexts{and{further{information{is{included{below{we{need{your{donations{the{first{folio{lsb{plays{rsb{by{william{shakespeare{july{lsb{etext{rsb{project{gutenberg{s{etext{of{shakespeare{s{first{folio{plays{this{file{should{be{named{wstxt{or{ws{zip{corrected{editions{of{our{etexts{get{a{new{number{wstxt{versions{based{on{separate{sources{

Varying size of test text

Fix large (e.g. 3 MB of raw text) training text and explore how the ratio of correctly decrypted symbols depends on the size of observed encrypted text (using the same number of MCMC iterations as in step 2).


In [89]:
fname = 'main\oliver_twist.txt'
sizes =  [1000,10000,100000]
for s in sizes:       
    test_text = read_text_words(fname, s)
    #NB remove word delimiters { before encryption!
    original = test_text.replace('{','')
    #encryption
    encrypted = crypt(original)
    
    i=0
    train_text = read_text_whole('main\war_and_peace.txt')
    unistats = get_unigram_stats(train_text)
    counts = get_unicount(train_text)
    bistats = get_bigram_stats_dic(train_text)
#     print chr(np.argmax(counts)+97)
#     print max(bistats.iteritems(), key=operator.itemgetter(1))[0]
    
    #Metropolis here
    #decryption
    #output - decrypted text
#     qs = np.zeros(len(sizes))
#     qs[i] = quality(decrypted, original)
#     i+=1

print train_text[:500]
# plt.plot(sizes, qs)


a -> n
b -> z
c -> a
d -> q
e -> o
f -> h
g -> g
h -> u
i -> p
j -> f
k -> i
l -> w
m -> r
n -> b
o -> v
p -> l
q -> m
r -> j
s -> d
t -> c
u -> e
v -> x
w -> k
x -> t
y -> s
z -> y
e
('q', 'u')
a -> h
b -> k
c -> r
d -> d
e -> y
f -> v
g -> u
h -> x
i -> c
j -> s
k -> g
l -> f
m -> o
n -> j
o -> i
p -> n
q -> m
r -> a
s -> p
t -> l
u -> z
v -> b
w -> t
x -> w
y -> q
z -> e
e
('q', 'u')
a -> b
b -> m
c -> j
d -> i
e -> c
f -> k
g -> u
h -> y
i -> l
j -> x
k -> w
l -> f
m -> d
n -> a
o -> z
p -> v
q -> p
r -> o
s -> t
t -> g
u -> s
v -> q
w -> n
x -> h
y -> e
z -> r
e
('q', 'u')
the{project{gutenberg{ebook{of{war{and{peace{by{leo{tolstoy{this{ebook{is{for{the{use{of{anyone{anywhere{at{no{cost{and{with{almost{no{restrictions{whatsoever{you{may{copy{it{give{it{away{or{reuse{it{under{the{terms{of{the{project{gutenberg{license{included{with{this{ebook{or{online{at{wwwgutenbergorg{title{war{and{peace{author{leo{tolstoy{translators{louise{and{aylmer{maude{posting{date{january{lsb{ebook{rsb{last{updated{march{language{english{start{of{this{project{gutenberg{ebook{war{and{peace

In [ ]: