In [1]:
def read_text_words(filename, wordsnumber):
X = list()
with open(filename) as f:
for i in xrange(wordsnumber):
line = f.readline()
if not line:
print 'reached end of file'
break
X.append(line)
X = ''.join(X)
X = X.replace('\n', '{') #123
return X
def read_text_whole(filename):
with open(filename) as f:
X = f.read()
X = X.replace('\n', '{') #123
return X
def chop_text_to_size(text, size):
return text[:1024*1024*size]
def read_text_filesize(filename, size):
with open(filename) as f:
X = f.read(1024*1024*size)
X = X.replace('\n', '{') #123
return X
text = read_text_filesize('main\war_and_peace.txt', 3)
print len(text)
print text[:10]
In [2]:
test_text = read_text_words('main\oliver_twist.txt', 7000)
# for i in xrange(10):
# print test_text[i]
print len(test_text)
print test_text[:15]
In [3]:
length = 575514
train_text = read_text_words('main\war_and_peace.txt', length)
# print train_text[575513]
print train_text[:15]
In [38]:
import numpy as np
from __future__ import division
def get_unigram_stats(text):
length = len(text)
stats = np.zeros(27)
for i in xrange(length):
c = ord(text[i])
# print text[i]
stats[c-97]+=1
#97-122, 123 - word delimiter
delimcount = stats[26]
stats = stats[:26]
return stats/(length-delimcount)
In [53]:
stats = get_unigram_stats(train_text)
print sum(stats)
print stats
# print (np.argmax(stats))
# print chr(4+97)
counts
In [68]:
def get_unicount(text):
length = len(text)
counts = np.zeros(27)
for i in xrange(length):
c = ord(text[i])
counts[c-97]+=1
#97-122, 123 - word delimiter
return counts[:26]
counts = get_unicount(train_text)
print counts
# print chr(np.argmax(counts)+97)
array
In [57]:
def get_bigram_stats_array(text):
length = len(text)
stats = np.zeros((27,27))
for i in xrange(length-1):
c = ord(text[i])
d = ord(text[i+1])
stats[c-97, d-97]+=1
#97-122, 123 - word delimiter
stats = stats[:26]
for i in xrange(26):
stats[i] = stats[i]/counts[i]
return stats[:26,:26]
stats = get_bigram_stats_array(train_text)
# print stats
In [65]:
# print np.amax(stats, 1)
print np.amax(stats)
print np.argmax(stats)
print chr(436//26+97)
print chr(436 - 26*(436//26) +97)
print np.count_nonzero(stats)
In [59]:
%timeit get_bigram_stats_array(train_text)
dict
In [69]:
def get_bigram_stats_dic(text):
length = len(text)
dic = {}
for i in xrange(length-1):
if ord(text[i]) == 123 or ord(text[i+1]) == 123:
continue
if (text[i], text[i+1]) in dic:
dic[(text[i], text[i+1])] += 1
else:
dic[(text[i], text[i+1])] = 1
for k,v in dic.items():
dic[k] = v/counts[ord(k[0])-97]
return dic
d = get_bigram_stats_dic(train_text)
# print d
import operator
print max(d.iteritems(), key=operator.itemgetter(1))[0]
print len(d)
In [64]:
%timeit get_bigram_stats_dic(train_text)
TODO zip
In [ ]:
Don’t forget to average over several restarts of Metropolis algorithm since finite-size samples produced by it depend on the initialization
In [66]:
def quality(decrypted, original):
l = len(decrypted)
zipped = zip(decrypted, original)
return sum(1 for x,y in zipped if x != y)/l
quality('abcd','adrd')
Out[66]:
In [86]:
import random
from numpy.random import rand
def crypt(text):
p = range(26)
random.shuffle(p)
output=''
for ch in text:
try:
x = ord(ch) - ord('a')
output+=(chr(p[x] + ord('a')))
except:
pass
# print '{} unsupported'.format(ch)
for i in xrange(len(p)):
print '{} -> {}'.format(chr(ord('a') + i), chr(ord('a') + p[i]))
return output
Fix large (e.g. 5000 or more words) encrypted text and explore how the ratio of correctly decrypted symbols depends on the size of training text (using the same number of MCMC iterations)
In [87]:
fname = 'main\oliver_twist.txt'
test_text = read_text_words(fname, 7000)
#NB remove word delimiters { before encryption!
original = test_text.replace('{','')
#encryption
encrypted = crypt(original)
print encrypted[:20]
In [90]:
import matplotlib.pyplot as plt
sizes = [2,4,8,16]
for s in sizes:
i=0
train_text = read_text_filesize('main\super.txt', s)
unistats = get_unigram_stats(train_text)
counts = get_unicount(train_text)
bistats = get_bigram_stats_dic(train_text)
# print chr(np.argmax(counts)+97)
# print max(bistats.iteritems(), key=operator.itemgetter(1))[0]
#Metropolis here
#decryption
#output - decrypted text
# qs = np.zeros(len(sizes))
# qs[i] = quality(decrypted, original)
# i+=1
print train_text[:1000]
# plt.plot(sizes, qs)
Fix large (e.g. 3 MB of raw text) training text and explore how the ratio of correctly decrypted symbols depends on the size of observed encrypted text (using the same number of MCMC iterations as in step 2).
In [89]:
fname = 'main\oliver_twist.txt'
sizes = [1000,10000,100000]
for s in sizes:
test_text = read_text_words(fname, s)
#NB remove word delimiters { before encryption!
original = test_text.replace('{','')
#encryption
encrypted = crypt(original)
i=0
train_text = read_text_whole('main\war_and_peace.txt')
unistats = get_unigram_stats(train_text)
counts = get_unicount(train_text)
bistats = get_bigram_stats_dic(train_text)
# print chr(np.argmax(counts)+97)
# print max(bistats.iteritems(), key=operator.itemgetter(1))[0]
#Metropolis here
#decryption
#output - decrypted text
# qs = np.zeros(len(sizes))
# qs[i] = quality(decrypted, original)
# i+=1
print train_text[:500]
# plt.plot(sizes, qs)
In [ ]: