In [27]:
%load_ext autoreload
%autoreload 2
In [36]:
from data.text import samples as sam
from latinpigsay import generalfunctions as gfunc
from latinpigsay.tmp.experiments import expfunctions as expfunc
from latinpigsay import latinpig as lp
from latinpigsay import piggyprint as pp
from latinpigsay.tmp.experiments import exp
from latinpigsay.contractions import contractions_parallel as contspara
from latinpigsay.contractions import find_contractions as findconts
import piglatin as pl
import requests
from itertools import islice, permutations, count, izip, imap, product, chain
import itertools
import re
import json
from xml.dom import minidom
import os
from os import path
import operator
from multiprocessing import Pool
In [29]:
import numpy
from data.text import samples as sam
from textblob import TextBlob as textb
import nltk
In [30]:
import time
import arrow
class Timer:
def __init__(self):
self.interval = 0
def __enter__(self):
self.start = arrow.now()
return self
def __exit__(self, *args):
self.end = arrow.now()
self.interval = self.end - self.start
In [4]:
In [5]:
print sam.acidtest
In [5]:
In [6]:
text = sam.paragraphs_og
In [7]:
tokens = nltk.word_tokenize(text)
In [8]:
#print tokens
In [9]:
#print ' '.join(tokens)
In [10]:
sentences = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
sentences = nltk.parse.util.extract_test_sentences(sentences)
In [11]:
len(sentences)
Out[11]:
In [12]:
print sentences[12]
In [10]:
files = {1 : {'file1' : 'data/text/phrases_english.txt',
'file2' : 'data/text/phrases_piglatin.txt',
},
2 : {'file1' : 'data/text/contractions.txt',
'file2' : 'data/text/contractions-un.txt'
},
3 : {'file1' : 'data/contractions.txt',
},
}
print files[1]['file1']
In [11]:
class regexpreplacer(object):
def __init__(self, patterns):
#self.patternlist = patterns
self.patterns = [(re.compile(regex), repl) for (regex, repl) in
patterns]
def replace(self, text):
s = text
for (pattern, repl) in self.patterns:
(s, count) = re.subn(pattern, repl, s)
return s
#def replace2(self, text):
# s = text
# for (pattern, repl) in self.patternlist:
# (s, count) = re.subn(pattern, repl, s, flags=re.IGNORECASE)
# return s
In [12]:
contractions = [(r"won't", "will not"),
(r"can't", "cannot"),
(r"i'm", "i am"),
(r"I'm", "I am"),
(r"ma'am", "madam"),
(r"ain't", "is not"),
(r"let's", "let us"),
(r"Let's", "Let us"),
(r"shan't", "shall not"),
(r"where'd", "where did"),
(r"y'all", "you all"),
(r"o'clock", "of the clock"), # A "Hackish way to to get oway ' ockclay
(r"(\w+)'ll", "\g<1> will"),
(r"(\w+)n't", "\g<1> not"),
(r"(\w+)'ve", "\g<1> have"),
(r"(\w+)'s", "\g<1> is"),
(r"(\w+)'re", "\g<1> are"),
(r"(\w+)'d", "\g<1> would"),
]
expander = regexpreplacer(contractions)
In [13]:
with open(files[2]['file1']) as f:
conts = f.read()
with open(files[2]['file2']) as f:
notconts = f.read()
In [14]:
contslist = conts.splitlines()
fixedcontslist = expander.replace(conts).splitlines()
notcontslist = notconts.splitlines()
In [15]:
padding = 20
n = 0
for cont, fcont, ncont in izip(contslist, fixedcontslist, notcontslist):
print n, cont.ljust(padding), fcont.ljust(padding), ncont
n += 1
In [16]:
padding = 20
n = 0
for cont, fcont, ncont in izip(contslist, fixedcontslist, notcontslist):
print cont + '\t' + fcont
n += 1
In [17]:
def testexpander():
padding = 20
n = 0
for cont, fcont, ncont in izip(contslist, fixedcontslist, notcontslist):
if not(fcont in ncont):
print n, cont.ljust(padding), fcont.ljust(padding), ncont
n += 1
testexpander()
In [18]:
padding = 20
n = 0
print
for cont, fcont, ncont in izip(contslist, fixedcontslist, notcontslist):
print n, exp.translator(fcont).returnstr.ljust(padding), exp.translator(cont).returnstr.ljust(padding), lp.translator(cont).returnstr.ljust(padding), pl.translate(cont).ljust(padding), ncont
n += 1
In [19]:
def testexpander():
padding = 20
n = 0
for cont, fcont, ncont in izip(contslist, fixedcontslist, notcontslist):
prefixed = exp.translator(fcont).returnstr
testing = exp.translator(cont).returnstr
if testing != prefixed:
print n, prefixed, testing
n += 1
testexpander()
In [20]:
n = 0
for word in contslist:
m = re.match(r"[n]'[\w]+|[\w]+(?!')(?:[A-Za-mo-z](?='))?|(?<=\s)[\w](?=')|[^\s\w']", word)
if m is None:
print n, word
n += 1
In [21]:
n = 0
for word in contslist:
m = re.match(r"[n]'[\w]+|[\w]+(?!')(?:[A-Za-mo-z](?='))?|(?<=\s)[\w](?=')|[^\s\w']", word)
if m is not None:
print n, word
n += 1
In [21]:
In [22]:
def fileline_gen(file_):
with open(file_) as f:
for line in f.read().splitlines():
yield line
def fileline(file_):
with open(file_) as f:
return f.read().splitlines()
def fileword_gen(file_):
with open(file_) as f:
for word in re.findall(r'(?:\S+)|(?:\s+)', f.read()):
yield word
def urlword_gen(url):
f = requests.get(url, stream=True)
for line in f.iter_lines(delimiter='\n'):
#yield json.loads(line)
yield line
In [23]:
contdatabase = [tuple([line[0], ' '.join(line[1:])])
for line in (line.split()
for line in fileline(files[3]['file1'])
)
]
justconts = [cont[0] for cont in contdatabase]
In [26]:
#justconts
In [24]:
padding = 20
n = 0
for cont, clean, db in izip(contslist, cleanlist, contdatabase):
db1 = db[0]
db2 = db[1]
columns = ' '.join([str(n).rjust(2),
cont.ljust(13, "."),
clean.ljust(13, "."),
db1.ljust(13, "."),
db2.ljust(13, " "),
])
print columns
n += 1
In [28]:
def countfreq(word, listtocount):
assert type(listtocount) in (list, tuple)
if type(listtocount[0]) is not list:
countlist = [[w, 0] for w in listtocount]
elif type(listtocount[0]) is list:
countlist = listtocount
for thing in countlist:
if thing[0] in word:
thing[1] += 1
return countlist
In [29]:
def countfromlist(wordgen, listtocount):
assert type(listtocount) in (list, tuple)
if type(listtocount[0]) is not list:
countlist = [[w, 0] for w in listtocount]
elif type(listtocount[0]) is list:
countlist = listtocount
for word in wordgen:
words = word.split()
for w in words:
for thing in countlist:
if thing[0] in w:
thing[1] += 1
return countlist
In [30]:
def countfromgenlist(genlist, listtocount):
countlist = listtocount
try:
gen = genlist.next()
gengen = genlist
countlist = countfromlist(gen, listtocount)
countfromgenlist(gengen, countlist)
finally:
return countlist
In [31]:
genlist = [fileword_gen("data/text/largetext/alltext.txt"),
fileword_gen("data/text/largetext/warandpeace.txt"),
fileword_gen("data/text/largetext/misctext.txt"),
fileword_gen("data/text/largetext/trustingthewatercure.txt"),
]
ggg = lambda :itertools.imap(fileword_gen, (''.join(['data/text/largetext/', file]) for file in os.listdir('data/text/largetext/') if file.endswith('.txt')))
derp = ggg()
fff = lambda :itertools.imap(fileline, (''.join(['data/text/largetext/', file]) for file in os.listdir('data/text/largetext/') if file.endswith('.txt')))
gengen = (gen for gen in genlist)
In [32]:
with Timer() as t:
totals = countfromgenlist(fff(), justconts)
print t.interval
totals
Out[32]:
In [33]:
d = {}
for i in totals:
d[i[0]] = i[1]
In [40]:
#d
Out[40]:
In [21]:
sorted_x = reversed(sorted(d.items(), key=operator.itemgetter(1)))
sortedgen = lambda : reversed(sorted(d.items(), key=operator.itemgetter(1)))
In [8]:
#for i in sortedgen():
#print i
In [34]:
def checkifin_iter(item, iterlist):
for i in iterlist:
if i[0] == item:
return i[1] + ' --- !'
return item
In [23]:
#for word in sortedgen():
#print str(checkifin_iter(word[0], contdatabase)) + ','
In [24]:
#for word in fileword_gen("data/text/largetext/alltext.txt"):
#print str(checkifin_iter(word, contdatabase)), '---', word
In [25]:
#for word in fileword_gen("data/text/largetext/alltext.txt"):
#print word
In [25]:
In [32]:
In [2]:
contgen = lambda:(line.split('\t')[0] for line in fileline_gen('data/contractions.txt'))
contlist = [line.split('\t') for line in fileline_gen('data/contractions.txt')]
In [36]:
gen1 = contgen()
gen2 = contgen()
In [36]:
In [37]:
wordlist = [line for line in fileword_gen('data/contractions.txt')]
In [37]:
In [38]:
gencont_factory = lambda:islice(wordlist, 0, len(wordlist), 1)
In [39]:
gencont = gencont_factory()
In [40]:
gencont.next()
Out[40]:
In [41]:
testl = []
for i in xrange(21):
testl.append(i)
In [42]:
testl
Out[42]:
In [43]:
testl[0:10]
Out[43]:
In [43]:
In [43]:
In [44]:
filelist = [''.join(['data/text/testbatch/', file]) for file in os.listdir('data/text/testbatch/') if file.endswith('.txt')]
In [45]:
len(filelist)
Out[45]:
In [46]:
numberoffiles = len(filelist)
cores = 4
filespercore = numberoffiles / cores
[filespercore, filespercore*cores]
Out[46]:
In [47]:
def formbatches(listof, batches):
numberof = len(listof)
batches = 4
perbatch = numberof / batches
offset = numberof - perbatch*batches
start = 0
end = perbatch + 1
batches = []
for core in xrange(cores):
batches.append(listof[start:end])
start = end
end += perbatch
return batches
In [48]:
filebatches = formbatches(filelist, cores)
len(filebatches)
Out[48]:
In [49]:
l1 = len(filebatches[0])
l2 = len(filebatches[1])
l3 = len(filebatches[2])
l4 = len(filebatches[3])
print l1, l2, l3, l4
print l1 + l2 + l3 + l4
In [50]:
filegen_factory = lambda filelist: itertools.imap(fileline, filelist)
filebatch_gen = lambda filebatches: map(filegen_factory, filebatches)
In [51]:
#with Timer() as t:
# totals = countfromgenlist(fff(), justconts)
#print t.interval
#totals
In [52]:
hyh = filebatch_gen(filebatches)
In [53]:
def countfromgenlist_parallel(filegens):
return countfromgenlist(filegens, justconts)
In [1]:
In [63]:
def contsinparallel(filebatch_list):
pool = Pool(processes=cores)
results = pool.map(countfromgenlist_parallel, tuple(filebatch_list))
product = results
return product
In [ ]:
with Timer() as t:
listofresults = contsinparallel(filebatch_gen(filebatches))
print t.interval
In [56]:
print listofresults
In [57]:
countfromgenlist_parallel(filebatch_gen(filebatches)[0])
Out[57]:
In [64]:
filebatch_gen(filebatches)[0]
Out[64]:
In [65]:
print tuple(filebatch_gen(filebatches))
In [ ]:
map(countfromgenlist_parallel, filebatch_gen(filebatches))
In [ ]:
In [35]:
contspara.contspara('data/text/testbatch/', 4)
In [ ]: