In [27]:
    
%load_ext autoreload
%autoreload 2
    
    
In [36]:
    
from data.text import samples as sam
from latinpigsay import generalfunctions as gfunc
from latinpigsay.tmp.experiments import expfunctions as expfunc
from latinpigsay import latinpig as lp
from latinpigsay import piggyprint as pp
from latinpigsay.tmp.experiments import exp
from latinpigsay.contractions import contractions_parallel as contspara
from latinpigsay.contractions import find_contractions as findconts
import piglatin as pl
import requests
from itertools import islice, permutations, count, izip, imap, product, chain
import itertools
import re
import json
from xml.dom import minidom
import os
from os import path
import operator
from multiprocessing import Pool
    
In [29]:
    
import numpy
from data.text import samples as sam
from textblob import TextBlob as textb
import nltk
    
In [30]:
    
import time
import arrow
class Timer:
    def __init__(self):
        self.interval = 0
    def __enter__(self):
        self.start = arrow.now()
        return self
    def __exit__(self, *args):
        self.end = arrow.now()
        self.interval = self.end - self.start
    
In [4]:
    
    
In [5]:
    
print sam.acidtest
    
    
In [5]:
    
    
In [6]:
    
text = sam.paragraphs_og
    
In [7]:
    
tokens = nltk.word_tokenize(text)
    
In [8]:
    
#print tokens
    
In [9]:
    
#print ' '.join(tokens)
    
In [10]:
    
sentences = nltk.data.load('grammars/large_grammars/atis_sentences.txt')
sentences = nltk.parse.util.extract_test_sentences(sentences)
    
In [11]:
    
len(sentences)
    
    Out[11]:
In [12]:
    
print sentences[12]
    
    
In [10]:
    
files = {1 : {'file1' : 'data/text/phrases_english.txt',
              'file2' : 'data/text/phrases_piglatin.txt',
             },
         2 : {'file1' : 'data/text/contractions.txt',
              'file2' : 'data/text/contractions-un.txt'
             },
         3 : {'file1' : 'data/contractions.txt',
             },
        }
print files[1]['file1']
    
    
In [11]:
    
class regexpreplacer(object):
    def __init__(self, patterns):
        #self.patternlist = patterns
        self.patterns = [(re.compile(regex), repl) for (regex, repl) in
                         patterns]
    def replace(self, text):
        s = text
        for (pattern, repl) in self.patterns:
            (s, count) = re.subn(pattern, repl, s)
        return s
    #def replace2(self, text):
    #    s = text
    #    for (pattern, repl) in self.patternlist:
    #        (s, count) = re.subn(pattern, repl, s, flags=re.IGNORECASE)
    #    return s
    
In [12]:
    
contractions = [(r"won't", "will not"),
                (r"can't", "cannot"),
                (r"i'm", "i am"),
                (r"I'm", "I am"),
                (r"ma'am", "madam"),
                (r"ain't", "is not"),
                (r"let's", "let us"),
                (r"Let's", "Let us"),
                (r"shan't", "shall not"),
                (r"where'd", "where did"),
                (r"y'all", "you all"),
                (r"o'clock", "of the clock"), # A "Hackish way to to get oway ' ockclay
                (r"(\w+)'ll", "\g<1> will"),
                (r"(\w+)n't", "\g<1> not"),
                (r"(\w+)'ve", "\g<1> have"),
                (r"(\w+)'s", "\g<1> is"),
                (r"(\w+)'re", "\g<1> are"),
                (r"(\w+)'d", "\g<1> would"),
                ]
expander = regexpreplacer(contractions)
    
In [13]:
    
with open(files[2]['file1']) as f:
    conts = f.read()
with open(files[2]['file2']) as f:
    notconts = f.read()
    
In [14]:
    
contslist = conts.splitlines()
fixedcontslist = expander.replace(conts).splitlines()
notcontslist = notconts.splitlines()
    
In [15]:
    
padding = 20
n = 0
for cont, fcont, ncont in izip(contslist, fixedcontslist, notcontslist):
    print n, cont.ljust(padding), fcont.ljust(padding), ncont
    n += 1
    
    
In [16]:
    
padding = 20
n = 0
for cont, fcont, ncont in izip(contslist, fixedcontslist, notcontslist):
    print cont + '\t' + fcont
    n += 1
    
    
In [17]:
    
def testexpander():
    padding = 20
    n = 0
    for cont, fcont, ncont in izip(contslist, fixedcontslist, notcontslist):
        if not(fcont in ncont):
            print n, cont.ljust(padding), fcont.ljust(padding), ncont
        n += 1
testexpander()
    
In [18]:
    
padding = 20
n = 0
print 
for cont, fcont, ncont in izip(contslist, fixedcontslist, notcontslist):
    print n, exp.translator(fcont).returnstr.ljust(padding), exp.translator(cont).returnstr.ljust(padding), lp.translator(cont).returnstr.ljust(padding), pl.translate(cont).ljust(padding), ncont
    n += 1
    
    
    
In [19]:
    
def testexpander():
    padding = 20
    n = 0
    for cont, fcont, ncont in izip(contslist, fixedcontslist, notcontslist):
        prefixed = exp.translator(fcont).returnstr
        testing = exp.translator(cont).returnstr
        if testing != prefixed:
            print n, prefixed, testing
        n += 1
testexpander()
    
    
In [20]:
    
n = 0
for word in contslist:
    m = re.match(r"[n]'[\w]+|[\w]+(?!')(?:[A-Za-mo-z](?='))?|(?<=\s)[\w](?=')|[^\s\w']", word)
    if m is None:
        print n, word
    n += 1
    
    
In [21]:
    
n = 0
for word in contslist:
    m = re.match(r"[n]'[\w]+|[\w]+(?!')(?:[A-Za-mo-z](?='))?|(?<=\s)[\w](?=')|[^\s\w']", word)
    if m is not None:
        print n, word
    n += 1
    
    
In [21]:
    
    
In [22]:
    
def fileline_gen(file_):
    with open(file_) as f:
        for line in f.read().splitlines():
            yield line
def fileline(file_):
    with open(file_) as f:
        return f.read().splitlines()
def fileword_gen(file_):
    with open(file_) as f:
        for word in re.findall(r'(?:\S+)|(?:\s+)', f.read()):
            yield word
def urlword_gen(url):
    f = requests.get(url, stream=True)
    for line in f.iter_lines(delimiter='\n'):
        #yield json.loads(line)
        yield line
    
In [23]:
    
contdatabase = [tuple([line[0], ' '.join(line[1:])])
                for line in (line.split()
                             for line in fileline(files[3]['file1'])
                             )
                ]
justconts = [cont[0] for cont in contdatabase]
    
In [26]:
    
#justconts
    
In [24]:
    
padding = 20
n = 0
for cont, clean, db in izip(contslist, cleanlist, contdatabase):
    db1 = db[0]
    db2 = db[1]
    columns = ' '.join([str(n).rjust(2),
                        cont.ljust(13, "."),
                        clean.ljust(13, "."),
                        db1.ljust(13, "."),
                        db2.ljust(13, " "),
                       ])
    print columns
    n += 1
    
    
In [28]:
    
def countfreq(word, listtocount):
    assert type(listtocount) in (list, tuple)
    if type(listtocount[0]) is not list:
        countlist = [[w, 0] for w in listtocount]
    elif type(listtocount[0]) is list:
        countlist = listtocount
    for thing in countlist:
        if thing[0] in word:
            thing[1] += 1
    return countlist
    
In [29]:
    
def countfromlist(wordgen, listtocount):
    assert type(listtocount) in (list, tuple)
    if type(listtocount[0]) is not list:
        countlist = [[w, 0] for w in listtocount]
    elif type(listtocount[0]) is list:
        countlist = listtocount
    for word in wordgen:
        words = word.split()
        for w in words:
            for thing in countlist:
                if thing[0] in w:
                    thing[1] += 1
    return countlist
    
In [30]:
    
def countfromgenlist(genlist, listtocount):
    countlist = listtocount
    try:
        gen = genlist.next()
        gengen = genlist
        countlist = countfromlist(gen, listtocount)
        countfromgenlist(gengen, countlist)
    
    finally:
        return countlist
    
In [31]:
    
genlist = [fileword_gen("data/text/largetext/alltext.txt"),
           fileword_gen("data/text/largetext/warandpeace.txt"),
           fileword_gen("data/text/largetext/misctext.txt"),
           fileword_gen("data/text/largetext/trustingthewatercure.txt"),
           ]
ggg = lambda :itertools.imap(fileword_gen, (''.join(['data/text/largetext/', file]) for file in os.listdir('data/text/largetext/') if file.endswith('.txt')))
derp = ggg()
fff = lambda :itertools.imap(fileline, (''.join(['data/text/largetext/', file]) for file in os.listdir('data/text/largetext/') if file.endswith('.txt')))
gengen = (gen for gen in genlist)
    
In [32]:
    
with Timer() as t:
    totals = countfromgenlist(fff(), justconts)
print t.interval
totals
    
    
    Out[32]:
In [33]:
    
d = {}
for i in totals:
    d[i[0]] = i[1]
    
In [40]:
    
#d
    
    Out[40]:
In [21]:
    
sorted_x = reversed(sorted(d.items(), key=operator.itemgetter(1)))
sortedgen = lambda : reversed(sorted(d.items(), key=operator.itemgetter(1)))
    
    
In [8]:
    
#for i in sortedgen():
    #print i
    
In [34]:
    
def checkifin_iter(item, iterlist):
    for i in iterlist:
        if i[0] == item:
            return i[1] + ' --- !'
    return item
    
In [23]:
    
#for word in sortedgen():
    #print str(checkifin_iter(word[0], contdatabase)) + ','
    
In [24]:
    
#for word in fileword_gen("data/text/largetext/alltext.txt"):
    #print str(checkifin_iter(word, contdatabase)), '---', word
    
In [25]:
    
#for word in fileword_gen("data/text/largetext/alltext.txt"):
    #print word
    
In [25]:
    
    
In [32]:
    
    
In [2]:
    
contgen = lambda:(line.split('\t')[0] for line in fileline_gen('data/contractions.txt'))
contlist = [line.split('\t') for line in fileline_gen('data/contractions.txt')]
    
    
In [36]:
    
gen1 = contgen()
gen2 = contgen()
    
In [36]:
    
    
In [37]:
    
wordlist = [line for line in fileword_gen('data/contractions.txt')]
    
In [37]:
    
    
In [38]:
    
gencont_factory = lambda:islice(wordlist, 0, len(wordlist), 1)
    
In [39]:
    
gencont = gencont_factory()
    
In [40]:
    
gencont.next()
    
    Out[40]:
In [41]:
    
testl = []
for i in xrange(21):
    testl.append(i)
    
In [42]:
    
testl
    
    Out[42]:
In [43]:
    
testl[0:10]
    
    Out[43]:
In [43]:
    
    
In [43]:
    
    
In [44]:
    
filelist = [''.join(['data/text/testbatch/', file]) for file in os.listdir('data/text/testbatch/') if file.endswith('.txt')]
    
In [45]:
    
len(filelist)
    
    Out[45]:
In [46]:
    
numberoffiles = len(filelist)
cores = 4
filespercore = numberoffiles / cores
[filespercore, filespercore*cores]
    
    Out[46]:
In [47]:
    
def formbatches(listof, batches):
    numberof = len(listof)
    batches = 4
    perbatch = numberof / batches
    offset = numberof - perbatch*batches
    start = 0
    end = perbatch + 1
    batches = []
    for core in xrange(cores):
        batches.append(listof[start:end])
        start = end
        end += perbatch
    return batches
    
In [48]:
    
filebatches = formbatches(filelist, cores)
len(filebatches)
    
    Out[48]:
In [49]:
    
l1 = len(filebatches[0])
l2 = len(filebatches[1])
l3 = len(filebatches[2])
l4 = len(filebatches[3])
print l1, l2, l3, l4
print l1 + l2 + l3 + l4
    
    
In [50]:
    
filegen_factory = lambda filelist: itertools.imap(fileline, filelist)
filebatch_gen = lambda filebatches: map(filegen_factory, filebatches)
    
In [51]:
    
#with Timer() as t:
#    totals = countfromgenlist(fff(), justconts)
#print t.interval
#totals
    
In [52]:
    
hyh = filebatch_gen(filebatches)
    
In [53]:
    
def countfromgenlist_parallel(filegens):
    return countfromgenlist(filegens, justconts)
    
In [1]:
    
    
In [63]:
    
def contsinparallel(filebatch_list):
    pool = Pool(processes=cores)
    results = pool.map(countfromgenlist_parallel, tuple(filebatch_list))
    product = results
    return product
    
In [ ]:
    
with Timer() as t:
    listofresults = contsinparallel(filebatch_gen(filebatches))
print t.interval
    
In [56]:
    
print listofresults
    
    
In [57]:
    
countfromgenlist_parallel(filebatch_gen(filebatches)[0])
    
    Out[57]:
In [64]:
    
filebatch_gen(filebatches)[0]
    
    Out[64]:
In [65]:
    
print tuple(filebatch_gen(filebatches))
    
    
In [ ]:
    
map(countfromgenlist_parallel, filebatch_gen(filebatches))
    
In [ ]:
    
    
In [35]:
    
contspara.contspara('data/text/testbatch/', 4)
    
    
In [ ]: