In [3]:
from collections import Counter
from itertools import chain, repeat
from language_models.ngrams import ngrams

In [4]:
sentences = [
    ["Angie", "ist", "doof", "."],
    ["Kevin", "ist", "schlau", "."],
    ["Dies", "ist", "ein", "Test", "."]
]

In [5]:
print list(ngrams(sentences[0], 1, pad='<eos>'))
print list(ngrams(sentences[0], 2, pad='<eos>'))
print list(ngrams(sentences[0], 3, pad='<eos>'))


[('Angie',), ('ist',), ('doof',), ('.',)]
[('Angie', 'ist'), ('ist', 'doof'), ('doof', '.'), ('.', '<eos>')]
[('Angie', 'ist', 'doof'), ('ist', 'doof', '.'), ('doof', '.', '<eos>'), ('.', '<eos>', '<eos>')]

In [6]:
n = 2
nc = Counter()
for sentence in sentences:
    nc.update(ngrams(sentence, n, pad='$'))

In [7]:
n = 2
nc = Counter()
for sentence in sentences:
    nc.update(ngrams(sentence, n, pad='<eos>'))
    
print nc # TODO: pad shall always be treated as one symbol


Counter({('.', '<eos>'): 3, ('Kevin', 'ist'): 1, ('ist', 'schlau'): 1, ('Dies', 'ist'): 1, ('Angie', 'ist'): 1, ('ist', 'ein'): 1, ('Test', '.'): 1, ('ein', 'Test'): 1, ('doof', '.'): 1, ('schlau', '.'): 1, ('ist', 'doof'): 1})

In [8]:
def build_ngram_model(sentences, n, pad='<eos>'):
    nc = Counter()
    for sentence in sentences:
        nc.update(ngrams(sentence, n, pad))
    return nc

In [9]:
build_ngram_model(sentences, 3)


Out[9]:
Counter({('.', '<eos>', '<eos>'): 3, ('ist', 'schlau', '.'): 1, ('Test', '.', '<eos>'): 1, ('schlau', '.', '<eos>'): 1, ('ist', 'ein', 'Test'): 1, ('doof', '.', '<eos>'): 1, ('Dies', 'ist', 'ein'): 1, ('Kevin', 'ist', 'schlau'): 1, ('Angie', 'ist', 'doof'): 1, ('ein', 'Test', '.'): 1, ('ist', 'doof', '.'): 1})

In [10]:
import tarfile

tar = tarfile.open('/home/arne/corpora/tiger_release_dec05.txt.tar.gz')

Counter()


In [11]:
%%timeit
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
tiger1 = build_ngram_model(tiger_sentences, 1)


1 loops, best of 3: 1.7 s per loop

In [12]:
%%timeit
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
tiger5 = build_ngram_model(tiger_sentences, 5)


1 loops, best of 3: 1.83 s per loop

Pandas.Series


In [13]:
import pandas

def build_ngram_model_pandas(sentences, n, pad='<eos>'):
    nc = pandas.Series( list(chain.from_iterable(ngrams(sent, n, pad) for sent in sentences)) )
    return nc

In [14]:
%%timeit
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
tiger1 = build_ngram_model_pandas(tiger_sentences, 1)


1 loops, best of 3: 916 ms per loop

In [15]:
%%timeit
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
tiger5 = build_ngram_model_pandas(tiger_sentences, 5)


1 loops, best of 3: 965 ms per loop

In [16]:
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
tiger5 = build_ngram_model_pandas(tiger_sentences, 5)
tiger5


Out[16]:
0                (``, Ross, Perot, wäre, vielleicht)
1               (Ross, Perot, wäre, vielleicht, ein)
2         (Perot, wäre, vielleicht, ein, prächtiger)
3      (wäre, vielleicht, ein, prächtiger, Diktator)
4        (vielleicht, ein, prächtiger, Diktator, '')
5             (ein, prächtiger, Diktator, '', <eos>)
6           (prächtiger, Diktator, '', <eos>, <eos>)
7                (Diktator, '', <eos>, <eos>, <eos>)
8                   ('', <eos>, <eos>, <eos>, <eos>)
9       (Konzernchefs, lehnen, den, Milliardär, als)
10    (lehnen, den, Milliardär, als, US-Präsidenten)
11        (den, Milliardär, als, US-Präsidenten, ab)
12          (Milliardär, als, US-Präsidenten, ab, /)
13               (als, US-Präsidenten, ab, /, <eos>)
14             (US-Präsidenten, ab, /, <eos>, <eos>)
...
888563                    (Erhalt, der, Nato, ., <eos>)
888564                     (der, Nato, ., <eos>, <eos>)
888565                   (Nato, ., <eos>, <eos>, <eos>)
888566                  (., <eos>, <eos>, <eos>, <eos>)
888567        (Der, allein, ist, kein, Zukunftskonzept)
888568        (allein, ist, kein, Zukunftskonzept, für)
888569           (ist, kein, Zukunftskonzept, für, die)
888570    (kein, Zukunftskonzept, für, die, Sicherheit)
888571      (Zukunftskonzept, für, die, Sicherheit, in)
888572               (für, die, Sicherheit, in, Europa)
888573                 (die, Sicherheit, in, Europa, .)
888574               (Sicherheit, in, Europa, ., <eos>)
888575                    (in, Europa, ., <eos>, <eos>)
888576                 (Europa, ., <eos>, <eos>, <eos>)
888577                  (., <eos>, <eos>, <eos>, <eos>)
Length: 888578, dtype: object

Counter with list comprehension


In [17]:
def build_ngram_model_comprehension(sentences, n, pad='<eos>'):
    return Counter( chain.from_iterable(ngrams(sent, n, pad) for sent in sentences) )

In [18]:
%%timeit
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
tiger1 = build_ngram_model_comprehension(tiger_sentences, 1)


1 loops, best of 3: 1.55 s per loop

In [19]:
%%timeit
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
tiger5 = build_ngram_model_comprehension(tiger_sentences, 5)


1 loops, best of 3: 1.58 s per loop

In [20]:
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
tiger5 = build_ngram_model_comprehension(tiger_sentences, 5)
tiger5.most_common(10)


Out[20]:
[(('.', '<eos>', '<eos>', '<eos>', '<eos>'), 40091),
 ((':', '<eos>', '<eos>', '<eos>', '<eos>'), 1439),
 (("''", '.', '<eos>', '<eos>', '<eos>'), 1348),
 ((')', '.', '<eos>', '<eos>', '<eos>'), 1335),
 (("''", '<eos>', '<eos>', '<eos>', '<eos>'), 1157),
 (('werden', '.', '<eos>', '<eos>', '<eos>'), 1032),
 (('.', "''", '<eos>', '<eos>', '<eos>'), 913),
 (('?', '<eos>', '<eos>', '<eos>', '<eos>'), 628),
 (('worden', '.', '<eos>', '<eos>', '<eos>'), 497),
 ((';', '<eos>', '<eos>', '<eos>', '<eos>'), 477)]

cytoolz.frequencies


In [21]:
from cytoolz import frequencies

def build_ngram_model_cytoolz(sentences, n, pad='<eos>'):
    return frequencies( chain.from_iterable(ngrams(sent, n, pad) for sent in sentences) )

In [22]:
%%timeit
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
tiger1 = build_ngram_model_cytoolz(tiger_sentences, 1)


1 loops, best of 3: 1.2 s per loop

In [23]:
%%timeit
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
tiger5 = build_ngram_model_cytoolz(tiger_sentences, 5)


1 loops, best of 3: 1.34 s per loop

In [24]:
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
tiger5 = build_ngram_model_cytoolz(tiger_sentences, 5)

In [25]:
from cytoolz import frequencies, concat

def build_ngram_model_cytoolz_concat(sentences, n, pad='<eos>'):
    return frequencies( concat(ngrams(sent, n, pad) for sent in sentences) )

In [26]:
%%timeit
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
tiger1 = build_ngram_model_cytoolz_concat(tiger_sentences, 1)


1 loops, best of 3: 1.19 s per loop

In [27]:
from language_models.ngrams import ngrams

tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))

In [28]:
tiger_sentences.next()


Out[28]:
['``',
 'Ross',
 'Perot',
 'w\xc3\xa4re',
 'vielleicht',
 'ein',
 'pr\xc3\xa4chtiger',
 'Diktator',
 "''"]

In [29]:
n = 5
print range(1, n+1)


[1, 2, 3, 4, 5]

In [53]:
def upto_ngrams(words, n, pad=None):
    """build [1, ..., n]-gram tuples of the given words"""
    return concat(ngrams(words, i, pad) for i in xrange(1, n+1))

In [54]:
list(upto_ngrams(sentences[0], 3))


Out[54]:
[('Angie',),
 ('ist',),
 ('doof',),
 ('.',),
 ('Angie', 'ist'),
 ('ist', 'doof'),
 ('doof', '.'),
 ('Angie', 'ist', 'doof'),
 ('ist', 'doof', '.')]

In [55]:
def build_upto_ngram_model(sentences, n, pad=None):
    return frequencies( concat(upto_ngrams(sent, n, pad) for sent in sentences) )

In [66]:
#%%timeit
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
tiger1 = build_upto_ngram_model(tiger_sentences, 1)

In [56]:
# %%timeit
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
tiger5 = build_upto_ngram_model(tiger_sentences, 5)

In [58]:
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))

In [64]:
import cytoolz
cytoolz.countby(len, tiger5)


Out[64]:
{1: 89416, 2: 440474, 3: 675317, 4: 709910, 5: 680124}

In [67]:
cytoolz.countby(len, tiger1)


Out[67]:
{1: 89416}

In [ ]: