In [3]:
from collections import Counter
from itertools import chain, repeat
from language_models.ngrams import ngrams
In [4]:
sentences = [
["Angie", "ist", "doof", "."],
["Kevin", "ist", "schlau", "."],
["Dies", "ist", "ein", "Test", "."]
]
In [5]:
print list(ngrams(sentences[0], 1, pad='<eos>'))
print list(ngrams(sentences[0], 2, pad='<eos>'))
print list(ngrams(sentences[0], 3, pad='<eos>'))
In [6]:
n = 2
nc = Counter()
for sentence in sentences:
nc.update(ngrams(sentence, n, pad='$'))
In [7]:
n = 2
nc = Counter()
for sentence in sentences:
nc.update(ngrams(sentence, n, pad='<eos>'))
print nc # TODO: pad shall always be treated as one symbol
In [8]:
def build_ngram_model(sentences, n, pad='<eos>'):
nc = Counter()
for sentence in sentences:
nc.update(ngrams(sentence, n, pad))
return nc
In [9]:
build_ngram_model(sentences, 3)
Out[9]:
In [10]:
import tarfile
tar = tarfile.open('/home/arne/corpora/tiger_release_dec05.txt.tar.gz')
In [11]:
%%timeit
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
tiger1 = build_ngram_model(tiger_sentences, 1)
In [12]:
%%timeit
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
tiger5 = build_ngram_model(tiger_sentences, 5)
In [13]:
import pandas
def build_ngram_model_pandas(sentences, n, pad='<eos>'):
nc = pandas.Series( list(chain.from_iterable(ngrams(sent, n, pad) for sent in sentences)) )
return nc
In [14]:
%%timeit
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
tiger1 = build_ngram_model_pandas(tiger_sentences, 1)
In [15]:
%%timeit
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
tiger5 = build_ngram_model_pandas(tiger_sentences, 5)
In [16]:
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
tiger5 = build_ngram_model_pandas(tiger_sentences, 5)
tiger5
Out[16]:
In [17]:
def build_ngram_model_comprehension(sentences, n, pad='<eos>'):
return Counter( chain.from_iterable(ngrams(sent, n, pad) for sent in sentences) )
In [18]:
%%timeit
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
tiger1 = build_ngram_model_comprehension(tiger_sentences, 1)
In [19]:
%%timeit
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
tiger5 = build_ngram_model_comprehension(tiger_sentences, 5)
In [20]:
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
tiger5 = build_ngram_model_comprehension(tiger_sentences, 5)
tiger5.most_common(10)
Out[20]:
In [21]:
from cytoolz import frequencies
def build_ngram_model_cytoolz(sentences, n, pad='<eos>'):
return frequencies( chain.from_iterable(ngrams(sent, n, pad) for sent in sentences) )
In [22]:
%%timeit
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
tiger1 = build_ngram_model_cytoolz(tiger_sentences, 1)
In [23]:
%%timeit
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
tiger5 = build_ngram_model_cytoolz(tiger_sentences, 5)
In [24]:
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
tiger5 = build_ngram_model_cytoolz(tiger_sentences, 5)
In [25]:
from cytoolz import frequencies, concat
def build_ngram_model_cytoolz_concat(sentences, n, pad='<eos>'):
return frequencies( concat(ngrams(sent, n, pad) for sent in sentences) )
In [26]:
%%timeit
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
tiger1 = build_ngram_model_cytoolz_concat(tiger_sentences, 1)
In [27]:
from language_models.ngrams import ngrams
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
In [28]:
tiger_sentences.next()
Out[28]:
In [29]:
n = 5
print range(1, n+1)
In [53]:
def upto_ngrams(words, n, pad=None):
"""build [1, ..., n]-gram tuples of the given words"""
return concat(ngrams(words, i, pad) for i in xrange(1, n+1))
In [54]:
list(upto_ngrams(sentences[0], 3))
Out[54]:
In [55]:
def build_upto_ngram_model(sentences, n, pad=None):
return frequencies( concat(upto_ngrams(sent, n, pad) for sent in sentences) )
In [66]:
#%%timeit
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
tiger1 = build_upto_ngram_model(tiger_sentences, 1)
In [56]:
# %%timeit
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
tiger5 = build_upto_ngram_model(tiger_sentences, 5)
In [58]:
tiger_sentences = (line.split() for line in tar.extractfile('tiger_release_dec05.txt'))
In [64]:
import cytoolz
cytoolz.countby(len, tiger5)
Out[64]:
In [67]:
cytoolz.countby(len, tiger1)
Out[67]:
In [ ]: