In [22]:
from pug.nlp.classifier import get_words

docs = ['Explicit is better than implicit.',
        'Simple is better than complex.',
        'Flat is better than nested.',
       ]
O_sparse = [Counter(get_words(d)) for d in docs]
print O_sparse


[Counter({'better': 1, 'explicit': 1, 'than': 1, 'implicit': 1}), Counter({'simple': 1, 'better': 1, 'complex': 1, 'than': 1}), Counter({'better': 1, 'flat': 1, 'than': 1, 'nested': 1})]

In [23]:
from collections import Counter

total = Counter()
for c in O_sparse:
    total += c
print total


Counter({'than': 3, 'better': 3, 'flat': 1, 'simple': 1, 'explicit': 1, 'complex': 1, 'nested': 1, 'implicit': 1})

In [21]:
from tabulate import tabulate
words, O = list(total), []
for counts in O_sparse:
    O += [[0] * len(words)]
    for word, count in counts.iteritems():
        j = words.index(word)
        O[-1][j] += count
print tabulate(O, words)


  flat    simple    explicit    than    better    complex    nested    implicit
------  --------  ----------  ------  --------  ---------  --------  ----------
     0         0           1       1         1          0         0           1
     0         1           0       1         1          1         0           0
     1         0           0       1         1          0         1           0

In [28]:
from nltk import download, ConditionalFreqDist
download('inaugural')
from nltk.corpus import inaugural

cfd = ConditionalFreqDist(
        (target, fileid[:4])
        for fileid in inaugural.fileids()
        for w in inaugural.words(fileid)
        for target in ['america', 'citizen']
        if w.lower().startswith(target)) [1]
cfd.plot()


[nltk_data] Downloading package 'inaugural' to
[nltk_data]     /home/Hobson/nltk_data...
[nltk_data]   Package inaugural is already up-to-date!

In [29]:
from nltk.corpus import inaugural
cfd = ConditionalFreqDist(
        (target, fileid[:4])
        for fileid in inaugural.fileids()
        for w in inaugural.words(fileid)
        for target in ['america', 'citizen']
        if w.lower().startswith(target)) [1]

In [12]:
import pug.nlp.inaugural as nlp
from tabulate import tabulate
import numpy as np

data = [
    'Romeo and Juliet.',
    'Juliet: O happy dagger!',
    'Romeo died by dagger.',
    '"Live free or die”, that’s the New-Hampshire’s motto.',
    'Did you know, New-Hampshire is in New-England.',
    ]


O, row_labels, col_labels = nlp.get_occurrence_matrix(data)
print tabulate(O, col_labels)

U, s, V = np.linalg.svd(O)

print tabulate(s.as_lists, row_labels)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-12-6c75ba3d1692> in <module>()
     12 
     13 
---> 14 O, row_labels, col_labels = nlp.get_occurrence_matrix(data)
     15 print tabulate(O, col_labels)
     16 

AttributeError: 'module' object has no attribute 'get_occurrence_matrix'

In [ ]: