In [22]:
from pug.nlp.classifier import get_words

docs = ['Explicit is better than implicit.',
        'Simple is better than complex.',
        'Flat is better than nested.',
       ]
O_sparse = [Counter(get_words(d)) for d in docs]
print O_sparse


[Counter({'better': 1, 'explicit': 1, 'than': 1, 'implicit': 1}), Counter({'simple': 1, 'better': 1, 'complex': 1, 'than': 1}), Counter({'better': 1, 'flat': 1, 'than': 1, 'nested': 1})]

In [23]:
from collections import Counter

total = Counter()
for c in O_sparse:
    total += c
print total


Counter({'than': 3, 'better': 3, 'flat': 1, 'simple': 1, 'explicit': 1, 'complex': 1, 'nested': 1, 'implicit': 1})

In [21]:
from tabulate import tabulate
words, O = list(total), []
for counts in O_sparse:
    O += [[0] * len(words)]
    for word, count in counts.iteritems():
        j = words.index(word)
        O[-1][j] += count
print tabulate(O, words)


  flat    simple    explicit    than    better    complex    nested    implicit
------  --------  ----------  ------  --------  ---------  --------  ----------
     0         0           1       1         1          0         0           1
     0         1           0       1         1          1         0           0
     1         0           0       1         1          0         1           0

In [28]:
from nltk import download, ConditionalFreqDist
download('inaugural')
from nltk.corpus import inaugural

cfd = ConditionalFreqDist(
        (target, fileid[:4])
        for fileid in inaugural.fileids()
        for w in inaugural.words(fileid)
        for target in ['america', 'citizen']
        if w.lower().startswith(target)) [1]
cfd.plot()


[nltk_data] Downloading package 'inaugural' to
[nltk_data]     /home/Hobson/nltk_data...
[nltk_data]   Package inaugural is already up-to-date!

In [29]:
from nltk.corpus import inaugural
cfd = ConditionalFreqDist(
        (target, fileid[:4])
        for fileid in inaugural.fileids()
        for w in inaugural.words(fileid)
        for target in ['america', 'citizen']
        if w.lower().startswith(target)) [1]

In [5]:
import pug.nlp.inaugural as nlp
from tabulate import tabulate
import numpy as np

data = [
    'Romeo and Juliet.',
    'Juliet: O happy dagger!',
    'Romeo died by dagger.',
    '"Live free or die”, that’s the New-Hampshire’s motto.',
    'Did you know, New-Hampshire is in New-England.',
    ]


O, row_labels, col_labels = nlp.get_occurrence_matrix(data)
print tabulate(O, col_labels)

U, s, V = np.linalg.svd(np.matrix(O).transpose())
print tabulate(U.tolists())
print tabulate(s, col_labels)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-5-11741139640a> in <module>()
     16 
     17 U, s, V = np.linalg.svd(O)
---> 18 print tabulate(U, row_labels)
     19 print tabulate(s, col_labels)

/usr/local/share/.virtualenvs/pug/lib/python2.7/site-packages/tabulate.pyc in tabulate(tabular_data, headers, tablefmt, floatfmt, numalign, stralign, missingval)
    757     # align columns
    758     aligns = [numalign if ct in [int,float] else stralign for ct in coltypes]
--> 759     minwidths = [width_fn(h)+2 for h in headers] if headers else [0]*len(cols)
    760     cols = [_align_column(c, a, minw, has_invisible)
    761             for c, a, minw in zip(cols, aligns, minwidths)]

TypeError: object of type 'int' has no len()
  and    dagger    know    that    did    die    free    romeo    live    hampshire    you    new    the    happy    england    motto    died    juliet
-----  --------  ------  ------  -----  -----  ------  -------  ------  -----------  -----  -----  -----  -------  ---------  -------  ------  --------
    1         0       0       0      0      0       0        1       0            0      0      0      0        0          0        0       0         1
    0         1       0       0      0      0       0        0       0            0      0      0      0        1          0        0       0         1
    0         1       0       0      0      0       0        1       0            0      0      0      0        0          0        0       1         0
    0         0       0       1      0      1       1        0       1            1      0      1      1        0          0        1       0         0
    0         0       1       0      1      0       0        0       0            1      1      2      0        0          1        0       0         0

In [ ]: