In [22]:
from pug.nlp.classifier import get_words
docs = ['Explicit is better than implicit.',
'Simple is better than complex.',
'Flat is better than nested.',
]
O_sparse = [Counter(get_words(d)) for d in docs]
print O_sparse
In [23]:
from collections import Counter
total = Counter()
for c in O_sparse:
total += c
print total
In [21]:
from tabulate import tabulate
words, O = list(total), []
for counts in O_sparse:
O += [[0] * len(words)]
for word, count in counts.iteritems():
j = words.index(word)
O[-1][j] += count
print tabulate(O, words)
In [28]:
from nltk import download, ConditionalFreqDist
download('inaugural')
from nltk.corpus import inaugural
cfd = ConditionalFreqDist(
(target, fileid[:4])
for fileid in inaugural.fileids()
for w in inaugural.words(fileid)
for target in ['america', 'citizen']
if w.lower().startswith(target)) [1]
cfd.plot()
In [29]:
from nltk.corpus import inaugural
cfd = ConditionalFreqDist(
(target, fileid[:4])
for fileid in inaugural.fileids()
for w in inaugural.words(fileid)
for target in ['america', 'citizen']
if w.lower().startswith(target)) [1]
In [12]:
import pug.nlp.inaugural as nlp
from tabulate import tabulate
import numpy as np
data = [
'Romeo and Juliet.',
'Juliet: O happy dagger!',
'Romeo died by dagger.',
'"Live free or die”, that’s the New-Hampshire’s motto.',
'Did you know, New-Hampshire is in New-England.',
]
O, row_labels, col_labels = nlp.get_occurrence_matrix(data)
print tabulate(O, col_labels)
U, s, V = np.linalg.svd(O)
print tabulate(s.as_lists, row_labels)
In [ ]: