In progress...
*By [Diego Marinho de Oliveira](mailto:dmztheone@gmail.com)
Last Update: 08-18-2015*
This notebook is only to demostrante simple and quick usefull examples of what we can do with NLTK. It also can used as a reference guide. Its not intendend to explore exaustively NLTK package. Many examples were extracted directly from NLTK Book written by Steven Bird, Ewan Klein, and Edward Loper distributed under the terms of the Creative Commons Attribution Noncommercial No-Derivative-Works 3.0 US License.
In [1]:
import nltk
from __future__ import division
import matplotlib as mpl
from matplotlib import pyplot as plt
from nltk.book import *
from nltk.corpus import brown
from nltk.corpus import udhr
from nltk.corpus import wordnet as wn
from numpy import arange
import networkx as nx
%matplotlib inline
In [6]:
text1.similar("monstrous")
In [5]:
text2.common_contexts(["monstrous", "very"])
In [10]:
text4.dispersion_plot(["citizens", :"democracy", "freedom", "duties", "America"])
In [11]:
format(len(set(text4))/len(text4)
Out[11]:
In [20]:
nltk.FreqDist(text1).most_common(5)
Out[20]:
In [22]:
nltk.FreqDist(text1).plot(50, cumulative=True)
In [27]:
[w for w in text1 if len(w) > 15][:5]
Out[27]:
In [34]:
text4.collocations()
In [7]:
cfd = nltk.ConditionalFreqDist(
(genre, word)
for genre in brown.categories()
for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
cfd.tabulate(conditions=genres, samples=modals)
In [2]:
cfd = nltk.ConditionalFreqDist(
(target, fileid)
for fileid in inaugural.fileids()
for w in inaugural.words(fileid)
for target in ['america', 'citizen']
if w.lower().startswith(target)) [1]
cfd.plot()
In [3]:
languages = ['Chickasaw', 'English', 'German_Deutsch',
'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']
cfd = nltk.ConditionalFreqDist(
(lang, len(word))
for lang in languages
for word in udhr.words(lang + '-Latin1'))
cfd.plot(cumulative=True)
In [4]:
names = nltk.corpus.names
names.fileids()
['female.txt', 'male.txt']
male_names = names.words('male.txt')
female_names = names.words('female.txt')
[w for w in male_names if w in female_names]
cfd = nltk.ConditionalFreqDist(
(fileid, name[-1])
for fileid in names.fileids()
for name in names.words(fileid))
cfd.plot()
In [5]:
sent = ['In', 'the', 'beginning', 'God', 'created', 'the', 'heaven',
... 'and', 'the', 'earth', '.']
list(nltk.bigrams(sent))
Out[5]:
In [6]:
def generate_model(cfdist, word, num=15):
result = ''
for i in range(num):
result += word + ' '
word = cfdist[word].max()
print result.strip()
text = nltk.corpus.genesis.words('english-kjv.txt')
bigrams = nltk.bigrams(text)
cfd = nltk.ConditionalFreqDist(bigrams)
cfd['living']
nltk.FreqDist({'creature': 7, 'thing': 4, 'substance': 2, ',': 1, '.': 1, 'soul': 1})
generate_model(cfd, 'living')
In [10]:
colors = 'rgbcmyk' # red, green, blue, cyan, magenta, yellow, black
def bar_chart(categories, words, counts):
"Plot a bar chart showing counts for each word by category"
ind = arange(len(words))
width = 1 / (len(categories) + 1)
bar_groups = []
for c in range(len(categories)):
bars = plt.bar(ind+c*width, counts[categories[c]], width,
color=colors[c % len(colors)])
bar_groups.append(bars)
plt.xticks(ind+width, words)
plt.legend([b[0] for b in bar_groups], categories, loc='upper left')
plt.ylabel('Frequency')
plt.title('Frequency of Six Modal Verbs by Genre')
plt.show()
genres = ['news', 'religion', 'hobbies', 'government', 'adventure']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
cfdist = nltk.ConditionalFreqDist(
(genre, word)
for genre in genres
for word in nltk.corpus.brown.words(categories=genre)
if word in modals)
counts = {}
for genre in genres:
counts[genre] = [cfdist[genre][word] for word in modals]
bar_chart(genres, modals, counts)
In [12]:
def traverse(graph, start, node):
graph.depth[node.name] = node.shortest_path_distance(start)
for child in node.hyponyms():
graph.add_edge(node.name, child.name)
traverse(graph, start, child)
def hyponym_graph(start):
G = nx.Graph()
G.depth = {}
traverse(G, start, start)
return G
def graph_draw(graph):
nx.draw_graphviz(graph,
node_size = [16 * graph.degree(n) for n in graph],
node_color = [graph.depth[n] for n in graph],
with_labels = False)
plt.show()
dog = wn.synset('dog.n.01')
graph = hyponym_graph(dog)
graph_draw(graph)