NLTK notebook

I work through the code from the Python NLTK library, "a leading platform for building Python programs to work with human language data." The library comes with data and an online book. The code below consists of problems and examples from the book.

Written by David Cai, NYU, July 2015. A product of the NYU Stern Python Factory. For more along similar lines, see the Data Bootcamp repo.



In [1]:
1 + 5 * 2 - 3


Out[1]:
8

In [ ]:
import nltk
nltk.download()

In [ ]:
from nltk.book import *

In [ ]:
text1

In [ ]:
text1.concordance("monstrous")

In [ ]:
text1.similar("monstrous")
text2.similar("monstrous")

In [ ]:
text2.common_contexts(["monstrous", "very"])

In [ ]:
%matplotlib inline
text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"])

In [ ]:
# Note: The generate() method is not available in NLTK 3.0 but will be reinstated in a subsequent version.
text3.generate()

In [ ]:
len(text3)

In [ ]:
sorted(set(text3))

In [ ]:
len(set(text3))

In [ ]:
len(set(text3)) / len(text3)

In [ ]:
text3.count("smote")

In [ ]:
100 * text4.count('a') / len(text4)

In [ ]:
def lexical_diversity(text):
    return len(set(text)) / len(text)
def percentage(count, total):
    return 100 * count / total

In [ ]:
lexical_diversity(text3)

In [ ]:
lexical_diversity(text5)

In [ ]:
percentage(4, 5)

In [ ]:
percentage(text4.count('a'), len(text4))

In [ ]:
sent1 = ['Call', 'me', 'Ishmael', '.']

In [ ]:
sent1

In [ ]:
len(sent1)

In [ ]:
lexical_diversity(sent1)

In [ ]:
sent2

In [ ]:
sent3

In [ ]:
['Monty', 'Python'] + ['and', 'the', 'Holy', 'Grail']

In [ ]:
sent4 + sent1

In [ ]:
sent1.append("Some")
sent1

In [ ]:
text4[173]

In [ ]:
text4.index('awaken')

In [ ]:
text5[16715:16735]

In [ ]:
text6[1600:1625]

In [ ]:
sent = ['word1', 'word2', 'word3', 'word4', 'word5',
...         'word6', 'word7', 'word8', 'word9', 'word10']
sent[0]

In [ ]:
sent[9]

In [ ]:
sent[10]

In [ ]:
sent[5:8]

In [ ]:
sent[5]

In [ ]:
sent[6]

In [ ]:
sent[7]

In [ ]:
sent[:3]

In [ ]:
text2[141525:]

In [ ]:
sent[0] = 'First'
sent[9] = 'Last'
len(sent)

In [ ]:
sent[1:9] = ['Second', 'Third']
sent

In [ ]:
sent[9]

In [ ]:
sent1 = ['Call', 'me', 'Ishmael', '.']

In [ ]:
my_sent = ['Bravely', 'bold', 'Sir', 'Robin', ',', 'rode',
... 'forth', 'from', 'Camelot', '.']
noun_phrase = my_sent[1:4]
noun_phrase

In [ ]:
wOrDs = sorted(noun_phrase)
wOrDs

In [ ]:
not = 'Camelot'

In [ ]:
>>> vocab = set(text1)
>>> vocab_size = len(vocab)
>>> vocab_size

In [ ]:
>>> name = 'Monty'
>>> name[0]

In [ ]:
>>> name[:4]

In [ ]:
>>> name * 2

In [ ]:
>>> name + '!'

In [ ]:
>>> ' '.join(['Monty', 'Python'])

In [ ]:
>>> 'Monty Python'.split()

In [ ]:
>>> saying = ['After', 'all', 'is', 'said', 'and', 'done',
...           'more', 'is', 'said', 'than', 'done']
>>> tokens = set(saying)
>>> tokens = sorted(tokens)
>>> tokens[-2:]

In [ ]:
>>> fdist1 = FreqDist(text1)
>>> print(fdist1)

In [ ]:
>>> fdist1.most_common(50)

In [ ]:
>>> fdist1['whale']

In [ ]:
>>> V = set(text1)
>>> long_words = [w for w in V if len(w) > 15]
>>> sorted(long_words)

In [ ]:
>>> fdist5 = FreqDist(text5)
>>> sorted(w for w in set(text5) if len(w) > 7 and fdist5[w] > 7)

In [ ]:
list(bigrams(['more', 'is', 'said', 'than', 'done']))

In [ ]:
>>> text4.collocations()

In [ ]:
>>> text8.collocations()

In [ ]:
>>> [len(w) for w in text1]

In [ ]:
>>> fdist = FreqDist(len(w) for w in text1)
>>> print(fdist)

In [ ]:
>>> fdist

In [ ]:
>>> fdist.most_common()

In [ ]:
>>> fdist.max()

In [ ]:
>>> fdist[3]

In [ ]:
>>> fdist.freq(3)

In [ ]:
>>> sent7

In [ ]:
>>> [w for w in sent7 if len(w) < 4]

In [ ]:
>>> [w for w in sent7 if len(w) <= 4]

In [ ]:
>>> [w for w in sent7 if len(w) == 4]

In [ ]:
>>> [w for w in sent7 if len(w) != 4]

In [ ]:
>>> sorted(w for w in set(text1) if w.endswith('ableness'))

In [ ]:
>>> sorted(term for term in set(text4) if 'gnt' in term)

In [ ]:
>>> sorted(item for item in set(text6) if item.istitle())

In [ ]:
>>> sorted(item for item in set(sent7) if item.isdigit())

In [ ]:
>>> sorted(w for w in set(text7) if '-' in w and 'index' in w)
>>> sorted(wd for wd in set(text3) if wd.istitle() and len(wd) > 10)
>>> sorted(w for w in set(sent7) if not w.islower())
>>> sorted(t for t in set(text2) if 'cie' in t or 'cei' in t)

In [ ]:
>>> [len(w) for w in text1]

In [ ]:
>>> [w.upper() for w in text1]

In [ ]:
len(text1)

In [ ]:
len(set(text1))

In [ ]:
len(set(word.lower() for word in text1))

In [ ]:
len(set(word.lower() for word in text1 if word.isalpha()))

In [ ]:
>>> word = 'cat'
>>> if len(word) < 5:
...     print('word length is less than 5')

In [ ]:
>>> if len(word) >= 5:
...   print('word length is greater than or equal to 5')

In [ ]:
>>> for word in ['Call', 'me', 'Ishmael', '.']:
...     print(word)

In [ ]:
>>> sent1 = ['Call', 'me', 'Ishmael', '.']
>>> for xyzzy in sent1:
...     if xyzzy.endswith('l'):
...         print(xyzzy)

In [ ]:
>>> for token in sent1:
...     if token.islower():
...         print(token, 'is a lowercase word')
...     elif token.istitle():
...         print(token, 'is a titlecase word')
...     else:
...         print(token, 'is punctuation')

In [ ]:
>>> tricky = sorted(w for w in set(text2) if 'cie' in w or 'cei' in w)
>>> for word in tricky:
...     print(word, end=' ')