NLTK notebook

I work through the code from the Python NLTK library, "a leading platform for building Python programs to work with human language data." The library comes with data and an online book. The code below consists of problems and examples from the book.

Links

NLTK documentation: http://www.nltk.org/
NLTK book: http://www.nltk.org/book/

Written by David Cai, NYU, July 2015. A product of the NYU Stern Python Factory. For more along similar lines, see the Data Bootcamp repo.



In [1]:

    
1 + 5 * 2 - 3









    Out[1]:





8



In [ ]:

    
import nltk
nltk.download()



In [ ]:

    
from nltk.book import *



In [ ]:

    
text1



In [ ]:

    
text1.concordance("monstrous")



In [ ]:

    
text1.similar("monstrous")
text2.similar("monstrous")



In [ ]:

    
text2.common_contexts(["monstrous", "very"])



In [ ]:

    
%matplotlib inline
text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"])



In [ ]:

    
# Note: The generate() method is not available in NLTK 3.0 but will be reinstated in a subsequent version.
text3.generate()



In [ ]:

    
len(text3)



In [ ]:

    
sorted(set(text3))



In [ ]:

    
len(set(text3))



In [ ]:

    
len(set(text3)) / len(text3)



In [ ]:

    
text3.count("smote")



In [ ]:

    
100 * text4.count('a') / len(text4)



In [ ]:

    
def lexical_diversity(text):
    return len(set(text)) / len(text)
def percentage(count, total):
    return 100 * count / total



In [ ]:

    
lexical_diversity(text3)



In [ ]:

    
lexical_diversity(text5)



In [ ]:

    
percentage(4, 5)



In [ ]:

    
percentage(text4.count('a'), len(text4))



In [ ]:

    
sent1 = ['Call', 'me', 'Ishmael', '.']



In [ ]:

    
sent1



In [ ]:

    
len(sent1)



In [ ]:

    
lexical_diversity(sent1)



In [ ]:

    
sent2



In [ ]:

    
sent3



In [ ]:

    
['Monty', 'Python'] + ['and', 'the', 'Holy', 'Grail']



In [ ]:

    
sent4 + sent1



In [ ]:

    
sent1.append("Some")
sent1



In [ ]:

    
text4[173]



In [ ]:

    
text4.index('awaken')



In [ ]:

    
text5[16715:16735]



In [ ]:

    
text6[1600:1625]



In [ ]:

    
sent = ['word1', 'word2', 'word3', 'word4', 'word5',
...         'word6', 'word7', 'word8', 'word9', 'word10']
sent[0]



In [ ]:

    
sent[9]



In [ ]:

    
sent[10]



In [ ]:

    
sent[5:8]



In [ ]:

    
sent[5]



In [ ]:

    
sent[6]



In [ ]:

    
sent[7]



In [ ]:

    
sent[:3]



In [ ]:

    
text2[141525:]



In [ ]:

    
sent[0] = 'First'
sent[9] = 'Last'
len(sent)



In [ ]:

    
sent[1:9] = ['Second', 'Third']
sent



In [ ]:

    
sent[9]



In [ ]:

    
sent1 = ['Call', 'me', 'Ishmael', '.']



In [ ]:

    
my_sent = ['Bravely', 'bold', 'Sir', 'Robin', ',', 'rode',
... 'forth', 'from', 'Camelot', '.']
noun_phrase = my_sent[1:4]
noun_phrase



In [ ]:

    
wOrDs = sorted(noun_phrase)
wOrDs



In [ ]:

    
not = 'Camelot'



In [ ]:

    
>>> vocab = set(text1)
>>> vocab_size = len(vocab)
>>> vocab_size



In [ ]:

    
>>> name = 'Monty'
>>> name[0]



In [ ]:

    
>>> name[:4]



In [ ]:

    
>>> name * 2



In [ ]:

    
>>> name + '!'



In [ ]:

    
>>> ' '.join(['Monty', 'Python'])



In [ ]:

    
>>> 'Monty Python'.split()



In [ ]:

    
>>> saying = ['After', 'all', 'is', 'said', 'and', 'done',
...           'more', 'is', 'said', 'than', 'done']
>>> tokens = set(saying)
>>> tokens = sorted(tokens)
>>> tokens[-2:]



In [ ]:

    
>>> fdist1 = FreqDist(text1)
>>> print(fdist1)



In [ ]:

    
>>> fdist1.most_common(50)



In [ ]:

    
>>> fdist1['whale']



In [ ]:

    
>>> V = set(text1)
>>> long_words = [w for w in V if len(w) > 15]
>>> sorted(long_words)



In [ ]:

    
>>> fdist5 = FreqDist(text5)
>>> sorted(w for w in set(text5) if len(w) > 7 and fdist5[w] > 7)



In [ ]:

    
list(bigrams(['more', 'is', 'said', 'than', 'done']))



In [ ]:

    
>>> text4.collocations()



In [ ]:

    
>>> text8.collocations()



In [ ]:

    
>>> [len(w) for w in text1]



In [ ]:

    
>>> fdist = FreqDist(len(w) for w in text1)
>>> print(fdist)



In [ ]:

    
>>> fdist



In [ ]:

    
>>> fdist.most_common()



In [ ]:

    
>>> fdist.max()



In [ ]:

    
>>> fdist[3]



In [ ]:

    
>>> fdist.freq(3)



In [ ]:

    
>>> sent7



In [ ]:

    
>>> [w for w in sent7 if len(w) < 4]



In [ ]:

    
>>> [w for w in sent7 if len(w) <= 4]



In [ ]:

    
>>> [w for w in sent7 if len(w) == 4]



In [ ]:

    
>>> [w for w in sent7 if len(w) != 4]



In [ ]:

    
>>> sorted(w for w in set(text1) if w.endswith('ableness'))



In [ ]:

    
>>> sorted(term for term in set(text4) if 'gnt' in term)



In [ ]:

    
>>> sorted(item for item in set(text6) if item.istitle())



In [ ]:

    
>>> sorted(item for item in set(sent7) if item.isdigit())



In [ ]:

    
>>> sorted(w for w in set(text7) if '-' in w and 'index' in w)
>>> sorted(wd for wd in set(text3) if wd.istitle() and len(wd) > 10)
>>> sorted(w for w in set(sent7) if not w.islower())
>>> sorted(t for t in set(text2) if 'cie' in t or 'cei' in t)



In [ ]:

    
>>> [len(w) for w in text1]



In [ ]:

    
>>> [w.upper() for w in text1]



In [ ]:

    
len(text1)



In [ ]:

    
len(set(text1))



In [ ]:

    
len(set(word.lower() for word in text1))



In [ ]:

    
len(set(word.lower() for word in text1 if word.isalpha()))



In [ ]:

    
>>> word = 'cat'
>>> if len(word) < 5:
...     print('word length is less than 5')



In [ ]:

    
>>> if len(word) >= 5:
...   print('word length is greater than or equal to 5')



In [ ]:

    
>>> for word in ['Call', 'me', 'Ishmael', '.']:
...     print(word)



In [ ]:

    
>>> sent1 = ['Call', 'me', 'Ishmael', '.']
>>> for xyzzy in sent1:
...     if xyzzy.endswith('l'):
...         print(xyzzy)



In [ ]:

    
>>> for token in sent1:
...     if token.islower():
...         print(token, 'is a lowercase word')
...     elif token.istitle():
...         print(token, 'is a titlecase word')
...     else:
...         print(token, 'is punctuation')



In [ ]:

    
>>> tricky = sorted(w for w in set(text2) if 'cie' in w or 'cei' in w)
>>> for word in tricky:
...     print(word, end=' ')