I work through the code from the Python NLTK library, "a leading platform for building Python programs to work with human language data." The library comes with data and an online book. The code below consists of problems and examples from the book.
Written by David Cai, NYU, July 2015. A product of the NYU Stern Python Factory. For more along similar lines, see the Data Bootcamp repo.
In [1]:
1 + 5 * 2 - 3
Out[1]:
In [ ]:
import nltk
nltk.download()
In [ ]:
from nltk.book import *
In [ ]:
text1
In [ ]:
text1.concordance("monstrous")
In [ ]:
text1.similar("monstrous")
text2.similar("monstrous")
In [ ]:
text2.common_contexts(["monstrous", "very"])
In [ ]:
%matplotlib inline
text4.dispersion_plot(["citizens", "democracy", "freedom", "duties", "America"])
In [ ]:
# Note: The generate() method is not available in NLTK 3.0 but will be reinstated in a subsequent version.
text3.generate()
In [ ]:
len(text3)
In [ ]:
sorted(set(text3))
In [ ]:
len(set(text3))
In [ ]:
len(set(text3)) / len(text3)
In [ ]:
text3.count("smote")
In [ ]:
100 * text4.count('a') / len(text4)
In [ ]:
def lexical_diversity(text):
return len(set(text)) / len(text)
def percentage(count, total):
return 100 * count / total
In [ ]:
lexical_diversity(text3)
In [ ]:
lexical_diversity(text5)
In [ ]:
percentage(4, 5)
In [ ]:
percentage(text4.count('a'), len(text4))
In [ ]:
sent1 = ['Call', 'me', 'Ishmael', '.']
In [ ]:
sent1
In [ ]:
len(sent1)
In [ ]:
lexical_diversity(sent1)
In [ ]:
sent2
In [ ]:
sent3
In [ ]:
['Monty', 'Python'] + ['and', 'the', 'Holy', 'Grail']
In [ ]:
sent4 + sent1
In [ ]:
sent1.append("Some")
sent1
In [ ]:
text4[173]
In [ ]:
text4.index('awaken')
In [ ]:
text5[16715:16735]
In [ ]:
text6[1600:1625]
In [ ]:
sent = ['word1', 'word2', 'word3', 'word4', 'word5',
... 'word6', 'word7', 'word8', 'word9', 'word10']
sent[0]
In [ ]:
sent[9]
In [ ]:
sent[10]
In [ ]:
sent[5:8]
In [ ]:
sent[5]
In [ ]:
sent[6]
In [ ]:
sent[7]
In [ ]:
sent[:3]
In [ ]:
text2[141525:]
In [ ]:
sent[0] = 'First'
sent[9] = 'Last'
len(sent)
In [ ]:
sent[1:9] = ['Second', 'Third']
sent
In [ ]:
sent[9]
In [ ]:
sent1 = ['Call', 'me', 'Ishmael', '.']
In [ ]:
my_sent = ['Bravely', 'bold', 'Sir', 'Robin', ',', 'rode',
... 'forth', 'from', 'Camelot', '.']
noun_phrase = my_sent[1:4]
noun_phrase
In [ ]:
wOrDs = sorted(noun_phrase)
wOrDs
In [ ]:
not = 'Camelot'
In [ ]:
>>> vocab = set(text1)
>>> vocab_size = len(vocab)
>>> vocab_size
In [ ]:
>>> name = 'Monty'
>>> name[0]
In [ ]:
>>> name[:4]
In [ ]:
>>> name * 2
In [ ]:
>>> name + '!'
In [ ]:
>>> ' '.join(['Monty', 'Python'])
In [ ]:
>>> 'Monty Python'.split()
In [ ]:
>>> saying = ['After', 'all', 'is', 'said', 'and', 'done',
... 'more', 'is', 'said', 'than', 'done']
>>> tokens = set(saying)
>>> tokens = sorted(tokens)
>>> tokens[-2:]
In [ ]:
>>> fdist1 = FreqDist(text1)
>>> print(fdist1)
In [ ]:
>>> fdist1.most_common(50)
In [ ]:
>>> fdist1['whale']
In [ ]:
>>> V = set(text1)
>>> long_words = [w for w in V if len(w) > 15]
>>> sorted(long_words)
In [ ]:
>>> fdist5 = FreqDist(text5)
>>> sorted(w for w in set(text5) if len(w) > 7 and fdist5[w] > 7)
In [ ]:
list(bigrams(['more', 'is', 'said', 'than', 'done']))
In [ ]:
>>> text4.collocations()
In [ ]:
>>> text8.collocations()
In [ ]:
>>> [len(w) for w in text1]
In [ ]:
>>> fdist = FreqDist(len(w) for w in text1)
>>> print(fdist)
In [ ]:
>>> fdist
In [ ]:
>>> fdist.most_common()
In [ ]:
>>> fdist.max()
In [ ]:
>>> fdist[3]
In [ ]:
>>> fdist.freq(3)
In [ ]:
>>> sent7
In [ ]:
>>> [w for w in sent7 if len(w) < 4]
In [ ]:
>>> [w for w in sent7 if len(w) <= 4]
In [ ]:
>>> [w for w in sent7 if len(w) == 4]
In [ ]:
>>> [w for w in sent7 if len(w) != 4]
In [ ]:
>>> sorted(w for w in set(text1) if w.endswith('ableness'))
In [ ]:
>>> sorted(term for term in set(text4) if 'gnt' in term)
In [ ]:
>>> sorted(item for item in set(text6) if item.istitle())
In [ ]:
>>> sorted(item for item in set(sent7) if item.isdigit())
In [ ]:
>>> sorted(w for w in set(text7) if '-' in w and 'index' in w)
>>> sorted(wd for wd in set(text3) if wd.istitle() and len(wd) > 10)
>>> sorted(w for w in set(sent7) if not w.islower())
>>> sorted(t for t in set(text2) if 'cie' in t or 'cei' in t)
In [ ]:
>>> [len(w) for w in text1]
In [ ]:
>>> [w.upper() for w in text1]
In [ ]:
len(text1)
In [ ]:
len(set(text1))
In [ ]:
len(set(word.lower() for word in text1))
In [ ]:
len(set(word.lower() for word in text1 if word.isalpha()))
In [ ]:
>>> word = 'cat'
>>> if len(word) < 5:
... print('word length is less than 5')
In [ ]:
>>> if len(word) >= 5:
... print('word length is greater than or equal to 5')
In [ ]:
>>> for word in ['Call', 'me', 'Ishmael', '.']:
... print(word)
In [ ]:
>>> sent1 = ['Call', 'me', 'Ishmael', '.']
>>> for xyzzy in sent1:
... if xyzzy.endswith('l'):
... print(xyzzy)
In [ ]:
>>> for token in sent1:
... if token.islower():
... print(token, 'is a lowercase word')
... elif token.istitle():
... print(token, 'is a titlecase word')
... else:
... print(token, 'is punctuation')
In [ ]:
>>> tricky = sorted(w for w in set(text2) if 'cie' in w or 'cei' in w)
>>> for word in tricky:
... print(word, end=' ')