In [199]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
Write a function tokenize that takes a string of English text returns a list of words. It should also remove stop words, which are common short words that are often removed before natural language processing. Your function should have the following logic:
splitlines.filter function to remove all punctuation.stop_words is a list, remove all occurences of the words in the list.stop_words is a space delimeted string of words, split them and remove them.
In [311]:
things = "hello!"
def ispuct(char, punctuation='`~!@#$%^&*()_-+={[}]|\:;"<,>.?/}\t'):
return (not (char in punctuation))
#x = list(filter(ispuct, things))
#a = ''
#a.join(x)
#print(new_things)
In [326]:
def tokenize(s, stop_words = '', punctuation='`~!@#$%^&*()_+={[}]|\:;"<,>.?/}\t'):
m = []
s = s.replace("-", " ")
stop = stop_words
def is_stop(word, stop_words = stop):
return not (word in stop_words)
def is_space(word, space = ['']):
return not (word in space)
for line in s.splitlines():
raw = line.lower().split(' ' or '.')
y = list()
for w in raw:
x = list(filter(ispuct, w))
y.append(a.join(x))
words = list(filter(is_space, y))
words = list(filter(is_stop, words))
m += words
return m
tokenize("This, is the way; that things will hi--end", stop_words = 'is the')
#ispuct('!')
Out[326]:
In [279]:
wasteland = """
APRIL is the cruellest month, breeding
Lilacs out of the dead land, mixing
Memory and desire, stirring
Dull roots with spring rain.
"""
tokenize(wasteland, stop_words='is the of and')
Out[279]:
In [280]:
assert tokenize("This, is the way; that things will end", stop_words=['the', 'is']) == \
['this', 'way', 'that', 'things', 'will', 'end']
wasteland = """
APRIL is the cruellest month, breeding
Lilacs out of the dead land, mixing
Memory and desire, stirring
Dull roots with spring rain.
"""
#tokenize(wasteland, stop_words='is the of and')
assert tokenize(wasteland, stop_words='is the of and') == \
['april','cruellest','month','breeding','lilacs','out','dead','land',
'mixing','memory','desire','stirring','dull','roots','with','spring',
'rain']
Write a function count_words that takes a list of words and returns a dictionary where the keys in the dictionary are the unique words in the list and the values are the word counts.
In [281]:
def count_words(data):
"""Return a word count dictionary from the list of words in data."""
dictionary = {}
for n in data:
dictionary[n]= data.count(n)
return dictionary
In [282]:
assert count_words(tokenize('this and the this from and a a a')) == \
{'a': 3, 'and': 2, 'from': 1, 'the': 1, 'this': 2}
Write a function sort_word_counts that return a list of sorted word counts:
(word, count) tuple.sorted function with a custom key and reverse
argument.
In [283]:
def sort_word_counts(wc):
"""Return a list of 2-tuples of (word, count), sorted by count descending."""
l = [(i,wc[i]) for i in wc]
return sorted(l, key = lambda x:x[1], reverse = True)
In [284]:
print(sort_word_counts(count_words(tokenize('this and a the this this and a a a'))))
In [285]:
assert sort_word_counts(count_words(tokenize('this and a the this this and a a a'))) == \
[('a', 4), ('this', 3), ('and', 2), ('the', 1)]
Perform a word count analysis on Chapter 1 of Moby Dick, whose text can be found in the file mobydick_chapter1.txt:
'the of and a to in is it that as'.swc.
In [338]:
txt = open('mobydick_chapter1.txt', 'r')
x = txt.read()
swc = sort_word_counts(count_words(tokenize(s = x, stop_words = ['the', 'of', 'and', 'to', 'in', 'is', 'it', 'that', 'as', 'a'])))
string = ''
x = (tokenize(s = x, stop_words = ['the', 'of', 'and', 'to', 'in', 'is', 'it', 'that', 'as', 'a']))
for things in x:
string = string + things + " "
print(len(swc))
#print(swc)
print(string)
punchfactor = 4
In [ ]:
assert swc[0]==('i',43)
assert len(swc)==848 - punchfactor #4 is the punchfactor, ranked out of 4
Create a "Cleveland Style" dotplot of the counts of the top 50 words using Matplotlib. If you don't know what a dotplot is, you will have to do some research...
In [344]:
x = np.array(swc)
plt.plot(x[0:50,1], range(50),'o')
Out[344]:
In [ ]:
assert True # use this for grading the dotplot