In [3]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
Write a function tokenize that takes a string of English text returns a list of words. It should also remove stop words, which are common short words that are often removed before natural language processing. Your function should have the following logic:
splitlines.filter function to remove all punctuation.stop_words is a list, remove all occurences of the words in the list.stop_words is a space delimeted string of words, split them and remove them.
In [52]:
def tokenize(s, stop_words=None, punctuation='`~!@#$%^&*()_-+={[}]|\:;"<,>.?/}\t'):
"""Split a string into a list of words, removing punctuation and stop words."""
y = s.splitlines()
z = list(y)
y = str(y)
my_str = y
no_punct = ""
for char in my_str:
if char not in punctuation:
no_punct = no_punct + char
y = no_punct.split()
iloveyouryanpleasegivemeanA = list(filter(lambda x: x not in stop_words, y))
x = iloveyouryanpleasegivemeanA[0].split()
return x
In [53]:
?filter
In [54]:
tokenize("This, is the way; that things will end")
In [55]:
tokenize(s)
In [56]:
?remove
In [57]:
#>>> squares = map(lambda x: x**2, range(10))
#>>> special_squares = list(filter(lambda x: x > 5 and x < 50, squares))
In [58]:
tokenize("This, is the way; that things will end", stop_words=['the', 'is']) == \
['this', 'way', 'that', 'things', 'will', 'end']
wasteland = """
APRIL is the cruellest month, breeding
Lilacs out of the dead land, mixing
Memory and desire, stirring
Dull roots with spring rain.
"""
In [59]:
assert tokenize("This, is the way; that things will end", stop_words=['the', 'is']) == \
['this', 'way', 'that', 'things', 'will', 'end']
wasteland = """
APRIL is the cruellest month, breeding
Lilacs out of the dead land, mixing
Memory and desire, stirring
Dull roots with spring rain.
"""
assert tokenize(wasteland, stop_words='is the of and') == \
['april','cruellest','month','breeding','lilacs','out','dead','land',
'mixing','memory','desire','stirring','dull','roots','with','spring',
'rain']
Write a function count_words that takes a list of words and returns a dictionary where the keys in the dictionary are the unique words in the list and the values are the word counts.
In [61]:
def count_words(a_string):
"""Return a word count dictionary from the list of words in data."""
split_string = a_string.split()
string_dict = {}
#first populate the dictionary with the keys being each word in the string, all having zero for their values.
for item in split_string:
string_dict[item] = 0
#Then cycle through the split string again and if the word is one of the keys in the dictionary add 1 each time.
for item in split_string:
if item in string_dict.keys():
string_dict[item] += 1
return string_dict
count_words('this and the this from and a a a')
Out[61]:
In [17]:
#def word_count(a_string):
# split_string = a_string.split()
# string_dict = {}
# #first populate the dictionary with the keys being each word in the string, all having zero for their values.
# for item in split_string:
# string_dict[item] = 0
# #Then cycle through the split string again and if the word is one of the keys in the dictionary add 1 each time.
# for item in split_string:
# if item in string_dict.keys():
# string_dict[item] += 1
#
# return string_dict
#
#word_count('this and the this from and a a a')
Out[17]:
In [97]:
assert count_words(tokenize('this and the this from and a a a')) == \
{'a': 3, 'and': 2, 'from': 1, 'the': 1, 'this': 2}
Write a function sort_word_counts that return a list of sorted word counts:
(word, count) tuple.sorted function with a custom key and reverse
argument.
In [ ]:
def sort_word_counts(wc):
"""Return a list of 2-tuples of (word, count), sorted by count descending."""
# YOUR CODE HERE
raise NotImplementedError()
In [94]:
assert sort_word_counts(count_words(tokenize('this and a the this this and a a a'))) == \
[('a', 4), ('this', 3), ('and', 2), ('the', 1)]
Perform a word count analysis on Chapter 1 of Moby Dick, whose text can be found in the file mobydick_chapter1.txt:
'the of and a to in is it that as'.swc.
In [ ]:
# YOUR CODE HERE
raise NotImplementedError()
In [ ]:
assert swc[0]==('i',43)
assert len(swc)==848
Create a "Cleveland Style" dotplot of the counts of the top 50 words using Matplotlib. If you don't know what a dotplot is, you will have to do some research...
In [ ]:
# YOUR CODE HERE
raise NotImplementedError()
In [ ]:
assert True # use this for grading the dotplot