In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
Write a function tokenize that takes a string of English text returns a list of words. It should also remove stop words, which are common short words that are often removed before natural language processing. Your function should have the following logic:
splitlines.filter function to remove all punctuation.stop_words is a list, remove all occurences of the words in the list.stop_words is a space delimeted string of words, split them and remove them.
In [ ]:
In [19]:
s="""
APRIL--this is the cruellest month, breeding
Lilacs out of the dead land, mixing
Memory and desire, stirring
Dull roots with spring rain.
"""
stop_words='the is'
s=s.splitlines()
y=[]
for i in s:
c=i.split()
y.append(c)
y
z=[]
for j in range(len(y)):
z=z+y[j]
b=' '.join(z)
u=list(filter(punctuation_split, b))
v=''.join(u)
if isinstance(stop_words, str)== True:
stop_words=stop_words.split()
for i in range(len(stop_words)):
v=v.replace(' '+stop_words[i],'')
v=v.replace(' ','')
else:
for i in range(len(stop_words)):
v=v.replace(stop_words[i],'')
v=v.replace(' ','')
v=v.lower()
u
Out[19]:
In [ ]:
In [3]:
def punctuation_split(x):
if x == "'" or x == '`' or x == '~' or x == '!' or x == '@' or x == '#' or x == '$' or x == '%' or x == '^' or x == '&' or x == '*' or x == '(' or x == ')' or x == '-' or x == '_' or x == '=' or x == '+' or x == '[' or x == ']' or x == '{' or x == '}' or x == '|' or x == '\\' or x == '"' or x == ':' or x == ';' or x == '<' or x == '>' or x == ',' or x == '.' or x == '?' or x == '/':
return False
return True
u=list(filter(punctuation_split, b))
''.join(u)
Out[3]:
In [4]:
def tokenize(s, stop_words=None, punctuation='`~!@#$%^&*()_-+={[}]|\:;"<,>.?/}\\'):
"""Split a string into a list of words, removing punctuation and stop words."""
s=s.replace('-',' ')
s=s.replace('--',' ')
s=s.splitlines() #Collaborated with Kevin Phung
y=[]
for i in s:
c=i.split()
y.append(c)
z=[]
for j in range(len(y)):
z=z+y[j]
b=' '.join(z)
u=list(filter(punctuation_split, b))
v=''.join(u)
if stop_words==None:
v=v.replace(' ','')
elif isinstance(stop_words, str)== True:
stop_words=stop_words.split()
for i in range(len(stop_words)):
v=v.replace(' '+stop_words[i]+' ',' ')
else:
for i in range(len(stop_words)):
v=v.replace(' '+stop_words[i],'')
v=v.replace(' ','')
v=v.lower()
return(v.split())
In [5]:
wasteland = """
APRIL is the cruellest month, breeding
Lilacs out of the dead land, mixing
Memory and desire, stirring
Dull roots with spring rain.
"""
tokenize(wasteland, stop_words='is the of and')
Out[5]:
In [6]:
assert tokenize("This, is the way; that things will end", stop_words=['the', 'is']) == \
['this', 'way', 'that', 'things', 'will', 'end']
wasteland = """
APRIL is the cruellest month, breeding
Lilacs out of the dead land, mixing
Memory and desire, stirring
Dull roots with spring rain.
"""
assert tokenize(wasteland, stop_words='is the of and') == \
['april','cruellest','month','breeding','lilacs','out','dead','land',
'mixing','memory','desire','stirring','dull','roots','with','spring',
'rain']
In [7]:
tokenize(wasteland, stop_words='is the of and')
Out[7]:
In [8]:
tokenize('this and the this from and a a a')
Out[8]:
Write a function count_words that takes a list of words and returns a dictionary where the keys in the dictionary are the unique words in the list and the values are the word counts.
In [9]:
def count_words(data):
"""Return a word count dictionary from the list of words in data."""
word_dictionary={}
for i in data:
if i not in word_dictionary:
word_dictionary[i]=1
else:
word_dictionary[i]=word_dictionary[i]+1
return word_dictionary
In [10]:
assert count_words(tokenize('this and the this from and a a a')) == \
{'a': 3, 'and': 2, 'from': 1, 'the': 1, 'this': 2}
In [11]:
sorted
Out[11]:
Write a function sort_word_counts that return a list of sorted word counts:
(word, count) tuple.sorted function with a custom key and reverse
argument.
In [12]:
def sort_word_counts(wc):
"""Return a list of 2-tuples of (word, count), sorted by count descending."""
x=sorted(wc, key=wc.get, reverse=True)
y=sorted(wc.values(), reverse=True)
return list(zip(x,y))
In [13]:
sort_word_counts(count_words(tokenize('this and a the this this and a a a')))
Out[13]:
In [14]:
assert sort_word_counts(count_words(tokenize('this and a the this this and a a a'))) == \
[('a', 4), ('this', 3), ('and', 2), ('the', 1)]
Perform a word count analysis on Chapter 1 of Moby Dick, whose text can be found in the file mobydick_chapter1.txt:
'the of and a to in is it that as'.swc.
In [15]:
nnn=open('mobydick_chapter1.txt')
mobypenis=nnn.read()
swc=sort_word_counts(count_words(tokenize(mobypenis, 'the of and a to in is it that as')))
swc
Out[15]:
In [16]:
assert swc[0]==('i',43)
assert len(swc)==848
Create a "Cleveland Style" dotplot of the counts of the top 50 words using Matplotlib. If you don't know what a dotplot is, you will have to do some research...
In [17]:
ff=np.array(swc)
dd=ff[range(50),0]
dd
cc=ff[range(50),1]
cc
plt.figure(figsize=(10,10))
plt.scatter(cc, range(50))
plt.yticks(range(50), dd)
plt.title('Most Common Words in Moby Dick First Chapter')
plt.xlabel('Number of times word appears')
plt.tight_layout()
ff
Out[17]:
In [18]:
assert True # use this for grading the dotplot
In [ ]:
In [ ]:
In [ ]: