Algorithms Exercise 1

Imports



In [1]:

    
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np

Word counting

Write a function tokenize that takes a string of English text returns a list of words. It should also remove stop words, which are common short words that are often removed before natural language processing. Your function should have the following logic:

Split the string into lines using splitlines.
Split each line into a list of words and merge the lists for each line.
Use Python's builtin filter function to remove all punctuation.
If stop_words is a list, remove all occurences of the words in the list.
If stop_words is a space delimeted string of words, split them and remove them.
Remove any remaining empty words.
Make all words lowercase.



In [ ]:



In [19]:

    
s="""
APRIL--this is the cruellest month, breeding
Lilacs out of the dead land, mixing
Memory and desire, stirring
Dull roots with spring rain.
"""
stop_words='the is'
s=s.splitlines()
y=[]
for i in s:
    c=i.split()
    y.append(c)
y
z=[]
for j in range(len(y)):
    z=z+y[j]
b=' '.join(z)
u=list(filter(punctuation_split, b))
v=''.join(u)
if isinstance(stop_words, str)== True:
    stop_words=stop_words.split()
    for i in range(len(stop_words)):
        v=v.replace(' '+stop_words[i],'')
    v=v.replace('  ','')
else:
    for i in range(len(stop_words)):
        v=v.replace(stop_words[i],'')
    v=v.replace('  ','')
v=v.lower()
u









    Out[19]:





['A',
 'P',
 'R',
 'I',
 'L',
 't',
 'h',
 'i',
 's',
 ' ',
 'i',
 's',
 ' ',
 't',
 'h',
 'e',
 ' ',
 'c',
 'r',
 'u',
 'e',
 'l',
 'l',
 'e',
 's',
 't',
 ' ',
 'm',
 'o',
 'n',
 't',
 'h',
 ' ',
 'b',
 'r',
 'e',
 'e',
 'd',
 'i',
 'n',
 'g',
 ' ',
 'L',
 'i',
 'l',
 'a',
 'c',
 's',
 ' ',
 'o',
 'u',
 't',
 ' ',
 'o',
 'f',
 ' ',
 't',
 'h',
 'e',
 ' ',
 'd',
 'e',
 'a',
 'd',
 ' ',
 'l',
 'a',
 'n',
 'd',
 ' ',
 'm',
 'i',
 'x',
 'i',
 'n',
 'g',
 ' ',
 'M',
 'e',
 'm',
 'o',
 'r',
 'y',
 ' ',
 'a',
 'n',
 'd',
 ' ',
 'd',
 'e',
 's',
 'i',
 'r',
 'e',
 ' ',
 's',
 't',
 'i',
 'r',
 'r',
 'i',
 'n',
 'g',
 ' ',
 'D',
 'u',
 'l',
 'l',
 ' ',
 'r',
 'o',
 'o',
 't',
 's',
 ' ',
 'w',
 'i',
 't',
 'h',
 ' ',
 's',
 'p',
 'r',
 'i',
 'n',
 'g',
 ' ',
 'r',
 'a',
 'i',
 'n']



In [ ]:



In [3]:

    
def punctuation_split(x):
    if x == "'" or x == '`' or x == '~' or x == '!' or x == '@' or x == '#' or x == '$' or x == '%' or x == '^' or x == '&' or x == '*' or x == '(' or x == ')' or x == '-' or x == '_' or x == '=' or x == '+' or x == '[' or x == ']' or x == '{' or x == '}' or x == '|' or x == '\\' or x == '"' or x == ':' or x == ';' or x == '<' or x == '>' or x == ',' or x == '.' or x == '?' or x == '/':
        return False
    return True
u=list(filter(punctuation_split, b))
''.join(u)









    Out[3]:





'APRILthis is the cruellest month breeding Lilacs out of the dead land mixing Memory and desire stirring Dull roots with spring rain'



In [4]:

    
def tokenize(s, stop_words=None, punctuation='`~!@#$%^&*()_-+={[}]|\:;"<,>.?/}\\'):
    """Split a string into a list of words, removing punctuation and stop words."""
    s=s.replace('-',' ')
    s=s.replace('--',' ')
    s=s.splitlines() #Collaborated with Kevin Phung
    y=[]
    for i in s:
        c=i.split()
        y.append(c)
    z=[]
    for j in range(len(y)):
        z=z+y[j]
    b=' '.join(z)
    u=list(filter(punctuation_split, b))
    v=''.join(u)
    if stop_words==None:
        v=v.replace('  ','')
    elif isinstance(stop_words, str)== True:
        stop_words=stop_words.split()
        for i in range(len(stop_words)):
            v=v.replace(' '+stop_words[i]+' ',' ')
        
    else:
        for i in range(len(stop_words)):
            v=v.replace(' '+stop_words[i],'')
        v=v.replace('  ','')
    v=v.lower()
    return(v.split())



In [5]:

    
wasteland = """
APRIL is the cruellest month, breeding
Lilacs out of the dead land, mixing
Memory and desire, stirring
Dull roots with spring rain.
"""
tokenize(wasteland, stop_words='is the of and')









    Out[5]:





['april',
 'cruellest',
 'month',
 'breeding',
 'lilacs',
 'out',
 'dead',
 'land',
 'mixing',
 'memory',
 'desire',
 'stirring',
 'dull',
 'roots',
 'with',
 'spring',
 'rain']



In [6]:

    
assert tokenize("This, is the way; that things will end", stop_words=['the', 'is']) == \
    ['this', 'way', 'that', 'things', 'will', 'end']
wasteland = """
APRIL is the cruellest month, breeding
Lilacs out of the dead land, mixing
Memory and desire, stirring
Dull roots with spring rain.
"""

assert tokenize(wasteland, stop_words='is the of and') == \
    ['april','cruellest','month','breeding','lilacs','out','dead','land',
     'mixing','memory','desire','stirring','dull','roots','with','spring',
     'rain']



In [7]:

    
tokenize(wasteland, stop_words='is the of and')









    Out[7]:





['april',
 'cruellest',
 'month',
 'breeding',
 'lilacs',
 'out',
 'dead',
 'land',
 'mixing',
 'memory',
 'desire',
 'stirring',
 'dull',
 'roots',
 'with',
 'spring',
 'rain']



In [8]:

    
tokenize('this and the this from and a a a')









    Out[8]:





['this', 'and', 'the', 'this', 'from', 'and', 'a', 'a', 'a']

Write a function count_words that takes a list of words and returns a dictionary where the keys in the dictionary are the unique words in the list and the values are the word counts.



In [9]:

    
def count_words(data):
    """Return a word count dictionary from the list of words in data."""
    word_dictionary={}
    for i in data:
        if i not in word_dictionary:
            word_dictionary[i]=1
        else:
            word_dictionary[i]=word_dictionary[i]+1
    return word_dictionary



In [10]:

    
assert count_words(tokenize('this and the this from and a a a')) == \
    {'a': 3, 'and': 2, 'from': 1, 'the': 1, 'this': 2}



In [11]:

    
sorted









    Out[11]:





<function sorted>

Write a function sort_word_counts that return a list of sorted word counts:

Each element of the list should be a (word, count) tuple.
The list should be sorted by the word counts, with the higest counts coming first.
To perform this sort, look at using the sorted function with a custom key and reverse argument.



In [12]:

    
def sort_word_counts(wc):
    """Return a list of 2-tuples of (word, count), sorted by count descending."""
    x=sorted(wc, key=wc.get, reverse=True)
    y=sorted(wc.values(), reverse=True)
    return list(zip(x,y))



In [13]:

    
sort_word_counts(count_words(tokenize('this and a the this this and a a a')))









    Out[13]:





[('a', 4), ('this', 3), ('and', 2), ('the', 1)]



In [14]:

    
assert sort_word_counts(count_words(tokenize('this and a the this this and a a a'))) == \
    [('a', 4), ('this', 3), ('and', 2), ('the', 1)]

Perform a word count analysis on Chapter 1 of Moby Dick, whose text can be found in the file mobydick_chapter1.txt:

Read the file into a string.
Tokenize with stop words of 'the of and a to in is it that as'.
Perform a word count, the sort and save the result in a variable named swc.



In [15]:

    
nnn=open('mobydick_chapter1.txt')
mobypenis=nnn.read()

swc=sort_word_counts(count_words(tokenize(mobypenis, 'the of and a to in is it that as')))
swc









    Out[15]:





[('i', 43),
 ('me', 25),
 ('all', 23),
 ('you', 23),
 ('this', 17),
 ('there', 16),
 ('for', 16),
 ('but', 15),
 ('my', 14),
 ('with', 13),
 ('sea', 13),
 ('on', 12),
 ('go', 12),
 ('they', 12),
 ('not', 11),
 ('some', 11),
 ('from', 11),
 ('his', 10),
 ('or', 10),
 ('he', 10),
 ('one', 10),
 ('into', 9),
 ('be', 9),
 ('what', 9),
 ('if', 9),
 ('upon', 9),
 ('was', 8),
 ('by', 8),
 ('do', 8),
 ('water', 8),
 ('have', 8),
 ('were', 7),
 ('about', 7),
 ('and', 7),
 ('part', 7),
 ('though', 7),
 ('why', 7),
 ('way', 6),
 ('take', 6),
 ('no', 6),
 ('old', 6),
 ('it', 6),
 ('can', 6),
 ('your', 6),
 ('will', 6),
 ('land', 6),
 ('more', 6),
 ('see', 6),
 ('time', 6),
 ('like', 6),
 ('down', 6),
 ('get', 6),
 ('voyage', 6),
 ('the', 5),
 ('such', 5),
 ('same', 5),
 ('most', 5),
 ('sailor', 5),
 ('then', 5),
 ('never', 5),
 ('at', 5),
 ('here', 5),
 ('when', 5),
 ('whenever', 5),
 ('other', 5),
 ('who', 5),
 ('whaling', 5),
 ('them', 5),
 ('are', 5),
 ('stand', 5),
 ('now', 5),
 ('did', 5),
 ('ever', 5),
 ('these', 4),
 ('up', 4),
 ('head', 4),
 ('being', 4),
 ('every', 4),
 ('world', 4),
 ('tell', 4),
 ('money', 4),
 ('high', 4),
 ('first', 4),
 ('things', 4),
 ('than', 4),
 ('which', 4),
 ('yet', 4),
 ('must', 4),
 ('little', 4),
 ('would', 4),
 ('much', 4),
 ('so', 4),
 ('those', 4),
 ('passenger', 4),
 ('am', 4),
 ('an', 4),
 ('their', 4),
 ('two', 4),
 ('men', 4),
 ('going', 4),
 ('because', 4),
 ('still', 3),
 ('image', 3),
 ('how', 3),
 ('set', 3),
 ('passengers', 3),
 ('paying', 3),
 ('grand', 3),
 ('say', 3),
 ('purse', 3),
 ('should', 3),
 ('we', 3),
 ('ships', 3),
 ('broiled', 3),
 ('man', 3),
 ('without', 3),
 ('come', 3),
 ('sort', 3),
 ('cook', 3),
 ('winds', 3),
 ('out', 3),
 ('nothing', 3),
 ('almost', 3),
 ('whale', 3),
 ('ship', 3),
 ('hand', 3),
 ('great', 3),
 ('parts', 3),
 ('may', 3),
 ('miles', 3),
 ('before', 3),
 ('myself', 3),
 ('off', 3),
 ('him', 3),
 ('soul', 3),
 ('something', 3),
 ('right', 3),
 ('hill', 2),
 ('under', 2),
 ('metaphysical', 2),
 ('once', 2),
 ('does', 2),
 ('let', 2),
 ('mast', 2),
 ('account', 2),
 ('city', 2),
 ('ocean', 2),
 ('order', 2),
 ('wild', 2),
 ('strong', 2),
 ('own', 2),
 ('deep', 2),
 ('shepherds', 2),
 ('think', 2),
 ('thing', 2),
 ('its', 2),
 ('else', 2),
 ('aloft', 2),
 ('over', 2),
 ('thousands', 2),
 ('better', 2),
 ('yonder', 2),
 ('motives', 2),
 ('long', 2),
 ('thump', 2),
 ('phantom', 2),
 ('ourselves', 2),
 ('fixed', 2),
 ('cannot', 2),
 ('crowds', 2),
 ('thinks', 2),
 ('streets', 2),
 ('hunks', 2),
 ('in', 2),
 ('sail', 2),
 ('each', 2),
 ('unless', 2),
 ('meaning', 2),
 ('well', 2),
 ('distant', 2),
 ('make', 2),
 ('however', 2),
 ('grow', 2),
 ('pay', 2),
 ('schoolmaster', 2),
 ('deck', 2),
 ('captain', 2),
 ('leaders', 2),
 ('commodore', 2),
 ('robust', 2),
 ('having', 2),
 ('perhaps', 2),
 ('air', 2),
 ('where', 2),
 ('sight', 2),
 ('seas', 2),
 ('magic', 2),
 ('round', 2),
 ('particular', 2),
 ('plunged', 2),
 ('just', 2),
 ('spar', 2),
 ('among', 2),
 ('requires', 2),
 ('point', 2),
 ('look', 2),
 ('chief', 2),
 ('always', 2),
 ('ago', 2),
 ('others', 2),
 ('sleep', 2),
 ('meadow', 2),
 ('forecastle', 2),
 ('glory', 2),
 ('exactly', 2),
 ('thousand', 2),
 ('officer', 2),
 ('content', 2),
 ('healthy', 2),
 ('himself', 2),
 ('mean', 2),
 ('any', 2),
 ('could', 2),
 ('besides', 2),
 ('leaves', 2),
 ('stream', 2),
 ('between', 2),
 ('reveries', 2),
 ('ishmael', 2),
 ('fates', 2),
 ('paid', 2),
 ('care', 2),
 ('themselves', 2),
 ('find', 2),
 ('country', 2),
 ('scores', 2),
 ('begin', 2),
 ('warehouses', 2),
 ('previous', 2),
 ('been', 2),
 ('lead', 2),
 ('respectfully', 2),
 ('east', 1),
 ('established', 1),
 ('knocking', 1),
 ('prevalent', 1),
 ('cataract', 1),
 ('story', 1),
 ('floated', 1),
 ('judgment', 1),
 ('precisely', 1),
 ('rensselaers', 1),
 ('region', 1),
 ('cottage', 1),
 ('patagonian', 1),
 ('trouble', 1),
 ('roused', 1),
 ('curiosity', 1),
 ('interest', 1),
 ('invest', 1),
 ('confess', 1),
 ('quarter', 1),
 ('states', 1),
 ('has', 1),
 ('ungraspable', 1),
 ('oceans', 1),
 ('considerable', 1),
 ('wonder', 1),
 ('brigs', 1),
 ('extremest', 1),
 ('social', 1),
 ('easy', 1),
 ('desert', 1),
 ('noble', 1),
 ('within', 1),
 ('dogs', 1),
 ('bringing', 1),
 ('side', 1),
 ('inducements', 1),
 ('west', 1),
 ('fowls', 1),
 ('bulwarks', 1),
 ('run', 1),
 ('merchant', 1),
 ('infliction', 1),
 ('lakes', 1),
 ('needs', 1),
 ('coral', 1),
 ('even', 1),
 ('unbiased', 1),
 ('rockaway', 1),
 ('contested', 1),
 ('rigging', 1),
 ('downtown', 1),
 ('vain', 1),
 ('scales', 1),
 ('managers', 1),
 ('presented', 1),
 ('green', 1),
 ('universal', 1),
 ('nameless', 1),
 ('funeral', 1),
 ('anything', 1),
 ('athirst', 1),
 ('earnestly', 1),
 ('mild', 1),
 ('receiving', 1),
 ('astern', 1),
 ('smoke', 1),
 ('caravan', 1),
 ('blue', 1),
 ('unaccountable', 1),
 ('quarrelsome', 1),
 ('flourish', 1),
 ('thence', 1),
 ('reverentially', 1),
 ('compasses', 1),
 ('poet', 1),
 ('surveillance', 1),
 ('served', 1),
 ('simple', 1),
 ('leaning', 1),
 ('friendly', 1),
 ('springs', 1),
 ('satisfaction', 1),
 ('insular', 1),
 ('gone', 1),
 ('difference', 1),
 ('entailed', 1),
 ('sway', 1),
 ('surf', 1),
 ('pistol', 1),
 ('mortal', 1),
 ('path', 1),
 ('travel', 1),
 ('prairies', 1),
 ('shabby', 1),
 ('feet', 1),
 ('goes', 1),
 ('habit', 1),
 ('yes', 1),
 ('dale', 1),
 ('believe', 1),
 ('commonalty', 1),
 ('coasts', 1),
 ('farces', 1),
 ('inlanders', 1),
 ('randolphs', 1),
 ('valley', 1),
 ('affghanistan', 1),
 ('fields', 1),
 ('snow', 1),
 ('growing', 1),
 ('judiciously', 1),
 ('lording', 1),
 ('narcissus', 1),
 ('magnetic', 1),
 ('fore', 1),
 ('whether', 1),
 ('monster', 1),
 ('rather', 1),
 ('deeper', 1),
 ('plumb', 1),
 ('mid', 1),
 ('marvels', 1),
 ('quietly', 1),
 ('heaven', 1),
 ('philosophical', 1),
 ('against', 1),
 ('happen', 1),
 ('shakes', 1),
 ('june', 1),
 ('portentous', 1),
 ('try', 1),
 ('surprising', 1),
 ('crucifix', 1),
 ('strange', 1),
 ('life', 1),
 ('hook', 1),
 ('of', 1),
 ('short', 1),
 ('corlears', 1),
 ('breezes', 1),
 ('waves', 1),
 ('clinched', 1),
 ('nights', 1),
 ('resulting', 1),
 ('gets', 1),
 ('wherefore', 1),
 ('limit', 1),
 ('quick', 1),
 ('drop', 1),
 ('punch', 1),
 ('avenues', 1),
 ('gazers', 1),
 ('sabbath', 1),
 ('lungs', 1),
 ('salt', 1),
 ('brother', 1),
 ('trials', 1),
 ('transition', 1),
 ('mystical', 1),
 ('houses', 1),
 ('rag', 1),
 ('pot', 1),
 ('love', 1),
 ('wedded', 1),
 ('whereas', 1),
 ('lilies', 1),
 ('pacing', 1),
 ('new', 1),
 ('reefs', 1),
 ('deliberately', 1),
 ('unite', 1),
 ('toils', 1),
 ('suddenly', 1),
 ('hardicanutes', 1),
 ('single', 1),
 ('presidency', 1),
 ('respectable', 1),
 ('manhattoes', 1),
 ('undeliverable', 1),
 ('compare', 1),
 ('lath', 1),
 ('honourable', 1),
 ('put', 1),
 ('striving', 1),
 ('leagues', 1),
 ('cajoling', 1),
 ('knew', 1),
 ('really', 1),
 ('boys', 1),
 ('peep', 1),
 ('penny', 1),
 ('possibly', 1),
 ('inferred', 1),
 ('hold', 1),
 ('employs', 1),
 ('minded', 1),
 ('regulating', 1),
 ('various', 1),
 ('second', 1),
 ('passed', 1),
 ('performing', 1),
 ('sick', 1),
 ('extreme', 1),
 ('salted', 1),
 ('vibration', 1),
 ('silent', 1),
 ('less', 1),
 ('awe', 1),
 ('jump', 1),
 ('sense', 1),
 ('belted', 1),
 ('feel', 1),
 ('tree', 1),
 ('silver', 1),
 ('thieves', 1),
 ('bathed', 1),
 ('shore', 1),
 ('programme', 1),
 ('eye', 1),
 ('jove', 1),
 ('tormenting', 1),
 ('coffin', 1),
 ('bit', 1),
 ('pool', 1),
 ('board', 1),
 ('orchard', 1),
 ('stepping', 1),
 ('island', 1),
 ('american', 1),
 ('spurs', 1),
 ('circumambulate', 1),
 ('knows', 1),
 ('choice', 1),
 ('rub', 1),
 ('amount', 1),
 ('ills', 1),
 ('cattle', 1),
 ('bound', 1),
 ('tallest', 1),
 ('grasp', 1),
 ('mysterious', 1),
 ('earthly', 1),
 ('hooded', 1),
 ('buttered', 1),
 ('abominate', 1),
 ('northward', 1),
 ('open', 1),
 ('hypos', 1),
 ('tiger', 1),
 ('week', 1),
 ('discriminating', 1),
 ('shady', 1),
 ('buy', 1),
 ('wish', 1),
 ('gates', 1),
 ('welcome', 1),
 ('substitute', 1),
 ('decks', 1),
 ('damp', 1),
 ('marvellous', 1),
 ('again', 1),
 ('root', 1),
 ('quite', 1),
 ('watery', 1),
 ('weighed', 1),
 ('sadly', 1),
 ('uncomfortable', 1),
 ('place', 1),
 ('performances', 1),
 ('particularly', 1),
 ('wanting', 1),
 ('towards', 1),
 ('mouth', 1),
 ('terms', 1),
 ('somehow', 1),
 ('commerce', 1),
 ('secretly', 1),
 ('remote', 1),
 ('genteel', 1),
 ('since', 1),
 ('answer', 1),
 ('invisible', 1),
 ('feelings', 1),
 ('niagara', 1),
 ('swung', 1),
 ('washed', 1),
 ('family', 1),
 ('huge', 1),
 ('quietest', 1),
 ('bear', 1),
 ('hermit', 1),
 ('hazy', 1),
 ('schooners', 1),
 ('distinction', 1),
 ('bake', 1),
 ('dont', 1),
 ('battle', 1),
 ('sounds', 1),
 ('wholesome', 1),
 ('seaward', 1),
 ('obey', 1),
 ('tennessee', 1),
 ('conceits', 1),
 ('abandon', 1),
 ('artist', 1),
 ('broiling', 1),
 ('solo', 1),
 ('shadiest', 1),
 ('tranced', 1),
 ('us', 1),
 ('around', 1),
 ('mole', 1),
 ('attending', 1),
 ('circulation', 1),
 ('river', 1),
 ('receives', 1),
 ('thither', 1),
 ('legs', 1),
 ('seemingly', 1),
 ('horse', 1),
 ('trees', 1),
 ('police', 1),
 ('view', 1),
 ('grin', 1),
 ('considering', 1),
 ('stage', 1),
 ('landscape', 1),
 ('sailors', 1),
 ('endless', 1),
 ('heard', 1),
 ('dreamiest', 1),
 ('street', 1),
 ('sword', 1),
 ('orders', 1),
 ('trunk', 1),
 ('everlasting', 1),
 ('needles', 1),
 ('urbane', 1),
 ('tied', 1),
 ('dive', 1),
 ('meet', 1),
 ('greeks', 1),
 ('supplied', 1),
 ('days', 1),
 ('afternoon', 1),
 ('coat', 1),
 ('many', 1),
 ('interlude', 1),
 ('dreamy', 1),
 ('forbidden', 1),
 ('professor', 1),
 ('either', 1),
 ('saco', 1),
 ('ball', 1),
 ('offices', 1),
 ('ten', 1),
 ('inmost', 1),
 ('helped', 1),
 ('election', 1),
 ('poor', 1),
 ('wade', 1),
 ('swayed', 1),
 ('carries', 1),
 ('castle', 1),
 ('key', 1),
 ('battery', 1),
 ('november', 1),
 ('itch', 1),
 ('overlapping', 1),
 ('isles', 1),
 ('promptly', 1),
 ('nigh', 1),
 ('involuntarily', 1),
 ('fowl', 1),
 ('violate', 1),
 ('putting', 1),
 ('magnificent', 1),
 ('rivers', 1),
 ('romantic', 1),
 ('hours', 1),
 ('lanes', 1),
 ('trip', 1),
 ('knowing', 1),
 ('atmosphere', 1),
 ('tribulations', 1),
 ('honour', 1),
 ('monied', 1),
 ('blades', 1),
 ('flood', 1),
 ('taking', 1),
 ('pyramids', 1),
 ('van', 1),
 ('alleys', 1),
 ('falling', 1),
 ('lee', 1),
 ('soon', 1),
 ('handfuls', 1),
 ('visit', 1),
 ('sleeps', 1),
 ('pure', 1),
 ('spleen', 1),
 ('kind', 1),
 ('thought', 1),
 ('formed', 1),
 ('general', 1),
 ('whitehall', 1),
 ('seated', 1),
 ('pausing', 1),
 ('please', 1),
 ('ignoring', 1),
 ('recall', 1),
 ('came', 1),
 ('mountains', 1),
 ('mummies', 1),
 ('enjoy', 1),
 ('pent', 1),
 ('peoples', 1),
 ('landsmen', 1),
 ('looking', 1),
 ('upper', 1),
 ('activity', 1),
 ('seneca', 1),
 ('deepest', 1),
 ('roasted', 1),
 ('enable', 1),
 ('drizzly', 1),
 ('prevent', 1),
 ('sand', 1),
 ('indian', 1),
 ('gabriel', 1),
 ('beach', 1),
 ('south', 1),
 ('lies', 1),
 ('perceive', 1),
 ('lodges', 1),
 ('making', 1),
 ('moral', 1),
 ('nor', 1),
 ('give', 1),
 ('archangel', 1),
 ('induced', 1),
 ('hollow', 1),
 ('after', 1),
 ('desires', 1),
 ('united', 1),
 ('enter', 1),
 ('benches', 1),
 ('element', 1),
 ('charm', 1),
 ('cooled', 1),
 ('suffice', 1),
 ('slip', 1),
 ('counters', 1),
 ('thus', 1),
 ('heads', 1),
 ('coenties', 1),
 ('touches', 1),
 ('enough', 1),
 ('mazy', 1),
 ('deity', 1),
 ('reaching', 1),
 ('grim', 1),
 ('doubtless', 1),
 ('ah', 1),
 ('cunningly', 1),
 ('instance', 1),
 ('straight', 1),
 ('picture', 1),
 ('virtue', 1),
 ('far', 1),
 ('aint', 1),
 ('royal', 1),
 ('indignity', 1),
 ('bill', 1),
 ('brief', 1),
 ('enchanting', 1),
 ('pier', 1),
 ('idolatrous', 1),
 ('slave', 1),
 ('her', 1),
 ('rear', 1),
 ('repeatedly', 1),
 ('comedies', 1),
 ('plaster', 1),
 ('north', 1),
 ('pythagorean', 1),
 ('experiment', 1),
 ('needed', 1),
 ('years', 1),
 ('deliberate', 1),
 ('posted', 1),
 ('principle', 1),
 ('cherish', 1),
 ('sleepy', 1),
 ('spiles', 1),
 ('dotings', 1),
 ('rolled', 1),
 ('persians', 1),
 ('true', 1),
 ('providence', 1),
 ('very', 1),
 ('few', 1),
 ('jolly', 1),
 ('overwhelming', 1),
 ('constant', 1),
 ('disguises', 1),
 ('sentinels', 1),
 ('influences', 1),
 ('surrounds', 1),
 ('holy', 1),
 ('especially', 1),
 ('woodlands', 1),
 ('conscious', 1),
 ('exercise', 1),
 ('assure', 1),
 ('separate', 1),
 ('nailed', 1),
 ('cheerfully', 1),
 ('drowned', 1),
 ('meditation', 1),
 ('idea', 1),
 ('unpleasant', 1),
 ('ibis', 1),
 ('eyes', 1),
 ('maxim', 1),
 ('sights', 1),
 ('judgmatically', 1),
 ('hats', 1),
 ('stoics', 1),
 ('tormented', 1),
 ('extensive', 1),
 ('attract', 1),
 ('tragedies', 1),
 ('fountain', 1),
 ('nearly', 1),
 ('wharves', 1),
 ('breathes', 1),
 ('ones', 1),
 ('sighs', 1),
 ('freewill', 1),
 ('processions', 1),
 ('absent', 1),
 ('saw', 1),
 ('bulk', 1),
 ('delusion', 1),
 ('bloody', 1),
 ('desks', 1),
 ('everybody', 1),
 ('fancied', 1),
 ('contrary', 1),
 ('drawn', 1),
 ('loitering', 1),
 ('horror', 1),
 ('broom', 1),
 ('waterward', 1),
 ('barbarous', 1),
 ('smelt', 1),
 ('yourself', 1),
 ('pine', 1),
 ('knee', 1),
 ('paint', 1),
 ('cato', 1),
 ('call', 1),
 ('grasshopper', 1),
 ('consign', 1),
 ('peppered', 1),
 ('act', 1),
 ('keen', 1),
 ('egyptians', 1),
 ('surely', 1),
 ('reason', 1),
 ('good', 1),
 ('captains', 1),
 ('china', 1),
 ('finally', 1),
 ('circumstances', 1),
 ('degree', 1),
 ('testament', 1),
 ('mind', 1),
 ('sweep', 1),
 ('decoction', 1),
 ('purpose', 1),
 ('tar', 1),
 ('suspect', 1),
 ('speak', 1),
 ('perils', 1),
 ('perdition', 1),
 ('boy', 1),
 ('pedestrian', 1),
 ('told', 1),
 ('wears', 1),
 ('shoulder', 1),
 ('town', 1),
 ('inmates', 1),
 ('methodically', 1),
 ('driving', 1),
 ('hands', 1),
 ('infallibly', 1),
 ('left', 1),
 ('creatures', 1),
 ('physical', 1),
 ('crazy', 1),
 ('throws', 1),
 ('barques', 1),
 ('whatsoever', 1)]



In [16]:

    
assert swc[0]==('i',43)
assert len(swc)==848

Create a "Cleveland Style" dotplot of the counts of the top 50 words using Matplotlib. If you don't know what a dotplot is, you will have to do some research...



In [17]:

    
ff=np.array(swc)
dd=ff[range(50),0]
dd
cc=ff[range(50),1]
cc

plt.figure(figsize=(10,10))
plt.scatter(cc, range(50))
plt.yticks(range(50), dd)
plt.title('Most Common Words in Moby Dick First Chapter')
plt.xlabel('Number of times word appears')
plt.tight_layout()

ff









    Out[17]:





array([['i', '43'],
       ['me', '25'],
       ['all', '23'],
       ..., 
       ['throws', '1'],
       ['barques', '1'],
       ['whatsoever', '1']], 
      dtype='<U14')



In [18]:

    
assert True # use this for grading the dotplot



In [ ]:



In [ ]:



In [ ]: