In [2]:
s = "the quick brown fox jumps over the lazy dog"
print (s)


the quick brown fox jumps over the lazy dog

In [4]:
s.split()


Out[4]:
['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

In [0]:
import urllib.request
content = urllib.request.urlopen("https://raw.githubusercontent.com/theleadio/datascience_demo/master/alice.txt").read().decode('UTF-8')

In [10]:
print(content[:300])


[Alice's Adventures in Wonderland by Lewis Carroll 1865]

CHAPTER I. Down the Rabbit-Hole

Alice was beginning to get very tired of sitting by her sister on the
bank, and of having nothing to do: once or twice she had peeped into the
book her sister was reading, but it had no pictures or conversatio

In [0]:
tokens = content.split(' ')

In [15]:
print(tokens[:50])


["[Alice's", 'Adventures', 'in', 'Wonderland', 'by', 'Lewis', 'Carroll', '1865]\n\nCHAPTER', 'I.', 'Down', 'the', 'Rabbit-Hole\n\nAlice', 'was', 'beginning', 'to', 'get', 'very', 'tired', 'of', 'sitting', 'by', 'her', 'sister', 'on', 'the\nbank,', 'and', 'of', 'having', 'nothing', 'to', 'do:', 'once', 'or', 'twice', 'she', 'had', 'peeped', 'into', 'the\nbook', 'her', 'sister', 'was', 'reading,', 'but', 'it', 'had', 'no', 'pictures', 'or', 'conversations']

In [18]:
from collections import Counter
print(Counter(tokens).most_common(50))


[('the', 1340), ('', 889), ('to', 634), ('and', 630), ('a', 546), ('of', 433), ('she', 423), ('said', 395), ('it', 316), ('in', 302), ('was', 291), ('I', 241), ('you', 222), ('as', 220), ('that', 182), ('at', 181), ('her', 171), ('had', 157), ('Alice', 155), ('with', 144), ('all', 134), ('on', 129), ('be', 119), ('very', 112), ('for', 111), ('little', 99), ('so', 97), ('they', 97), ('not', 92), ('but', 91), ('he', 89), ('out', 86), ('his', 81), ('up', 75), ('were', 74), ('went', 73), ('this', 73), ('what', 72), ('Alice,', 72), ('down', 69), ('have', 67), ('about', 67), ('one', 67), ('if', 67), ('like', 64), ('would', 61), ('when', 61), ('or', 60), ('into', 60), ('is', 59)]

In [20]:
import re

tokens = re.findall(r'\w+', content.lower())
tokens = [token for token in tokens]
print(tokens[:50])

print(Counter(tokens).most_common(50))


['alice', 's', 'adventures', 'in', 'wonderland', 'by', 'lewis', 'carroll', '1865', 'chapter', 'i', 'down', 'the', 'rabbit', 'hole', 'alice', 'was', 'beginning', 'to', 'get', 'very', 'tired', 'of', 'sitting', 'by', 'her', 'sister', 'on', 'the', 'bank', 'and', 'of', 'having', 'nothing', 'to', 'do', 'once', 'or', 'twice', 'she', 'had', 'peeped', 'into', 'the', 'book', 'her', 'sister', 'was', 'reading', 'but']
[('the', 1642), ('and', 872), ('to', 729), ('a', 632), ('it', 595), ('she', 553), ('i', 543), ('of', 514), ('said', 462), ('you', 411), ('alice', 398), ('in', 369), ('was', 357), ('that', 315), ('as', 263), ('her', 248), ('t', 218), ('at', 212), ('s', 201), ('on', 193), ('all', 182), ('with', 180), ('had', 178), ('but', 170), ('for', 153), ('they', 152), ('so', 151), ('be', 148), ('not', 145), ('very', 144), ('what', 141), ('this', 134), ('little', 128), ('he', 125), ('out', 117), ('is', 108), ('one', 104), ('down', 102), ('up', 100), ('there', 99), ('if', 96), ('his', 96), ('then', 94), ('about', 94), ('no', 90), ('them', 88), ('know', 88), ('like', 85), ('were', 85), ('would', 83)]

In [0]:
import urllib.request
stopwords = urllib.request.urlopen("https://raw.githubusercontent.com/theleadio/datascience_demo/master/stopwords.txt").read().decode(
    'UTF-8').splitlines()

In [0]:
final_tokens = [token for token in tokens if token not in stopwords]

In [28]:
print(Counter(final_tokens).most_common(50))


[('alice', 398), ('queen', 75), ('time', 71), ('king', 63), ('don', 61), ('turtle', 59), ('ll', 57), ('hatter', 56), ('mock', 56), ('gryphon', 55), ('rabbit', 51), ('head', 50), ('voice', 48), ('looked', 45), ('ve', 44), ('mouse', 44), ('duchess', 42), ('round', 41), ('tone', 40), ('dormouse', 40), ('cat', 37), ('march', 34), ('found', 32), ('moment', 31), ('hare', 31), ('white', 30), ('door', 30), ('heard', 30), ('day', 29), ('eyes', 29), ('dear', 29), ('replied', 29), ('caterpillar', 28), ('poor', 27), ('won', 26), ('half', 23), ('added', 23), ('jury', 22), ('words', 21), ('hand', 21), ('minute', 21), ('till', 21), ('sort', 20), ('cried', 20), ('feet', 19), ('tea', 19), ('curious', 19), ('house', 18), ('eat', 18), ('table', 18)]

In [0]: