In [62]:
# The story is stored in the file "story.txt".
f = open("story.txt", "r")
story = f.read()
print(story)
In [63]:
# We can split strings into lists with the .split() method.
# If we use a space as the input to .split(), it will split based on the space.
text = "Bears are probably better than sharks, but I can't get close enough to one to be sure."
tokenized_text = text.split(" ")
tokenized_story = story.split(" ")
print(tokenized_story)
The story has been loaded into tokenized_story.
Replace all of the punctuation in each of the tokens.
You'll need to loop through tokenized_story to do so.
You'll need to use multiple replace statements, one for each punctuation character to replace.
Append the token to no_punctuation_tokens once you are done replacing characters.
Don't forget to remove newlines!
Print out no_punctuation_tokens if you want to see which types of punctuation are still in the data.
In [64]:
# We can use the .replace function to replace punctuation in a string.
text = "Who really shot John F. Kennedy?"
text = text.replace("?", "?!")
# The question mark has been replaced with ?!.
##print(text)
# We can replace strings with blank spaces, meaning that they are just removed.
text = text.replace("?", "")
# The question mark is gone now.
##print(text)
no_punctuation_tokens = []
for token in tokenized_story:
for p in [".", ",", "\n", "'", ";", "?", "!", "-", ":"]:
token = token.replace(p, "")
no_punctuation_tokens.append(token)
print(no_punctuation_tokens)
In [65]:
# We can make strings all lowercase using the .lower() method.
text = "MY CAPS LOCK IS STUCK"
text = text.lower()
# The text is much nicer to read now.
print(text)
lowercase_tokens = []
for token in no_punctuation_tokens:
lowercase_tokens.append(token.lower())
print(lowercase_tokens)
Define a function that takes degrees in fahrenheit as an input, and return degrees celsius
Use it to convert 100 degrees fahrenheit to celsius. Assign the result to celsius_100.
Use it to convert 150 degrees fahrenheit to celsius. Assign the result to celsius_150.
In [66]:
# A simple function that takes in a number of miles, and turns it into kilometers
# The input at position 0 will be put into the miles variable.
def miles_to_km(miles):
# return is a special keyword that indicates that the function will output whatever comes after it.
return miles/0.62137
# Returns the number of kilometers equivalent to one mile
print(miles_to_km(1))
# Convert a from 10 miles to kilometers
a = 10
a = miles_to_km(a)
# We can convert and assign to a different variable
b = 50
c = miles_to_km(b)
fahrenheit = 80
celsius = (fahrenheit - 32)/1.8
def f2c(f):
c = (f - 32)/1.8
return c
celsius_100 = f2c(100)
celsius_150 = f2c(150)
print(celsius_100, celsius_150)
In [67]:
def split_string(text):
return text.split(" ")
sally = "Sally sells seashells by the seashore."
# This splits the string into a list.
print(split_string(sally))
# We can assign the output of a function to a variable.
sally_tokens = split_string(sally)
lowercase_me = "I wish I was in ALL lowercase"
def to_lowercase(text):
return text.lower()
lowercased_string = to_lowercase(lowercase_me)
print(lowercased_string)
In [68]:
# Sometimes, you will have problems with your code that cause python to throw an exception.
# Don't worry, it happens to all of us many times a day.
# An exception means that the program can't run, so you'll get an error in the results view instead of the normal output.
# There are a few different types of exceptions.
# The first we'll look at is a SyntaxError.
# This means that something is typed incorrectly (statements misspelled, quotes missing, and so on)
a = ["Errors are no fun!", "But they can be fixed", "Just fix the syntax and everything will be fine"]
b = 5
for item in a:
if b == 5:
print(item)
In [69]:
a = 5
if a == 6:
print("6 is obviously the best number")
print("What's going on, guys?")
else:
print("I never liked that 6")
In [70]:
# An index error is when a nonexistent index in a list is accessed.
the_list = ["Harrison Ford", "Mark Hamill"]
print(the_list[-1])
another_list = ["Jabba"]
print(another_list[-1])
All the tokens from Julius's story are in the tokenized_story variable.
Write a function that removes all punctuation from an input string.
Then loop over tokenized_story and call the function to remove the punctuation from each token.
Append the tokens to no_punctuation_tokens.
In [71]:
# Functions can have multiple lines in the function body.
def do_math(number):
# Multiply the number by 10
number = number * 10
# Add 20 to the number
number = number + 20
return number
print(do_math(20))
a = do_math(10)
no_punctuation_tokens = []
def remove_punctuation(text):
return text.replace("\n", "").replace(".", "").replace(",", "") \
.replace(":", "").replace(";", "") \
.replace("!", "").replace("?", "").replace("'", "")
for token in tokenized_story:
no_punctuation_tokens.append(remove_punctuation(token))
print(no_punctuation_tokens)
We've written the remove_punctuation function for you. Can you add to it so that it also makes the output lowercase?
Then loop over the tokens in tokenized_story and normalize them with the function.
Append the tokens to normalized_tokens when you're done.
In [72]:
# This is our function to remove punctuation.
def remove_punctuation(token):
token = token.replace(".","")
token = token.replace(",","")
token = token.replace("'", "")
token = token.replace(";", "")
token = token.replace("\n", "")
return token.lower()
# We've read the tokens from Julius's story into the tokenized_story variable.
# Can you add to the remove_punctuation function so it also lowercases the tokens?
# Then loop over the tokens in tokenized_story, normalize them with the function, and append them to normalized_tokens.
normalized_tokens = []
for token in tokenized_story:
normalized_tokens.append(remove_punctuation(token))
print(normalized_tokens)
In [73]:
# This function takes two arguments, at positions 0 and 1
def divide(x,y):
return x/y
# 5 is assigned to x, and 1 is assigned to y based on positions
print(divide(5,1))
# 1 is assigned to x, and 5 is assigned to y based on positions.
print(divide(1,5))
In [74]:
def multiply(x,y,z):
return x*y*z
a,b = multiply(10,3,5), multiply(20,-1,3)
In [75]:
print(a,b)
The dictionary is stored in dictionary.txt
We'll need to read it in and normalize it.
Read in the dictionary from the "dictionary.txt" file.
Split it into tokens based on the space character.
Normalize each token using the normalize function.
Append the normalized tokens to normalized_dictionary_tokens
In [76]:
def normalize(token):
token = token.replace(".","")
token = token.replace(",","")
token = token.replace("'", "")
token = token.replace(";", "")
token = token.replace("\n", "")
token = token.lower()
return token
normalized_dictionary_tokens = []
In [77]:
f = open("dictionary.txt", "r")
text = f.read()
print(text)
tokens = text.split(" ")
for token in tokens:
normalized_dictionary_tokens.append(normalize(token))
print(normalized_dictionary_tokens)
A solution is in sight! All we need to do is loop through the normalized story tokens, and check if they are in the dictionary. If they aren't, they are potential misspellings.
The normalized story tokens are in normalized_story_tokens, and the normalized dictionary tokens are in normalized_dictionary_tokens.
Loop through the story tokens, and check if each token is in the dictionary.
If the token is in normalized_dictionary_tokens, append it to correctly_spelled
If it isn't, append it to potential_misspellings.
In [78]:
potential_misspellings = []
correctly_spelled = []
In [79]:
normalized_story_tokens = normalized_tokens
In [80]:
for token in normalized_story_tokens:
if token in normalized_dictionary_tokens:
correctly_spelled.append(token)
else:
potential_misspellings.append(token)
print(potential_misspellings)
print(correctly_spelled)
In [ ]: