In [1]:
##Some code to run at the beginning of the file, to be able to show images in the notebook
##Don't worry about this cell
#Print the plots in this screen
%matplotlib inline
#Be able to plot images saved in the hard drive
from IPython.display import Image
#Make the notebook wider
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
#Usual imports
import pandas as pd
import numpy as np
import pylab as plt
Python uses variables and code.
Variables tell the computer to save something (a number, a string, a spreadsheet) with a name. For instance, if you write variable_name = 3, the computer knows that variable_name is 3.
In [3]:
#Dictionary
this_is_a_dict = {"Javier": "garcia@uva.nl", "Friend1": "f1@uva.nl", "Friend2": "f2@uva.nl"}
print(this_is_a_dict)
print(type(this_is_a_dict))
OPERATIONS IN DICT
Get
In [4]:
#Get an element
print(this_is_a_dict["Friend2"])
print(this_is_a_dict.get("Friend2"))
In [6]:
#The difference between the two is that while the first line gives an error if "Friends2"
#is not part of the dictionary, the second one answers with None**
print(this_is_a_dict.get("Friend5")) #not enough friends
Add
In [7]:
#Create an element
this_is_a_dict["Friend3"] = "f3@uva.nl"
In [12]:
this_is_a_dict
Out[12]:
In [9]:
#Print the keys
print(this_is_a_dict.keys())
In [10]:
#Print the values
print(this_is_a_dict.values())
Remove
In [13]:
del this_is_a_dict["Friend3"]
print(this_is_a_dict)
Creating a dictionary from two lists: ZIP
In [15]:
#Creating dictionary using two lists
list_names = ["Javier", "Friend1", "Friend2"]
list_numbers = ["garcia@uva.nl","f1@uva.nl","f2@uva.nl"]
#Put both together using zip
this_is_a_dict = dict(zip(list_names,list_numbers))
print(this_is_a_dict)
In [16]:
#The zip object is another strange data structure that we cannot see (like range)
print(zip(list_names,list_numbers))
In [17]:
#But we can convert it to a list to see how it looks (like range)
print(list(zip(list_names,list_numbers)))
Why to use dict? Because it's much much faster than a list, it always takes the same time to find an element in a dict, that's not the case in a list
Useful to assing values to words for instance
In [18]:
## Our own functions
def mean_ours(list_numbers): #list_numbers is the arguments
"""
This is called the docstring, it is a comment describing the function. In this case the function calculates the mean of a list of numbers.
input
list_numbers: a list of numbers
output: the mean of the input
"""
#what gives back
return sum(list_numbers)/len(list_numbers)
##INDENTATION!!
##Two points after the "def"
In [19]:
mean_ours?
In [20]:
aList = [2,3,4]
print(mean_ours(aList)) #this is how you call the funciton
How the arguments of a function work
If there are many arguments, the first value that you pass is matched to the first argument of the function, the second to the second, etc.
For instance, these are the arguments of the function pd.read_csv()
`pd.read_csv(filepath_or_buffer, sep=',', delimiter=None, header='infer',...)`
Writing
`pd.read_csv("data/ams_green.csv","\t",None,0)`
matches
`filepath_or_buffer TO "data/ams_green.csv",
sep TO "\t",
delimiter TO None,
header TO 0`
You can also pass the arguments by name. For instance
`pd.read_csv("data/ams_green.csv",header= 0, sep="\t",delimiter=None)`
is identical to the line before. In this case the values you pass do not have to be in the same order as the arguments.
In [21]:
def f():
local_var1 = 2
local_var2 = 3
local_var = local_var1*local_var2
print(local_var)
#Call the function
f()
Variables created inside functions are only seen within the function
In [22]:
def f():
local_var1 = 2
local_var2 = 2
local_var = local_var1*local_var2
#Call the function
f()
#We haven't created local_var
print(local_var)
In [23]:
def f():
local_var1 = 2
local_var2 = 2
local_var = local_var1*local_var2
return local_var
#Call the function
gvar = f()
#Now we have local_var (but generally it is not a good idea to use the same name)
print(gvar)
Variables created outside functions are seen by all the code (be careful!)
In [24]:
local_var = "python"
def f():
print(local_var) #this can read the variable outside, but NOT CHANGE IT (except .pop() and .append())
#it's okay for functions not to return anything, by default they return None
#Call the function
f()
#We can also see it from outside the function
print(local_var)
for element in [1,2,3,4,5]:
print(element)
The computer:
for element in [1,2,3,4,5]
) and realizes it is a for loopprint(element)
)You can write anything instead of element (for i in range(10) for instance)
The indentation and the colon are important, you get SyntaxError without them.
In [9]:
for x in ["Adam","Utercht"]:
print(x)
In [12]:
for i,x in enumerate(["Adam","Utercht"]):
print(i,x)
In [13]:
i = 0
for x in ["Adam","Utercht"]:
print(i,x)
i = i + 1
In [ ]:
In [ ]:
In [28]:
print("python" in list_articles[1])
In [27]:
#Imagine we want to find what some articles are talking about, we could do it like this,
#but it's unfeasible when you have more than a dozen articles
list_articles = ["article 1: blah python",
"article 2: blah Trump",
"article 3: blah Trump",
"article 4: blah Trump"]#many article
print("python" in list_articles[0])
print("python" in list_articles[1])
print("python" in list_articles[2])
print("python" in list_articles[3])
#...
In [30]:
#but we can use for loops
for a in list_articles:
print("python" in a)
In [31]:
#this is very common as well (especially in other programming languages)
for index in [0,1,2,3]:
print("python" in list_articles[index])
In [34]:
list(enumerate(list_articles))
Out[34]:
In [32]:
#this is sometimes useful when we want both the article and the index
for index,article in enumerate(list_articles):
print(index, "python" in article)
what if we want to stop a loop?
Then we can use break
In [68]:
for index,article in enumerate(list_articles):
if index == 2: break
print(index, "python" in article)
what if we want to skip some rounds? Then we use continue
In [69]:
for index,article in enumerate(list_articles):
if index%2 == 0:
continue #this skips the rest of the code below if the number is even
print(index, "python" in article)
`
article = "Trump is going to make America great"
if "python" in article:
print("python",article)
elif "climate change" in article:
print("climate change",article)
else:
print("no python", article)
`
The computer:
if "python" in article
) and realizes it is an if-else statementpython" in article
is True. print("python",article)
) and goes to the end of all the if-elif-else. elif "climate change" in article
is True.You only need the if
, the elif
and else
are optional. For instance without else
the code above wouldn't print anything.
You can have as many elif
s as you want.
The indentation and the colon are important, you get SyntaxError without them.
Let's write code that tells us if an article is about python or Trump
In [2]:
article = "article 2: blah Trump python"
if "python" in article:
print("Article refering to Python")
if "Trump" in article:
print("Article refering to Trump")
We can wrap it into a function
In [39]:
def python_or_trump(article):
"""
prints if an article is related to python or trump
input
article: string with words
"""
if "python" in article:
print("Article refering to Python")
elif "Trump" in article:
print("Article refering to Trump")
else:
print("Article not refering to Python or Trump")
In [40]:
article = "article 2: blah Trump"
print(article)
#this is how you call the function
python_or_trump(article)
In [41]:
#stops when python is found, never check for trump
article = "article 2: blah Trump python"
print(article)
python_or_trump(article)
In [42]:
article = "article 2: blah blah"
print(article)
python_or_trump(article)
Now we do it for many articles
In [43]:
list_articles = ["article 1: blah python",
"article 2: blah Trump",
"article 3: blah Trump",
"article 4: blah Trump"]#many articles
for article in list_articles:
python_or_trump(article)
In [44]:
def count_words(list_articles):
"""
input: list of articles
output: number of articles with the word trump and with the word pythoon
"""
count_trump = 0
count_python = 0
for article in list_articles:
if "python" in article.lower():
count_python = count_python + 1 #count_python += 1
if "trump" in article.lower():
count_trump = count_trump + 1 #count_trump += 1
return count_trump,count_python
In [47]:
import numpy as np
list_articles = ["article 1: blah python",
"article 2: blah Trump",
"article 3: blah Trump",
"article 4: blah Trump"]#many articles
g_count_trump,g_count_python = count_words(list_articles)
print(g_count_python)
print(g_count_trump)
print("python articles: ", g_count_python)
print("trump_articles: ", g_count_trump)
In [50]:
[0]*10
Out[50]:
Let's make it a bit more flexible
In [53]:
#Let's use a list of numbers instead of two separate variables for the counter
list_articles = ["article 1: blah python",
"article 2: blah Trump",
"article 3: blah Trump",
"article 4: blah Trump"]#many articles
def count_words(list_articles):
counters = [0]*2 # [0,0]
for article in list_articles:
if "python" in article:
counters[0] += 1 #count_python += 1
#counters[0] = counters[0] + 1
if "Trump" in article:
counters[1] += 1 #count_python += 1
return counters
counters = count_words(list_articles)
print("python articles: ")
print(counters[0])
print("trump_articles: ")
print(counters[1])
In [55]:
# And allow for any two words, not just python or Trump
list_articles = ["article 1: blah python",
"article 2: blah Trump",
"article 3: blah Trump",
"article 4: blah Trump"]#many articles
def count_words(list_articles,words):
counters = [0]*2
for article in list_articles:
if words[0] in article:
counters[0] += 1 #count_python += 1
if words[1] in article:
counters[1] += 1 #count_python += 1
return counters
counters = count_words(list_articles,words=["python","blah"])
print("python articles: ", counters[0])
print("blah_articles: ", counters[1])
In [67]:
words = ["python","Trump","blah"]
list(enumerate(words))
Out[67]:
In [63]:
list(range(len(words))),words
Out[63]:
In [66]:
enumerate(words)
zip(range(len(words)),words)
Out[66]:
In [59]:
# And allow for any number of words, not just two
list_articles = ["article 1: blah python",
"article 2: blah Trump",
"article 3: blah Trump",
"article 4: blah Trump"]#many articles
def count_words(list_articles,words):
counters = [0] * len(words)
for article in list_articles:
for i,word in enumerate(words):
if word in article:
counters[i] += 1
return counters
words = ["python","Trump","blah"]
counters = count_words(list_articles,words)
print(words)
print(counters)
In [60]:
#We can make a dictionary out of it
d_word2counter = dict(zip(words,counters))
d_word2counter["Trump"]
Out[60]:
what if we want a loop but we don't know when we need to stop?
Then we can use the while loop:
while condition:
do something
update condition #otherwise the loop is infinitei
However in python is not too common.
In [74]:
#For instance this fails, because we don't have more than 2 friends
this_is_a_dict = {"Javier": "garcia@uva.nl", "Friend1": "f1@uva.nl", "Friend2": "f2@uva.nl"}
In [75]:
print(this_is_a_dict["Friend5"])
In [77]:
f5 = this_is_a_dict.get("Friend5")
if f5 is None: #f5 == None
print("Not enough friends")
In [72]:
#example how to fix it
#the indents are important, as well as the colons
try:
print(this_is_a_dict["Friend5"])
except KeyError:
print("Not enough friends")
In [71]:
#but this one is very common and we have a function that does it for us
print(this_is_a_dict.get("Friend5"))
"r": read "w": write "w+": write and if doesn't exist, create it
In [78]:
with open("data/file_to_write.csv","w+") as f:
f.write("I'm line number {}".format(0))
f.write("I'm line number {}".format(1))
f.write("I'm line number {}".format(2))
f.write("I'm line number {}".format(3))
f.write("I'm line number {}".format(4))
But remember to add a "return character" (\n)
In [79]:
with open("data/file_to_write.csv","w+") as f:
f.write("I'm line number {}\n".format(0))
f.write("I'm line number {}\n".format(1))
f.write("I'm line number {}\n".format(2))
f.write("I'm line number {}\n".format(3))
f.write("I'm line number {}\n".format(4))
There are 3 ways to read a file
We won't be reading the files like this too often, but sometimes you need to read them line by line (instead of loading all the files like we do with pandas)
In [80]:
#Ways to read files
with open("data/file_to_write.csv","r") as f:
#way 1
all_file = f.read()
print(all_file)
In [81]:
with open("data/file_to_write.csv") as f:
#way 2
all_file_by_line = f.readlines()
print(all_file_by_line)
In [82]:
with open("data/file_to_write.csv") as f:
#way 3
for line in f:
print(line)
In [83]:
print("Hi")
print("Hi again")
you can delete the "\n" at the end of the string with .rstrip()
In [ ]:
with open("data/file_to_write.csv") as f:
#way 3
for line in f:
print(line.rstrip())
In [ ]:
with open("data/file_to_write.csv","w+") as f:
f.write("I'm line number {}\n".format(0))
f.write("I'm line number {}\n".format(1))
f.write("I'm line number {}\n".format(2))
f.write("I'm line number {}\n".format(3))
f.write("I'm line number {}\n".format(4))
1. Use a loop to do the same than above (write 5 lines to a file)
In [87]:
with open("data/file_to_write.csv","w+") as f:
for test in range(5):
f.write("I'm line number {}\n".format(test))
2. Use an if-else statement to write only if the number is larger than 3
In [95]:
with open("data/file_to_write.csv","w+") as f:
for test in range(5):
if test > 3:
f.write("I'm line number {}\n".format(test))
In [96]:
with open("data/file_to_write.csv","r") as f:
print(f.read())
3. Encapsulate everything in a function, and call the function
In [97]:
def makesomethingup():
with open("data/file_to_write.csv","w+") as f:
for test in range(5):
if test > 3:
f.write("I'm line number {}\n".format(test))
return None
makesomethingup()
In [84]:
#A character is a special type of number
ord("b")
Out[84]:
In [85]:
#A string is very similar to a list of characters
"abdc"[3]
Out[85]:
In [86]:
#A boolean is a number
print(True == 1)
In [ ]:
#A numpy array is a special type of list
#A pandas dataframe is a list of numpy arrays
#A set is a dictionary without values {"d":1,"e":3} vs {"d","e"}
In [ ]:
In [ ]: