In [12]:
##Some code to run at the beginning of the file, to be able to show images in the notebook
##Don't worry about this cell
#Print the plots in this screen
%matplotlib inline
#Be able to plot images saved in the hard drive
from IPython.display import Image
#Make the notebook wider
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
CSV = comma separated values file
TSV = tab separated values file
person year score
1 2000 8
2 2000 1
3 2000 3
1 2010 7
2 1010 3
We use pandas to read them and save then in a data structure call dataframe
In [13]:
#First import required library
import pandas as pd
In [4]:
## Read excel
excelFrame = pd.read_excel("data/class1_test_excel.xlsx",sheetname = 0)
#Print the first 5 lines
print(excelFrame.head(5))
In [7]:
#However jupyter notebooks show you what is inside your last command, so you can skip the print, and it looks nicer
excelFrame.head(5)
Out[7]:
In [8]:
## Read stata
stataFrame = pd.read_stata("data/class1_test_stata.dta")
stataFrame.head(5)
Out[8]:
In [10]:
## Read csv
csvFrame = pd.read_csv("data/class1_test_csv.csv",
sep="\t",skiprows=4,na_values=["-9"])
csvFrame.head()
Out[10]:
1 "a" 2000 2 "b" 3000
In [ ]:
We'll focus on CSVs, because they are universal, you can read them with any text editor, and you can export your data as csvs from any program
outsheet id gender race read write science using outfile.csv , comma
Save as -> csv (or text to use tabs)
Other Examples
Argument of a function
Arguments of pd.read_csv()
This is required, what is the name of the file. If inside a folder you need to write the name of the folder too. For instance if the file "example.csv" is inside the folder "data", you need to write data/example.csv
: "\t" for tab, "," for comma, ";" for semicolon, etc
: 0 if the first line has column names. None if the first line has already data.
: number of lines to skip
: number of lines to skip at the end
: what columns do you want to read? The default is all, but you can say [0,3,4] or ["column_x","column_y"]
: what other values should be considered missing (e.g. ["n.a.","-9","-999"])
: what is the thousands separator, usually there is None
: Americans use "."; Europeans use ","; in science we use ".".
In [11]:
#To find more about the function use
pd.read_csv?
In [12]:
#To find a lot more about the function use
pd.read_csv??
We will be using a very small dataset (data/class1_test_csv.csv)
sep="\t"
index_col=None
skiprows=4
na_values=["-9"]
In [14]:
#First we reed our data
import pandas as pd
df = pd.read_csv("data/class1_test_csv.csv",sep="\t",skiprows=4,na_values=["-9"])
In [15]:
#And print it
df
Out[15]:
In [16]:
#Describe the data
#use df.describe? (outside a comment) to get help
df.describe()
Out[16]:
You can calculate the mean with df.mean() (or the median, std, etc)
In [19]:
## Calculate mean by columns
## axis is a very common argument. The computer by default gets the mean by column
#df.mean() === df.mean(axis=0)
df.mean(axis=0)
Out[19]:
In [20]:
df.mean(axis=1) #By rows
Out[20]:
In [18]:
df.head()
Out[18]:
In [19]:
## Keep ONE column
df["treatment"]
Out[19]:
In [22]:
## Keep SEVERAL column
df[["year","treatment"]]
Out[22]:
In [21]:
df.iloc[:5]
Out[21]:
In [23]:
df
Out[23]:
In [49]:
df["year"].isin([2000,2010])
Out[49]:
In [53]:
cond = df["year"].isin( [2000,2010] )
cond
Out[53]:
In [54]:
df.loc[cond]
Out[54]:
In [59]:
df.loc[df["year"] == 2000]
Out[59]:
In [ ]:
()
mean
list
sorted
.isin
[]
df.loc[]
df["Year"]
["A","b","c"]
In [26]:
df_2000 = df[df["year"] == 2000]
Out[26]:
In [23]:
#For example we want to keep the rows with the year 2000
#We can create the condition
condition = df["year"] == 2000
print(condition)
In [68]:
df.loc[df["year"] == 2000 , ["score","events"] ]
Out[68]:
In [26]:
#And then filter. In a numpy array you could do np_array[condition]. Here you do df[condition]
condition = df["year"] == 2000
df[condition] # df[df["year"]==2000]
Out[26]:
In [30]:
df[df["year"].isin([2000,2010])]
Out[30]:
In [27]:
#If they meet more than one condition
condition = df["year"].isin([2000,2010])
df[condition]
Out[27]:
In [30]:
df_treat_and_year = df[["treatment","year"]]
In [34]:
df_treat_and_year.head()
Out[34]:
In [31]:
x = [1,2,3]
In [ ]:
df.loc[df["year"]==2000, ["year","treatment"]]
In [35]:
#Keeping the columns year and treatment for the year 2000
condition = df["year"] == 2000
df.loc[condition,["year","treatment"]]
Out[35]:
In [37]:
#df[["year","treatment"]]
df.loc[:,["year","treatment"]].head()
Out[37]:
In [69]:
df
Out[69]:
In [71]:
df["test"] = df["score"]/2
df
Out[71]:
In [37]:
df["score_sq"] = df["score"]**2
df
Out[37]:
In [38]:
df["happiness"] = [1,2,3,4,5,6,7,8,9,10,11,12]
df["events"] = [1,2,3,4,5,6,7,8,9,10,11,12]
In [40]:
df
Out[40]:
In [41]:
#create new rows
df.loc[12] = [2,2017,2,9.,10,23]
In [43]:
df.sort_values(by=["treatment","score"],ascending=[True,False])
Out[43]:
In [46]:
df_no_nan = df.dropna(subset=["score"])
In [47]:
df.columns
Out[47]:
In [48]:
df.columns = ["ID","year","treatment","score","happiness","events"]
df.head()
Out[48]:
In [49]:
#this tells the computer to plot everything here
%matplotlib inline
#importing this library makes the default colors be beautiful
import seaborn as sns
#this import matplotlib
import pylab as plt
In [ ]:
#create a figure with a size (measured in inches!)
plt.figure(figsize=(4,3))
#add a title to the figure
plt.title("Title")
#add a label in the x and y axis
plt.xlabel("X axis label")
plt.ylabel("Y axis label",fontsize=14) #we can add the font size to all the functions where we pass text
#add a legend
plt.legend()
#use log scale in the x and y axis
plt.xscale("log")
plt.yscale("log")
#trim the x axis between 1 and 100 (to make it look like you want, it depends on your specific values)
plt.xlim((1,100))
#add minor ticks (vertical/horizontal lines) with tranparency 50%
plt.grid(which='minor',alpha=0.5)
#take out the grid
plt.grid(False)
#save the figure (I CAN'T STRESS ENOUGH: SAVE AS PDF FOR ANY PAPER YOU WRITE!)
plt.savefig("plots/name_of_figure.pdf")
#show the figure (not required in jupyter notebooks but still good to write it)
plt.show()
Used to plot two quantitative variables against each other. We can add one extra quant. variable if we use bubble size and one qualitative if we use bubble color.
Important arguments:
In [50]:
plt.scatter?
In [51]:
df.head()
Out[51]:
In [39]:
df = df.dropna()
In [40]:
x = df["score"] #x values
y = df["happiness"] #y values
c = df["treatment"] #color
s = df["events"] #size
#to convert the pandas column into a np.array you need to write "values".
#It's not needed to plot but it is to print it nicely
print(x.values)
print(y.values)
print(c.values)
print(s.values)
#Create a figure
plt.figure(figsize=(6,4))
#Make the scatter plot, using treatment as color, 80 as size of the marker,
#no edgecolor and a Red-Yellow-Blue colormap
plt.scatter(x,y,c=c,s=s*20,edgecolor="none",cmap="RdYlBu")
plt.xlabel("Score",fontsize=12)
plt.ylabel("Happiness",fontsize=12)
plt.show()
In [42]:
plt.scatter(df["score"],df["happiness"])
Out[42]:
Used to plot two quantitative variables against each other. We can add one qualitative if we use several lines. Importantly, the x variable must be ordered.
Important arguments:
In [ ]:
plt.plot?
In [58]:
#Data of person 1,2,3 and 4
df_1 = df.loc[df["ID"]==1,["year","score"]]
df_2 = df.loc[df["ID"]==2,["year","score"]]
df_3 = df.loc[df["ID"]==3,["year","score"]]
df_4 = df.loc[df["ID"]==4,["year","score"]]
#let's use the default matplotlib colros (instead of the seaborn colors)
sns.reset_orig() #sns.set() to bring back the seaborn colors
#create plot
plt.figure(figsize=(6,4))
#plot the score for all years for the people
plt.plot(df_1["year"],df_1["score"],marker="o",color="#0D4F8B",linewidth=2,label="Treatment 1")
plt.plot(df_2["year"],df_2["score"],marker="o",color="#0D4F8B",linewidth=2,label="") #no legend for this guy
plt.plot(df_3["year"],df_3["score"],marker="o",color="#e60000",linewidth=2,label="Treatment 2")
plt.plot(df_4["year"],df_4["score"],marker="o",color="#e60000",linewidth=2,label="") #no legend for this guy
#Make a legend, the default is the right top corner, but that can be changed with the "loc" argument
plt.legend(loc=0)
#Labels
plt.xlabel("Year",fontsize=14)
plt.ylabel("Score",fontsize=14)
#Add some more space so the markers are not cut
plt.xlim(1999.8,2010.2)
plt.ylim(2.8,9.2)
plt.show()
Used to plot the ranges of one quantitative variable for different categories. We can add another qualitative variable splitting into colors (hue).
Important arguments:
In [ ]:
sns.boxplot?
In [5]:
import pylab as plt
In [11]:
sns.set(font_scale=1.2) #20% larger fonts
#Create figure
plt.figure(figsize=(6,4))
#Make box plot
sns.boxplot(x="treatment", y="score",hue='year', data=df,palette="Blues")
#Take out the vertical grid
sns.despine(trim=True)
plt.savefig("plots/d.pdf")
plt.show()
But is this the most appropriate plot?
A point plot represents an estimate of central tendency for a numeric variable by the position of scatter plot points and provides some indication of the uncertainty around that estimate using error bars.
Important arguments:
In [ ]:
sns.pointplot?
In [60]:
sns.set(font_scale=1.) #20% larger fonts
#Create figure
plt.figure(figsize=(6,4))
#Make box plot
sns.pointplot(x="year", y="score", hue="treatment",data=df)
#Take out the vertical grid
sns.despine(trim=True)
plt.show()
In [61]:
Image("http://i.imgur.com/WRuJV6r.png")
Out[61]:
In [62]:
#we have a list
this_is_a_list = [1,2,3,4,5]
#this is the length
len_list = len(this_is_a_list)
print(len_list)
#we try to get the element, it doesn't exit (index 4 = fifth element)
this_is_a_list[len_list]
In [63]:
this_is_a_list = [1,2,3,4,5]
#we try to sum the fourth element to a variable
sum_all = sum_all + this_is_a_list[3]
In [64]:
#missing parenthesis
sum([1,2,3]
In [65]:
#you cannot tell that 3 is 5, the computer is smarter than that
3 = 5
In [66]:
#Careful with this one
"A" == "a"
Out[66]:
In [67]:
open("non_existing_file","r")
In [ ]:
#The mistake from earlier
d = dict({"Him": 0, "Her": 1})
d["You"]
In [68]:
this_is_a_list = [0,1,2,3,4]
this_is_a_list + 8
In [ ]:
this_is_a_list = [0,1,2,3,4]
this_is_a_list + [8]
In [69]:
this_is_a_list = [0,1,2,3,4]
this_is_a_list.add(8)
In [ ]:
#THIS DOES NOT WORK IN PLACE
this_is_a_list = [4,3,2,1,0]
print(this_is_a_list)
print(sorted(this_is_a_list))
print(this_is_a_list)
In [ ]:
#THIS WORKS IN PLACE
this_is_a_list = [4,3,2,1,0]
print(this_is_a_list)
print(this_is_a_list.sort())
print(this_is_a_list)
In [ ]:
#So usually you would do
this_is_a_list = [4,3,2,1,0]
sorted_list = sorted(this_is_a_list)
print(this_is_a_list)
print(sorted_list)
In [ ]:
#But it you do that with a function that works in place you may not get what you expect
this_is_a_list = [4,3,2,1,0]
sorted_list = this_is_a_list.sort()
print(this_is_a_list)
print(sorted_list)
In [ ]:
#Some functions that you will use and work in place: APPEND to list
this_is_a_list = [4,3,2,1,0]
this_is_a_list.append(3) #add a 3 to the end
print(this_is_a_list)
In [ ]:
#Some functions that you will use and work in place: POP to list
this_is_a_list = [4,3,2,1,0]
this_is_a_list.pop(-1) #remove last element
print(this_is_a_list)