Javier Garcia-Bernardo garcia@uva.nl
In [77]:
##Some code to run at the beginning of the file, to be able to show images in the notebook
##Don't worry about this cell
#Print the plots in this screen
%matplotlib inline
#Be able to plot images saved in the hard drive
from IPython.display import Image
#Make the notebook wider
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
import seaborn as sns
import pylab as plt
import pandas as pd
import numpy as np
def read_our_csv():
#reading the raw data from oecd
df = pd.read_csv("../class2/data/CITIES_19122016195113034.csv",sep="\t")
#fixing the columns (the first one is ""METRO_ID"" instead of "METRO_ID")
cols = list(df.columns)
cols[0] = "METRO_ID"
df.columns = cols
#pivot the table
column_with_values = "Value"
column_to_split = ["VAR"]
variables_already_present = ["METRO_ID","Metropolitan areas","Year"]
df_fixed = df.pivot_table(column_with_values,
variables_already_present,
column_to_split).reset_index()
return df_fixed
In [3]:
#From Tufte "The visual display of information"
Image(url="images/tufle1.png")
Out[3]:
In [125]:
sns.set(style="ticks")
# Load the example dataset for Anscombe's quartet
df = sns.load_dataset("anscombe")
# Show the results of a linear regression within each dataset
sns.lmplot(x="x", y="y", col="dataset", hue="dataset", data=df,
col_wrap=4, ci=None, palette="muted", size=4,
scatter_kws={"s": 50, "alpha": 1})
Out[125]:
In [19]:
#From http://www.cs171.org/2015/assets/slides/05-marks_channels.pdf
Image(url="images/channels.png",width=1000)
Out[19]:
In [20]:
#From http://www.cs171.org/2015/assets/slides/05-marks_channels.pdf
Image(url="images/cleveland.png",width=1000)
Out[20]:
In [16]:
#https://en.wikipedia.org/wiki/Stevens'_power_law
#From http://www.cs171.org/2015/assets/slides/05-marks_channels.pdf
Image(url="images/steven.png",width=500)
Out[16]:
In [126]:
plt.figure(figsize=(4,3))
plt.bar([1,2],[1,3.5],width=0.3)
#plt.axis('off')
plt.yticks([1,2,3,4])
plt.xticks([])
plt.tick_params(axis='both', left='off', top='off', right='off', bottom='off', labelleft='off', labeltop='off', labelright='off', labelbottom='off')
plt.grid("on")
plt.title("How much bigger?")
Out[126]:
In [49]:
plt.scatter?
In [127]:
plt.figure(figsize=(4,3))
plt.scatter([1,1.1],[1,1],s=[500,1250])
plt.tick_params(axis='both', left='off', top='off', right='off', bottom='off', labelleft='off', labeltop='off', labelright='off', labelbottom='off')
plt.grid("on")
plt.title("How much bigger?")
Out[127]:
In [133]:
plt.figure(figsize=(4,3))
plt.bar([1,2],[2*np.sqrt(0.5),3.5],width=[np.sqrt(0.5),1])
plt.yticks([1,2,3,4])
plt.xticks([0.5,1,2,3,3.5])
plt.tick_params(axis='both', left='off', top='off', right='off', bottom='off', labelleft='off', labeltop='off', labelright='off', labelbottom='off')
plt.grid("on")
plt.title("How much bigger?")
Out[133]:
In [13]:
plt.figure(figsize=(4,3))
plt.bar([1,2],[2,2],width=[1,1],color=[(20/255,20/255,20/255),(100/255,100/255,100/255)])
plt.yticks([1,2,3,4])
plt.xticks([0.5,1,2,3,3.5])
plt.tick_params(axis='both', left='off', top='off', right='off', bottom='off', labelleft='off', labeltop='off', labelright='off', labelbottom='off')
plt.grid("on")
plt.title("How much darker?")
Out[13]:
In [15]:
plt.figure(figsize=(4,3))
plt.bar([1,2],[2,2],width=[1,1],color=[(0/255,74/255,235/255),(59/255,96/255,176/255)])
plt.yticks([1,2,3,4])
plt.xticks([0.5,1,2,3,3.5])
plt.tick_params(axis='both', left='off', top='off', right='off', bottom='off', labelleft='off', labeltop='off', labelright='off', labelbottom='off')
plt.grid("on")
plt.title("How much more saturation?")
Out[15]:
When we do this, the area below the curve is equal to 1.
When are we interested on this:
In [18]:
#Data example: If you draw two dice, then you will get a lot of 7s, many 6s and 8s, some 5s and 9s, a few 4s and 10st, very few32s and 11st and almost no 2s ans 12st.
#This data is discrete
from collections import Counter
#Roll two dices 10000 times
dice_rolls = np.random.randint(1,7,10000) + np.random.randint(1,7,10000)
#Count the number of each element to create the distribution
Counter(dice_rolls)
Out[18]:
In [30]:
from scipy.stats import norm,lognorm,expon
#seaborn defaults
sns.set()
#And we can visualize it with a histogram
plt.figure(figsize=(4,3))
#Histogram
sns.distplot(dice_rolls, fit=norm, kde=False,rug=False,bins=range(2,14),norm_hist=True)
Out[30]:
In [112]:
!conda update pandas -y
In [116]:
#And we can visualize it with a histogram
plt.figure(figsize=(4,2))
sns.boxplot(dice_rolls,orient="h")
Out[116]:
In [119]:
#And we can visualize it with a histogram
plt.figure(figsize=(4,2))
sns.violinplot(dice_rolls,orient="h",inner="quartiles")
Out[119]:
When are we interested on this:
This plot only make sense if we have many categories
In [141]:
plt.figure(figsize=(4,2))
#mean is the default
sns.barplot(dice_rolls,estimator=np.mean)
Out[141]:
In [78]:
df = read_our_csv()
df["C"] = df["METRO_ID"].apply(lambda x: x[:2])
df = df.loc[df["C"] == "IT"]
df.head()
Out[78]:
In [171]:
plt.figure(figsize=(6,4))
sns.boxplot(x="Metropolitan areas",y="GDP_PC",data=df,color="gray")
plt.xticks(rotation=45)
plt.show()
In [170]:
plt.figure(figsize=(6,4))
sns.violinplot(x="Metropolitan areas",y="GDP_PC",data=df,color="gray",inner="quartiles")
plt.xticks(rotation=45)
plt.show()
In [169]:
plt.figure(figsize=(6,4))
sns.barplot(x="Metropolitan areas",y="GDP_PC",data=df,color="gray")
plt.xticks(rotation=45)
plt.show()
In [79]:
df_2010_15 = df.loc[df["Year"].isin([2005,2010]),:]
plt.figure(figsize=(6,4))
sns.barplot(x="Metropolitan areas",y="GDP_PC",data=df_2010_15,hue="Year")
plt.xticks(rotation=45)
plt.show()
In [88]:
df_2010_15 = df.loc[df["Year"].isin([2005,2010]),:]
df_2010_15 = df_2010_15.sort_values(by="GDP_PC")
plt.figure(figsize=(6,4))
sns.barplot(x="Metropolitan areas",y="GDP_PC",data=df_2010_15,hue="Year")
plt.xticks(rotation=45)
plt.show()
In [202]:
sns.lmplot(x="GDP_PC",y="UNEMP_R",data=df, fit_reg=False,size=4,aspect=1.4)
plt.show()
And we can add a trendline
In [217]:
plt.figure(figsize=(6,4))
sns.lmplot(x="GDP_PC",y="UNEMP_R",data=df, logx=True)
plt.show()
In [38]:
plt.figure(figsize=(6,4))
sns.lmplot(x="GDP_PC",y="UNEMP_R",data=df, lowess=True)
plt.show()
And we can add the marginal distributions
In [220]:
sns.jointplot(x="GDP_PC", y="UNEMP_R", data=df,
marginal_kws=dict(bins=20, rug=False, kde=True, kde_kws={"cut":0}),size=6,alpha=0.5)
Out[220]:
And we can bin the data
In [99]:
sns.jointplot(x="GDP_PC", y="UNEMP_R", data=df,kind="hex",
marginal_kws=dict(bins=20, rug=False, kde=True, kde_kws={"cut":0}), gridsize = 15,size=6)
plt.subplots_adjust(top=0.9)
plt.suptitle('THIS IS A TITLE, YOU BET') # can also get the figure from plt.gcf()
Out[99]:
In [93]:
sns.tsplot(time="Year",unit="Metropolitan areas",value="UNEMP_R",data=df.reset_index(),estimator=np.mean)
Out[93]:
In [89]:
sns.tsplot?
In [224]:
sns.lmplot(x="GDP_PC",y="UNEMP_R",hue="Metropolitan areas",data=df, fit_reg=False,size=4,aspect=1.4)
plt.show()
In [73]:
sns.tsplot(time="Year",unit="Metropolitan areas",value="UNEMP_R",condition="Metropolitan areas",data=df.reset_index())
plt.savefig("annoying_legend.pdf")
In [74]:
#How to move the legend out
sns.tsplot(time="Year",unit="Metropolitan areas",value="UNEMP_R",condition="Metropolitan areas",data=df.reset_index())
sns.plt.legend(loc='center left',bbox_to_anchor=(1,0.5))
Out[74]:
In [100]:
df_subset = df.loc[:,["Metropolitan areas","CO2_PC","GDP_PC","GREEN_AREA_PC","POP_DENS","UNEMP_R"]].dropna()
In [76]:
sns.pairplot(df_subset.dropna(),hue="Metropolitan areas")
Out[76]:
In [102]:
df_subset.head()
Out[102]:
In [105]:
corr = df_subset.corr()
corr
Out[105]:
In [106]:
corr**2
Out[106]:
In [264]:
# Compute the correlation matrix
corr = df_subset.corr()
# Generate a mask for the upper triangle (hide the upper triangle)
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, square=True,linewidths=.5)
plt.show()
Use gephi: https://gephi.org/
This is what we do at the corpnet group (corpnet.uva.nl)
In [189]:
Image(url="images/newtork.png")
Out[189]:
In [111]:
plt.bar([1,2],[5,0.2])
plt.plot([1,3],[1,1])
plt.yscale("log")