In [330]:
#Normal inputs
import pandas as pd
import numpy as np
import seaborn as sns
import pylab as plt
%matplotlib inline
from IPython.display import Image, display
#Make the notebook wider
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
In [328]:
#Create a toy dataframe
df = pd.DataFrame([[1,2,3],[1,2,3]],columns=["a","b","c"])
df
Out[328]:
In [263]:
#Saving the toy dataframe above
df.to_csv("data/toy_df.csv",sep="\t",index=False)
A variable is any characteristics, number, or quantity that can be measured or counted: e.g. income, height, happiness
Numeric: Numbers
Categorical: Categories
An observation is each of the values of a variable: e.g. 800, 1.93, 8, "middle"
In [264]:
Image(url="http://www.abs.gov.au/websitedbs/a3121120.nsf/4a256353001af3ed4b2562bb00121564/5869990518bbb7feca257b55002261e0/Body/3.109A!OpenElement&FieldElemFormat=gif")
Out[264]:
Based on: http://vita.had.co.nz/papers/tidy-data.pdf
Tidy data is a standard way of mapping the meaning of a dataset to its structure. [...] In tidy data:
In [265]:
%%html
<h3>There are three variables in this data set. What are they? </h3>
<table>
<tr> <th></th> <th>2010</th> <th>2015</th> </tr>
<tr> <td>Male</td> <td>0</td> <td>5</td> </tr>
<tr> <td>Female</td> <td>1</td> <td>4</td> </tr>
</table>
In [268]:
%%html
<h3>Tidy data</h3>
<table>
<tr> <th>year</th> <th>sex</th> <th>count</th> </tr>
<tr> <td>2010</td> <td>female</td> <td>1</td> </tr>
<tr> <td>2010</td> <td>male</td> <td>0</td> </tr>
<tr> <td>2015</td> <td>female</td> <td>4</td> </tr>
<tr> <td>2015</td> <td>male</td> <td>5</td> </tr>
</table>
Real datasets can, and often do, violate the three precepts of tidy data in almost every way imaginable. While occasionally you do get a dataset that you can start analysing immediately, this is the exception, not the rule:
In [269]:
#Example of melting (columns to rows)
print("Melting")
Image("figures/melt.png",width=400)
Out[269]:
In [270]:
#Example of pivoting (rows to columns)
print("Pivoting")
Image("figures/pivot.png",width=800)
Out[270]:
Data to study: variations of https://stats.oecd.org/Index.aspx?DataSetCode=CITIES (which has problem 2)
In [331]:
#Data on population density by year
df = pd.read_csv("data/columns_headers.csv")
df.head()
Out[331]:
In [332]:
### Fix
#Variables present in columns already
variables_already_presents = ['METRO_ID', 'Metropolitan areas']
#Variables to combine in one
columns_combine = ['2000', '2001', '2002','2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011','2012', '2013', '2014']
#Fix
fixed = pd.melt(df,id_vars=variables_already_presents,
value_vars=columns_combine,
var_name="Year",
value_name="POP_DENS")
fixed.head(10)
Out[332]:
In [334]:
#Actual data from OECD
df = pd.read_csv("data/multiple_variables.csv")
df.head(2)
Out[334]:
In [340]:
### Fix
#Columns already present
variables_already_present = ["METRO_ID","Metropolitan areas","Year"]
#Column with the variables
column_to_split = ["VAR"]
#Column with the values
column_with_values = "Value"
#Fix
df_fixed = df.pivot_table(column_with_values,
variables_already_present,
column_to_split)
df_fixed.head()
Out[340]:
In [341]:
#use .reset_index() to convert index to columns
df_fixed.reset_index()
Out[341]:
In [290]:
Out[290]:
In [297]:
df = pd.read_csv("data/multiple_variables_and_headers.csv")
df.head()
Out[297]:
In [298]:
print(list(df.columns)[4:])
In [299]:
cols = []
for column in df.columns:
try:
int(column)
cols.append(column)
except:
pass
print(cols)
In [300]:
### Fix setp 1
variables_already_presents = ['METRO_ID', 'Metropolitan areas','VAR']
columns_combine = cols
df = pd.melt(df,
id_vars=variables_already_presents,
value_vars=columns_combine,
var_name="Year",
value_name="Value")
df.head()
Out[300]:
In [301]:
### Fix step 2
column_with_values = "Value"
column_to_split = ["VAR"]
variables_already_present = ["METRO_ID","Metropolitan areas","Year"]
df.pivot_table(column_with_values,
variables_already_present,
column_to_split).reset_index().head()
Out[301]:
In [302]:
#Add more observations
Image(url="http://pandas.pydata.org/pandas-docs/stable/_images/merging_concat_basic.png")
Out[302]:
In [304]:
madrid = pd.read_csv("data/mad_twovariables.csv").head(2)
amsterdam = pd.read_csv("data/ams_twovariables.csv").head(2)
display(madrid)
display(amsterdam)
In [305]:
combined = pd.concat([madrid,amsterdam])
combined
Out[305]:
Note that it's not a problem if the variables are not all the same
But in that case you'll have missing values
In [306]:
a = pd.DataFrame([[1,2],[3,4]],columns=["a","b"])
b = pd.DataFrame([[1,2],[3,4]],columns=["c","b"])
display(a)
display(b)
In [307]:
pd.concat([a,b])
Out[307]:
In [308]:
Image(url="http://pandas.pydata.org/pandas-docs/stable/_images/merging_merge_on_key.png",width=700)
Out[308]:
In [309]:
ams_green = pd.read_csv("data/ams_green.csv")
ams_pop = pd.read_csv("data/ams_popdens.csv")
display(ams_green.head(2))
display(ams_pop.head(2))
In [343]:
combined = pd.merge(ams_green,ams_pop,
left_on=["METRO_ID","Metropolitan areas","Year"],
right_on=["METRO_ID","Metropolitan areas","Year"],
how="inner")
combined.head()
Out[343]:
In [311]:
print("Inner join pd.merge(left_df,right_df,how=\"inner\")")
display(Image(url="http://pandas.pydata.org/pandas-docs/stable/_images/merging_merge_on_key_inner.png"))
print("Right join pd.merge(left_df,right_df,how=\"right\")", "The same with left")
display(Image(url="http://pandas.pydata.org/pandas-docs/stable/_images/merging_merge_on_key_right.png"))
print("Outer join pd.merge(left_df,right_df,how=\"outer\")")
display(Image(url="http://pandas.pydata.org/pandas-docs/stable/_images/merging_merge_on_key_outer.png"))
In [350]:
def mean_(some_numbers):
return np.mean(some_numbers)
In [351]:
a = mean_([1,2,3])
print(a)
In [ ]:
In [ ]:
In [ ]:
np.mean([1,2,3])
In [ ]:
In [357]:
def read_our_csv():
#reading the raw data from oecd
df = pd.read_csv("data/CITIES_19122016195113034.csv",sep="\t")
#fixing the columns (the first one is ""METRO_ID"" instead of "METRO_ID")
cols = list(df.columns)
cols[0] = "METRO_ID"
df.columns = cols
#pivot the table
column_with_values = "Value"
column_to_split = ["VAR"]
variables_already_present = ["METRO_ID","Metropolitan areas","Year"]
df_fixed = df.pivot_table(column_with_values,
variables_already_present,
column_to_split).reset_index()
return df_fixed
In [352]:
df_fixed = read_our_csv()
df_fixed.head()
Out[352]:
In [316]:
#Read and fix the data
df_fixed = read_our_csv()
df_fixed = df_fixed.loc[df_fixed["Year"]==2000,["GREEN_AREA_PC","POP_DENS"]]
df_fixed.head()
Out[316]:
In [317]:
plt.figure(figsize=(6,4))
plt.scatter(df_fixed["GREEN_AREA_PC"],df_fixed["POP_DENS"],edgecolor="none",color=(70/255,140/255,210/255))
plt.yscale("log")
plt.xscale("log")
plt.ylim(10,10000)
plt.xlim(10,10000)
plt.xlabel("Green area (m^2/million people)")
plt.ylabel("Density population (people/km^s)")
Out[317]:
Imagine we want to calculate the fraction of green area
$$\frac{10^{6} m^2_{green}}{people} \cdot \frac{people}{km^2_{city}} = \frac{10^{6} m^2_{green}}{km^2_{city}}$$
In [318]:
#fraction of green area
df_fixed["fraction_green"] = df_fixed["GREEN_AREA_PC"]*df_fixed["POP_DENS"]/1E6
In [319]:
#make the figure
plt.figure(figsize=(6,4))
plt.scatter(df_fixed["fraction_green"],df_fixed["POP_DENS"],edgecolor="none",color=(70/255,140/255,210/255))
plt.yscale("log")
plt.xscale("log")
plt.ylim(10,10000)
plt.xlim(0.0001,1)
plt.xlabel("Percentage green (m^2 green / km^2 land)")
plt.ylabel("Density population (population/km^2)")
Out[319]:
In [358]:
#Imagine we want to convert the income to a category
df = pd.DataFrame({"kids": [2,3,3,1,3,1,4,4,2], "income": [">100",">100","<40","<40","40-100","<40","40-100",">100","40-100"]})
df
Out[358]:
In [359]:
d_cat2income = {">100": 100,
"40-100": 70,
"<40": 40}
d_cat2income
Out[359]:
In [376]:
d_cat2income.get(">90")
In [ ]:
In [374]:
df["income_number"] = df["income"].apply(lambda x:
x[0])
In [375]:
df.head()
Out[375]:
In [321]:
#First we need a dictionary
d_cat2income = {">100": 100, "40-100": 70, "<40": 40}
In [322]:
def category2income(value):
return d_cat2income.get(value)
df = pd.DataFrame({"kids": [2,3,3,1,3,1,4,4,2], "income": [">100",">100","40-100","<40","40-100","<40","40-100",">100","40-100"]})
df["income"] = df["income"].apply(category2income)
df.head()
Out[322]:
In [323]:
#Or without creating a new function
df = pd.DataFrame({"kids": [2,3,3,1,3,1,4,4,2], "income": [">100",">100","40-100","<40","40-100","<40","40-100",">100","40-100"]})
df["income"] = df["income"].apply(lambda x: d_cat2income.get(x))
df.head()
Out[323]:
In [324]:
df_fixed = read_our_csv().loc[:,["METRO_ID","Year"]]
df_fixed.head()
Out[324]:
In [325]:
def keep_2_letters(value):
return value[:2]
df_fixed["Country"] = df_fixed["METRO_ID"].apply(keep_2_letters)
df_fixed.head()
Out[325]:
In [326]:
#or in one line
df_fixed["Country"] = df_fixed["METRO_ID"].apply(lambda x: x[:2])
df_fixed.head()
Out[326]:
In [327]:
#for example doing the log of a column. This is useful when the distribution is lognormal (such as income, gdp).
#But somehow confusing, the distribution of the income of many people IS LOGNORMAL.
#The distribution of the mean income of many cities IS NOT LOGNORMAL.
df_fixed = read_our_csv().loc[:,["METRO_ID","Year","GDP_PC"]]
df_fixed["GDP_PC_log"] = np.log10(df_fixed["GDP_PC"])
df_fixed.head()
Out[327]:
Let's imagine we have a data of a survey, with age, income, education, political ideas, location, and if you will vote for Trump or Hillary.
We may have some missing values. This missing values can be:
impute
missing values.
In [ ]:
#Read and fix the data
df_fixed = read_our_csv()
#Remove rows with missing values
cols = ["LABOUR_PRODUCTIVITY","UNEMP_R","GDP_PC"]
df_fixed = df_fixed.dropna(subset=cols)
#Creating a column for country
df_fixed["C"] = df_fixed["METRO_ID"].apply(lambda x: x[:2])
#Keeping italy
df_fixed = df_fixed.loc[df_fixed["C"]=="IT",["C","METRO_ID","Metropolitan areas"] +cols]
#We are going to normalize values dividing by the mean (so new values have a mean of 1)
df_fixed.loc[:,cols] = df_fixed[cols]/np.nanmean(df_fixed[cols],0)
#Make a copy of the data
df_original = df_fixed.copy()
#Take a random sample of 20 values of productivity
sample = set(df_fixed.loc[:,"LABOUR_PRODUCTIVITY"].sample(20))
#Deleting those values (saying that they are np.NaN (missing))
df_fixed.loc[df_fixed["LABOUR_PRODUCTIVITY"].isin(sample),"LABOUR_PRODUCTIVITY"] = np.NaN
df_fixed.head()
In [ ]:
#How to fix by ignoring the rows
ignoring = df_fixed.dropna(subset=["LABOUR_PRODUCTIVITY"])
In [ ]:
#How to fix by imputing with mean/median/mode
mean_inputed = df_fixed.fillna(df_fixed.mean())
In [ ]:
#Based on this. Similar points for unemployment have similar points for productivity
sns.lmplot(x="LABOUR_PRODUCTIVITY",y="UNEMP_R",data=df_fixed,fit_reg=False)
In [ ]:
Image("figures/labels.png")
In [ ]:
print("Using a random sample => MCAR DATA")
Image("figures/kmeans_vs_mean.png")
In [ ]:
print("Using a biasad sample => MAR DATA")
Image("figures/kmeans_vs_mean_worst_case.png")
In short:
robust
statistics, that are not so affected by outliers. More info at another time, but read this: http://www.theanalysisfactor.com/outliers-to-drop-or-not-to-drop/
In [ ]:
#Read and fix the data. also drop missing values
df = read_our_csv().loc[:,["METRO_ID","Year","Metropolitan areas","LABOUR_PRODUCTIVITY","UNEMP_R"]]
#Drop rows with missing values
df = df.dropna()
#Keep years 2000 and 2012
df = df.loc[df["Year"].isin([2000,2012])]
#Add a column for country
df["C"] = df["METRO_ID"].apply(lambda x: x[:2] if isinstance(x,str) else "")
df.head()
Use to plot the relationship between two quantitative variables (and one qualitative variable using color)
In [ ]:
sns.lmplot?
In [ ]:
keep_countries = ['IT', 'NL', 'DE', 'ES', 'GR']
In [ ]:
sns.lmplot(x="LABOUR_PRODUCTIVITY",y="UNEMP_R",hue="C",col="Year"
,data=df.loc[df["C"].isin(keep_countries)],fit_reg=False,
size=3,aspect=1.4,sharex=True,sharey=True)
plt.show()
Creates a histogram. Important to see the distribution of your data, to find outliers, etc.
In [ ]:
#Roll two dices 100 times
dice_rolls = np.random.randint(1,7,1000) + np.random.randint(1,7,1000)
dice_rolls
In [ ]:
#Count the number of each element to create the distribution
from collections import Counter
Counter(dice_rolls)
In [ ]:
#And we can visualize it with a histogram
sns.distplot(dice_rolls, kde=False, rug=False)
In [ ]:
#Read and fix the data. also drop missing values
df = read_our_csv().loc[:,["METRO_ID","Year","Metropolitan areas","LABOUR_PRODUCTIVITY","UNEMP_R"]]
#Drop rows with missing values
df = df.dropna()
#Keep years 2000 and 2012
df = df.loc[df["Year"].isin([2000,2012])]
#Add a column for country
df["C"] = df["METRO_ID"].apply(lambda x: x[:2] if isinstance(x,str) else "")
df.head()
In [ ]:
#Keep x and y values
x = df["LABOUR_PRODUCTIVITY"]
#Make plot
plt.figure(figsize=(6,4))
sns.distplot(x, kde=True, rug=False,hist_kws={"edgecolor":"none"},kde_kws={"cut":0})
plt.ylabel("Frequency")
plt.xlabel("Labour productivity")
In [ ]:
#fitting a distribution
from scipy.stats import norm,lognorm,expon
#Keep x and y values
x = df["LABOUR_PRODUCTIVITY"]
#Make plot
plt.figure(figsize=(6,4))
sns.distplot(x, kde=False, fit=norm, rug=False,
hist_kws={"edgecolor":"none","normed":True},fit_kws={"cut":0,"color":"gray"})
plt.ylabel("Frequency")
plt.xlabel("Labour productivity")
Creates two histogrmams one vs the other. Important to see the distribution of your data, to find outliers, etc. Also important when plotting events in a map
In [ ]:
sns.jointplot?
In [ ]:
#Make plot
sns.jointplot(x="LABOUR_PRODUCTIVITY", y="UNEMP_R", data=df,size=6)
plt.ylabel("Frequency")
plt.xlabel("Labour productivity")
plt.show()
In [ ]:
sns.jointplot(x="LABOUR_PRODUCTIVITY", y="UNEMP_R", data=df,kind="hex",
marginal_kws=dict(bins=20, rug=False, kde=True, kde_kws={"cut":0}), gridsize = 15,size=6)
In [ ]:
cols_get = ["METRO_ID","Metropolitan areas","Year","UNEMP_R",'GDP_PC',"LABOUR_PRODUCTIVITY"]
df_fixed = read_our_csv().loc[:,cols_get]
df_fixed["C"] = df_fixed["METRO_ID"].apply(lambda x: x[:2] if isinstance(x,str) else "")
south = df_fixed.loc[df_fixed["C"].isin(["IT","ES","GR","DE"])]
south.head()
plt.figure(figsize=(6,4))
sns.violinplot(x="C", y="LABOUR_PRODUCTIVITY", data=south,width=1,inner="quartiles")
In [ ]:
from pandas.tools.plotting import parallel_coordinates
In [ ]:
cols_get = ["METRO_ID","Metropolitan areas","Year","UNEMP_R",'GDP_PC',"LABOUR_PRODUCTIVITY"]
df_fixed = read_our_csv().loc[:,cols_get]
df_fixed["C"] = df_fixed["METRO_ID"].apply(lambda x: x[:2] if isinstance(x,str) else "")
italy = df_fixed.loc[df_fixed["C"].isin(["IT"])]
italy = italy.loc[italy["Year"].isin([2001])]
italy.head()
We are going to normalize values using standard scores
https://en.wikipedia.org/wiki/Standard_score
m
and a standard deviation d
m
and divide them by d
($\frac{x - mean(x)}{std(x)}$) your new values have a mean of 0 and a standard deviation of 1
In [ ]:
#Here we normalize the variables so th
cols = ["UNEMP_R",'GDP_PC',"LABOUR_PRODUCTIVITY"]
for c in cols:
italy[c] = (italy[c]- np.mean(italy[c]))/np.std(italy[c])
parallel_coordinates(italy.loc[(italy["Metropolitan areas"] !="Italy")],"Year",cols=cols,colormap="Paired")
In [ ]:
d_city2regio = {"Bari": "S",
"Bologna": "C",
"Catania": "S",
"Florence": "C",
"Genova": "N",
"Milan": "N",
"Naples": "S",
"Palermo": "S",
"Rome": "C",
"Turin": "N",
"Venice": "N"}
italy["Area"] = italy["Metropolitan areas"].apply(lambda x: d_city2regio.get(x))
parallel_coordinates(italy.loc[(italy["Metropolitan areas"] !="Italy")],"Area",cols=cols,colormap="Paired")
http://seaborn.pydata.org/generated/seaborn.factorplot.html To visualize interactions between variables, as well as many distributions.
ci = size of confidence intervals (0 for no confidence intervals)
See examples for documentation
In [ ]:
cols_get = ["METRO_ID","Metropolitan areas","Year","UNEMP_R",'GDP_PC',"LABOUR_PRODUCTIVITY"]
df_fixed = read_our_csv().loc[:,cols_get]
df_fixed["C"] = df_fixed["METRO_ID"].apply(lambda x: x[:2] if isinstance(x,str) else "")
south = df_fixed.loc[df_fixed["C"].isin(["IT","ES","GR","DE"])]
south.head()
In [ ]:
sns.factorplot(x="Year", y="UNEMP_R", hue="C", size=4,aspect=1.8,data=south,ci=95)
In [ ]:
sns.factorplot(x="Year", y="UNEMP_R", hue="C", size=4,aspect=3,
data=south.loc[south["Year"].isin([2000,2010])],kind="violin",width=0.9,inner="quartiles")
In [ ]: