Javier Garcia-Bernardo garcia@uva.nl
In [54]:
##Some code to run at the beginning of the file, to be able to show images in the notebook
##Don't worry about this cell
#Print the plots in this screen
%matplotlib inline
#Be able to plot images saved in the hard drive
from IPython.display import Image
#Make the notebook wider
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
import seaborn as sns
import pylab as plt
import pandas as pd
import numpy as np
def read_our_csv():
#reading the raw data from oecd
df = pd.read_csv("../class2/data/CITIES_19122016195113034.csv",sep="\t")
#fixing the columns (the first one is ""METRO_ID"" instead of "METRO_ID")
cols = list(df.columns)
cols[0] = "METRO_ID"
df.columns = cols
#pivot the table
column_with_values = "Value"
column_to_split = ["VAR"]
variables_already_present = ["METRO_ID","Metropolitan areas","Year"]
df_fixed = df.pivot_table(column_with_values,
variables_already_present,
column_to_split).reset_index()
return df_fixed
Let's imagine we have a data of a survey, with age, income, education, political ideas, location, and if you will vote for Trump or Hillary.
We may have some missing values. This missing values can be:
impute
missing values.
In [20]:
Out[20]:
In [22]:
Out[22]:
In [6]:
#Read and fix the data
df_fixed = read_our_csv()
#Remove rows with missing values
cols = ["LABOUR_PRODUCTIVITY","UNEMP_R","GDP_PC"]
df_fixed = df_fixed.dropna(subset=cols)
#Creating a column for country
df_fixed["C"] = df_fixed["METRO_ID"].apply(lambda x: x[:2])
#Keeping italy
df_fixed = df_fixed.loc[df_fixed["C"]=="IT",["C","METRO_ID","Metropolitan areas"] +cols]
#We are going to normalize values dividing by the mean (so new values have a mean of 1)
df_fixed.loc[:,cols] = df_fixed[cols]/np.nanmean(df_fixed[cols],0)
#Make a copy of the data
df_original = df_fixed.copy()
#Take a random sample of 20 values of productivity
sample = set(df_fixed.loc[:,"LABOUR_PRODUCTIVITY"].sample(20))
#Deleting those values (saying that they are np.NaN (missing))
df_fixed.loc[df_fixed["LABOUR_PRODUCTIVITY"].isin(sample),"LABOUR_PRODUCTIVITY"] = np.NaN
df_fixed.head(10)
Out[6]:
In [7]:
#How to fix by ignoring the rows
ignoring = df_fixed.dropna(subset=["LABOUR_PRODUCTIVITY"])
ignoring.head(10)
Out[7]:
In [8]:
#How to fix by imputing with mean/median/mode
mean_inputed = df_fixed.fillna(df_fixed.mean())
mean_inputed.head(10)
Out[8]:
In [9]:
#Based on this. Similar points for unemployment have similar points for productivity
sns.lmplot(x="LABOUR_PRODUCTIVITY",y="UNEMP_R",data=df_fixed,fit_reg=False)
Out[9]:
In [11]:
Image("figures/labels.png")
Out[11]:
In [12]:
print("Using a random sample => MCAR DATA")
Image("figures/kmeans_vs_mean.png")
Out[12]:
In [13]:
print("Using a biasad sample => MAR DATA")
Image("figures/kmeans_vs_mean_worst_case.png")
Out[13]:
In [9]:
#What's our current directory
import os
os.getcwd()
Out[9]:
In [ ]:
pd.read_stata("data/colombia.dta")
In [15]:
#Read data and print the head to see how it looks like
df = pd.read_csv("data/world_bank/data.csv",na_values="..")
df.head()
Out[15]:
In [ ]:
df.columns = ["Country Name","Country Code","Series Name","Series Code",1967,1968,1969,...]
In [ ]:
df.to_csv("data/new_columns.csv",sep="\t")
In [51]:
## 4.1b Fix the year of the column (make it numbers)
df = pd.read_csv("data/world_bank/data.csv",na_values="..")
old_columns = list(df.columns)
new_columns = []
for index,column_name in enumerate(old_columns):
if index < 4:
new_columns.append(column_name)
else:
year_column = int(column_name[:4])
new_columns.append(year_column)
df.columns = new_columns
df.head()
Out[51]:
Remember, this was the code that we use to fix the file of the `
### Fix setp 1: Melt
variables_already_presents = ['METRO_ID', 'Metropolitan areas','VAR']
columns_combine = cols
df = pd.melt(df,
id_vars=variables_already_presents,
value_vars=columns_combine,
var_name="Year",
value_name="Value")
df.head()
### Fix step 2: Pivot
column_with_values = "Value"
column_to_split = ["VAR"]
variables_already_present = ["METRO_ID","Metropolitan areas","Year"]
df.pivot_table(column_with_values,
variables_already_present,
column_to_split).reset_index().head()
`
In [ ]:
#code
In [ ]:
In [ ]:
In [ ]:
In [ ]:
For instance, if we have data on temperatures by country and month and we want to calculate the mean temperature for each country:
Luckily, python can make this easier (in one line).
In [55]:
df = pd.read_csv("data/world_bank/data.csv",na_values="..")
In [56]:
df
Out[56]:
In [58]:
df.groupby(["Country Code"]).describe()
Out[58]:
In [33]:
df.groupby(["Country Code"]).mean()
Out[33]:
A note on keeping only non-missing values from an array
In [33]:
import numpy as np
#This creates 11 equally spaced numbers between 0 and 10
x = np.linspace(0,10,11)
#The fourth element is NaN
x[3] = np.NaN
x
Out[33]:
In [34]:
np.isfinite(x)
Out[34]:
In [35]:
#keep only finite values (no missing, no infinite)
x = x[np.isfinite(x)]
x
Out[35]:
In [38]:
def my_function(x):
return np.median(x[np.isfinite(x)])
In [39]:
df.groupby(["Country Code"]).agg(my_function)
Out[39]:
In [37]:
df.groupby(["Country Code"]).agg(lambda x: np.median(x[np.isfinite(x)]))
Out[37]:
In [35]:
for country,data in df.groupby(["Country Code"]):
print(country)
display(data.head())
In [43]:
df.groupby(["Country Code"]).max()
Out[43]: