Javier Garcia-Bernardo garcia@uva.nl
In [1]:
##Some code to run at the beginning of the file, to be able to show images in the notebook
##Don't worry about this cell
#Print the plots in this screen
%matplotlib inline
#Be able to plot images saved in the hard drive
from IPython.display import Image
#Make the notebook wider
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
import seaborn as sns
import pylab as plt
import pandas as pd
import numpy as np
def read_our_csv():
#reading the raw data from oecd
df = pd.read_csv("../class2/data/CITIES_19122016195113034.csv",sep="\t")
#fixing the columns (the first one is ""METRO_ID"" instead of "METRO_ID")
cols = list(df.columns)
cols[0] = "METRO_ID"
df.columns = cols
#pivot the table
column_with_values = "Value"
column_to_split = ["VAR"]
variables_already_present = ["METRO_ID","Metropolitan areas","Year"]
df_fixed = df.pivot_table(column_with_values,
variables_already_present,
column_to_split).reset_index()
return df_fixed
In [5]:
import pandas as pd
import numpy as np
import pylab as plt
import seaborn as sns
from scipy.stats import chi2_contingency,ttest_ind
#This allows us to use R
%load_ext rpy2.ipython
#Visualize in line
%matplotlib inline
#Be able to plot images saved in the hard drive
from IPython.display import Image,display
#Make the notebook wider
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
In [6]:
data = pd.read_csv("data/random.csv",sep="\t",index_col=0)*100
data.head()
Out[6]:
In [2]:
import seaborn as sns
sns.heatmap?
In [7]:
ax = sns.heatmap(data,cbar_kws={"label":"Body temperature"},cmap="YlOrRd")
ax.invert_yaxis()
plt.ylabel("Pizzas eaten")
plt.xlabel("Outside temperature")
plt.show()
In [8]:
sns.heatmap(data,cbar_kws={"label":"Body temperature"},cmap="YlOrRd")
plt.ylabel("Pizzas eaten")
plt.xlabel("Outside temperature")
plt.xticks(0.5+np.arange(10),["10-20","20-30","30-40","40-50","50-60","60-70","80-90","90-100","100-110","110-120"],rotation=90)
plt.show()
Conclusion: Pizzas make you lekker warm
Lesson of the day: Eat more pizza
In [14]:
#Read data and print the head to see how it looks like
df = pd.read_csv("../class3/data/world_bank/data.csv",na_values="..")
df.head()
Out[14]:
In [ ]:
#We could fix the column names with: df.columns = ["Country Name","Country Code","Series Name","Series Code",1967,1968,1969,...]
In [15]:
## 4.1b Fix the year of the column (make it numbers)
df = pd.read_csv("../class3/data/world_bank/data.csv",na_values="..")
old_columns = list(df.columns)
new_columns = []
for index,column_name in enumerate(old_columns):
if index < 4:
new_columns.append(column_name)
else:
year_column = int(column_name[:4])
new_columns.append(year_column)
df.columns = new_columns
#We could save our data with: df.to_csv("data/new_columns.csv",sep="\t")
df.head()
Out[15]:
Remember, this was the code that we use to fix the file of the `
### Fix setp 1: Melt
variables_already_presents = ['METRO_ID', 'Metropolitan areas','VAR']
columns_combine = cols
df = pd.melt(df,
id_vars=variables_already_presents,
value_vars=columns_combine,
var_name="Year",
value_name="Value")
df.head()
### Fix step 2: Pivot
column_with_values = "Value"
column_to_split = ["VAR"]
variables_already_present = ["METRO_ID","Metropolitan areas","Year"]
df.pivot_table(column_with_values,
variables_already_present,
column_to_split).reset_index().head()
`
In [19]:
### Fix setp 1: Melt
cols = list(df.columns)
variables_already_presents = cols[:4]
columns_combine = cols[4:]
df_1 = pd.melt(df,
id_vars=variables_already_presents,
value_vars=columns_combine,
var_name="Year",
value_name="Value")
df_1.head()
Out[19]:
In [20]:
### Fix step 2: Pivot
column_with_values = "Value"
column_to_split = ["Series Name"]
variables_already_present = ["Country Name","Country Code","Year"]
df_1.pivot_table(column_with_values,
variables_already_present,
column_to_split).reset_index().head()
Out[20]:
In [ ]:
#code
df_NL =
df_CO =
In [ ]:
In [ ]:
df_pri =
df_pu =
In [ ]:
In [ ]:
In [31]:
import scipy.stats #you need to import scipy.stats
Out[31]: