This Jupyter notebook shows how to use jsonstat.py python library to explore Istat data. Istat is the Italian National Institute of Statistics. It publishes a rest api for browsing italian statistics. This api can return results in jsonstat format.
In [1]:
from __future__ import print_function
import os
import pandas as pd
from IPython.core.display import HTML
import matplotlib.pyplot as plt
%matplotlib inline
import istat
In [2]:
cache_dir = os.path.abspath(os.path.join("..", "tmp", "istat_cached")) # you could choice /tmp
istat.cache_dir(cache_dir)
print("cache_dir is '{}'".format(istat.cache_dir()))
List all istat areas
In [3]:
istat.areas()
Out[3]:
List all datasets contained into area LAB
(Labour)
In [4]:
istat_area_lab = istat.area('LAB')
istat_area_lab
Out[4]:
List all dimension for dataset DCCV_TAXDISOCCU
(Unemployment rate)
In [5]:
istat_dataset_taxdisoccu = istat_area_lab.dataset('DCCV_TAXDISOCCU')
istat_dataset_taxdisoccu
Out[5]:
Extract data from dataset DCCV_TAXDISOCCU
In [6]:
spec = {
"Territory": 0, # 1 Italy
"Data type": 6, # (6:'unemployment rate')
'Measure': 1, # 1 : 'percentage values'
'Gender': 3, # 3 total
'Age class':31, # 31:'15-74 years'
'Highest level of education attained': 12, # 12:'total',
'Citizenship': 3, # 3:'total')
'Duration of unemployment': 3, # 3:'total'
'Time and frequency': 0 # All
}
# convert istat dataset into jsonstat collection and print some info
collection = istat_dataset_taxdisoccu.getvalues(spec)
collection
Out[6]:
Print some info of the only dataset contained into the above jsonstat collection
In [7]:
jsonstat_dataset = collection.dataset(0)
jsonstat_dataset
Out[7]:
In [8]:
df_all = jsonstat_dataset.to_table(rtype=pd.DataFrame)
df_all.head()
Out[8]:
In [9]:
df_all.pivot('Territory', 'Time and frequency', 'Value').head()
Out[9]:
In [10]:
spec = {
"Territory": 1, # 1 Italy
"Data type": 6, # (6:'unemployment rate')
'Measure': 1,
'Gender': 3,
'Age class':0, # all classes
'Highest level of education attained': 12, # 12:'total',
'Citizenship': 3, # 3:'total')
'Duration of unemployment': 3, # 3:'total')
'Time and frequency': 0 # All
}
# convert istat dataset into jsonstat collection and print some info
collection_2 = istat_dataset_taxdisoccu.getvalues(spec)
collection_2
Out[10]:
In [11]:
df = collection_2.dataset(0).to_table(rtype=pd.DataFrame, blocked_dims={'IDCLASETA28':'31'})
df.head(6)
Out[11]:
In [12]:
df = df.dropna()
df = df[df['Time and frequency'].str.contains(r'^Q.*')]
# df = df.set_index('Time and frequency')
df.head(6)
Out[12]:
In [13]:
df.plot(x='Time and frequency',y='Value', figsize=(18,4))
Out[13]:
In [14]:
fig = plt.figure(figsize=(18,6))
ax = fig.add_subplot(111)
plt.grid(True)
df.plot(x='Time and frequency',y='Value', ax=ax, grid=True)
# kind='barh', , alpha=a, legend=False, color=customcmap,
# edgecolor='w', xlim=(0,max(df['population'])), title=ttl)
Out[14]:
In [15]:
# plt.figure(figsize=(7,4))
# plt.plot(df['Time and frequency'],df['Value'], lw=1.5, label='1st')
# plt.plot(y[:,1], lw=1.5, label='2st')
# plt.plot(y,'ro')
# plt.grid(True)
# plt.legend(loc=0)
# plt.axis('tight')
# plt.xlabel('index')
# plt.ylabel('value')
# plt.title('a simple plot')
In [16]:
# forza lavoro
istat_forzlv = istat.dataset('LAB', 'DCCV_FORZLV')
spec = {
"Territory": 'Italy',
"Data type": 'number of labour force 15 years and more (thousands)', #
'Measure': 'absolute values',
'Gender': 'total',
'Age class': '15 years and over',
'Highest level of education attained': 'total',
'Citizenship': 'total',
'Time and frequency': 0
}
df_forzlv = istat_forzlv.getvalues(spec).dataset(0).to_table(rtype=pd.DataFrame)
df_forzlv = df_forzlv.dropna()
df_forzlv = df_forzlv[df_forzlv['Time and frequency'].str.contains(r'^Q.*')]
df_forzlv.tail(6)
Out[16]:
In [17]:
istat_inattiv = istat.dataset('LAB', 'DCCV_INATTIV')
# HTML(istat_inattiv.info_dimensions_as_html())
In [18]:
spec = {
"Territory": 'Italy',
"Data type": 'number of inactive persons',
'Measure': 'absolute values',
'Gender': 'total',
'Age class': '15 years and over',
'Highest level of education attained': 'total',
'Time and frequency': 0
}
df_inattiv = istat_inattiv.getvalues(spec).dataset(0).to_table(rtype=pd.DataFrame)
df_inattiv = df_inattiv.dropna()
df_inattiv = df_inattiv[df_inattiv['Time and frequency'].str.contains(r'^Q.*')]
df_inattiv.tail(6)
Out[18]: