In [168]:
#import sys
#reload(sys)
#sys.setdefaultencoding("latin-1")
In [1]:
# import libraries
import pandas as pd
import numpy as np
import HTMLParser
import datetime
import os
In [3]:
# Recupera la lista degli Open Data
url_list = 'http://www.dati.gov.it/api/3/action/package_list'
df_list = pd.read_json(url_list)
# Numero di dataset
df_list['result'].count()
Out[3]:
In [26]:
url_test = 'http://www.dati.gov.it/api/3/action/package_show?id=organico-comune-bari'
df_meta = pd.read_json(url_test)
df_meta['result']['metadata_modified']
Out[26]:
In [29]:
df_meta['result']['metadata_modified'][0:10]
Out[29]:
In [30]:
def crea_df_metadato(df_name):
url_meta = 'http://www.dati.gov.it/api/3/action/package_show?id='+df_name
d = {}
try:
df_meta = pd.read_json(url_meta)
d['ds_name']=df_name
d['ds_title'] = df_meta['result']['title']
d['ds_id'] = df_meta['result']['id']
try:
d['ds_license'] = df_meta['result']['license_id']
except:
d['ds_license'] = np.nan
d['_catalog_parent_name']=df_meta['result']['_catalog_parent_name']
d['ultima_modifica']=df_meta['result']['metadata_modified'][0:10]
try:
d['gruppo'] = df_meta['result']['groups'][0]['display_name']
except:
d['gruppo'] = np.nan
try:
d['note'] = df_meta['result']['notes']
except:
d['note'] = np.nan
try:
d['url']=df_meta['result']['resources'][-1]['url']
d['mymtype']=df_meta['result']['resources'][-1]['mimetype']
except:
d['url'] = np.nan
d['mymtype'] = np.nan
except:
d['ds_name']=df_name
d['ds_title'] = np.nan
d['ds_id'] = np.nan
d['ds_license'] = np.nan
d['_catalog_parent_name']=np.nan
d['ultima_modifica'] = np.nan
d['gruppo'] = np.nan
d['note'] = np.nan
d['url'] = np.nan
d['mymtype'] = np.nan
return d
In [31]:
def crea_df_metadati_all(df_list):
l = []
for index, row in df_list.iterrows():
l.append(crea_df_metadato(row['result']))
df = pd.DataFrame.from_dict(l)
return df
In [38]:
#df_list_test = df_list[0:10]
df_list_test = df_list[df_list['result']=='anticorruzione-2013']
In [39]:
df = crea_df_metadati_all(df_list_test)
cols = ['ds_title','_catalog_parent_name','gruppo','note','ultima_modifica',
'ds_name','ds_id','ds_license','url','mymtype']
df = df[cols]
In [40]:
df.head(2)
Out[40]:
In [2]:
now = datetime.datetime.now()
dt = now.strftime("%Y-%m-%d")
name_file = dt+'_df_metadati_pa.csv'
dir_in = os.path.join(os.path.abspath(''),'input')
df_file = os.path.join(dir_in, name_file)
In [3]:
df.to_csv(path_or_buf=df_file, sep=';')
In [158]:
df = pd.read_csv(r['url'])
In [159]:
df.head(2)
Out[159]: