In [168]:
#import sys
#reload(sys)
#sys.setdefaultencoding("latin-1")

API dati.gov.it


In [1]:
# import libraries
import pandas as pd
import numpy as np
import HTMLParser
import datetime
import os


/usr/local/lib/python2.7/dist-packages/pandas/core/computation/__init__.py:18: UserWarning: The installed version of numexpr 2.4 is not supported in pandas and will be not be used
The minimum supported version is 2.4.6

  ver=ver, min_ver=_MIN_NUMEXPR_VERSION), UserWarning)

Lista dei Dataset Pubblicati sul sito dati.gov.it


In [3]:
# Recupera la lista degli Open Data
url_list = 'http://www.dati.gov.it/api/3/action/package_list'
df_list = pd.read_json(url_list)
# Numero di dataset
df_list['result'].count()


Out[3]:
18284

Dataset dei Metadati


In [26]:
url_test = 'http://www.dati.gov.it/api/3/action/package_show?id=organico-comune-bari'
df_meta = pd.read_json(url_test)
df_meta['result']['metadata_modified']


Out[26]:
u'2014-12-17T11:47:19+01:00'

In [29]:
df_meta['result']['metadata_modified'][0:10]


Out[29]:
u'2014-12-17'

In [30]:
def crea_df_metadato(df_name):
    url_meta = 'http://www.dati.gov.it/api/3/action/package_show?id='+df_name
    d = {}
    try:
        df_meta = pd.read_json(url_meta)
        d['ds_name']=df_name
        d['ds_title'] = df_meta['result']['title']
        d['ds_id'] = df_meta['result']['id']
        try:
            d['ds_license'] = df_meta['result']['license_id']
        except:
            d['ds_license'] = np.nan
        d['_catalog_parent_name']=df_meta['result']['_catalog_parent_name']
        d['ultima_modifica']=df_meta['result']['metadata_modified'][0:10]
        try:
            d['gruppo'] = df_meta['result']['groups'][0]['display_name']
        except:
            d['gruppo'] = np.nan
        try:
            d['note'] = df_meta['result']['notes']
        except:
            d['note'] = np.nan
        try:
            d['url']=df_meta['result']['resources'][-1]['url']
            d['mymtype']=df_meta['result']['resources'][-1]['mimetype']
        except:
            d['url'] = np.nan
            d['mymtype'] = np.nan
    except:
        d['ds_name']=df_name
        d['ds_title'] = np.nan
        d['ds_id'] = np.nan
        d['ds_license'] = np.nan
        d['_catalog_parent_name']=np.nan
        d['ultima_modifica'] = np.nan
        d['gruppo'] = np.nan
        d['note'] = np.nan
        d['url'] = np.nan
        d['mymtype'] = np.nan
    return d

In [31]:
def crea_df_metadati_all(df_list):
    l = []
    for index, row in df_list.iterrows():
        l.append(crea_df_metadato(row['result']))
    df = pd.DataFrame.from_dict(l)
    return df

In [38]:
#df_list_test = df_list[0:10]
df_list_test = df_list[df_list['result']=='anticorruzione-2013']

In [39]:
df = crea_df_metadati_all(df_list_test)
cols = ['ds_title','_catalog_parent_name','gruppo','note','ultima_modifica',
       'ds_name','ds_id','ds_license','url','mymtype']
df = df[cols]

In [40]:
df.head(2)


Out[40]:
ds_title _catalog_parent_name gruppo note ultima_modifica ds_name ds_id ds_license url mymtype
0 Anticorruzione 2013 Comune di Bari NaN <p>Pubblicazione Dataset informazioni anticorr... 2014-12-17 anticorruzione-2013 6c00bf33-1fa2-47b5-90de-14f400da7e87 CC0 1.0 http://opendata.comune.bari.it/dataset/6c00bf3... csv

In [2]:
now = datetime.datetime.now()
dt = now.strftime("%Y-%m-%d")
name_file = dt+'_df_metadati_pa.csv'
dir_in = os.path.join(os.path.abspath(''),'input')
df_file = os.path.join(dir_in, name_file)

In [3]:
df.to_csv(path_or_buf=df_file, sep=';')



NameErrorTraceback (most recent call last)
<ipython-input-3-f86ec0dac9d0> in <module>()
----> 1 df.to_csv(path_or_buf=df_file, sep=';')

NameError: name 'df' is not defined

Get Dataset from Url


In [158]:
df = pd.read_csv(r['url'])

In [159]:
df.head(2)


Out[159]:
cognome nome UFFICIO PROFILO
0 ABBATANTUONO VITO 3^Circoscrizione Dirigente Amministrativo
1 ABBATICCHIO ROBERTO Segreteria Generale-Informazioni Esecutore Informatore-Notificatore