To Do - API_Dati_Pubblici
In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import os
plt.style.use('ggplot')
get_ipython().magic('pylab inline')
import nltk
from nltk.tokenize import word_tokenize
import re
from collections import Counter
from nltk.corpus import stopwords
import string
In [2]:
dir_in = os.path.join(os.path.abspath(''),'input')
dir_out = os.path.join(os.path.abspath(''),'output')
df_file = os.path.join(dir_in, '2017-10-19_DSMetadatiPA.csv')
In [4]:
df = pd.read_csv(df_file, delimiter=';')
In [6]:
df['ds_name'].count()
Out[6]:
In [7]:
df[pd.isnull(df['ds_title'])]['ds_name'].count()
Out[7]:
In [8]:
# Check DS senza metadati
# http://www.dati.gov.it/api/3/action/package_show?id=popolazione-straniera
df_test = df[pd.isnull(df['ds_title'])][-5:]
df_test
Out[8]:
In [9]:
df.head(2)
Out[9]:
In [10]:
df_cat = df.groupby(['_catalog_parent_name']).count().reset_index().sort_values(by='ds_name', ascending=False).set_index('_catalog_parent_name')
In [11]:
df_cat.head(2)
Out[11]:
In [12]:
# Primi 10 elementi per numero di dataset
df_cat_10 = df_cat['ds_name'][:10]
df_cat_10.sum()
Out[12]:
In [13]:
# resto degli elementi
df_cat['ds_name'][11:].count()
Out[13]:
In [14]:
# Numero di dataset sul resto degli elementi
df_cat['ds_name'][11:].sum()
Out[14]:
In [15]:
# Lista Cataloghi -> Occhio alle label settate a mano
df_cat_10
Out[15]:
In [16]:
#Grafico
style.use('fivethirtyeight')
colors = [['red']]
tp = df_cat_10.plot(
kind='bar',
legend = False,
figsize = (10,4),
color = colors)
for p in tp.patches:
tp.annotate(str(int(round(p.get_height()))), (p.get_x() * 1.005, p.get_height() * 1.005), ha='center', va='center', xytext=(15, 5), textcoords='offset points',fontsize=9)
tp.plot()
tp.tick_params(axis = 'both', which = 'major', labelsize = 10)
tp.set_xlabel('Catalogo',fontsize=8)
tp.set_ylabel('n. dataset',fontsize=8)
tp.axhline(y = 0, color = 'black', linewidth = 4, alpha = 0.7)
labels = [item.get_text() for item in tp.get_xticklabels()]
labels[0] = 'Dati \nTrentino'
labels[1] = 'Regione \nToscana'
labels[2] = 'Dati \nLombardia'
labels[3] = 'INPS'
labels[4] = 'Comune di \nAlbano Laziale'
labels[5] = 'Regione \nSardegna'
labels[6] = 'Dati \nVeneto'
labels[7] = 'Open Data \nAlto Adige'
labels[8] = 'Dati Emilia \nRomagna'
labels[9] = 'Comune di \nLecce'
tp.set_xticklabels(labels)
plt.xticks(rotation=0)
# Adding a title and a subtitle
tp.text(x = -1.0, y = 7640, s = "I dati aperti della Pubblica Amministrazione",
fontsize = 14, weight = 'bold', alpha = .75)
tp.text(x = -1.0, y = 6700,
s = u"I primi 10 cataloghi sorgente producono oltre l' 80% dei dataset. Su un totale di 18.000 dataset, 15.600 sono prodotti \ndai primi 10 Enti Pubblici.",
fontsize = 10, alpha = .85)
text = tp.text(x = -1.2, y = -1500,
s = 'www.ildatomancante.it Fonte: dati.gov.it',
fontsize = 10, color = '#f0f0f0', backgroundcolor = 'grey')
text.set_url('http://www.ildatomancante.it')
fig_posts = tp.get_figure()
df_file_cat = os.path.join(dir_out, 'Cataloghi Dataset.png')
fig_posts.savefig(df_file_cat, format='png', dpi=300,bbox_inches='tight')
In [17]:
df_grp= df.groupby(['gruppo']).count().reset_index().sort_values(by='ds_name', ascending=False).set_index('gruppo')
In [18]:
df_grp.head(2)
Out[18]:
In [19]:
# Primi 10 elementi per numero di dataset
df_grp_10 = df_grp['ds_name'][:10]
df_grp_10.sum()
Out[19]:
In [20]:
# resto degli elementi
df_grp['ds_name'][11:].count()
Out[20]:
In [21]:
# Numero di dataset sul resto degli elementi
df_grp['ds_name'][11:].sum()
Out[21]:
In [22]:
df_grp_10
Out[22]:
In [24]:
#Grafico
style.use('fivethirtyeight')
colors = [['red']]
tp = df_grp_10.plot(
kind='bar',
legend = False,
figsize = (10,4),
color = colors)
for p in tp.patches:
tp.annotate(str(int(round(p.get_height()))), (p.get_x() * 1.005, p.get_height() * 1.005), ha='center', va='center', xytext=(15, 5), textcoords='offset points',fontsize=9)
tp.plot()
tp.tick_params(axis = 'both', which = 'major', labelsize = 10)
tp.set_xlabel('Gruppo',fontsize=8)
tp.set_ylabel('n. dataset',fontsize=8)
tp.axhline(y = 0, color = 'black', linewidth = 4, alpha = 0.7)
labels = [item.get_text() for item in tp.get_xticklabels()]
labels[0] = u'Popolazione \ne società'
labels[1] = 'Ambiente'
labels[2] = 'Istruzione, \ncultura \ne sport'
labels[3] = 'Governo e \nsettore \npubblico'
labels[4] = 'Economia \ne finanze'
labels[5] = 'Agricoltura'
labels[6] = 'Trasporti'
labels[7] = 'Salute'
labels[8] = 'Giustizia'
labels[9] = u'Regioni \ne città'
tp.set_xticklabels(labels)
plt.xticks(rotation=0)
# Adding a title and a subtitle
tp.text(x = -1.0, y = 2150, s = "I dati aperti della Pubblica Amministrazione",
fontsize = 14, weight = 'bold', alpha = .75)
tp.text(x = -1.0, y = 2000,
s = u"Su ca.18.000 dataset solo in 6.400 hanno il campo gruppo popolato.",
fontsize = 10, alpha = .85)
text = tp.text(x = -1.2, y = -500,
s = 'www.ildatomancante.it Fonte: dati.gov.it',
fontsize = 10, color = '#f0f0f0', backgroundcolor = 'grey')
text.set_url('http://www.ildatomancante.it')
fig_posts = tp.get_figure()
df_file_grp = os.path.join(dir_out, 'Gruppi Dataset.png')
fig_posts.savefig(df_file_grp, format='png', dpi=300,bbox_inches='tight')
In [25]:
np.isnan(df['ultima_modifica'][0])
Out[25]:
In [26]:
from datetime import datetime
def calcolo_anno(x):
try:
np.isnan(x['ultima_modifica'])
anno = '9999'
except:
anno = x['ultima_modifica'][0:4]
return anno
In [27]:
df['dt_ultima_modifica'] = df.apply(lambda x: calcolo_anno(x), axis=1)
In [28]:
df.head(2)
Out[28]:
In [29]:
df_tms = df.groupby(['dt_ultima_modifica']).count().reset_index().sort_values(by='dt_ultima_modifica', ascending=False).set_index('dt_ultima_modifica')
In [30]:
df_tms
Out[30]:
In [31]:
df.head(2)
Out[31]:
In [32]:
emoticons_str = r"""
(?:
[:=;] # Eyes
[oO\-]? # Nose (optional)
[D\)\]\(\]/\\OpP] # Mouth
)"""
regex_str = [
emoticons_str,
r'<[^>]+>', # HTML tags
r'(?:@[\w_]+)', # @-mentions
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
r'(?:[\w_]+)', # other words
r'(?:\S)' # anything else
]
In [33]:
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
In [34]:
def tokenize(s):
return tokens_re.findall(s)
def preprocess(s, lowercase=False):
tokens = tokenize(s)
if lowercase:
tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
return tokens
In [35]:
count_only = Counter()
punctuation = list(string.punctuation)
# stop = stopwords.words('english') + punctuation + ['rt', 'via']
stop = punctuation
for i, row in df['ds_title'].iteritems():
try:
np.isnan(row)
except:
terms_only = [term for term in preprocess(row)
if term not in stop and
not term.startswith(('per','\xc3','di','del','e','\xa0','della'
,'Anno','a','in','#039','al','dei',
'con','nel','12','10','1','da','31'))]
count_only.update(terms_only)
In [36]:
word_freq = count_only.most_common(30)
words_json = [{'text': item[0], 'weight': item[1]} for item in word_freq]
words_json
Out[36]:
In [46]:
count_only = Counter()
punctuation = list(string.punctuation)
# stop = stopwords.words('english') + punctuation + ['rt', 'via']
stop = punctuation
for i, row in df['note'].iteritems():
try:
np.isnan(row)
except:
terms_only = [term for term in preprocess(row)
if term not in stop and
not term.startswith(('per','\xc3','di','del','e','\xa0','della'
,'Anno','a','in','#039','al','dei',
'con','nel','12','10','1','da','31',
'<p>','</p>','<br />', 'i', 'sul',
'</strong>','<strong>', 'la', 'le','\x80',
'xe2', 'Tutti', 'il', 'sono', '\xa8', '\x99',
'che', 'una', 'che', 'gli','Il','n','o',
'\xe2','si','un','\xc2','su','questo','come',
'stesso','I','Tutte','\x93','Tipo','La','\xb9',
'Per','degli','formati','pi'))]
count_only.update(terms_only)
In [51]:
word_freq_note = count_only.most_common(30)
wordsnote__json = [{'text': item[0], 'weight': item[1]} for item in word_freq_note]
wordsnote__json
Out[51]:
In [48]:
Out[48]: