In [33]:
# Carga de librerías
import requests
import urllib.request
from lxml import html
from bs4 import BeautifulSoup
import pandas as pd
from ggplot import *
%matplotlib inline
In [34]:
# Para representar en moneda con "$ y ."
# pd.set_option("display.float_format",lambda x: "{:,}".format(x).replace(",","."))
In [35]:
# Extracción desde URL
url = 'http://www.gobiernotransparentechile.cl/directorio/servicios'
headers = {'User-Agent': 'My User Agent 1.0'}
page = requests.get(url, headers = headers)
tree = html.fromstring(page.content)
In [36]:
# Listado de servicios
servicios = tree.xpath("//ul[@class='resultList']/li/a/text()")
In [37]:
servicios[0:5]
Out[37]:
In [59]:
len(servicios)
Out[59]:
In [38]:
# URLs asociadas a servicios
links = tree.xpath("//ul[@class='resultList']/li/a/@href")
In [39]:
# Añadimos link a personas a contrata
links = [s + '/per_contrata/Ao-2016?x=0&y=0&sort=id&direction=asc&page_number=' for s in links]
In [41]:
def get_html(link):
'''
Devuelve el contenido html de un link asociado
'''
headers = {'User-Agent': 'My User Agent 1.0'}
req = urllib.request.Request(link, data=None, headers=headers)
try:
f = urllib.request.urlopen(req)
html = f.read().decode('utf-8')
return html
except:
return
def extract_table(html):
soup = BeautifulSoup(html,'lxml')
main_div = soup.find('div', {'id': 'main'})
main_table = main_div.find('table')
if main_table:
return main_table
else:
return
def create_df(table,servicio):
'''Cabecera'''
thead = main_table.find('thead')
rows = thead.find('tr')
titles = [s.text.strip() for s in rows.find_all('th')]
'''Filas'''
tbody = main_table.find('tbody')
rows = tbody.find_all('tr')
data = []
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols])
df = pd.DataFrame(data)
df = df.dropna()
df.reset_index(drop=True,inplace=True)
df.columns = titles
df['Remuneración Bruta Mensualizada'] = df['Remuneración Bruta Mensualizada'].str.replace('.','').astype(float)
df = df[['Estamento','Cargo','Grado EUS','Remuneración Bruta Mensualizada','Fecha de inicio','Fecha de término']]
df['servicio'] = servicio
return df
In [61]:
%time
df = pd.DataFrame()
for i in [s for s in list(range(0,len(links)))]:
n_page = 1
link = links[i]
servicio = servicios[i]
print('Procesando servicio ' + str(i+1) + ':' + servicio)
main_html = get_html(link + str(n_page))
if main_html:
main_table = extract_table(main_html)
if main_table:
try:
df_1 = create_df(main_table,servicio)
df = pd.concat([df,df_1])
except:
continue
while True:
n_page = n_page + 1
sub_html = get_html(link + str(n_page))
sub_table = extract_table(sub_html)
# Se pone un factor de seguridad de un máximo de 10 páginas...
if sub_table and n_page<50:
df_extra = create_df(sub_table,servicio)
df = pd.concat([df,df_extra])
else:
break
In [62]:
df.shape
Out[62]:
In [63]:
df.head()
Out[63]:
In [45]:
df['servicio'].unique()
Out[45]:
In [46]:
labels = ['$ {0:,}'.format(n*1000000).replace(',','.') for n in list(range(10))]
ggplot(df,aes(x='Remuneración Bruta Mensualizada'))+geom_histogram(aes(fill='steelblue')) + \
scale_x_continuous(labels=labels)
Out[46]:
In [65]:
#pd.set_option("display.float_format",lambda x: "$ {:,.0f}".format(x).replace(",","."))
In [66]:
df.describe()[1:]
Out[66]:
In [64]:
pd.reset_option("display.float_format")
In [58]:
df.to_pickle('sueldos_contrata.pkl')