In [2]:
# imports
import urllib2  
from bs4 import BeautifulSoup 
from bs4 import SoupStrainer
import re
from time import sleep
import pandas as pd

In [79]:
open_url = "https://www.copolad.eu/en/directorio-de-centros-y-servicios?p_p_id=directoriocentrosservicios_WAR_directoriocentrosserviciosportlet_INSTANCE_vD5B&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p_p_col_count=1&_directoriocentrosservicios_WAR_directoriocentrosserviciosportlet_INSTANCE_vD5B_action=directory&_directoriocentrosservicios_WAR_directoriocentrosserviciosportlet_INSTANCE_vD5B_toolbarItem=directory&_directoriocentrosservicios_WAR_directoriocentrosserviciosportlet_INSTANCE_vD5B_delta=20&_directoriocentrosservicios_WAR_directoriocentrosserviciosportlet_INSTANCE_vD5B_keywords=&_directoriocentrosservicios_WAR_directoriocentrosserviciosportlet_INSTANCE_vD5B_advancedSearch=false&_directoriocentrosservicios_WAR_directoriocentrosserviciosportlet_INSTANCE_vD5B_andOperator=true&_directoriocentrosservicios_WAR_directoriocentrosserviciosportlet_INSTANCE_vD5B_country=null&_directoriocentrosservicios_WAR_directoriocentrosserviciosportlet_INSTANCE_vD5B_typeOrg=null&_directoriocentrosservicios_WAR_directoriocentrosserviciosportlet_INSTANCE_vD5B_province=&_directoriocentrosservicios_WAR_directoriocentrosserviciosportlet_INSTANCE_vD5B_city=&_directoriocentrosservicios_WAR_directoriocentrosserviciosportlet_INSTANCE_vD5B_typeInterv=null&_directoriocentrosservicios_WAR_directoriocentrosserviciosportlet_INSTANCE_vD5B_orderByCol=&_directoriocentrosservicios_WAR_directoriocentrosserviciosportlet_INSTANCE_vD5B_orderByType=&cur="
url_list=[]
for letters in range(1,158):
    url_list.append(open_url+str(letters))

center_names = []
direction = [] 
postal = []
city = []
state = []
phone = []
email = []
website = []

def get_regex(list_a, path, string):
    string_cleaned = re.findall(path, string)
    if not string_cleaned:
        list_a.append('NA')
    else: list_a.append(string_cleaned)

for services in url_list:
    
    only_dd_tags = SoupStrainer(["dd","dt"])
    page = urllib2.urlopen(services)  
    soup =  BeautifulSoup(page, 'html.parser', parse_only=only_dd_tags)       
    string = str(soup)
    get_regex(direction, "<dt>Dirección:</dt><dd>(.*?)</dd><dt>Código Postal", string)
    get_regex(postal, "<dt>Código Postal:</dt><dd>(.*?)</dd><dt>Ciudad / Localidad:", string)
    get_regex(city, "<dt>Ciudad / Localidad:</dt><dd>(.*?)</dd><dt>Departamento / Provincia:", string)
    get_regex(state, "<dt>Departamento / Provincia:</dt><dd>(.*?)</dd><dt>País:", string)
    get_regex(phone, "<dt>Teléfono:</dt><dd>(.*?)<br/></dd><dt>", string)
    get_regex(email,"Correo electrónico:</dt><dd> <a href=\"mailto:(.*?)\" target=\"_blank\">", string)
    get_regex(website, "Web institucional:</dt><dd> <a href=\"(.*?)\" target=\"_blank\">", string)
    
    page_test = urllib2.urlopen(services)
    soup_names =  BeautifulSoup(page_test, 'html.parser')
    string_names = soup_names.find_all('div', class_='titulo-sec')
    string_centers = str(string_names)
    get_regex(center_names, "<div class=\"titulo-sec\">(.*?)<a href=", string_centers)
    sleep(20)

In [104]:
import itertools
def unnest_list(list_user):
    list_of_menuitems = list_user
    chain = itertools.chain(*list_of_menuitems)
    return list(chain)

direction_df = pd.DataFrame({'direction': unnest_list(direction)})
postal_df = pd.DataFrame({'postal': unnest_list(postal)})
center_name_df = pd.DataFrame({'center_name': unnest_list(center_names)})
state_df = pd.DataFrame({'state': unnest_list(state)})
website_df = pd.DataFrame({'website': unnest_list(website)})
city_df = pd.DataFrame({'city': unnest_list(city)})
email_df = pd.DataFrame({'email': unnest_list(email)})

full_df = pd.concat([direction_df, postal_df, center_name_df, state_df, website_df, city_df, email_df], 
                    ignore_index = True, axis = 1)
full_df.columns = ['direction', 'postal','center_name', 'state', 'website', 'city', 'email']

In [106]:
full_df.to_csv('~/cicad/copolad.csv')