GIS with and

Part III: Data Munging...Combining GIS with Other Tools

Set-up our environment as before

Let's import the packages we will use and set the paths for outputs.


In [1]:
# Let's import pandas and some other basic packages we will use 
from __future__ import division

import pandas as pd
import numpy as np
import os, sys

# GIS packages
import geopandas as gpd
from geopandas.tools import overlay
from shapely.geometry import Polygon, Point
import georasters as gr
# Alias for Geopandas
gp = gpd

# Plotting
import matplotlib as mpl
import seaborn as sns
# Setup seaborn
sns.set()

# Mapping
import geoplot as gplt
import geoplot.crs as gcrs
import mapclassify as mc
import textwrap

%pylab --no-import-all
%matplotlib inline


Using matplotlib backend: MacOSX
Populating the interactive namespace from numpy and matplotlib

In [2]:
# Functions for plotting
def center_wrap(text, cwidth=32, **kw):
    '''Center Text (to be used in legend)'''
    lines = text
    #lines = textwrap.wrap(text, **kw)
    return "\n".join(line.center(cwidth) for line in lines)

def MyChloropleth(mydf, myfile='', myvar='',
                  mylegend='',
                  k=5,
                  extent=[-180, -90, 180, 90],
                  bbox_to_anchor=(0.2, 0.5),
                  edgecolor='white', facecolor='lightgray',
                  scheme='FisherJenks', bins=None, pct=None,
                  legend_labels=None,
                  save=True,
                  percent=False,
                  cmap='Reds',
                  **kwargs):
    # Chloropleth
    # Color scheme
    if scheme=='EqualInterval':
        scheme = mc.EqualInterval(mydf[myvar], k=k)
    elif scheme=='Quantiles':
        scheme = mc.Quantiles(mydf[myvar], k=k)
    elif scheme=='BoxPlot':
        scheme = mc.BoxPlot(mydf[myvar], k=k)
    elif scheme=='FisherJenks':
        scheme = mc.FisherJenks(mydf[myvar], k=k)
    elif scheme=='FisherJenksSampled':
        scheme = mc.FisherJenksSampled(mydf[myvar], k=k)
    elif scheme=='HeadTailBreaks':
        scheme = mc.HeadTailBreaks(mydf[myvar], k=k)
    elif scheme=='JenksCaspall':
        scheme = mc.JenksCaspall(mydf[myvar], k=k)
    elif scheme=='JenksCaspallForced':
        scheme = mc.JenksCaspallForced(mydf[myvar], k=k)
    elif scheme=='JenksCaspallSampled':
        scheme = mc.JenksCaspallSampled(mydf[myvar], k=k)
    elif scheme=='KClassifiers':
        scheme = mc.KClassifiers(mydf[myvar], k=k)
    elif scheme=='Percentiles':
        scheme = mc.Percentiles(mydf[myvar], pct=pct)
    elif scheme=='UserDefined':
        scheme = mc.UserDefined(mydf[myvar], bins=bins)
    
    if legend_labels is None:
        # Format legend
        upper_bounds = scheme.bins
        # get and format all bounds
        bounds = []
        for index, upper_bound in enumerate(upper_bounds):
            if index == 0:
                lower_bound = mydf[myvar].min()
            else:
                lower_bound = upper_bounds[index-1]
            # format the numerical legend here
            if percent:
                bound = f'{lower_bound:.0%} - {upper_bound:.0%}'
            else:
                bound = f'{float(lower_bound):,.0f} - {float(upper_bound):,.0f}'
            bounds.append(bound)
        legend_labels = bounds
    #Plot
    ax = gplt.choropleth(
        mydf, hue=myvar, projection=gcrs.PlateCarree(central_longitude=0.0, globe=None),
        edgecolor='white', linewidth=1,
        cmap=cmap, legend=True,
        scheme=scheme,
        legend_kwargs={'bbox_to_anchor': bbox_to_anchor,
                       'frameon': True,
                       'title':mylegend,
                       },
        legend_labels = legend_labels,
        figsize=(24, 16),
        rasterized=True,
    )
    gplt.polyplot(
        countries, projection=gcrs.PlateCarree(central_longitude=0.0, globe=None),
        edgecolor=edgecolor, facecolor=facecolor,
        ax=ax,
        rasterized=True,
        extent=extent,
    )
    if save:
        plt.savefig(pathgraphs + myfile + '_' + myvar +'.pdf', dpi=300, bbox_inches='tight')
        plt.savefig(pathgraphs + myfile + '_' + myvar +'.png', dpi=300, bbox_inches='tight')
    pass

In [3]:
# Paths
pathout = './data/'

if not os.path.exists(pathout):
    os.mkdir(pathout)
    
pathgraphs = './graphs/'
if not os.path.exists(pathgraphs):
    os.mkdir(pathgraphs)

Let's plot the countries for which Colombian citizens do not require visas

The Colombian Cancillery's website has a list with visa requirements for colombians. Let's use it to map countries for which visas are not required. Below is the link to the information. The problem is that it is a pdf file. Let's open the website and check it out


In [4]:
# Import display options for showing websites
from IPython.display import IFrame

url = 'https://www.cancilleria.gov.co/sites/default/files/FOTOS2020/relacion_de_paises_que_exigen_o_no_visas_a_colombianos_17-04-2020.pdf'
IFrame(url, width=800, height=400)


Out[4]:

Roadblock

Someone forgot to make our life easy and made the data available in a pdf.

Luckily python has tools to deal with this.

So let's download it, save it to disk and use these tools to process the pdf into a pandas.DataFrame.


In [5]:
# Import package for downloading internet content and save it to file
import requests

url = 'https://www.cancilleria.gov.co/sites/default/files/FOTOS2020/relacion_de_paises_que_exigen_o_no_visas_a_colombianos_17-04-2020.pdf'
response = requests.get(url)
with open(pathout + 'visas.pdf', 'wb') as f:
    f.write(response.content)

In [6]:
# Import package to read pdf tables
import camelot
visas = camelot.read_pdf(pathout + 'visas.pdf', pages='1-7')

Let's explore the visas object


In [7]:
visas


Out[7]:
<TableList n=7>

So there are 7 tables in visas. What does Table 1 have?


In [8]:
visas[0]


Out[8]:
<Table shape=(28, 3)>

In [9]:
visas[0].df


Out[9]:
0 1 2
0 MINISTERIO DE RELACIONES EXTERIORES DE COLOMBIA
1 DIRECCION DE ASUNTOS MIGRATORIOS, CONSULARES Y...
2 COORDINACION DE VISAS E INMIGRACION
3 Estados y territorios que exigen o NO visas a ...
4 EXIGEN VISA A
5 PAIS SI NO
6 Afganistán X
7 Albania X
8 Alemania X
9 Andorra X
10 Angola X
11 Antigua y Barbuda X
12 Arabia Saudita X
13 Argelia X
14 Argentina X
15 Armenia
16 Australia X X
17 Austria X
18 Azerbaiyán X (Visa electrónica)
19 Bahamas X
20 Bahréin X (visa a la llegada y visa electrónica)
21 Bangladesh X
22 Barbados X
23 Bélgica X
24 Belice X
25 Benin
26 Belarús X X
27 Bolivia X

Ok, let's concatenate all these pandas dataframes.


In [10]:
visadf = pd.concat([i.df for i in visas])
visadf


Out[10]:
0 1 2
0 MINISTERIO DE RELACIONES EXTERIORES DE COLOMBIA
1 DIRECCION DE ASUNTOS MIGRATORIOS, CONSULARES Y...
2 COORDINACION DE VISAS E INMIGRACION
3 Estados y territorios que exigen o NO visas a ...
4 EXIGEN VISA A
... ... ... ...
20 Taiwan X Visa electrónica
21 Wallis y Futuna (Francia) X
22
23 Actualización 21 -10-2019
24 El presente cuadro presenta generalidades sobr...

225 rows × 3 columns

We need to correct the header


In [11]:
visadf.columns = visadf.iloc[5]

In [12]:
visadf.head(10)


Out[12]:
5 PAIS SI NO
0 MINISTERIO DE RELACIONES EXTERIORES DE COLOMBIA
1 DIRECCION DE ASUNTOS MIGRATORIOS, CONSULARES Y...
2 COORDINACION DE VISAS E INMIGRACION
3 Estados y territorios que exigen o NO visas a ...
4 EXIGEN VISA A
5 PAIS SI NO
6 Afganistán X
7 Albania X
8 Alemania X
9 Andorra X

In [13]:
visadf = visadf.iloc[6:].copy()

In [14]:
visadf.columns.name = ''

In [15]:
visadf.head(10)


Out[15]:
PAIS SI NO
6 Afganistán X
7 Albania X
8 Alemania X
9 Andorra X
10 Angola X
11 Antigua y Barbuda X
12 Arabia Saudita X
13 Argelia X
14 Argentina X
15 Armenia

Let's code SI (YES) as 1 and NO as 0


In [16]:
visadf['visa_req'] = visadf.SI.map({'X':1, '':0})

Let's check whether things were mapped correctly


In [17]:
visadf.loc[visadf.visa_req.isna()]


Out[17]:
PAIS SI NO visa_req
16 Australia X X NaN
18 Azerbaiyán X (Visa electrónica) NaN
20 Bahréin X (visa a la llegada y visa electrónica) NaN
26 Belarús X X NaN
6 Burundi X X X NaN
8 Cabo Verde X (Visa a la llegada) NaN
9 Camboya X (Visa a la llegada) NaN
11 Canadá X X X NaN
18 Congo X X X NaN
22 Costa de Marfil X X NaN
30 Egipto X (Visa a la llegada) NaN
5 Fiji X X NaN
13 Granada X X NaN
17 Guinea-Bissau X X X NaN
25 Irán X X X X X NaN
30 Islas Salomón X X NaN
35 Jordania X X NaN
1 Kenia X Visa a la llegada NaN
3 Kiribati X X NaN
6 Laos República Democrática P X Visa a la llegada NaN
11 Libia X X NaN
17 Malasia X X X NaN
23 Mauricio X X X NaN
32 Myanmar X (Visa a la llegada) NaN
1 Nicaragua X (visa a la llegada para titulares de visa de... NaN
3 Nigeria X X NaN
6 Omán X (Visa de turismo al ingreso a Omán en los pu... NaN
9 Palau X X NaN
22 Ruanda X (Visa electrónica) NaN
4 Sierra Leona X X NaN
9 Sudáfrica X X X X X X NaN
16 Tailandia X X NaN
17 Tanzania X Visa a la llegada NaN
20 Togo X X X X NaN
31 Vanuatu X X NaN
34 Yemen X X X NaN
7 Macao (SARG-China) (*) X Visa a la llegada NaN
20 Taiwan X Visa electrónica NaN

In [18]:
IFrame(url, width=800, height=400)


Out[18]:

In [19]:
visadf.loc[(visadf.SI=='X X') | (visadf.SI.shift(1)=='X X')  | (visadf.SI.shift(-1)=='X X')]


Out[19]:
PAIS SI NO visa_req
15 Armenia 0.0
16 Australia X X NaN
17 Austria X 0.0
25 Benin 0.0
26 Belarús X X NaN
27 Bolivia X 0.0
21 Corea República Popular Dem. 0.0
22 Costa de Marfil X X NaN
23 Costa Rica X A titulares de Visa de EE UU o Schengen vigen... 1.0
4 Etiopía 0.0
5 Fiji X X NaN
6 Filipinas X Hasta por 30 días 0.0
12 Ghana 0.0
13 Granada X X NaN
14 Grecia X 0.0
29 Islas Marshall 0.0
30 Islas Salomón X X NaN
31 Israel X 0.0
34 Japón 0.0
35 Jordania X X NaN
0 Kazajstán X (Hasta por 30 días) 0.0
2 Kirguistán 0.0
3 Kiribati X X NaN
4 Kuwait X 1.0
10 Liberia 0.0
11 Libia X X NaN
12 Liechtenstein X 0.0
2 Níger 0.0
3 Nigeria X X NaN
4 Noruega X 0.0
8 Pakistán 0.0
9 Palau X X NaN
10 Panamá X 0.0
3 Seychelles 0.0
4 Sierra Leona X X NaN
5 Singapur X Hasta por 30 días 0.0
15 Suazilandia 0.0
16 Tailandia X X NaN
17 Tanzania X Visa a la llegada NaN
30 Uzbekistán 0.0
31 Vanuatu X X NaN
32 Venezuela X 0.0

In [20]:
visadf.loc[(visadf.SI=='X X X') | (visadf.SI.shift(1)=='X X X')  | (visadf.SI.shift(-1)=='X X X')]


Out[20]:
PAIS SI NO visa_req
5 Burkina Faso 0.0
6 Burundi X X X NaN
7 Bután 0.0
10 Camerún 0.0
11 Canadá X X X NaN
12 Chad 0.0
17 Comoras 0.0
18 Congo X X X NaN
19 Congo República Democrática 0.0
16 Guinea 0.0
17 Guinea-Bissau X X X NaN
18 Guinea Ecuatorial 0.0
16 Madagascar 0.0
17 Malasia X X X NaN
18 Malawi 0.0
22 Marruecos 0.0
23 Mauricio X X X NaN
24 Mauritania 0.0
33 Vietnam 0.0
34 Yemen X X X NaN
35 Zambia 0.0

Ok it seems we have two types of errors. First, notince that sometimes the type of visa is defined, e.g., Azerbayán. Second, the OCR software has mixed some rows, so that now we have XX, XXX, etc. Looking at the pdf it seems this is due to assigning an X from a previous row to the current row ("X X") or from both the previous and next ("X X X"). Let's try to correct these errors programatically (obviously sometimes it may just be faster and better to export the dataframe, correct it by hand snd then load the corrected one, but we're here to learn, right?).

First, let's replace the repeated X with what seems to be the correct data.

X X


In [21]:
visadf.loc[(visadf.SI=='X X') | (visadf.SI.shift(-1)=='X X'), 'visa_req'] = 1
visadf.loc[(visadf.SI=='X X') | (visadf.SI.shift(-1)=='X X')]


Out[21]:
PAIS SI NO visa_req
15 Armenia 1.0
16 Australia X X 1.0
25 Benin 1.0
26 Belarús X X 1.0
21 Corea República Popular Dem. 1.0
22 Costa de Marfil X X 1.0
4 Etiopía 1.0
5 Fiji X X 1.0
12 Ghana 1.0
13 Granada X X 1.0
29 Islas Marshall 1.0
30 Islas Salomón X X 1.0
34 Japón 1.0
35 Jordania X X 1.0
2 Kirguistán 1.0
3 Kiribati X X 1.0
10 Liberia 1.0
11 Libia X X 1.0
2 Níger 1.0
3 Nigeria X X 1.0
8 Pakistán 1.0
9 Palau X X 1.0
3 Seychelles 1.0
4 Sierra Leona X X 1.0
15 Suazilandia 1.0
16 Tailandia X X 1.0
30 Uzbekistán 1.0
31 Vanuatu X X 1.0

X X X


In [22]:
visadf.loc[(visadf.SI=='X X X') | (visadf.SI.shift(1)=='X X X')  | (visadf.SI.shift(-1)=='X X X'), 'visa_req'] =1
visadf.loc[(visadf.SI=='X X X') | (visadf.SI.shift(1)=='X X X')  | (visadf.SI.shift(-1)=='X X X')]


Out[22]:
PAIS SI NO visa_req
5 Burkina Faso 1.0
6 Burundi X X X 1.0
7 Bután 1.0
10 Camerún 1.0
11 Canadá X X X 1.0
12 Chad 1.0
17 Comoras 1.0
18 Congo X X X 1.0
19 Congo República Democrática 1.0
16 Guinea 1.0
17 Guinea-Bissau X X X 1.0
18 Guinea Ecuatorial 1.0
16 Madagascar 1.0
17 Malasia X X X 1.0
18 Malawi 1.0
22 Marruecos 1.0
23 Mauricio X X X 1.0
24 Mauritania 1.0
33 Vietnam 1.0
34 Yemen X X X 1.0
35 Zambia 1.0

X X X X


In [23]:
visadf.loc[(visadf.SI=='X X X X') | (visadf.SI.shift(1)=='X X X X')  | (visadf.SI.shift(-1)=='X X X X') | (visadf.SI.shift(2)=='X X X X')  | (visadf.SI.shift(-2)=='X X X X')  | (visadf.SI.shift(-3)=='X X X X')]


Out[23]:
PAIS SI NO visa_req
17 Tanzania X Visa a la llegada NaN
18 Tayikistán 0.0
19 Timor Oriental 0.0
20 Togo X X X X NaN
21 Tonga 0.0
22 Trinidad y Tobago X 0.0

In [24]:
visadf.loc[(visadf.SI=='X X X X') | (visadf.SI.shift(1)=='X X X X')  | (visadf.SI.shift(-1)=='X X X X') | (visadf.SI.shift(-2)=='X X X X'), 'visa_req'] = 1
visadf.loc[(visadf.SI=='X X X X') | (visadf.SI.shift(1)=='X X X X')  | (visadf.SI.shift(-1)=='X X X X') | (visadf.SI.shift(-2)=='X X X X')]


Out[24]:
PAIS SI NO visa_req
18 Tayikistán 1.0
19 Timor Oriental 1.0
20 Togo X X X X 1.0
21 Tonga 1.0

X X X X X


In [25]:
visadf.loc[(visadf.SI=='X X X X X') | (visadf.SI.shift(1)=='X X X X X')  | (visadf.SI.shift(-1)=='X X X X X') | (visadf.SI.shift(-2)=='X X X X X') | (visadf.SI.shift(2)=='X X X X X')]


Out[25]:
PAIS SI NO visa_req
23 India 0.0
24 Indonesia 0.0
25 Irán X X X X X NaN
26 Iraq 0.0
27 Irlanda 0.0

In [26]:
visadf.loc[(visadf.SI=='X X X X X') | (visadf.SI.shift(1)=='X X X X X')  | (visadf.SI.shift(-1)=='X X X X X') | (visadf.SI.shift(-2)=='X X X X X') | (visadf.SI.shift(2)=='X X X X X'), 'visa_req'] = 1
visadf.loc[(visadf.SI=='X X X X X') | (visadf.SI.shift(1)=='X X X X X')  | (visadf.SI.shift(-1)=='X X X X X') | (visadf.SI.shift(-2)=='X X X X X') | (visadf.SI.shift(2)=='X X X X X')]


Out[26]:
PAIS SI NO visa_req
23 India 1.0
24 Indonesia 1.0
25 Irán X X X X X 1.0
26 Iraq 1.0
27 Irlanda 1.0

X X X X X X


In [27]:
visadf.loc[(visadf.SI=='X X X X X X') | (visadf.SI.shift(1)=='X X X X X X')  | (visadf.SI.shift(-1)=='X X X X X X') | (visadf.SI.shift(-2)=='X X X X X X') | (visadf.SI.shift(2)=='X X X X X X') | (visadf.SI.shift(-3)=='X X X X X X') | (visadf.SI.shift(3)=='X X X X X X')]


Out[27]:
PAIS SI NO visa_req
6 Siria 0.0
7 Somalia 0.0
8 Sri Lanka 0.0
9 Sudáfrica X X X X X X NaN
10 Sudán del Sur 0.0
11 Sudán 0.0
12 Suecia X 0.0

In [28]:
visadf.loc[(visadf.SI=='X X X X X X') | (visadf.SI.shift(1)=='X X X X X X')  | (visadf.SI.shift(-1)=='X X X X X X') | (visadf.SI.shift(-2)=='X X X X X X') | (visadf.SI.shift(2)=='X X X X X X') | (visadf.SI.shift(-3)=='X X X X X X'), 'visa_req'] = 1
visadf.loc[(visadf.SI=='X X X X X X') | (visadf.SI.shift(1)=='X X X X X X')  | (visadf.SI.shift(-1)=='X X X X X X') | (visadf.SI.shift(-2)=='X X X X X X') | (visadf.SI.shift(2)=='X X X X X X') | (visadf.SI.shift(-3)=='X X X X X X')]


Out[28]:
PAIS SI NO visa_req
6 Siria 1.0
7 Somalia 1.0
8 Sri Lanka 1.0
9 Sudáfrica X X X X X X 1.0
10 Sudán del Sur 1.0
11 Sudán 1.0

Let's also replace visa required for any row that has the word "visa".


In [29]:
visadf.loc[visadf.SI.str.lower().str.find('visa')!=-1]


Out[29]:
PAIS SI NO visa_req
18 Azerbaiyán X (Visa electrónica) NaN
20 Bahréin X (visa a la llegada y visa electrónica) NaN
8 Cabo Verde X (Visa a la llegada) NaN
9 Camboya X (Visa a la llegada) NaN
30 Egipto X (Visa a la llegada) NaN
1 Kenia X Visa a la llegada NaN
6 Laos República Democrática P X Visa a la llegada NaN
32 Myanmar X (Visa a la llegada) NaN
1 Nicaragua X (visa a la llegada para titulares de visa de... NaN
6 Omán X (Visa de turismo al ingreso a Omán en los pu... NaN
22 Ruanda X (Visa electrónica) NaN
17 Tanzania X Visa a la llegada NaN
7 Macao (SARG-China) (*) X Visa a la llegada NaN
20 Taiwan X Visa electrónica NaN

In [30]:
visadf.loc[visadf.SI.str.lower().str.find('visa')!=-1, 'visa_req'] = 1
visadf.loc[visadf.SI.str.lower().str.find('visa')!=-1]


Out[30]:
PAIS SI NO visa_req
18 Azerbaiyán X (Visa electrónica) 1.0
20 Bahréin X (visa a la llegada y visa electrónica) 1.0
8 Cabo Verde X (Visa a la llegada) 1.0
9 Camboya X (Visa a la llegada) 1.0
30 Egipto X (Visa a la llegada) 1.0
1 Kenia X Visa a la llegada 1.0
6 Laos República Democrática P X Visa a la llegada 1.0
32 Myanmar X (Visa a la llegada) 1.0
1 Nicaragua X (visa a la llegada para titulares de visa de... 1.0
6 Omán X (Visa de turismo al ingreso a Omán en los pu... 1.0
22 Ruanda X (Visa electrónica) 1.0
17 Tanzania X Visa a la llegada 1.0
7 Macao (SARG-China) (*) X Visa a la llegada 1.0
20 Taiwan X Visa electrónica 1.0

Let's check again


In [31]:
visadf.loc[visadf.visa_req.isna()]


Out[31]:
PAIS SI NO visa_req

Ok, it seems we have coded which countries need and which do not need visa for colombian citizens. Let's analyze this data a bit.


In [32]:
visadf['visa_req_YN'] = visadf.visa_req.map({0:'NO', 1:'YES'})
visadf


Out[32]:
PAIS SI NO visa_req visa_req_YN
6 Afganistán X 1.0 YES
7 Albania X 0.0 NO
8 Alemania X 0.0 NO
9 Andorra X 0.0 NO
10 Angola X 1.0 YES
... ... ... ... ... ...
20 Taiwan X Visa electrónica 1.0 YES
21 Wallis y Futuna (Francia) X 0.0 NO
22 0.0 NO
23 Actualización 21 -10-2019 0.0 NO
24 El presente cuadro presenta generalidades sobr... 0.0 NO

219 rows × 5 columns


In [33]:
visadf.hist()
visadf.visa_req.describe()


Out[33]:
count    219.000000
mean       0.547945
std        0.498836
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max        1.000000
Name: visa_req, dtype: float64

In [34]:
df = visadf.groupby('visa_req_YN').count().reset_index()
df


Out[34]:
visa_req_YN PAIS SI NO visa_req
0 NO 99 99 99 99
1 YES 120 120 120 120

In [35]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
#sns.reset_orig()
sns.set_context("talk")
# Plot
fig, ax = plt.subplots()
sns.barplot(x='visa_req_YN', y='visa_req', data=df, alpha=1)
ax.tick_params(axis = 'both', which = 'major')
ax.tick_params(axis = 'both', which = 'minor')
ax.set_xlabel('Visa Required')
ax.set_ylabel('Number of Countries')


Out[35]:
Text(0, 0.5, 'Number of Countries')

Let's try to map these countries. First let's get the Natural Earth shapefile.


In [36]:
countries = gpd.read_file('https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_admin_0_countries.zip')

In [37]:
countries


Out[37]:
featurecla scalerank LABELRANK SOVEREIGNT SOV_A3 ADM0_DIF LEVEL TYPE ADMIN ADM0_A3 ... NAME_KO NAME_NL NAME_PL NAME_PT NAME_RU NAME_SV NAME_TR NAME_VI NAME_ZH geometry
0 Admin-0 country 5 2 Indonesia IDN 0 2 Sovereign country Indonesia IDN ... 인도네시아 Indonesië Indonezja Indonésia Индонезия Indonesien Endonezya Indonesia 印度尼西亚 MULTIPOLYGON (((117.70361 4.16341, 117.70361 4...
1 Admin-0 country 5 3 Malaysia MYS 0 2 Sovereign country Malaysia MYS ... 말레이시아 Maleisië Malezja Malásia Малайзия Malaysia Malezya Malaysia 马来西亚 MULTIPOLYGON (((117.70361 4.16341, 117.69711 4...
2 Admin-0 country 6 2 Chile CHL 0 2 Sovereign country Chile CHL ... 칠레 Chili Chile Chile Чили Chile Şili Chile 智利 MULTIPOLYGON (((-69.51009 -17.50659, -69.50611...
3 Admin-0 country 0 3 Bolivia BOL 0 2 Sovereign country Bolivia BOL ... 볼리비아 Bolivia Boliwia Bolívia Боливия Bolivia Bolivya Bolivia 玻利維亞 POLYGON ((-69.51009 -17.50659, -69.51009 -17.5...
4 Admin-0 country 0 2 Peru PER 0 2 Sovereign country Peru PER ... 페루 Peru Peru Peru Перу Peru Peru Peru 秘鲁 MULTIPOLYGON (((-69.51009 -17.50659, -69.63832...
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
250 Admin-0 country 0 4 China CH1 1 2 Country Macao S.A.R MAC ... 마카오 Macau Makau Macau Макао Macao Makao Ma Cao 澳門 MULTIPOLYGON (((113.55860 22.16303, 113.56943 ...
251 Admin-0 country 6 5 Australia AU1 1 2 Dependency Ashmore and Cartier Islands ATC ... 애시모어 카르티에 제도 Ashmore- en Cartiereilanden Wyspy Ashmore i Cartiera Ilhas Ashmore e Cartier Острова Ашмор и Картье Ashmore- och Cartieröarna Ashmore ve Cartier Adaları Quần đảo Ashmore và Cartier 阿什莫尔和卡捷岛 POLYGON ((123.59702 -12.42832, 123.59775 -12.4...
252 Admin-0 country 6 8 Bajo Nuevo Bank (Petrel Is.) BJN 0 2 Indeterminate Bajo Nuevo Bank (Petrel Is.) BJN ... 바호 누에보 뱅크 Bajo Nuevo Bajo Nuevo Ilha Baixo Novo Бахо-Нуэво Bajo Nuevo Bajo Nuevo Bank Bajo Nuevo Bank 巴霍努埃沃礁 POLYGON ((-79.98929 15.79495, -79.98782 15.796...
253 Admin-0 country 6 5 Serranilla Bank SER 0 2 Indeterminate Serranilla Bank SER ... 세라냐 뱅크 Serranilla Isla Serranilla Ilha Serranilla Серранилья-Банк Serranilla Bank Serranilla Bank Serranilla Bank 塞拉纳浅滩 POLYGON ((-78.63707 15.86209, -78.64041 15.864...
254 Admin-0 country 6 6 Scarborough Reef SCR 0 2 Indeterminate Scarborough Reef SCR ... 스카버러 암초 Scarborough-rif Huangyan Dao Recife de Scarborough Скарборо-Шол Scarboroughrevet Scarborough Shoal Bãi cạn Scarborough 黄岩岛 POLYGON ((117.75389 15.15437, 117.75569 15.151...

255 rows × 95 columns

Luckily there are country names in Spanish. Let's see if we can merge these two data sets.


In [38]:
countries.NAME_ES


Out[38]:
0                    Indonesia
1                      Malasia
2                        Chile
3                      Bolivia
4                         Perú
                ...           
250                      Macao
251    Islas Ashmore y Cartier
252                 Bajo Nuevo
253            Isla Serranilla
254           Bajo de Masinloc
Name: NAME_ES, Length: 255, dtype: object

In [39]:
col_visa = countries.merge(visadf, left_on='NAME_ES', right_on='PAIS')

In [40]:
cmap = mpl.colors.ListedColormap(['blue', 'red'])
mylegend = center_wrap(["Visa Requirements", "For Colombian Citizens"], cwidth=32, width=32)
MyChloropleth(mydf=col_visa, myfile='col_visa', myvar='visa_req', mylegend=mylegend, k=1, bbox_to_anchor=(0.25, 0.3),
                  edgecolor='white', facecolor='lightgray', cmap=cmap, scheme='UserDefined', bins=[0,1], legend_labels=['NO', 'YES'],
                  save=False)


So it seems not everythung merged correctly


In [41]:
col_visa.shape


Out[41]:
(164, 100)

In [42]:
visadf.shape


Out[42]:
(219, 5)

In [43]:
col_visa.loc[col_visa.visa_req.isna(), 'NAME_ES'].sort_values()


Out[43]:
Series([], Name: NAME_ES, dtype: object)

So we are not linking all countries. This is usually due to symbols like accents and ~, but in this case also because the tail of the data frame includes territories of countries, so their names are non-standard (and OCR may have made some mistakes).


In [44]:
visadf.tail(25)


Out[44]:
PAIS SI NO visa_req visa_req_YN
0 OTROS TERRITORIOS 0.0 NO
1 Aruba (Países Bajos) X 0.0 NO
2 Bonaire (Países Bajos) X 0.0 NO
3 Curazao (Países Bajos) X 0.0 NO
4 Guadalupe (Francia) X 0.0 NO
5 Guyana Francesa X 0.0 NO
6 Hong Kong (SARG-China) X Por 90 días 0.0 NO
7 Macao (SARG-China) (*) X Visa a la llegada 1.0 YES
8 Martinica (Francia) X 0.0 NO
9 Mayotte (Francia) X 0.0 NO
10 Nueva Caledonia (Francia) X 0.0 NO
11 Palestina X 1.0 YES
12 Polinesia Francesa X 0.0 NO
13 Réunion (Francia) X 0.0 NO
14 Saba (Países Bajos) X 0.0 NO
15 Saint Barthélémy (Francia) X 1.0 YES
16 Saint Pïerre et Miquelon (Francia) X 0.0 NO
17 Saint Martin (Francia) X 1.0 YES
18 Sint Maarten (Países Bajos) X 0.0 NO
19 Sint Eustatius (Países Bajos) X 0.0 NO
20 Taiwan X Visa electrónica 1.0 YES
21 Wallis y Futuna (Francia) X 0.0 NO
22 0.0 NO
23 Actualización 21 -10-2019 0.0 NO
24 El presente cuadro presenta generalidades sobr... 0.0 NO

Let's correct the country names to improve matching. It's always a good practice to keep the original names.


In [45]:
visadf['PAIS_OR'] = visadf.PAIS

In [46]:
visadf.loc[visadf.PAIS.str.find('(')!=-1, 'PAIS'] = visadf.loc[visadf.PAIS_OR.str.find('(')!=-1, 'PAIS_OR'].apply(lambda x: x[:x.find('(')])
visadf.PAIS = visadf.PAIS.str.strip()

In [47]:
visadf.tail(30)


Out[47]:
PAIS SI NO visa_req visa_req_YN PAIS_OR
32 Venezuela X 0.0 NO Venezuela
33 Vietnam 1.0 YES Vietnam
34 Yemen X X X 1.0 YES Yemen
35 Zambia 1.0 YES Zambia
36 Zimbabwe X 1.0 YES Zimbabwe
0 OTROS TERRITORIOS 0.0 NO OTROS TERRITORIOS
1 Aruba X 0.0 NO Aruba (Países Bajos)
2 Bonaire X 0.0 NO Bonaire (Países Bajos)
3 Curazao X 0.0 NO Curazao (Países Bajos)
4 Guadalupe X 0.0 NO Guadalupe (Francia)
5 Guyana Francesa X 0.0 NO Guyana Francesa
6 Hong Kong X Por 90 días 0.0 NO Hong Kong (SARG-China)
7 Macao X Visa a la llegada 1.0 YES Macao (SARG-China) (*)
8 Martinica X 0.0 NO Martinica (Francia)
9 Mayotte X 0.0 NO Mayotte (Francia)
10 Nueva Caledonia X 0.0 NO Nueva Caledonia (Francia)
11 Palestina X 1.0 YES Palestina
12 Polinesia Francesa X 0.0 NO Polinesia Francesa
13 Réunion X 0.0 NO Réunion (Francia)
14 Saba X 0.0 NO Saba (Países Bajos)
15 Saint Barthélémy X 1.0 YES Saint Barthélémy (Francia)
16 Saint Pïerre et Miquelon X 0.0 NO Saint Pïerre et Miquelon (Francia)
17 Saint Martin X 1.0 YES Saint Martin (Francia)
18 Sint Maarten X 0.0 NO Sint Maarten (Países Bajos)
19 Sint Eustatius X 0.0 NO Sint Eustatius (Países Bajos)
20 Taiwan X Visa electrónica 1.0 YES Taiwan
21 Wallis y Futuna X 0.0 NO Wallis y Futuna (Francia)
22 0.0 NO
23 Actualización 21 -10-2019 0.0 NO Actualización 21 -10-2019
24 El presente cuadro presenta generalidades sobr... 0.0 NO El presente cuadro presenta generalidades sobr...

In [48]:
col_visa = countries.merge(visadf, left_on='NAME_ES', right_on='PAIS')
cmap = mpl.colors.ListedColormap(['blue', 'red'])
mylegend = center_wrap(["Visa Requirements", "For Colombian Citizens"], cwidth=32, width=32)
MyChloropleth(mydf=col_visa, myfile='col_visa', myvar='visa_req', mylegend=mylegend, k=1, bbox_to_anchor=(0.25, 0.3),
                  edgecolor='white', facecolor='lightgray', cmap=cmap, scheme='UserDefined', bins=[0,1], legend_labels=['NO', 'YES'],
                  save=False)



In [49]:
col_visa.shape


Out[49]:
(171, 101)

Ok, that helped a bit. Let's see what else is different. Let's start by finding which countries are not linked.


In [50]:
miss_countries = list(set(countries.NAME_ES).difference(col_visa.NAME_ES))
miss_countries.remove(None)
#miss_countries.sort()
miss_visadf = list(set(visadf.PAIS).difference(col_visa.PAIS))
miss_visadf.remove('')
miss_visadf.sort()
print('Misssing countries', miss_countries)
print('')
print('Missing PAIS', miss_visadf)


Misssing countries ['Puerto Rico', 'Islas del Mar del Coral', 'Islas ultramarinas de Estados Unidos', 'Zimbabue', 'Papúa Nueva Guinea', 'Lesoto', 'Línea Verde', 'Laos', 'San Pedro y Miquelón', 'Bielorrusia', 'Jersey', 'Brunéi', 'Guernsey', 'Colombia', 'República del Congo', 'Islas Pitcairn', 'Islas Ashmore y Cartier', 'Corea del Norte', 'República Turca del Norte de Chipre', 'Antártida', 'Base Naval de la Bahía de Guantánamo', 'Somalilandia', 'Baikonur', 'Isla de Man', 'República Popular China', 'San Cristóbal y Nieves', 'San Martín', 'Guam', 'Malaui', 'Tierras Australes y Antárticas Francesas', 'Bermudas', 'Irak', 'Islas Spratly', 'Birmania', 'Bajo de Masinloc', 'Benín', 'Catar', 'San Bartolomé', 'Islas Caimán', 'Islas Vírgenes de los Estados Unidos', 'Samoa Estadounidense', 'Islas Cook', 'Palaos', 'Fiyi', 'República de Macedonia', 'Yibuti', 'Ciudad del Vaticano', 'Islas Feroe', 'Montserrat', 'Islas Heard y McDonald', 'Bangladés', 'Isla Clipperton', 'Isla Santa Elena', 'Moldavia', 'Guinea-Bisáu', 'Gibraltar', 'Islas Georgias del Sur y Sándwich del Sur', 'Islas Malvinas', 'Taiwán', 'Baréin', 'Estados Unidos', 'Corea del Sur', 'eSwatini', 'Bajo Nuevo', 'República Democrática del Congo', 'Isla Norfolk', 'Anguila', 'Sahara Occidental', 'Kazajistán', 'Glaciar de Siachen', 'Islas Turcas y Caicos', 'República Checa', 'Territorios Australianos del Océano Índico', 'Reino Unido', 'Territorio Británico del Océano Índico', 'Islas Marianas del Norte', 'Isla Serranilla', 'Groenlandia', 'Åland', 'Islas Vírgenes Británicas', 'Rusia', 'Dekelia', 'Niue']

Missing PAIS ['Actualización 21 -10-2019', 'Bahréin', 'Bangladesh', 'Belarús', 'Benin', 'Bonaire', 'Brunei Darussalam', 'Checa República', 'China República Popular', 'Congo', 'Congo República Democrática', 'Corea República', 'Corea República Popular Dem.', 'Djibouti', 'El presente cuadro presenta generalidades sobre la política de visas de otros países y no compromete la responsabilidad del Ministerio de Relaciones Exteriores. Se \nrecomienda dirigirse directamente a la Oficina Consular del país o territorio de su interés para obtener mayor información sobre turismo, visitas e inmigración.', 'Estados Unidos de América', 'Fiji', 'Guadalupe', 'Guinea-Bissau', 'Guyana Francesa', 'Iraq', 'Kazajstán', 'Laos República Democrática P', 'Lesotho', 'Macedonia', 'Malawi', 'Martinica', 'Mayotte', 'Moldova', 'Myanmar', 'OTROS TERRITORIOS', 'Palau', 'Papua Nueva Guinea', 'Qatar', 'Reino Unido Gran Bretaña e Irlanda del  Norte', 'Rusia Federación', 'Réunion', 'Saba', 'Saint Barthélémy', 'Saint Kitts y Nevis', 'Saint Martin', 'Saint Pïerre et Miquelon', 'Santa Sede', 'Sint Eustatius', 'Suazilandia', 'Taiwan', 'Zimbabwe']

Let's choose one example to see why/how they differ


In [51]:
countries.loc[countries.NAME_ES.str.find('Congo')!=-1, 'NAME_ES']


Out[51]:
31                 República del Congo
32     República Democrática del Congo
171                               None
Name: NAME_ES, dtype: object

In [52]:
visadf.loc[visadf.PAIS.str.find('Congo')!=-1, 'PAIS']


Out[52]:
18                          Congo
19    Congo República Democrática
Name: PAIS, dtype: object

OK, so not an easy fix. We can correct by hand the missing one or perhaps if we can find a way of linking for each missing country in one dataframe the most similar country in the other we may be able to simplify our work. If you google for help you will find e.g., that the package difflib can help.


In [53]:
# Import package to match text
import difflib

Let's create a dataframe to keep the matches we create between the country name in countries and visadf.


In [54]:
matches = pd.DataFrame(miss_countries, columns=['countries'])
matches = matches.loc[matches.countries.isna()==False].reset_index(drop=True).copy()
matches


Out[54]:
countries
0 Puerto Rico
1 Islas del Mar del Coral
2 Islas ultramarinas de Estados Unidos
3 Zimbabue
4 Papúa Nueva Guinea
... ...
78 Åland
79 Islas Vírgenes Británicas
80 Rusia
81 Dekelia
82 Niue

83 rows × 1 columns

Now, let's use the difflib.get_close_matches function to find the closest match to each country name in countries to visadf.


In [55]:
matches['visadf'] = matches.countries.apply(lambda x: difflib.get_close_matches(x, miss_visadf, cutoff=0.8))
matches.loc[matches.visadf.apply(lambda x: x!=[])]


Out[55]:
countries visadf
3 Zimbabue [Zimbabwe]
4 Papúa Nueva Guinea [Papua Nueva Guinea]
5 Lesoto [Lesotho]
26 San Martín [Saint Martin]
28 Malaui [Malawi]
35 Benín [Benin]
36 Catar [Qatar]
50 Bangladés [Bangladesh]
53 Moldavia [Moldova]
54 Guinea-Bisáu [Guinea-Bissau]
58 Taiwán [Taiwan]
59 Baréin [Bahréin]
68 Kazajistán [Kazajstán]

So it works! Of course now we need to improve matches and try to find as many as we can so we do not have to do it by hand. One way to do it is to keep the correct matches and decrease the cutoff required for a match.


In [56]:
matches.loc[matches.visadf.apply(lambda x: x!=[] and len(x)==1), 'k'] = 0.8
matches.loc[matches.visadf.apply(lambda x: x!=[] and len(x)==1), 'visadf_matched'] = matches.loc[matches.visadf.apply(lambda x: x!=[] and len(x)==1), 'visadf'].apply(lambda x: x[0])
matches


Out[56]:
countries visadf k visadf_matched
0 Puerto Rico [] NaN NaN
1 Islas del Mar del Coral [] NaN NaN
2 Islas ultramarinas de Estados Unidos [] NaN NaN
3 Zimbabue [Zimbabwe] 0.8 Zimbabwe
4 Papúa Nueva Guinea [Papua Nueva Guinea] 0.8 Papua Nueva Guinea
... ... ... ... ...
78 Åland [] NaN NaN
79 Islas Vírgenes Británicas [] NaN NaN
80 Rusia [] NaN NaN
81 Dekelia [] NaN NaN
82 Niue [] NaN NaN

83 rows × 4 columns


In [57]:
for k in np.arange(0.9,0.1,-0.025):
    if matches.visadf_matched.isna().sum()!=0:
        print(k)
        matches['visadf'] = matches.countries.apply(lambda x: difflib.get_close_matches(x, miss_visadf, cutoff=k))
        matches.loc[(matches.visadf.apply(lambda x: x!=[] and len(x)==1)) & (matches.visadf_matched.isna()), 'k'] = k
        matches.loc[(matches.visadf.apply(lambda x: x!=[] and len(x)==1)) & (matches.visadf_matched.isna()), 'visadf_matched'] = matches.loc[(matches.visadf.apply(lambda x: x!=[] and len(x)==1)) & (matches.visadf_matched.isna()), 'visadf'].apply(lambda x: x[0])
matches


0.9
0.875
0.85
0.825
0.7999999999999999
0.7749999999999999
0.7499999999999999
0.7249999999999999
0.6999999999999998
0.6749999999999998
0.6499999999999998
0.6249999999999998
0.5999999999999998
0.5749999999999997
0.5499999999999997
0.5249999999999997
0.49999999999999967
0.47499999999999964
0.4499999999999996
0.4249999999999996
0.3999999999999996
0.37499999999999956
0.34999999999999953
0.3249999999999995
0.2999999999999995
0.27499999999999947
0.24999999999999944
0.22499999999999942
0.1999999999999994
0.17499999999999938
0.14999999999999936
0.12499999999999933
Out[57]:
countries visadf k visadf_matched
0 Puerto Rico [Martinica, Corea República, Checa República] 0.400 Martinica
1 Islas del Mar del Coral [Laos República Democrática P, Estados Unidos ... 0.425 Laos República Democrática P
2 Islas ultramarinas de Estados Unidos [Estados Unidos de América, Sint Eustatius, Sa... 0.450 Estados Unidos de América
3 Zimbabue [Zimbabwe, Djibouti, Saba] 0.800 Zimbabwe
4 Papúa Nueva Guinea [Papua Nueva Guinea, Guinea-Bissau, Santa Sede] 0.800 Papua Nueva Guinea
... ... ... ... ...
78 Åland [Suazilandia, Santa Sede, Palau] 0.500 Suazilandia
79 Islas Vírgenes Británicas [Estados Unidos de América, Martinica, Corea R... 0.425 Estados Unidos de América
80 Rusia [Réunion, Rusia Federación, Suazilandia] 0.500 Réunion
81 Dekelia [Corea República, Checa República, Belarús] NaN NaN
82 Niue [Bonaire, Zimbabwe, Djibouti] 0.350 Bonaire

83 rows × 4 columns


In [58]:
matches.sort_values('k', ascending=False)


Out[58]:
countries visadf k visadf_matched
35 Benín [Benin, Réunion, Bonaire] 0.8 Benin
26 San Martín [Saint Martin, Saint Barthélémy, Martinica] 0.8 Saint Martin
59 Baréin [Bahréin, Benin, Martinica] 0.8 Bahréin
58 Taiwán [Taiwan, Bahréin, Martinica] 0.8 Taiwan
54 Guinea-Bisáu [Guinea-Bissau, Papua Nueva Guinea, Guadalupe] 0.8 Guinea-Bissau
... ... ... ... ...
65 Isla Norfolk [Lesotho, Belarús, Laos República Democrática P] NaN NaN
67 Sahara Occidental [Saint Barthélémy, Bahréin, Saint Martin] NaN NaN
71 República Checa [Congo República Democrática, Laos República D... NaN NaN
74 Territorio Británico del Océano Índico [Estados Unidos de América, Reino Unido Gran B... NaN NaN
81 Dekelia [Corea República, Checa República, Belarús] NaN NaN

83 rows × 4 columns

Let's create the opposite match


In [59]:
matches2 = pd.DataFrame(miss_visadf, columns=['visadf'])
matches2 = matches2.loc[matches2.visadf.isna()==False].reset_index(drop=True).copy()
matches2['countries'] = matches2.visadf.apply(lambda x: difflib.get_close_matches(x, miss_countries, cutoff=0.9))
matches2.loc[matches2.countries.apply(lambda x: x!=[] and len(x)==1), 'k'] = 0.8
matches2.loc[matches2.countries.apply(lambda x: x!=[] and len(x)==1), 'countries_matched'] = matches2.loc[matches2.countries.apply(lambda x: x!=[] and len(x)==1), 'countries'].apply(lambda x: x[0])
for k in np.arange(0.9,0.1,-0.025):
    if matches2.countries_matched.isna().sum()!=0:
        print(k)
        matches2['countries'] = matches2.visadf.apply(lambda x: difflib.get_close_matches(x, miss_countries, cutoff=k))
        matches2.loc[(matches2.countries.apply(lambda x: x!=[] and len(x)==1)) & (matches2.countries_matched.isna()), 'k'] = k
        matches2.loc[(matches2.countries.apply(lambda x: x!=[] and len(x)==1)) & (matches2.countries_matched.isna()), 'countries_matched'] = matches2.loc[(matches2.countries.apply(lambda x: x!=[] and len(x)==1)) & (matches2.countries_matched.isna()), 'countries'].apply(lambda x: x[0])
matches2


0.9
0.875
0.85
0.825
0.7999999999999999
0.7749999999999999
0.7499999999999999
0.7249999999999999
0.6999999999999998
0.6749999999999998
0.6499999999999998
0.6249999999999998
0.5999999999999998
0.5749999999999997
0.5499999999999997
0.5249999999999997
0.49999999999999967
0.47499999999999964
0.4499999999999996
0.4249999999999996
0.3999999999999996
0.37499999999999956
0.34999999999999953
0.3249999999999995
0.2999999999999995
0.27499999999999947
0.24999999999999944
0.22499999999999942
0.1999999999999994
0.17499999999999938
0.14999999999999936
0.12499999999999933
Out[59]:
visadf countries k countries_matched
0 Actualización 21 -10-2019 [Kazajistán, Glaciar de Siachen, Somalilandia] NaN NaN
1 Bahréin [Baréin, Brunéi, Baikonur] 0.800 Baréin
2 Bangladesh [Bangladés, Palaos, Anguila] 0.825 Bangladés
3 Belarús [Bielorrusia, Bermudas, Bangladés] 0.550 Bielorrusia
4 Benin [Benín, Brunéi, Baréin] 0.800 Benín
5 Bonaire [Baikonur, Baréin, Moldavia] 0.525 Baikonur
6 Brunei Darussalam [Brunéi, Bielorrusia, San Bartolomé] NaN NaN
7 Checa República [República Checa, República del Congo, Repúbli... 0.600 República Checa
8 China República Popular [República Popular China, República Checa, Rep... 0.725 República Popular China
9 Congo [Colombia, República del Congo, Islas Cook] 0.450 Colombia
10 Congo República Democrática [República Democrática del Congo, República Ch... 0.700 República Democrática del Congo
11 Corea República [República Checa, Corea del Sur, Corea del Norte] 0.600 República Checa
12 Corea República Popular Dem. [República Popular China, Corea del Norte, Cor... 0.700 República Popular China
13 Djibouti [Yibuti, Bielorrusia, Zimbabue] 0.700 Yibuti
14 El presente cuadro presenta generalidades sobr... [] NaN NaN
15 Estados Unidos de América [Estados Unidos, Islas ultramarinas de Estados... 0.700 Estados Unidos
16 Fiji [Fiyi, Yibuti, eSwatini] 0.750 Fiyi
17 Guadalupe [Guam, Malaui, Zimbabue] 0.450 Guam
18 Guinea-Bissau [Guinea-Bisáu, Groenlandia, Papúa Nueva Guinea] 0.875 Guinea-Bisáu
19 Guyana Francesa [Sahara Occidental, Tierras Australes y Antárt... 0.425 Sahara Occidental
20 Iraq [Irak, Islas Spratly, Birmania] 0.750 Irak
21 Kazajstán [Kazajistán, Taiwán, Palaos] 0.800 Kazajistán
22 Laos República Democrática P [República Democrática del Congo, República Ch... 0.725 República Democrática del Congo
23 Lesotho [Lesoto, Laos, Islas Cook] 0.800 Lesoto
24 Macedonia [República de Macedonia, Moldavia, Birmania] 0.575 República de Macedonia
25 Malawi [Malaui, Moldavia, Palaos] 0.825 Malaui
26 Martinica [eSwatini, Baréin, San Martín] 0.575 eSwatini
27 Mayotte [Montserrat, Laos, San Martín] 0.450 Montserrat
28 Moldova [Moldavia, Colombia, Montserrat] 0.650 Moldavia
29 Myanmar [Catar, San Martín, Malaui] 0.500 Catar
30 OTROS TERRITORIOS [Sahara Occidental, Samoa Estadounidense, San ... 0.175 Sahara Occidental
31 Palau [Palaos, Malaui, Gibraltar] NaN NaN
32 Papua Nueva Guinea [Papúa Nueva Guinea, República Popular China, ... 0.800 Papúa Nueva Guinea
33 Qatar [Catar, Gibraltar, Islas Pitcairn] 0.800 Catar
34 Reino Unido Gran Bretaña e Irlanda del Norte [Corea del Norte, República Turca del Norte de... 0.450 Corea del Norte
35 Rusia Federación [Isla de Man, Rusia, República de Macedonia] 0.500 Isla de Man
36 Réunion [Rusia, Brunéi, Baréin] 0.500 Rusia
37 Saba [Catar, San Martín, Palaos] NaN NaN
38 Saint Barthélémy [San Bartolomé, San Martín, San Cristóbal y Ni... 0.675 San Bartolomé
39 Saint Kitts y Nevis [San Cristóbal y Nieves, San Pedro y Miquelón,... 0.625 San Cristóbal y Nieves
40 Saint Martin [San Martín, San Bartolomé, eSwatini] 0.800 San Martín
41 Saint Pïerre et Miquelon [San Pedro y Miquelón, San Bartolomé, Bajo de ... 0.725 San Pedro y Miquelón
42 Santa Sede [Línea Verde, Isla Santa Elena, San Pedro y Mi... 0.550 Línea Verde
43 Sint Eustatius [Samoa Estadounidense, San Cristóbal y Nieves,... 0.450 Samoa Estadounidense
44 Suazilandia [Somalilandia, Groenlandia, eSwatini] 0.775 Somalilandia
45 Taiwan [Taiwán, Baréin, Somalilandia] 0.825 Taiwán
46 Zimbabwe [Zimbabue, Colombia, Birmania] 0.875 Zimbabue

Clearly, this still will need some work...some were linked correctly, others not (although the correct ones seem o be in the list countries) and still there are some that do not seem to be there at all! This is partly due to the fact that the Natural Earth shapefile does not seem to have some countries (e.g., Bonaire, Sint Eustatius, Saba, Reunion). Given the missing locations in countries it may be easier to use matches2 to finish the matching.


In [60]:
namecols = ['SOVEREIGNT', 'NAME_ES'] + [col for col in countries.columns if col.find('NAME')!=-1] 
countries.loc[countries.apply(lambda row: row.astype(str).str.lower().str.contains('bonaire').any(), axis=1), namecols]


Out[60]:
SOVEREIGNT NAME_ES NAME NAME_LONG BRK_NAME NAME_CIAWF NAME_SORT NAME_ALT NAME_LEN NAME_AR ... NAME_JA NAME_KO NAME_NL NAME_PL NAME_PT NAME_RU NAME_SV NAME_TR NAME_VI NAME_ZH

0 rows × 30 columns


In [61]:
countries.loc[countries.apply(lambda row: row.astype(str).str.lower().str.contains('eust').any(), axis=1), namecols]


Out[61]:
SOVEREIGNT NAME_ES NAME NAME_LONG BRK_NAME NAME_CIAWF NAME_SORT NAME_ALT NAME_LEN NAME_AR ... NAME_JA NAME_KO NAME_NL NAME_PL NAME_PT NAME_RU NAME_SV NAME_TR NAME_VI NAME_ZH

0 rows × 30 columns


In [62]:
countries.loc[countries.apply(lambda row: row.astype(str).str.lower().str.contains('sab').any(), axis=1), namecols]


Out[62]:
SOVEREIGNT NAME_ES NAME NAME_LONG BRK_NAME NAME_CIAWF NAME_SORT NAME_ALT NAME_LEN NAME_AR ... NAME_JA NAME_KO NAME_NL NAME_PL NAME_PT NAME_RU NAME_SV NAME_TR NAME_VI NAME_ZH

0 rows × 30 columns


In [63]:
countries.loc[countries.apply(lambda row: row.astype(str).str.lower().str.contains('reun').any(), axis=1), namecols]


Out[63]:
SOVEREIGNT NAME_ES NAME NAME_LONG BRK_NAME NAME_CIAWF NAME_SORT NAME_ALT NAME_LEN NAME_AR ... NAME_JA NAME_KO NAME_NL NAME_PL NAME_PT NAME_RU NAME_SV NAME_TR NAME_VI NAME_ZH

0 rows × 30 columns

Or have different writing/names (e.g., Myanmar, Swaziland) or because the Spanish name used by the Colombian Cancillery is non-standard (e.g., Santa Sede vs Vaticano)


In [64]:
countries.loc[countries.apply(lambda row: row.astype(str).str.lower().str.contains('myan').any(), axis=1), namecols]


Out[64]:
SOVEREIGNT NAME_ES NAME NAME_LONG BRK_NAME NAME_CIAWF NAME_SORT NAME_ALT NAME_LEN NAME_AR ... NAME_JA NAME_KO NAME_NL NAME_PL NAME_PT NAME_RU NAME_SV NAME_TR NAME_VI NAME_ZH
126 Myanmar Birmania Myanmar Myanmar Myanmar Burma Myanmar None 7 ميانمار ... ミャンマー 미얀마 Myanmar Mjanma Mianmar Мьянма Burma Myanmar Myanma 缅甸

1 rows × 30 columns


In [65]:
countries.loc[countries.apply(lambda row: row.astype(str).str.lower().str.contains('swazi').any(), axis=1), namecols]


Out[65]:
SOVEREIGNT NAME_ES NAME NAME_LONG BRK_NAME NAME_CIAWF NAME_SORT NAME_ALT NAME_LEN NAME_AR ... NAME_JA NAME_KO NAME_NL NAME_PL NAME_PT NAME_RU NAME_SV NAME_TR NAME_VI NAME_ZH
123 eSwatini eSwatini eSwatini eSwatini eSwatini eSwatini eSwatini None 8 سوازيلاند ... スワジランド 스와질란드 Swaziland Suazi eSwatini Свазиленд Swaziland Svaziland Swaziland 斯威士兰

1 rows × 30 columns


In [66]:
countries.loc[countries.apply(lambda row: row.astype(str).str.lower().str.contains('vatic').any(), axis=1), namecols]


Out[66]:
SOVEREIGNT NAME_ES NAME NAME_LONG BRK_NAME NAME_CIAWF NAME_SORT NAME_ALT NAME_LEN NAME_AR ... NAME_JA NAME_KO NAME_NL NAME_PL NAME_PT NAME_RU NAME_SV NAME_TR NAME_VI NAME_ZH
166 Vatican Ciudad del Vaticano Vatican Vatican Vatican Holy See (Vatican City) Vatican (Holy See) Holy See 7 الفاتيكان ... バチカン 바티칸 시국 Vaticaanstad Watykan Vaticano Ватикан Vatikanstaten Vatikan Thành Vatican 梵蒂冈

1 rows × 30 columns

Let's try to see how goodf the best macthes are


In [67]:
matches2.sort_values('k', ascending=False)


Out[67]:
visadf countries k countries_matched
46 Zimbabwe [Zimbabue, Colombia, Birmania] 0.875 Zimbabue
18 Guinea-Bissau [Guinea-Bisáu, Groenlandia, Papúa Nueva Guinea] 0.875 Guinea-Bisáu
45 Taiwan [Taiwán, Baréin, Somalilandia] 0.825 Taiwán
25 Malawi [Malaui, Moldavia, Palaos] 0.825 Malaui
2 Bangladesh [Bangladés, Palaos, Anguila] 0.825 Bangladés
32 Papua Nueva Guinea [Papúa Nueva Guinea, República Popular China, ... 0.800 Papúa Nueva Guinea
23 Lesotho [Lesoto, Laos, Islas Cook] 0.800 Lesoto
21 Kazajstán [Kazajistán, Taiwán, Palaos] 0.800 Kazajistán
1 Bahréin [Baréin, Brunéi, Baikonur] 0.800 Baréin
40 Saint Martin [San Martín, San Bartolomé, eSwatini] 0.800 San Martín
33 Qatar [Catar, Gibraltar, Islas Pitcairn] 0.800 Catar
4 Benin [Benín, Brunéi, Baréin] 0.800 Benín
44 Suazilandia [Somalilandia, Groenlandia, eSwatini] 0.775 Somalilandia
16 Fiji [Fiyi, Yibuti, eSwatini] 0.750 Fiyi
20 Iraq [Irak, Islas Spratly, Birmania] 0.750 Irak
41 Saint Pïerre et Miquelon [San Pedro y Miquelón, San Bartolomé, Bajo de ... 0.725 San Pedro y Miquelón
22 Laos República Democrática P [República Democrática del Congo, República Ch... 0.725 República Democrática del Congo
8 China República Popular [República Popular China, República Checa, Rep... 0.725 República Popular China
12 Corea República Popular Dem. [República Popular China, Corea del Norte, Cor... 0.700 República Popular China
13 Djibouti [Yibuti, Bielorrusia, Zimbabue] 0.700 Yibuti
15 Estados Unidos de América [Estados Unidos, Islas ultramarinas de Estados... 0.700 Estados Unidos
10 Congo República Democrática [República Democrática del Congo, República Ch... 0.700 República Democrática del Congo
38 Saint Barthélémy [San Bartolomé, San Martín, San Cristóbal y Ni... 0.675 San Bartolomé
28 Moldova [Moldavia, Colombia, Montserrat] 0.650 Moldavia
39 Saint Kitts y Nevis [San Cristóbal y Nieves, San Pedro y Miquelón,... 0.625 San Cristóbal y Nieves
11 Corea República [República Checa, Corea del Sur, Corea del Norte] 0.600 República Checa
7 Checa República [República Checa, República del Congo, Repúbli... 0.600 República Checa
26 Martinica [eSwatini, Baréin, San Martín] 0.575 eSwatini
24 Macedonia [República de Macedonia, Moldavia, Birmania] 0.575 República de Macedonia
42 Santa Sede [Línea Verde, Isla Santa Elena, San Pedro y Mi... 0.550 Línea Verde
3 Belarús [Bielorrusia, Bermudas, Bangladés] 0.550 Bielorrusia
5 Bonaire [Baikonur, Baréin, Moldavia] 0.525 Baikonur
29 Myanmar [Catar, San Martín, Malaui] 0.500 Catar
35 Rusia Federación [Isla de Man, Rusia, República de Macedonia] 0.500 Isla de Man
36 Réunion [Rusia, Brunéi, Baréin] 0.500 Rusia
27 Mayotte [Montserrat, Laos, San Martín] 0.450 Montserrat
34 Reino Unido Gran Bretaña e Irlanda del Norte [Corea del Norte, República Turca del Norte de... 0.450 Corea del Norte
9 Congo [Colombia, República del Congo, Islas Cook] 0.450 Colombia
43 Sint Eustatius [Samoa Estadounidense, San Cristóbal y Nieves,... 0.450 Samoa Estadounidense
17 Guadalupe [Guam, Malaui, Zimbabue] 0.450 Guam
19 Guyana Francesa [Sahara Occidental, Tierras Australes y Antárt... 0.425 Sahara Occidental
30 OTROS TERRITORIOS [Sahara Occidental, Samoa Estadounidense, San ... 0.175 Sahara Occidental
0 Actualización 21 -10-2019 [Kazajistán, Glaciar de Siachen, Somalilandia] NaN NaN
6 Brunei Darussalam [Brunéi, Bielorrusia, San Bartolomé] NaN NaN
14 El presente cuadro presenta generalidades sobr... [] NaN NaN
31 Palau [Palaos, Malaui, Gibraltar] NaN NaN
37 Saba [Catar, San Martín, Palaos] NaN NaN

Seems we won't be able to improve, so let's finish by hand (using code of course, since we want replicability of our results).


In [68]:
# Correct matches2
matches2.loc[matches2.visadf=='Suazilandia', 'countries_matched'] = 'eSwatini'
matches2.loc[matches2.visadf=='Laos República Democrática P', 'countries_matched'] = 'Laos'
matches2.loc[matches2.visadf=='Corea República Popular Dem.', 'countries_matched'] = 'Corea del Norte'
matches2.loc[matches2.visadf=='Corea República', 'countries_matched'] = 'Corea del Sur'
matches2.loc[matches2.visadf=='Martinica', 'countries_matched'] = ''
matches2.loc[matches2.visadf=='Santa Sede', 'countries_matched'] = 'Ciudad del Vaticano'
matches2.loc[matches2.visadf=='Bonaire', 'countries_matched'] = ''
matches2.loc[matches2.visadf=='Myanmar', 'countries_matched'] = 'Birmania'
matches2.loc[matches2.visadf=='Rusia Federación', 'countries_matched'] = 'Rusia'
matches2.loc[matches2.visadf=='Réunion', 'countries_matched'] = ''
matches2.loc[matches2.visadf=='Mayotte', 'countries_matched'] = ''
matches2.loc[matches2.visadf=='Reino Unido Gran Bretaña e Irlanda del  Norte', 'countries_matched'] = 'Reino Unido'
matches2.loc[matches2.visadf=='Congo', 'countries_matched'] = 'República del Congo'
matches2.loc[matches2.visadf=='Sint Eustatius', 'countries_matched'] = ''
matches2.loc[matches2.visadf=='Guadalupe', 'countries_matched'] = ''
matches2.loc[matches2.visadf=='Guyana Francesa', 'countries_matched'] = ''
matches2.loc[matches2.visadf=='Brunei Darussalam', 'countries_matched'] = 'Brunéi'
matches2.loc[matches2.visadf=='Palau', 'countries_matched'] = 'Palaos'
matches2.loc[matches2.visadf=='Saba', 'countries_matched'] = ''
#matches2.loc[matches2.visadf=='', 'countries_matched'] = ''
#matches2.loc[matches2.visadf=='', 'countries_matched'] = ''
matches2.sort_values('k', ascending=False)


Out[68]:
visadf countries k countries_matched
46 Zimbabwe [Zimbabue, Colombia, Birmania] 0.875 Zimbabue
18 Guinea-Bissau [Guinea-Bisáu, Groenlandia, Papúa Nueva Guinea] 0.875 Guinea-Bisáu
45 Taiwan [Taiwán, Baréin, Somalilandia] 0.825 Taiwán
25 Malawi [Malaui, Moldavia, Palaos] 0.825 Malaui
2 Bangladesh [Bangladés, Palaos, Anguila] 0.825 Bangladés
32 Papua Nueva Guinea [Papúa Nueva Guinea, República Popular China, ... 0.800 Papúa Nueva Guinea
23 Lesotho [Lesoto, Laos, Islas Cook] 0.800 Lesoto
21 Kazajstán [Kazajistán, Taiwán, Palaos] 0.800 Kazajistán
1 Bahréin [Baréin, Brunéi, Baikonur] 0.800 Baréin
40 Saint Martin [San Martín, San Bartolomé, eSwatini] 0.800 San Martín
33 Qatar [Catar, Gibraltar, Islas Pitcairn] 0.800 Catar
4 Benin [Benín, Brunéi, Baréin] 0.800 Benín
44 Suazilandia [Somalilandia, Groenlandia, eSwatini] 0.775 eSwatini
16 Fiji [Fiyi, Yibuti, eSwatini] 0.750 Fiyi
20 Iraq [Irak, Islas Spratly, Birmania] 0.750 Irak
41 Saint Pïerre et Miquelon [San Pedro y Miquelón, San Bartolomé, Bajo de ... 0.725 San Pedro y Miquelón
22 Laos República Democrática P [República Democrática del Congo, República Ch... 0.725 Laos
8 China República Popular [República Popular China, República Checa, Rep... 0.725 República Popular China
12 Corea República Popular Dem. [República Popular China, Corea del Norte, Cor... 0.700 Corea del Norte
13 Djibouti [Yibuti, Bielorrusia, Zimbabue] 0.700 Yibuti
15 Estados Unidos de América [Estados Unidos, Islas ultramarinas de Estados... 0.700 Estados Unidos
10 Congo República Democrática [República Democrática del Congo, República Ch... 0.700 República Democrática del Congo
38 Saint Barthélémy [San Bartolomé, San Martín, San Cristóbal y Ni... 0.675 San Bartolomé
28 Moldova [Moldavia, Colombia, Montserrat] 0.650 Moldavia
39 Saint Kitts y Nevis [San Cristóbal y Nieves, San Pedro y Miquelón,... 0.625 San Cristóbal y Nieves
11 Corea República [República Checa, Corea del Sur, Corea del Norte] 0.600 Corea del Sur
7 Checa República [República Checa, República del Congo, Repúbli... 0.600 República Checa
26 Martinica [eSwatini, Baréin, San Martín] 0.575
24 Macedonia [República de Macedonia, Moldavia, Birmania] 0.575 República de Macedonia
42 Santa Sede [Línea Verde, Isla Santa Elena, San Pedro y Mi... 0.550 Ciudad del Vaticano
3 Belarús [Bielorrusia, Bermudas, Bangladés] 0.550 Bielorrusia
5 Bonaire [Baikonur, Baréin, Moldavia] 0.525
29 Myanmar [Catar, San Martín, Malaui] 0.500 Birmania
35 Rusia Federación [Isla de Man, Rusia, República de Macedonia] 0.500 Rusia
36 Réunion [Rusia, Brunéi, Baréin] 0.500
27 Mayotte [Montserrat, Laos, San Martín] 0.450
34 Reino Unido Gran Bretaña e Irlanda del Norte [Corea del Norte, República Turca del Norte de... 0.450 Reino Unido
9 Congo [Colombia, República del Congo, Islas Cook] 0.450 República del Congo
43 Sint Eustatius [Samoa Estadounidense, San Cristóbal y Nieves,... 0.450
17 Guadalupe [Guam, Malaui, Zimbabue] 0.450
19 Guyana Francesa [Sahara Occidental, Tierras Australes y Antárt... 0.425
30 OTROS TERRITORIOS [Sahara Occidental, Samoa Estadounidense, San ... 0.175 Sahara Occidental
0 Actualización 21 -10-2019 [Kazajistán, Glaciar de Siachen, Somalilandia] NaN NaN
6 Brunei Darussalam [Brunéi, Bielorrusia, San Bartolomé] NaN Brunéi
14 El presente cuadro presenta generalidades sobr... [] NaN NaN
31 Palau [Palaos, Malaui, Gibraltar] NaN Palaos
37 Saba [Catar, San Martín, Palaos] NaN

In [69]:
#countries.loc[countries.apply(lambda row: row.astype(str).str.lower().str.contains('aba').any(), axis=1), namecols]

This is as good as we can do it with this dataset and shapefile (of course we may need a different shapefile if we really need to ensure that we are plotting all the correct information. E.g., does French Guyana have the same visa requirements than France and the other French Territories represented in Natural Earth's shapefile as France? If so, then we are ok! Otherwise we would need another shapefile or transform this one).


In [70]:
visadf['countries_matched'] = visadf.PAIS
visadf.loc[visadf.PAIS.apply(lambda x: x in miss_visadf), 'countries_matched'] = visadf.loc[visadf.PAIS.apply(lambda x: x in miss_visadf)].PAIS.map(matches2[['visadf', 'countries_matched']].set_index('visadf').to_dict()['countries_matched'])
visadf


Out[70]:
PAIS SI NO visa_req visa_req_YN PAIS_OR countries_matched
6 Afganistán X 1.0 YES Afganistán Afganistán
7 Albania X 0.0 NO Albania Albania
8 Alemania X 0.0 NO Alemania Alemania
9 Andorra X 0.0 NO Andorra Andorra
10 Angola X 1.0 YES Angola Angola
... ... ... ... ... ... ... ...
20 Taiwan X Visa electrónica 1.0 YES Taiwan Taiwán
21 Wallis y Futuna X 0.0 NO Wallis y Futuna (Francia) Wallis y Futuna
22 0.0 NO
23 Actualización 21 -10-2019 0.0 NO Actualización 21 -10-2019 NaN
24 El presente cuadro presenta generalidades sobr... 0.0 NO El presente cuadro presenta generalidades sobr... NaN

219 rows × 7 columns


In [71]:
col_visa = countries.merge(visadf, left_on='NAME_ES', right_on='countries_matched')
cmap = mpl.colors.ListedColormap(['blue', 'red'])
mylegend = center_wrap(["Visa Requirements", "For Colombian Citizens"], cwidth=32, width=32)
MyChloropleth(mydf=col_visa, myfile='col_visa', myvar='visa_req', mylegend=mylegend, k=1, bbox_to_anchor=(0.25, 0.3),
                  edgecolor='white', facecolor='lightgray', cmap=cmap, scheme='UserDefined', bins=[0,1], legend_labels=['NO', 'YES'],
                  save=False)