notebook.community

Edit and run



In [2]:

    
import os
import re
import zipfile

import requests
import pandas as pd
import matplotlib.pyplot as plt

Creamos en un inicio una función que nos facilite obtener los datos desde el INEGI, para posterior reutilización:



In [3]:

    
def get_data(url, target_file):
    chunk_size = 1024
    r = requests.get(url, stream=True)
    if r.status_code == 200:
        with open(target_file, 'wb') as fd:
            for chunk in r.iter_content(chunk_size):
                fd.write(chunk)
        
        # Extract all content from zip file
        if zipfile.is_zipfile(target_file):
            with zipfile.ZipFile(target_file) as zf:
                zf.extractall()

En principio vamos a obtener los datos asociados a la *entidad federativa* de Nuevo León.



In [6]:

    
base_path="http://www3.inegi.org.mx/sistemas/descarga/descargaArchivo.aspx?file=Por+entidad+federativa"
nuevo_leon="%2f19+Nuevo+Le%f3n%2f19_NuevoLeon_tsv.zip"
url_target = "".join([base_path, nuevo_leon])
fname = "nuevo_leon.zip"
os.mkdir("data", 0755)
os.chdir("data")
get_data(url_target, fname)



In [7]:

    
input_data = pd.read_csv("19_nuevoleon_valor.tsv", delimiter="\t")
input_data.shape









    Out[7]:





(33941, 74)



In [8]:

    
input_data.Id_Indicador.value_counts()









    Out[8]:





1006000065    55
1006000060    55
1006000057    55
1006000063    55
1006000055    55
1006000059    55
1006000058    55
1006000061    55
1006000062    55
1006000064    55
1005000055    54
1005000054    54
1006000047    54
1005000049    54
1006000046    54
...
6200002103    1
6200002104    1
6200002105    1
6200002106    1
6200002108    1
6200002117    1
6200002109    1
6200002110    1
6200002111    1
6200002112    1
6200002113    1
6200002114    1
6200002115    1
6200002116    1
6300000259    1
Length: 890, dtype: int64



In [7]:

    
# 3106002001 -> Esperanza de vida al nacer
indicator = input_data[input_data.Id_Indicador == 3106002001]
indicator









    Out[7]:






  
    
      
      Cve_Entidad
      Desc_Entidad
      Cve_Municipio
      Desc_Municipio
      Tema_nivel_1
      Tema_nivel_2
      Tema_nivel_3
      Id_Indicador
      Indicador
      1895
      1900
      1910
      1921
      1930
      1940
      1950
      1952
      1960
      1970
      1971
      
    
  
  
    
      418
       19
       Nuevo León
       0
       Estatal
       Población, Hogares y Vivienda
       Mortalidad
       Mortalidad
       3106002001
       Esperanza de vida al nacimiento
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
       56.3
      NaN
       65.3
       67.2
      NaN
      ...
    
  

1 rows × 74 columns

Algunas de las columnas mostradas previamente contienen datos nulos o no representan un número. Por lo tanto, voy a eliminar dichas columnas que no son de nuestro interés para nuestro análisis.



In [8]:

    
# Drop NaN columns
indicator = indicator.dropna(axis=1, how='all')
indicator









    Out[8]:






  
    
      
      Cve_Entidad
      Desc_Entidad
      Cve_Municipio
      Desc_Municipio
      Tema_nivel_1
      Tema_nivel_2
      Tema_nivel_3
      Id_Indicador
      Indicador
      1950
      1960
      1970
      1980
      1990
      1991
      1992
      1993
      1994
      1995
      1996
      
    
  
  
    
      418
       19
       Nuevo León
       0
       Estatal
       Población, Hogares y Vivienda
       Mortalidad
       Mortalidad
       3106002001
       Esperanza de vida al nacimiento
       56.3
       65.3
       67.2
       71.5
       72.7
       73
       73.2
       73.4
       73.6
       73.9
       74.1
      ...
    
  

1 rows × 34 columns



In [9]:

    
# Select only years
pattern = re.compile(r"\d{4}(\/\d)?$")
years = [col_name for col_name in indicator.columns if re.match(pattern, col_name)]
years









    Out[9]:





['1950',
 '1960',
 '1970',
 '1980',
 '1990',
 '1991',
 '1992',
 '1993',
 '1994',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010']



In [10]:

    
# Only get the columns that reference year data and then transpose the matrix
indicator = indicator[years].T
indicator









    Out[10]:






  
    
      
      418
    
  
  
    
      1950
       56.3
    
    
      1960
       65.3
    
    
      1970
       67.2
    
    
      1980
       71.5
    
    
      1990
       72.7
    
    
      1991
       73.0
    
    
      1992
       73.2
    
    
      1993
       73.4
    
    
      1994
       73.6
    
    
      1995
       73.9
    
    
      1996
       74.1
    
    
      1997
       74.3
    
    
      1998
       74.6
    
    
      1999
       74.7
    
    
      2000
       74.4
    
    
      2001
       74.6
    
    
      2002
       74.6
    
    
      2003
       74.7
    
    
      2004
       74.8
    
    
      2005
       74.9
    
    
      2006
       75.2
    
    
      2007
       75.3
    
    
      2008
       75.4
    
    
      2009
       75.5
    
    
      2010
       75.6
    
  

25 rows × 1 columns



In [13]:

    
fig, axes = plt.subplots()
axes.plot(indicator.index, indicator.values, label="Nuevo Leon", color="green", alpha=0.5, lw=2)
axes.grid(True)
axes.set_xlabel("Tiempo")
axes.set_ylabel("Porcentaje")
axes.set_title("Probabilidad de vivir al nacer")









    Out[13]:





<matplotlib.text.Text at 0x11609b090>



In [ ]:

	418
1950	56.3
1960	65.3
1970	67.2
1980	71.5
1990	72.7
1991	73.0
1992	73.2
1993	73.4
1994	73.6
1995	73.9
1996	74.1
1997	74.3
1998	74.6
1999	74.7
2000	74.4
2001	74.6
2002	74.6
2003	74.7
2004	74.8
2005	74.9
2006	75.2
2007	75.3
2008	75.4
2009	75.5
2010	75.6