Database migration rules



In [1]:

    
from fludashboard import settings

import fludashboard as flud
import glob
import os
import pandas as pd
import sqlalchemy as sqla



In [2]:

    
# pandas configuration
pd.set_option('display.max_columns', 99)



In [3]:

    
def get_filename_from_path(file_path: str):
    """
    """
    return file_path.split(os.path.sep)[-1].split('.')[0]



In [4]:

    
path_data_files = os.path.join(flud.__path__[0], 'data', '*.csv')



In [5]:

    
dfs = {}
pks = {}



In [6]:

    
print('Data files:')
for file_path in glob.glob(path_data_files):
    filename = get_filename_from_path(file_path)
    
    print(filename)
    
    dfs[filename] = pd.read_csv(file_path)









    



Data files:
mem-report
current_estimated_values
clean_data_epiweek-weekly-incidence_w_situation






    



/home/xmn/miniconda3/envs/fludashboard/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2728: DtypeWarning: Columns (23) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)






    



historical_estimated_values
mem-typical

1. Setting IDs

1.1 Datasets



In [7]:

    
dataset_id = {
    'srag': 1,
    'sragflu': 2,
    'obitoflu': 3,
}

1.2 Scale



In [8]:

    
scale_id = {
    'incidência': 1,
    'casos': 2
}

1.3 Situation



In [9]:

    
situation_id = {
    'unknown': 1, 
    'estimated': 2, 
    'stable': 3,
    'incomplete': 4
}

1.4 Territory Type



In [10]:

    
territory_type_id = {
    'Estado': 1, 
    'Regional': 2,
    'Região': 3,
    'País': 4
}

1.5 Region id conversion



In [11]:

    
region_id = {
    'BR': 0,
    'RegN': 1001,
    'RegL': 1002,
    'RegC': 1003,
    'RegS': 1004
}

Territory Table



In [12]:

    
df_territory = pd.DataFrame([
    {'id': 11, 'initials': 'RO', 'name': 'Rondônia', 'territory_type_id': 1},
    {'id': 12, 'initials': 'AC', 'name': 'Acre', 'territory_type_id': 1},
    {'id': 13, 'initials': 'AM', 'name': 'Amazonas', 'territory_type_id': 1},
    {'id': 14, 'initials': 'RR', 'name': 'Roraima', 'territory_type_id': 1},
    {'id': 15, 'initials': 'PA', 'name': 'Pará', 'territory_type_id': 1},
    {'id': 16, 'initials': 'AP', 'name': 'Amapá', 'territory_type_id': 1},
    {'id': 17, 'initials': 'TO', 'name': 'Tocantins', 'territory_type_id': 1},
    {'id': 21, 'initials': 'MA', 'name': 'Maranhão', 'territory_type_id': 1},
    {'id': 22, 'initials': 'PI', 'name': 'Piauí', 'territory_type_id': 1},
    {'id': 23, 'initials': 'CE', 'name': 'Ceará', 'territory_type_id': 1},
    {'id': 24, 'initials': 'RN', 'name': 'Rio Grande do Norte', 'territory_type_id': 1},
    {'id': 25, 'initials': 'PB', 'name': 'Paraíba', 'territory_type_id': 1},
    {'id': 26, 'initials': 'PE', 'name': 'Pernambuco', 'territory_type_id': 1},
    {'id': 27, 'initials': 'AL', 'name': 'Alagoas', 'territory_type_id': 1},
    {'id': 28, 'initials': 'SE', 'name': 'Sergipe', 'territory_type_id': 1},
    {'id': 29, 'initials': 'BA', 'name': 'Bahia', 'territory_type_id': 1},
    {'id': 31, 'initials': 'MG', 'name': 'Minas Gerais', 'territory_type_id': 1},
    {'id': 32, 'initials': 'ES', 'name': 'Espírito Santo', 'territory_type_id': 1},
    {'id': 33, 'initials': 'RJ', 'name': 'Rio de Janeiro', 'territory_type_id': 1},
    {'id': 35, 'initials': 'SP', 'name': 'São Paulo', 'territory_type_id': 1},
    {'id': 41, 'initials': 'PR', 'name': 'Paraná', 'territory_type_id': 1},
    {'id': 42, 'initials': 'SC', 'name': 'Santa Catarina', 'territory_type_id': 1},
    {'id': 43, 'initials': 'RS', 'name': 'Rio Grande do Sul', 'territory_type_id': 1},
    {'id': 50, 'initials': 'MS', 'name': 'Mato Grosso do Sul', 'territory_type_id': 1},
    {'id': 51, 'initials': 'MT', 'name': 'Mato Grosso', 'territory_type_id': 1},
    {'id': 52, 'initials': 'GO', 'name': 'Goiás', 'territory_type_id': 1},
    {'id': 53, 'initials': 'DF', 'name': 'Distrito Federal', 'territory_type_id': 1},
    {'id': 0, 'initials': 'BR', 'name': 'Brasil', 'territory_type_id': 4},
    {'id': 1003, 'initials': 'RegC', 'name': 'Regional Centro', 'territory_type_id': 2},
    {'id': 1002, 'initials': 'RegL', 'name': 'Regional Leste', 'territory_type_id': 2},
    {'id': 1001, 'initials': 'RegN', 'name': 'Regional Norte', 'territory_type_id': 2},
    {'id': 1004, 'initials': 'RegS', 'name': 'Regional Sul', 'territory_type_id': 2},
    {'id': 1, 'initials': 'N', 'name': 'Norte', 'territory_type_id': 3},
    {'id': 2, 'initials': 'NE', 'name': 'Nordeste', 'territory_type_id': 3},
    {'id': 3, 'initials': 'SE', 'name': 'Sudeste', 'territory_type_id': 3},
    {'id': 5, 'initials': 'CO', 'name': 'Centro-oeste', 'territory_type_id': 3},
    {'id': 4, 'initials': 'S', 'name': 'Sul', 'territory_type_id': 3},
])

df_territory.set_index('id', inplace=True)

df_territory









    Out[12]:







  
    
      
      initials
      name
      territory_type_id
    
    
      id
      
      
      
    
  
  
    
      11
      RO
      Rondônia
      1
    
    
      12
      AC
      Acre
      1
    
    
      13
      AM
      Amazonas
      1
    
    
      14
      RR
      Roraima
      1
    
    
      15
      PA
      Pará
      1
    
    
      16
      AP
      Amapá
      1
    
    
      17
      TO
      Tocantins
      1
    
    
      21
      MA
      Maranhão
      1
    
    
      22
      PI
      Piauí
      1
    
    
      23
      CE
      Ceará
      1
    
    
      24
      RN
      Rio Grande do Norte
      1
    
    
      25
      PB
      Paraíba
      1
    
    
      26
      PE
      Pernambuco
      1
    
    
      27
      AL
      Alagoas
      1
    
    
      28
      SE
      Sergipe
      1
    
    
      29
      BA
      Bahia
      1
    
    
      31
      MG
      Minas Gerais
      1
    
    
      32
      ES
      Espírito Santo
      1
    
    
      33
      RJ
      Rio de Janeiro
      1
    
    
      35
      SP
      São Paulo
      1
    
    
      41
      PR
      Paraná
      1
    
    
      42
      SC
      Santa Catarina
      1
    
    
      43
      RS
      Rio Grande do Sul
      1
    
    
      50
      MS
      Mato Grosso do Sul
      1
    
    
      51
      MT
      Mato Grosso
      1
    
    
      52
      GO
      Goiás
      1
    
    
      53
      DF
      Distrito Federal
      1
    
    
      0
      BR
      Brasil
      4
    
    
      1003
      RegC
      Regional Centro
      2
    
    
      1002
      RegL
      Regional Leste
      2
    
    
      1001
      RegN
      Regional Norte
      2
    
    
      1004
      RegS
      Regional Sul
      2
    
    
      1
      N
      Norte
      3
    
    
      2
      NE
      Nordeste
      3
    
    
      3
      SE
      Sudeste
      3
    
    
      5
      CO
      Centro-oeste
      3
    
    
      4
      S
      Sul
      3

2. current_estimated_values



In [13]:

    
dataset = 'current_estimated_values'



In [14]:

    
dfs[dataset].head()









    Out[14]:







  
    
      
      UF
      epiyear
      epiweek
      SRAG
      Tipo
      Situation
      mean
      50%
      2.5%
      97.5%
      L0
      L1
      L2
      L3
      Run date
      dado
      escala
    
  
  
    
      0
      11
      2009
      1
      0.0
      Estado
      stable
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      2017-10-11
      srag
      incidência
    
    
      1
      11
      2009
      2
      0.0
      Estado
      stable
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      2017-10-11
      srag
      incidência
    
    
      2
      11
      2009
      3
      0.0
      Estado
      stable
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      2017-10-11
      srag
      incidência
    
    
      3
      11
      2009
      4
      0.0
      Estado
      stable
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      2017-10-11
      srag
      incidência
    
    
      4
      11
      2009
      5
      0.0
      Estado
      stable
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      2017-10-11
      srag
      incidência



In [15]:

    
migration_rules = {
    'UF': 'territory_id',
    'SRAG': 'value',
    'Tipo': 'territory_type',  # Not needed in the table
    'Situation': 'situation_id',
    '50%': 'median',
    '2.5%': 'ci_lower',
    '97.5%': 'ci_upper',
    'L0': 'low_level',
    'L1': 'epidemic_level',
    'L2': 'high_level',
    'L3': 'very_high_level',
    'Run date': 'run_date',
    'dado': 'dataset_id',
    'escala': 'scale_id'
}

# rename columns
dfs[dataset].rename(
    columns=migration_rules, inplace=True
)

# apply categories
dfs[dataset].dataset_id = dfs[dataset].dataset_id.map(dataset_id)
dfs[dataset].scale_id = dfs[dataset].scale_id.map(scale_id)
dfs[dataset].situation_id = dfs[dataset].situation_id.map(situation_id)
regions_indeces = dfs[dataset].territory_id.isin([
    'BR', 'RegN', 'RegL', 'RegC', 'RegS'
])
dfs[dataset].loc[regions_indeces, 'territory_id'] = dfs[dataset].loc[
    regions_indeces, 'territory_id'
].map(region_id)
dfs[dataset].territory_id = dfs[dataset].territory_id.astype(int)

# remove unnecessary fields
dfs[dataset].drop(['territory_type'], axis=1, inplace=True)

dfs[dataset].head()









    Out[15]:







  
    
      
      territory_id
      epiyear
      epiweek
      value
      situation_id
      mean
      median
      ci_lower
      ci_upper
      low_level
      epidemic_level
      high_level
      very_high_level
      run_date
      dataset_id
      scale_id
    
  
  
    
      0
      11
      2009
      1
      0.0
      3
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      2017-10-11
      1
      1
    
    
      1
      11
      2009
      2
      0.0
      3
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      2017-10-11
      1
      1
    
    
      2
      11
      2009
      3
      0.0
      3
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      2017-10-11
      1
      1
    
    
      3
      11
      2009
      4
      0.0
      3
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      2017-10-11
      1
      1
    
    
      4
      11
      2009
      5
      0.0
      3
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      2017-10-11
      1
      1



In [16]:

    
dfs[dataset].tail()









    Out[16]:







  
    
      
      territory_id
      epiyear
      epiweek
      value
      situation_id
      mean
      median
      ci_lower
      ci_upper
      low_level
      epidemic_level
      high_level
      very_high_level
      run_date
      dataset_id
      scale_id
    
  
  
    
      86203
      1004
      2017
      27
      9.0
      2
      10.0
      9.0
      9.0
      11.0
      0.000
      0.988
      0.012
      0.000
      2017-10-11
      3
      2
    
    
      86204
      1004
      2017
      28
      8.0
      2
      8.0
      8.0
      7.0
      10.0
      0.800
      0.200
      0.000
      0.000
      2017-10-11
      3
      2
    
    
      86205
      1004
      2017
      29
      9.0
      2
      10.0
      10.0
      9.0
      13.0
      0.000
      0.856
      0.140
      0.004
      2017-10-11
      3
      2
    
    
      86206
      1004
      2017
      30
      2.0
      2
      2.0
      2.0
      0.0
      7.0
      0.996
      0.004
      0.000
      0.000
      2017-10-11
      3
      2
    
    
      86207
      1004
      2017
      31
      3.0
      2
      9.0
      8.0
      3.0
      18.0
      0.560
      0.256
      0.136
      0.048
      2017-10-11
      3
      2



In [17]:

    
dfs[dataset].dtypes









    Out[17]:





territory_id         int64
epiyear              int64
epiweek              int64
value              float64
situation_id         int64
mean               float64
median             float64
ci_lower           float64
ci_upper           float64
low_level          float64
epidemic_level     float64
high_level         float64
very_high_level    float64
run_date            object
dataset_id           int64
scale_id             int64
dtype: object



In [18]:

    
# primary_keys
pks[dataset] = ['dataset_id', 'scale_id', 'territory_id', 'epiyear', 'epiweek']

dfs[dataset].set_index(pks[dataset], inplace=True)
dfs[dataset].head()









    Out[18]:







  
    
      
      
      
      
      
      value
      situation_id
      mean
      median
      ci_lower
      ci_upper
      low_level
      epidemic_level
      high_level
      very_high_level
      run_date
    
    
      dataset_id
      scale_id
      territory_id
      epiyear
      epiweek
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      1
      11
      2009
      1
      0.0
      3
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      2017-10-11
    
    
      2
      0.0
      3
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      2017-10-11
    
    
      3
      0.0
      3
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      2017-10-11
    
    
      4
      0.0
      3
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      2017-10-11
    
    
      5
      0.0
      3
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      2017-10-11

3. historical_estimated_values



In [19]:

    
dataset = 'historical_estimated_values'



In [20]:

    
dfs[dataset].head()









    Out[20]:







  
    
      
      UF
      epiyear
      epiweek
      SRAG
      Tipo
      Situation
      mean
      50%
      2.5%
      97.5%
      L0
      L1
      L2
      L3
      Run date
      base_epiyearweek
      base_epiyear
      base_epiweek
      dado
      escala
    
  
  
    
      0
      11
      2017
      7
      0.055377
      Estado
      estimated
      0.055377
      0.055377
      0.055377
      0.055377
      0.976
      0.024
      0.0
      0.0
      2017-10-11
      2017W23
      2017
      23
      srag
      incidência
    
    
      1
      11
      2017
      8
      0.055377
      Estado
      estimated
      0.055377
      0.055377
      0.055377
      0.110755
      0.972
      0.028
      0.0
      0.0
      2017-10-11
      2017W23
      2017
      23
      srag
      incidência
    
    
      2
      11
      2017
      9
      0.000000
      Estado
      estimated
      0.000000
      0.000000
      0.000000
      0.000000
      1.000
      0.000
      0.0
      0.0
      2017-10-11
      2017W23
      2017
      23
      srag
      incidência
    
    
      3
      11
      2017
      10
      0.000000
      Estado
      estimated
      0.000000
      0.000000
      0.000000
      0.055377
      0.996
      0.004
      0.0
      0.0
      2017-10-11
      2017W23
      2017
      23
      srag
      incidência
    
    
      4
      11
      2017
      11
      0.055377
      Estado
      estimated
      0.055377
      0.055377
      0.055377
      0.110755
      0.964
      0.036
      0.0
      0.0
      2017-10-11
      2017W23
      2017
      23
      srag
      incidência



In [21]:

    
migration_rules = {
    'UF': 'territory_id',
    'SRAG': 'value',
    'Tipo': 'territory_type',  # Not needed in the table
    'Situation': 'situation_id',
    '50%': 'median',
    '2.5%': 'ci_lower',
    '97.5%': 'ci_upper',
    'L0': 'low_level',
    'L1': 'epidemic_level',
    'L2': 'high_level',
    'L3': 'very_high_level',
    'Run date': 'run_date',
    'dado': 'dataset_id',  # or origin
    'escala': 'scale_id'
}

dfs[dataset].rename(
    columns=migration_rules, inplace=True
)
# apply categories
dfs[dataset].dataset_id = dfs[dataset].dataset_id.map(dataset_id)
dfs[dataset].scale_id = dfs[dataset].scale_id.map(scale_id)
dfs[dataset].situation_id = dfs[dataset].situation_id.map(situation_id)
regions_indeces = dfs[dataset].territory_id.isin([
    'BR', 'RegN', 'RegL', 'RegC', 'RegS'
])
dfs[dataset].loc[regions_indeces, 'territory_id'] = dfs[dataset].loc[
    regions_indeces, 'territory_id'
].map(region_id)
dfs[dataset].territory_id = dfs[dataset].territory_id.astype(int)

# remove unnecessary fields
dfs[dataset].drop(['territory_type'], axis=1, inplace=True)

dfs[dataset].head()









    Out[21]:







  
    
      
      territory_id
      epiyear
      epiweek
      value
      situation_id
      mean
      median
      ci_lower
      ci_upper
      low_level
      epidemic_level
      high_level
      very_high_level
      run_date
      base_epiyearweek
      base_epiyear
      base_epiweek
      dataset_id
      scale_id
    
  
  
    
      0
      11
      2017
      7
      0.055377
      2
      0.055377
      0.055377
      0.055377
      0.055377
      0.976
      0.024
      0.0
      0.0
      2017-10-11
      2017W23
      2017
      23
      1
      1
    
    
      1
      11
      2017
      8
      0.055377
      2
      0.055377
      0.055377
      0.055377
      0.110755
      0.972
      0.028
      0.0
      0.0
      2017-10-11
      2017W23
      2017
      23
      1
      1
    
    
      2
      11
      2017
      9
      0.000000
      2
      0.000000
      0.000000
      0.000000
      0.000000
      1.000
      0.000
      0.0
      0.0
      2017-10-11
      2017W23
      2017
      23
      1
      1
    
    
      3
      11
      2017
      10
      0.000000
      2
      0.000000
      0.000000
      0.000000
      0.055377
      0.996
      0.004
      0.0
      0.0
      2017-10-11
      2017W23
      2017
      23
      1
      1
    
    
      4
      11
      2017
      11
      0.055377
      2
      0.055377
      0.055377
      0.055377
      0.110755
      0.964
      0.036
      0.0
      0.0
      2017-10-11
      2017W23
      2017
      23
      1
      1



In [22]:

    
dfs[dataset].tail()









    Out[22]:







  
    
      
      territory_id
      epiyear
      epiweek
      value
      situation_id
      mean
      median
      ci_lower
      ci_upper
      low_level
      epidemic_level
      high_level
      very_high_level
      run_date
      base_epiyearweek
      base_epiyear
      base_epiweek
      dataset_id
      scale_id
    
  
  
    
      20567
      1004
      2017
      27
      9.0
      2
      10.0
      9.0
      9.0
      11.0
      0.000
      0.988
      0.012
      0.000
      2017-10-11
      2017W31
      2017
      31
      3
      2
    
    
      20568
      1004
      2017
      28
      8.0
      2
      8.0
      8.0
      7.0
      10.0
      0.800
      0.200
      0.000
      0.000
      2017-10-11
      2017W31
      2017
      31
      3
      2
    
    
      20569
      1004
      2017
      29
      9.0
      2
      10.0
      10.0
      9.0
      13.0
      0.000
      0.856
      0.140
      0.004
      2017-10-11
      2017W31
      2017
      31
      3
      2
    
    
      20570
      1004
      2017
      30
      2.0
      2
      2.0
      2.0
      0.0
      7.0
      0.996
      0.004
      0.000
      0.000
      2017-10-11
      2017W31
      2017
      31
      3
      2
    
    
      20571
      1004
      2017
      31
      3.0
      2
      9.0
      8.0
      3.0
      18.0
      0.560
      0.256
      0.136
      0.048
      2017-10-11
      2017W31
      2017
      31
      3
      2



In [23]:

    
dfs[dataset].dtypes









    Out[23]:





territory_id          int64
epiyear               int64
epiweek               int64
value               float64
situation_id          int64
mean                float64
median              float64
ci_lower            float64
ci_upper            float64
low_level           float64
epidemic_level      float64
high_level          float64
very_high_level     float64
run_date             object
base_epiyearweek     object
base_epiyear          int64
base_epiweek          int64
dataset_id            int64
scale_id              int64
dtype: object



In [24]:

    
# primary_keys
pks[dataset] = [
    'dataset_id', 'scale_id', 'territory_id', 
    'base_epiyear', 'base_epiweek', 
    'epiyear', 'epiweek'
]

dfs[dataset].set_index(pks[dataset], inplace=True)
dfs[dataset].head()









    Out[24]:







  
    
      
      
      
      
      
      
      
      value
      situation_id
      mean
      median
      ci_lower
      ci_upper
      low_level
      epidemic_level
      high_level
      very_high_level
      run_date
      base_epiyearweek
    
    
      dataset_id
      scale_id
      territory_id
      base_epiyear
      base_epiweek
      epiyear
      epiweek
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      1
      11
      2017
      23
      2017
      7
      0.055377
      2
      0.055377
      0.055377
      0.055377
      0.055377
      0.976
      0.024
      0.0
      0.0
      2017-10-11
      2017W23
    
    
      8
      0.055377
      2
      0.055377
      0.055377
      0.055377
      0.110755
      0.972
      0.028
      0.0
      0.0
      2017-10-11
      2017W23
    
    
      9
      0.000000
      2
      0.000000
      0.000000
      0.000000
      0.000000
      1.000
      0.000
      0.0
      0.0
      2017-10-11
      2017W23
    
    
      10
      0.000000
      2
      0.000000
      0.000000
      0.000000
      0.055377
      0.996
      0.004
      0.0
      0.0
      2017-10-11
      2017W23
    
    
      11
      0.055377
      2
      0.055377
      0.055377
      0.055377
      0.110755
      0.964
      0.036
      0.0
      0.0
      2017-10-11
      2017W23

4. clean_data_epiweek-weekly-incidence_w_situation



In [25]:

    
dataset = 'clean_data_epiweek-weekly-incidence_w_situation'



In [26]:

    
dfs[dataset].head()









    Out[26]:







  
    
      
      0-4 anos
      10-19 anos
      2-4 anos
      20-29 anos
      30-39 anos
      40-49 anos
      5-9 anos
      50-59 anos
      60+ anos
      < 2 anos
      DELAYED
      FLU_A
      FLU_B
      INCONCLUSIVE
      Idade desconhecida
      NEGATIVE
      NOTTESTED
      OTHERS
      POSITIVE_CASES
      SRAG
      Situation
      TESTING_IGNORED
      Tipo
      UF
      Unidade da Federação
      VSR
      dado
      epiweek
      epiyear
      epiyearweek
      escala
      sexo
    
  
  
    
      0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      stable
      0.0
      Estado
      11
      Rondônia
      0.0
      srag
      1
      2009
      2009W01
      incidência
      F
    
    
      1
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      stable
      0.0
      Estado
      11
      Rondônia
      0.0
      srag
      1
      2009
      2009W01
      incidência
      M
    
    
      2
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      stable
      0.0
      Estado
      11
      Rondônia
      0.0
      srag
      1
      2009
      2009W01
      incidência
      Total
    
    
      3
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      stable
      0.0
      Estado
      11
      Rondônia
      0.0
      srag
      2
      2009
      2009W02
      incidência
      F
    
    
      4
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      stable
      0.0
      Estado
      11
      Rondônia
      0.0
      srag
      2
      2009
      2009W02
      incidência
      M



In [27]:

    
dfs[dataset].keys()









    Out[27]:





Index(['0-4 anos', '10-19 anos', '2-4 anos', '20-29 anos', '30-39 anos',
       '40-49 anos', '5-9 anos', '50-59 anos', '60+ anos', '< 2 anos',
       'DELAYED', 'FLU_A', 'FLU_B', 'INCONCLUSIVE', 'Idade desconhecida',
       'NEGATIVE', 'NOTTESTED', 'OTHERS', 'POSITIVE_CASES', 'SRAG',
       'Situation', 'TESTING_IGNORED', 'Tipo', 'UF', 'Unidade da Federação',
       'VSR', 'dado', 'epiweek', 'epiyear', 'epiyearweek', 'escala', 'sexo'],
      dtype='object')



In [28]:

    
migration_rules = {
    '0-4 anos': 'years_0_4',
    '10-19 anos': 'years_10_19',
    '2-4 anos': 'years_2_4',
    '20-29 anos': 'years_20_29',
    '30-39 anos': 'years_30_39',
    '40-49 anos': 'years_40_49',
    '5-9 anos': 'years_5_9',
    '50-59 anos': 'years_50_59',
    '60+ anos': 'years_60_or_more',
    '< 2 anos': 'years_lt_2',
    'DELAYED': 'delayed',
    'FLU_A': 'flu_a',
    'FLU_B': 'flu_b',
    'INCONCLUSIVE': 'inconclusive',
    'Idade desconhecida': 'unknown_age',
    'NEGATIVE': 'negative',
    'NOTTESTED': 'not_tested',
    'OTHERS': 'others',
    'POSITIVE_CASES': 'positive_cases',
    'SRAG': 'value',
    'Situation': 'situation_id',
    'TESTING_IGNORED': 'testing_ignored',
    'Tipo': 'territory_type',  # Not needed in the table
    'UF': 'territory_id',
    'Unidade da Federação': 'state_country_name',  # Not needed in the table
    'VSR': 'vsr',
    'dado': 'dataset_id',
    'escala': 'scale_id',
    'sexo': 'gender'
}

dfs[dataset].rename(
    columns=migration_rules, inplace=True
)
# apply categories
dfs[dataset].dataset_id = dfs[dataset].dataset_id.map(dataset_id)
dfs[dataset].scale_id = dfs[dataset].scale_id.map(scale_id)
dfs[dataset].situation_id = dfs[dataset].situation_id.map(situation_id)
regions_indeces = dfs[dataset].territory_id.isin([
    'BR', 'RegN', 'RegL', 'RegC', 'RegS'
])
dfs[dataset].loc[regions_indeces, 'territory_id'] = dfs[dataset].loc[
    regions_indeces, 'territory_id'
].map(region_id)
dfs[dataset].territory_id = dfs[dataset].territory_id.astype(int)

# remove unnecessary fields
dfs[dataset].drop([
    'state_country_name', 'territory_type'
], axis=1, inplace=True)


dfs[dataset].head()









    Out[28]:







  
    
      
      years_0_4
      years_10_19
      years_2_4
      years_20_29
      years_30_39
      years_40_49
      years_5_9
      years_50_59
      years_60_or_more
      years_lt_2
      delayed
      flu_a
      flu_b
      inconclusive
      unknown_age
      negative
      not_tested
      others
      positive_cases
      value
      situation_id
      testing_ignored
      territory_id
      vsr
      dataset_id
      epiweek
      epiyear
      epiyearweek
      scale_id
      gender
    
  
  
    
      0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      3
      0.0
      11
      0.0
      1
      1
      2009
      2009W01
      1
      F
    
    
      1
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      3
      0.0
      11
      0.0
      1
      1
      2009
      2009W01
      1
      M
    
    
      2
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      3
      0.0
      11
      0.0
      1
      1
      2009
      2009W01
      1
      Total
    
    
      3
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      3
      0.0
      11
      0.0
      1
      2
      2009
      2009W02
      1
      F
    
    
      4
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      3
      0.0
      11
      0.0
      1
      2
      2009
      2009W02
      1
      M



In [29]:

    
dfs[dataset].tail()









    Out[29]:







  
    
      
      years_0_4
      years_10_19
      years_2_4
      years_20_29
      years_30_39
      years_40_49
      years_5_9
      years_50_59
      years_60_or_more
      years_lt_2
      delayed
      flu_a
      flu_b
      inconclusive
      unknown_age
      negative
      not_tested
      others
      positive_cases
      value
      situation_id
      testing_ignored
      territory_id
      vsr
      dataset_id
      epiweek
      epiyear
      epiyearweek
      scale_id
      gender
    
  
  
    
      301723
      NaN
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      1.0
      0.0
      0.0
      2.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      2.0
      2.0
      1
      0.0
      1004
      0.0
      3
      30
      2017
      2017W30
      2
      Total
    
    
      301724
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      1.0
      1
      0.0
      1004
      0.0
      3
      31
      2017
      2017W31
      2
      F
    
    
      301725
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1
      0.0
      1004
      0.0
      3
      31
      2017
      2017W31
      2
      I
    
    
      301726
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      1.0
      0.0
      0.0
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      2.0
      2.0
      1
      0.0
      1004
      0.0
      3
      31
      2017
      2017W31
      2
      M
    
    
      301727
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      2.0
      0.0
      0.0
      1.0
      2.0
      0.0
      0.0
      0.0
      0.0
      0.0
      3.0
      3.0
      1
      0.0
      1004
      0.0
      3
      31
      2017
      2017W31
      2
      Total



In [30]:

    
dfs[dataset].dtypes









    Out[30]:





years_0_4           float64
years_10_19         float64
years_2_4           float64
years_20_29         float64
years_30_39         float64
years_40_49         float64
years_5_9           float64
years_50_59         float64
years_60_or_more    float64
years_lt_2          float64
delayed             float64
flu_a               float64
flu_b               float64
inconclusive        float64
unknown_age         float64
negative            float64
not_tested          float64
others              float64
positive_cases      float64
value               float64
situation_id          int64
testing_ignored     float64
territory_id          int64
vsr                 float64
dataset_id            int64
epiweek               int64
epiyear               int64
epiyearweek          object
scale_id              int64
gender               object
dtype: object



In [31]:

    
# primary_keys
pks[dataset] = ['dataset_id', 'scale_id', 'territory_id', 'epiyear', 'epiweek']

dfs[dataset].set_index(pks[dataset], inplace=True)
dfs[dataset].head()









    Out[31]:







  
    
      
      
      
      
      
      years_0_4
      years_10_19
      years_2_4
      years_20_29
      years_30_39
      years_40_49
      years_5_9
      years_50_59
      years_60_or_more
      years_lt_2
      delayed
      flu_a
      flu_b
      inconclusive
      unknown_age
      negative
      not_tested
      others
      positive_cases
      value
      situation_id
      testing_ignored
      vsr
      epiyearweek
      gender
    
    
      dataset_id
      scale_id
      territory_id
      epiyear
      epiweek
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      1
      11
      2009
      1
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      3
      0.0
      0.0
      2009W01
      F
    
    
      1
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      3
      0.0
      0.0
      2009W01
      M
    
    
      1
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      3
      0.0
      0.0
      2009W01
      Total
    
    
      2
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      3
      0.0
      0.0
      2009W02
      F
    
    
      2
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      3
      0.0
      0.0
      2009W02
      M

5. mem-report



In [32]:

    
dataset = 'mem-report'



In [33]:

    
dfs[dataset].head()









    Out[33]:







  
    
      
      UF
      População
      Média geométrica do pico de infecção das temporadas regulares
      região de baixa atividade típica
      limiar pré-epidêmico
      intensidade alta
      intensidade muito alta
      SE típica do início do surto
      SE típica do início do surto - IC inferior (2,5%)
      SE típica do início do surto - IC superior (97,5%)
      duração típica do surto
      duração típica do surto - IC inferior (2,5%)
      duração típica do surto - IC superior (97,5%)
      temporadas utilizadas para os corredores endêmicos
      ano
      Unidade da Federação
      Tipo
      dado
      escala
    
  
  
    
      0
      11
      1805788
      0.803458
      0
      0.103162
      0.863224
      1.168160
      12.0
      9.0
      22.0
      8.0
      4.0
      15.0
      2010, 2013, 2014, 2016
      2017
      Rondônia
      Estado
      srag
      incidência
    
    
      1
      12
      829619
      0.953503
      0
      0.685939
      1.783885
      3.053363
      24.0
      5.0
      49.0
      6.0
      3.0
      16.0
      2010, 2011, 2012, 2013, 2014, 2015, 2016
      2017
      Acre
      Estado
      srag
      incidência
    
    
      2
      13
      4063614
      0.119378
      0
      0.042171
      0.165851
      0.212193
      12.0
      4.0
      25.0
      12.0
      6.0
      13.0
      2010, 2012, 2013, 2014, 2015, 2016
      2017
      Amazonas
      Estado
      srag
      incidência
    
    
      3
      14
      522636
      0.473615
      0
      0.414595
      0.593964
      0.801757
      26.0
      13.0
      50.0
      6.0
      3.0
      10.0
      2012, 2013, 2014, 2015, 2016
      2017
      Roraima
      Estado
      srag
      incidência
    
    
      4
      15
      8366628
      0.256230
      0
      0.083865
      0.466878
      0.859727
      12.0
      10.0
      18.0
      10.0
      5.0
      13.0
      2012, 2014, 2015, 2016
      2017
      Pará
      Estado
      srag
      incidência



In [34]:

    
dfs[dataset].keys()









    Out[34]:





Index(['UF', 'População',
       'Média geométrica do pico de infecção das temporadas regulares',
       'região de baixa atividade típica', 'limiar pré-epidêmico',
       'intensidade alta', 'intensidade muito alta',
       'SE típica do início do surto',
       'SE típica do início do surto - IC inferior (2,5%)',
       'SE típica do início do surto - IC superior (97,5%)',
       'duração típica do surto',
       'duração típica do surto - IC inferior (2,5%)',
       'duração típica do surto - IC superior (97,5%)',
       'temporadas utilizadas para os corredores endêmicos', 'ano',
       'Unidade da Federação', 'Tipo', 'dado', 'escala'],
      dtype='object')



In [35]:

    
migration_rules = {
    'UF': 'territory_id',
    'População': 'population',
    'Média geométrica do pico de infecção das temporadas regulares': 'geom_average_peak',
    'região de baixa atividade típica': 'low_activity_region',
    'limiar pré-epidêmico': 'pre_epidemic_threshold',
    'intensidade alta': 'high_threshold',
    'intensidade muito alta': 'very_high_threshold',
    'SE típica do início do surto': 'epi_start',
    'SE típica do início do surto - IC inferior (2,5%)': 'epi_start_ci_lower',
    'SE típica do início do surto - IC superior (97,5%)': 'epi_start_ci_upper',
    'duração típica do surto': 'epi_duration',
    'duração típica do surto - IC inferior (2,5%)': 'epi_duration_ci_lower',
    'duração típica do surto - IC superior (97,5%)': 'epi_duration_ci_upper',
    'temporadas utilizadas para os corredores endêmicos': 'regular_seasons',
    'ano': 'year',
    'Unidade da Federação': 'state_country_name',  # Not needed in the table
    'Tipo': 'territory_type',  # Not needed in the table
    'dado': 'dataset_id',
    'escala': 'scale_id'
}

dfs[dataset].rename(
    columns=migration_rules, inplace=True
)
# apply categories
dfs[dataset].dataset_id = dfs[dataset].dataset_id.map(dataset_id)
dfs[dataset].scale_id = dfs[dataset].scale_id.map(scale_id)
regions_indeces = dfs[dataset].territory_id.isin([
    'BR', 'RegN', 'RegL', 'RegC', 'RegS'
])
dfs[dataset].loc[regions_indeces, 'territory_id'] = dfs[dataset].loc[
    regions_indeces, 'territory_id'
].map(region_id)
dfs[dataset].territory_id = dfs[dataset].territory_id.astype(int)

# remove unnecessary fields
dfs[dataset].drop([
    'state_country_name', 'territory_type'
], axis=1, inplace=True)

dfs[dataset].head()









    Out[35]:







  
    
      
      territory_id
      population
      geom_average_peak
      low_activity_region
      pre_epidemic_threshold
      high_threshold
      very_high_threshold
      epi_start
      epi_start_ci_lower
      epi_start_ci_upper
      epi_duration
      epi_duration_ci_lower
      epi_duration_ci_upper
      regular_seasons
      year
      dataset_id
      scale_id
    
  
  
    
      0
      11
      1805788
      0.803458
      0
      0.103162
      0.863224
      1.168160
      12.0
      9.0
      22.0
      8.0
      4.0
      15.0
      2010, 2013, 2014, 2016
      2017
      1
      1
    
    
      1
      12
      829619
      0.953503
      0
      0.685939
      1.783885
      3.053363
      24.0
      5.0
      49.0
      6.0
      3.0
      16.0
      2010, 2011, 2012, 2013, 2014, 2015, 2016
      2017
      1
      1
    
    
      2
      13
      4063614
      0.119378
      0
      0.042171
      0.165851
      0.212193
      12.0
      4.0
      25.0
      12.0
      6.0
      13.0
      2010, 2012, 2013, 2014, 2015, 2016
      2017
      1
      1
    
    
      3
      14
      522636
      0.473615
      0
      0.414595
      0.593964
      0.801757
      26.0
      13.0
      50.0
      6.0
      3.0
      10.0
      2012, 2013, 2014, 2015, 2016
      2017
      1
      1
    
    
      4
      15
      8366628
      0.256230
      0
      0.083865
      0.466878
      0.859727
      12.0
      10.0
      18.0
      10.0
      5.0
      13.0
      2012, 2014, 2015, 2016
      2017
      1
      1



In [36]:

    
dfs[dataset].tail()









    Out[36]:







  
    
      
      territory_id
      population
      geom_average_peak
      low_activity_region
      pre_epidemic_threshold
      high_threshold
      very_high_threshold
      epi_start
      epi_start_ci_lower
      epi_start_ci_upper
      epi_duration
      epi_duration_ci_lower
      epi_duration_ci_upper
      regular_seasons
      year
      dataset_id
      scale_id
    
  
  
    
      187
      0
      207660929
      26.289172
      0
      8.484101
      49.118258
      91.799762
      16.0
      5.0
      21.0
      12.0
      11.0
      15.0
      2010, 2012, 2013, 2014, 2015
      2017
      3
      2
    
    
      188
      1003
      27644815
      4.280383
      0
      1.046539
      4.452711
      5.507278
      15.0
      7.0
      20.0
      10.0
      5.0
      18.0
      2010, 2012, 2013, 2014, 2015
      2017
      3
      2
    
    
      189
      1002
      67770757
      2.771309
      0
      1.257426
      3.756206
      4.770510
      16.0
      8.0
      30.0
      8.0
      3.0
      17.0
      2010, 2011, 2012, 2013, 2014, 2015
      2017
      3
      2
    
    
      190
      1001
      16386007
      4.228626
      0
      1.182767
      6.055162
      7.565655
      12.0
      2.0
      16.0
      8.0
      1.0
      11.0
      2010, 2012, 2013, 2014, 2015, 2016
      2017
      3
      2
    
    
      191
      1004
      95859350
      6.935301
      0
      8.946847
      11.831033
      16.324451
      25.0
      17.0
      33.0
      10.0
      7.0
      14.0
      2010, 2011, 2014, 2015
      2017
      3
      2



In [37]:

    
dfs[dataset].dtypes









    Out[37]:





territory_id                int64
population                  int64
geom_average_peak         float64
low_activity_region         int64
pre_epidemic_threshold    float64
high_threshold            float64
very_high_threshold       float64
epi_start                 float64
epi_start_ci_lower        float64
epi_start_ci_upper        float64
epi_duration              float64
epi_duration_ci_lower     float64
epi_duration_ci_upper     float64
regular_seasons            object
year                        int64
dataset_id                  int64
scale_id                    int64
dtype: object



In [38]:

    
# primary_keys
pks[dataset] = ['dataset_id', 'scale_id', 'territory_id', 'year']

dfs[dataset].set_index(pks[dataset], inplace=True)
dfs[dataset].head()









    Out[38]:







  
    
      
      
      
      
      population
      geom_average_peak
      low_activity_region
      pre_epidemic_threshold
      high_threshold
      very_high_threshold
      epi_start
      epi_start_ci_lower
      epi_start_ci_upper
      epi_duration
      epi_duration_ci_lower
      epi_duration_ci_upper
      regular_seasons
    
    
      dataset_id
      scale_id
      territory_id
      year
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      1
      11
      2017
      1805788
      0.803458
      0
      0.103162
      0.863224
      1.168160
      12.0
      9.0
      22.0
      8.0
      4.0
      15.0
      2010, 2013, 2014, 2016
    
    
      12
      2017
      829619
      0.953503
      0
      0.685939
      1.783885
      3.053363
      24.0
      5.0
      49.0
      6.0
      3.0
      16.0
      2010, 2011, 2012, 2013, 2014, 2015, 2016
    
    
      13
      2017
      4063614
      0.119378
      0
      0.042171
      0.165851
      0.212193
      12.0
      4.0
      25.0
      12.0
      6.0
      13.0
      2010, 2012, 2013, 2014, 2015, 2016
    
    
      14
      2017
      522636
      0.473615
      0
      0.414595
      0.593964
      0.801757
      26.0
      13.0
      50.0
      6.0
      3.0
      10.0
      2012, 2013, 2014, 2015, 2016
    
    
      15
      2017
      8366628
      0.256230
      0
      0.083865
      0.466878
      0.859727
      12.0
      10.0
      18.0
      10.0
      5.0
      13.0
      2012, 2014, 2015, 2016

6. mem-typical



In [39]:

    
dataset = 'mem-typical'



In [40]:

    
dfs[dataset].head()









    Out[40]:







  
    
      
      UF
      População
      epiweek
      corredor baixo
      corredor mediano
      corredor alto
      ano
      Unidade da Federação
      Tipo
      dado
      escala
    
  
  
    
      0
      11
      1805788
      1
      0.0
      0.013703
      0.041109
      2017
      Rondônia
      Estado
      srag
      incidência
    
    
      1
      11
      1805788
      2
      0.0
      0.000000
      0.000000
      2017
      Rondônia
      Estado
      srag
      incidência
    
    
      2
      11
      1805788
      3
      0.0
      0.013703
      0.041109
      2017
      Rondônia
      Estado
      srag
      incidência
    
    
      3
      11
      1805788
      4
      0.0
      0.028790
      0.087644
      2017
      Rondônia
      Estado
      srag
      incidência
    
    
      4
      11
      1805788
      5
      0.0
      0.000000
      0.000000
      2017
      Rondônia
      Estado
      srag
      incidência



In [41]:

    
dfs[dataset].keys()









    Out[41]:





Index(['UF', 'População', 'epiweek', 'corredor baixo', 'corredor mediano',
       'corredor alto', 'ano', 'Unidade da Federação', 'Tipo', 'dado',
       'escala'],
      dtype='object')



In [42]:

    
migration_rules = {
    'UF': 'territory_id',
    'População': 'population',
    'corredor baixo': 'low',
    'corredor mediano': 'median',
    'corredor alto': 'high',
    'ano': 'year',
    'Unidade da Federação': 'state_country_name',  # Not needed in the table
    'Tipo': 'territory_type',  # Not needed in the table
    'dado': 'dataset_id',
    'escala': 'scale_id'
}

dfs[dataset].rename(
    columns=migration_rules, inplace=True
)
# apply categories
dfs[dataset].dataset_id = dfs[dataset].dataset_id.map(dataset_id)
dfs[dataset].scale_id = dfs[dataset].scale_id.map(scale_id)
regions_indeces = dfs[dataset].territory_id.isin([
    'BR', 'RegN', 'RegL', 'RegC', 'RegS'
])
dfs[dataset].loc[regions_indeces, 'territory_id'] = dfs[dataset].loc[
    regions_indeces, 'territory_id'
].map(region_id)
dfs[dataset].territory_id = dfs[dataset].territory_id.astype(int)

# remove unnecessary fields
dfs[dataset].drop([
    'state_country_name', 'territory_type'
], axis=1, inplace=True)

dfs[dataset].head()









    Out[42]:







  
    
      
      territory_id
      population
      epiweek
      low
      median
      high
      year
      dataset_id
      scale_id
    
  
  
    
      0
      11
      1805788
      1
      0.0
      0.013703
      0.041109
      2017
      1
      1
    
    
      1
      11
      1805788
      2
      0.0
      0.000000
      0.000000
      2017
      1
      1
    
    
      2
      11
      1805788
      3
      0.0
      0.013703
      0.041109
      2017
      1
      1
    
    
      3
      11
      1805788
      4
      0.0
      0.028790
      0.087644
      2017
      1
      1
    
    
      4
      11
      1805788
      5
      0.0
      0.000000
      0.000000
      2017
      1
      1



In [43]:

    
dfs[dataset].tail()









    Out[43]:







  
    
      
      territory_id
      population
      epiweek
      low
      median
      high
      year
      dataset_id
      scale_id
    
  
  
    
      9979
      1004
      95859350
      48
      0.000000
      0.506813
      1.500926
      2017
      3
      2
    
    
      9980
      1004
      95859350
      49
      0.194681
      1.032562
      1.871175
      2017
      3
      2
    
    
      9981
      1004
      95859350
      50
      0.208623
      1.026247
      1.844569
      2017
      3
      2
    
    
      9982
      1004
      95859350
      51
      0.269861
      0.779518
      1.289445
      2017
      3
      2
    
    
      9983
      1004
      95859350
      52
      0.000000
      0.000000
      0.000000
      2017
      3
      2



In [44]:

    
dfs[dataset].dtypes









    Out[44]:





territory_id      int64
population        int64
epiweek           int64
low             float64
median          float64
high            float64
year              int64
dataset_id        int64
scale_id          int64
dtype: object



In [45]:

    
# primary_keys
pks[dataset] = ['dataset_id', 'scale_id', 'territory_id', 'year', 'epiweek']

dfs[dataset].set_index(pks[dataset], inplace=True)
dfs[dataset].head()









    Out[45]:







  
    
      
      
      
      
      
      population
      low
      median
      high
    
    
      dataset_id
      scale_id
      territory_id
      year
      epiweek
      
      
      
      
    
  
  
    
      1
      1
      11
      2017
      1
      1805788
      0.0
      0.013703
      0.041109
    
    
      2
      1805788
      0.0
      0.000000
      0.000000
    
    
      3
      1805788
      0.0
      0.013703
      0.041109
    
    
      4
      1805788
      0.0
      0.028790
      0.087644
    
    
      5
      1805788
      0.0
      0.000000
      0.000000

SQL Migration



In [46]:

    
exception_type_field = {
    'run_date': 'DATE'
}



In [48]:

    
# creating dataset dataframe
df_dataset = pd.DataFrame({
    'id': list(dataset_id.values()), 
    'name': list(dataset_id.keys())
}).set_index('id')

dfs['dataset'] = df_dataset

df_dataset



In [49]:

    
# creating situation dataframe
df_situation = pd.DataFrame({
    'id': list(situation_id.values()), 
    'name': list(situation_id.keys())
}).set_index('id')

dfs['situation'] = df_situation

df_situation









    Out[49]:







  
    
      
      name
    
    
      id
      
    
  
  
    
      1
      unknown
    
    
      2
      estimated
    
    
      3
      stable
    
    
      4
      incomplete



In [50]:

    
# creating scale dataframe
df_scale = pd.DataFrame({
    'id': list(scale_id.values()), 
    'name': list(scale_id.keys())
}).set_index('id')

dfs['scale'] = df_scale

df_scale



In [51]:

    
# creating territory_type dataframe
df_territory_type = pd.DataFrame({
    'id': list(territory_type_id.values()), 
    'name': list(territory_type_id.keys())
}).set_index('id')

dfs['territory_type'] = df_territory_type

df_territory_type



In [52]:

    
dfs['territory'] = df_territory

df_territory.head()









    Out[52]:







  
    
      
      initials
      name
      territory_type_id
    
    
      id
      
      
      
    
  
  
    
      11
      RO
      Rondônia
      1
    
    
      12
      AC
      Acre
      1
    
    
      13
      AM
      Amazonas
      1
    
    
      14
      RR
      Roraima
      1
    
    
      15
      PA
      Pará
      1



In [53]:

    
dsn = 'postgresql://%(USER)s:%(PASSWORD)s@%(HOST)s/%(NAME)s'
engine = sqla.create_engine(dsn % settings.DATABASE)
    
for k, df in dfs.items():
    k_new = k.replace('-', '_')
    
    print('Migrating %s ...' % k_new)
    df.to_sql(
        k_new, engine, if_exists='replace',
        chunksize=2048
    )









    



Migrating mem_report ...
Migrating current_estimated_values ...
Migrating clean_data_epiweek_weekly_incidence_w_situation ...
Migrating historical_estimated_values ...
Migrating mem_typical ...
Migrating dataset ...
Migrating situation ...
Migrating scale ...
Migrating territory_type ...
Migrating territory ...

	initials	name	territory_type_id
id
11	RO	Rondônia	1
12	AC	Acre	1
13	AM	Amazonas	1
14	RR	Roraima	1
15	PA	Pará	1
16	AP	Amapá	1
17	TO	Tocantins	1
21	MA	Maranhão	1
22	PI	Piauí	1
23	CE	Ceará	1
24	RN	Rio Grande do Norte	1
25	PB	Paraíba	1
26	PE	Pernambuco	1
27	AL	Alagoas	1
28	SE	Sergipe	1
29	BA	Bahia	1
31	MG	Minas Gerais	1
32	ES	Espírito Santo	1
33	RJ	Rio de Janeiro	1
35	SP	São Paulo	1
41	PR	Paraná	1
42	SC	Santa Catarina	1
43	RS	Rio Grande do Sul	1
50	MS	Mato Grosso do Sul	1
51	MT	Mato Grosso	1
52	GO	Goiás	1
53	DF	Distrito Federal	1
0	BR	Brasil	4
1003	RegC	Regional Centro	2
1002	RegL	Regional Leste	2
1001	RegN	Regional Norte	2
1004	RegS	Regional Sul	2
1	N	Norte	3
2	NE	Nordeste	3
3	SE	Sudeste	3
5	CO	Centro-oeste	3
4	S	Sul	3

	UF	epiyear	epiweek	Tipo	Situation	L0	Run date	dado	escala
0	11	2009	1	Estado	stable	1.0	2017-10-11	srag	incidência
1	11	2009	2	Estado	stable	1.0	2017-10-11	srag	incidência
2	11	2009	3	Estado	stable	1.0	2017-10-11	srag	incidência
3	11	2009	4	Estado	stable	1.0	2017-10-11	srag	incidência
4	11	2009	5	Estado	stable	1.0	2017-10-11	srag	incidência

	territory_id	epiyear	epiweek	value	situation_id	mean	median	ci_lower	ci_upper	low_level	epidemic_level	high_level	very_high_level	run_date	dataset_id	scale_id
86203	1004	2017	27	9.0	2	10.0	9.0	9.0	11.0	0.000	0.988	0.012	0.000	2017-10-11	3	2
86204	1004	2017	28	8.0	2	8.0	8.0	7.0	10.0	0.800	0.200	0.000	0.000	2017-10-11	3	2
86205	1004	2017	29	9.0	2	10.0	10.0	9.0	13.0	0.000	0.856	0.140	0.004	2017-10-11	3	2
86206	1004	2017	30	2.0	2	2.0	2.0	0.0	7.0	0.996	0.004	0.000	0.000	2017-10-11	3	2
86207	1004	2017	31	3.0	2	9.0	8.0	3.0	18.0	0.560	0.256	0.136	0.048	2017-10-11	3	2

					value	situation_id	mean	median	ci_lower	ci_upper	low_level	epidemic_level	high_level	very_high_level	run_date
dataset_id	scale_id	territory_id	epiyear	epiweek
1	1	11	2009	1	0.0	3	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	2017-10-11
				2	0.0	3	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	2017-10-11
				3	0.0	3	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	2017-10-11
				4	0.0	3	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	2017-10-11
				5	0.0	3	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	2017-10-11

	UF	epiyear	epiweek	SRAG	Tipo	Situation	mean	50%	2.5%	97.5%	L0	L1	Run date	base_epiyearweek	base_epiyear	base_epiweek	dado	escala
0	11	2017	7	0.055377	Estado	estimated	0.055377	0.055377	0.055377	0.055377	0.976	0.024	2017-10-11	2017W23	2017	23	srag	incidência
1	11	2017	8	0.055377	Estado	estimated	0.055377	0.055377	0.055377	0.110755	0.972	0.028	2017-10-11	2017W23	2017	23	srag	incidência
2	11	2017	9	0.000000	Estado	estimated	0.000000	0.000000	0.000000	0.000000	1.000	0.000	2017-10-11	2017W23	2017	23	srag	incidência
3	11	2017	10	0.000000	Estado	estimated	0.000000	0.000000	0.000000	0.055377	0.996	0.004	2017-10-11	2017W23	2017	23	srag	incidência
4	11	2017	11	0.055377	Estado	estimated	0.055377	0.055377	0.055377	0.110755	0.964	0.036	2017-10-11	2017W23	2017	23	srag	incidência

	territory_id	epiyear	epiweek	value	situation_id	mean	median	ci_lower	ci_upper	low_level	epidemic_level	high_level	very_high_level	run_date	base_epiyearweek	base_epiyear	base_epiweek	dataset_id	scale_id
20567	1004	2017	27	9.0	2	10.0	9.0	9.0	11.0	0.000	0.988	0.012	0.000	2017-10-11	2017W31	2017	31	3	2
20568	1004	2017	28	8.0	2	8.0	8.0	7.0	10.0	0.800	0.200	0.000	0.000	2017-10-11	2017W31	2017	31	3	2
20569	1004	2017	29	9.0	2	10.0	10.0	9.0	13.0	0.000	0.856	0.140	0.004	2017-10-11	2017W31	2017	31	3	2
20570	1004	2017	30	2.0	2	2.0	2.0	0.0	7.0	0.996	0.004	0.000	0.000	2017-10-11	2017W31	2017	31	3	2
20571	1004	2017	31	3.0	2	9.0	8.0	3.0	18.0	0.560	0.256	0.136	0.048	2017-10-11	2017W31	2017	31	3	2

	years_2_4	years_lt_2	unknown_age	situation_id	territory_id	dataset_id	epiweek	epiyear	epiyearweek	scale_id	gender
0	NaN	NaN	NaN	3	11	1	1	2009	2009W01	1	F
1	NaN	NaN	NaN	3	11	1	1	2009	2009W01	1	M
2	NaN	NaN	NaN	3	11	1	1	2009	2009W01	1	Total
3	NaN	NaN	NaN	3	11	1	2	2009	2009W02	1	F
4	NaN	NaN	NaN	3	11	1	2	2009	2009W02	1	M

	years_0_4	years_40_49	years_50_59	years_60_or_more	flu_a	flu_b	positive_cases	value	situation_id	territory_id	dataset_id	epiweek	epiyear	epiyearweek	scale_id	gender
301723	NaN	1.0	0.0	1.0	2.0	0.0	2.0	2.0	1	1004	3	30	2017	2017W30	2	Total
301724	NaN	0.0	0.0	1.0	0.0	1.0	1.0	1.0	1	1004	3	31	2017	2017W31	2	F
301725	NaN	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1	1004	3	31	2017	2017W31	2	I
301726	NaN	0.0	1.0	1.0	1.0	1.0	2.0	2.0	1	1004	3	31	2017	2017W31	2	M
301727	NaN	0.0	1.0	2.0	1.0	2.0	3.0	3.0	1	1004	3	31	2017	2017W31	2	Total

	UF	População	Média geométrica do pico de infecção das temporadas regulares	limiar pré-epidêmico	intensidade alta	intensidade muito alta	SE típica do início do surto	SE típica do início do surto - IC inferior (2,5%)	SE típica do início do surto - IC superior (97,5%)	duração típica do surto	duração típica do surto - IC inferior (2,5%)	duração típica do surto - IC superior (97,5%)	temporadas utilizadas para os corredores endêmicos	ano	Unidade da Federação	Tipo	dado	escala
0	11	1805788	0.803458	0.103162	0.863224	1.168160	12.0	9.0	22.0	8.0	4.0	15.0	2010, 2013, 2014, 2016	2017	Rondônia	Estado	srag	incidência
1	12	829619	0.953503	0.685939	1.783885	3.053363	24.0	5.0	49.0	6.0	3.0	16.0	2010, 2011, 2012, 2013, 2014, 2015, 2016	2017	Acre	Estado	srag	incidência
2	13	4063614	0.119378	0.042171	0.165851	0.212193	12.0	4.0	25.0	12.0	6.0	13.0	2010, 2012, 2013, 2014, 2015, 2016	2017	Amazonas	Estado	srag	incidência
3	14	522636	0.473615	0.414595	0.593964	0.801757	26.0	13.0	50.0	6.0	3.0	10.0	2012, 2013, 2014, 2015, 2016	2017	Roraima	Estado	srag	incidência
4	15	8366628	0.256230	0.083865	0.466878	0.859727	12.0	10.0	18.0	10.0	5.0	13.0	2012, 2014, 2015, 2016	2017	Pará	Estado	srag	incidência

	territory_id	population	geom_average_peak	pre_epidemic_threshold	high_threshold	very_high_threshold	epi_start	epi_start_ci_lower	epi_start_ci_upper	epi_duration	epi_duration_ci_lower	epi_duration_ci_upper	regular_seasons	year	dataset_id	scale_id
187	0	207660929	26.289172	8.484101	49.118258	91.799762	16.0	5.0	21.0	12.0	11.0	15.0	2010, 2012, 2013, 2014, 2015	2017	3	2
188	1003	27644815	4.280383	1.046539	4.452711	5.507278	15.0	7.0	20.0	10.0	5.0	18.0	2010, 2012, 2013, 2014, 2015	2017	3	2
189	1002	67770757	2.771309	1.257426	3.756206	4.770510	16.0	8.0	30.0	8.0	3.0	17.0	2010, 2011, 2012, 2013, 2014, 2015	2017	3	2
190	1001	16386007	4.228626	1.182767	6.055162	7.565655	12.0	2.0	16.0	8.0	1.0	11.0	2010, 2012, 2013, 2014, 2015, 2016	2017	3	2
191	1004	95859350	6.935301	8.946847	11.831033	16.324451	25.0	17.0	33.0	10.0	7.0	14.0	2010, 2011, 2014, 2015	2017	3	2

	UF	População	epiweek	corredor mediano	corredor alto	ano	Unidade da Federação	Tipo	dado	escala
0	11	1805788	1	0.013703	0.041109	2017	Rondônia	Estado	srag	incidência
1	11	1805788	2	0.000000	0.000000	2017	Rondônia	Estado	srag	incidência
2	11	1805788	3	0.013703	0.041109	2017	Rondônia	Estado	srag	incidência
3	11	1805788	4	0.028790	0.087644	2017	Rondônia	Estado	srag	incidência
4	11	1805788	5	0.000000	0.000000	2017	Rondônia	Estado	srag	incidência

	territory_id	population	epiweek	low	median	high	year	dataset_id	scale_id
9979	1004	95859350	48	0.000000	0.506813	1.500926	2017	3	2
9980	1004	95859350	49	0.194681	1.032562	1.871175	2017	3	2
9981	1004	95859350	50	0.208623	1.026247	1.844569	2017	3	2
9982	1004	95859350	51	0.269861	0.779518	1.289445	2017	3	2
9983	1004	95859350	52	0.000000	0.000000	0.000000	2017	3	2