Database migration rules


In [1]:
from fludashboard import settings

import fludashboard as flud
import glob
import os
import pandas as pd
import sqlalchemy as sqla

In [2]:
# pandas configuration
pd.set_option('display.max_columns', 99)

In [3]:
def get_filename_from_path(file_path: str):
    """
    """
    return file_path.split(os.path.sep)[-1].split('.')[0]

In [4]:
path_data_files = os.path.join(flud.__path__[0], 'data', '*.csv')

In [5]:
dfs = {}
pks = {}

In [6]:
print('Data files:')
for file_path in glob.glob(path_data_files):
    filename = get_filename_from_path(file_path)
    
    print(filename)
    
    dfs[filename] = pd.read_csv(file_path)


Data files:
mem-report
current_estimated_values
clean_data_epiweek-weekly-incidence_w_situation
/home/xmn/miniconda3/envs/fludashboard/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2728: DtypeWarning: Columns (23) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
historical_estimated_values
mem-typical

1. Setting IDs

1.1 Datasets


In [7]:
dataset_id = {
    'srag': 1,
    'sragflu': 2,
    'obitoflu': 3,
}

1.2 Scale


In [8]:
scale_id = {
    'incidência': 1,
    'casos': 2
}

1.3 Situation


In [9]:
situation_id = {
    'unknown': 1, 
    'estimated': 2, 
    'stable': 3,
    'incomplete': 4
}

1.4 Territory Type


In [10]:
territory_type_id = {
    'Estado': 1, 
    'Regional': 2,
    'Região': 3,
    'País': 4
}

1.5 Region id conversion


In [11]:
region_id = {
    'BR': 0,
    'RegN': 1001,
    'RegL': 1002,
    'RegC': 1003,
    'RegS': 1004
}

Territory Table


In [12]:
df_territory = pd.DataFrame([
    {'id': 11, 'initials': 'RO', 'name': 'Rondônia', 'territory_type_id': 1},
    {'id': 12, 'initials': 'AC', 'name': 'Acre', 'territory_type_id': 1},
    {'id': 13, 'initials': 'AM', 'name': 'Amazonas', 'territory_type_id': 1},
    {'id': 14, 'initials': 'RR', 'name': 'Roraima', 'territory_type_id': 1},
    {'id': 15, 'initials': 'PA', 'name': 'Pará', 'territory_type_id': 1},
    {'id': 16, 'initials': 'AP', 'name': 'Amapá', 'territory_type_id': 1},
    {'id': 17, 'initials': 'TO', 'name': 'Tocantins', 'territory_type_id': 1},
    {'id': 21, 'initials': 'MA', 'name': 'Maranhão', 'territory_type_id': 1},
    {'id': 22, 'initials': 'PI', 'name': 'Piauí', 'territory_type_id': 1},
    {'id': 23, 'initials': 'CE', 'name': 'Ceará', 'territory_type_id': 1},
    {'id': 24, 'initials': 'RN', 'name': 'Rio Grande do Norte', 'territory_type_id': 1},
    {'id': 25, 'initials': 'PB', 'name': 'Paraíba', 'territory_type_id': 1},
    {'id': 26, 'initials': 'PE', 'name': 'Pernambuco', 'territory_type_id': 1},
    {'id': 27, 'initials': 'AL', 'name': 'Alagoas', 'territory_type_id': 1},
    {'id': 28, 'initials': 'SE', 'name': 'Sergipe', 'territory_type_id': 1},
    {'id': 29, 'initials': 'BA', 'name': 'Bahia', 'territory_type_id': 1},
    {'id': 31, 'initials': 'MG', 'name': 'Minas Gerais', 'territory_type_id': 1},
    {'id': 32, 'initials': 'ES', 'name': 'Espírito Santo', 'territory_type_id': 1},
    {'id': 33, 'initials': 'RJ', 'name': 'Rio de Janeiro', 'territory_type_id': 1},
    {'id': 35, 'initials': 'SP', 'name': 'São Paulo', 'territory_type_id': 1},
    {'id': 41, 'initials': 'PR', 'name': 'Paraná', 'territory_type_id': 1},
    {'id': 42, 'initials': 'SC', 'name': 'Santa Catarina', 'territory_type_id': 1},
    {'id': 43, 'initials': 'RS', 'name': 'Rio Grande do Sul', 'territory_type_id': 1},
    {'id': 50, 'initials': 'MS', 'name': 'Mato Grosso do Sul', 'territory_type_id': 1},
    {'id': 51, 'initials': 'MT', 'name': 'Mato Grosso', 'territory_type_id': 1},
    {'id': 52, 'initials': 'GO', 'name': 'Goiás', 'territory_type_id': 1},
    {'id': 53, 'initials': 'DF', 'name': 'Distrito Federal', 'territory_type_id': 1},
    {'id': 0, 'initials': 'BR', 'name': 'Brasil', 'territory_type_id': 4},
    {'id': 1003, 'initials': 'RegC', 'name': 'Regional Centro', 'territory_type_id': 2},
    {'id': 1002, 'initials': 'RegL', 'name': 'Regional Leste', 'territory_type_id': 2},
    {'id': 1001, 'initials': 'RegN', 'name': 'Regional Norte', 'territory_type_id': 2},
    {'id': 1004, 'initials': 'RegS', 'name': 'Regional Sul', 'territory_type_id': 2},
    {'id': 1, 'initials': 'N', 'name': 'Norte', 'territory_type_id': 3},
    {'id': 2, 'initials': 'NE', 'name': 'Nordeste', 'territory_type_id': 3},
    {'id': 3, 'initials': 'SE', 'name': 'Sudeste', 'territory_type_id': 3},
    {'id': 5, 'initials': 'CO', 'name': 'Centro-oeste', 'territory_type_id': 3},
    {'id': 4, 'initials': 'S', 'name': 'Sul', 'territory_type_id': 3},
])

df_territory.set_index('id', inplace=True)

df_territory


Out[12]:
initials name territory_type_id
id
11 RO Rondônia 1
12 AC Acre 1
13 AM Amazonas 1
14 RR Roraima 1
15 PA Pará 1
16 AP Amapá 1
17 TO Tocantins 1
21 MA Maranhão 1
22 PI Piauí 1
23 CE Ceará 1
24 RN Rio Grande do Norte 1
25 PB Paraíba 1
26 PE Pernambuco 1
27 AL Alagoas 1
28 SE Sergipe 1
29 BA Bahia 1
31 MG Minas Gerais 1
32 ES Espírito Santo 1
33 RJ Rio de Janeiro 1
35 SP São Paulo 1
41 PR Paraná 1
42 SC Santa Catarina 1
43 RS Rio Grande do Sul 1
50 MS Mato Grosso do Sul 1
51 MT Mato Grosso 1
52 GO Goiás 1
53 DF Distrito Federal 1
0 BR Brasil 4
1003 RegC Regional Centro 2
1002 RegL Regional Leste 2
1001 RegN Regional Norte 2
1004 RegS Regional Sul 2
1 N Norte 3
2 NE Nordeste 3
3 SE Sudeste 3
5 CO Centro-oeste 3
4 S Sul 3

2. current_estimated_values


In [13]:
dataset = 'current_estimated_values'

In [14]:
dfs[dataset].head()


Out[14]:
UF epiyear epiweek SRAG Tipo Situation mean 50% 2.5% 97.5% L0 L1 L2 L3 Run date dado escala
0 11 2009 1 0.0 Estado stable 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 2017-10-11 srag incidência
1 11 2009 2 0.0 Estado stable 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 2017-10-11 srag incidência
2 11 2009 3 0.0 Estado stable 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 2017-10-11 srag incidência
3 11 2009 4 0.0 Estado stable 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 2017-10-11 srag incidência
4 11 2009 5 0.0 Estado stable 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 2017-10-11 srag incidência

In [15]:
migration_rules = {
    'UF': 'territory_id',
    'SRAG': 'value',
    'Tipo': 'territory_type',  # Not needed in the table
    'Situation': 'situation_id',
    '50%': 'median',
    '2.5%': 'ci_lower',
    '97.5%': 'ci_upper',
    'L0': 'low_level',
    'L1': 'epidemic_level',
    'L2': 'high_level',
    'L3': 'very_high_level',
    'Run date': 'run_date',
    'dado': 'dataset_id',
    'escala': 'scale_id'
}

# rename columns
dfs[dataset].rename(
    columns=migration_rules, inplace=True
)

# apply categories
dfs[dataset].dataset_id = dfs[dataset].dataset_id.map(dataset_id)
dfs[dataset].scale_id = dfs[dataset].scale_id.map(scale_id)
dfs[dataset].situation_id = dfs[dataset].situation_id.map(situation_id)
regions_indeces = dfs[dataset].territory_id.isin([
    'BR', 'RegN', 'RegL', 'RegC', 'RegS'
])
dfs[dataset].loc[regions_indeces, 'territory_id'] = dfs[dataset].loc[
    regions_indeces, 'territory_id'
].map(region_id)
dfs[dataset].territory_id = dfs[dataset].territory_id.astype(int)

# remove unnecessary fields
dfs[dataset].drop(['territory_type'], axis=1, inplace=True)

dfs[dataset].head()


Out[15]:
territory_id epiyear epiweek value situation_id mean median ci_lower ci_upper low_level epidemic_level high_level very_high_level run_date dataset_id scale_id
0 11 2009 1 0.0 3 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 2017-10-11 1 1
1 11 2009 2 0.0 3 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 2017-10-11 1 1
2 11 2009 3 0.0 3 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 2017-10-11 1 1
3 11 2009 4 0.0 3 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 2017-10-11 1 1
4 11 2009 5 0.0 3 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 2017-10-11 1 1

In [16]:
dfs[dataset].tail()


Out[16]:
territory_id epiyear epiweek value situation_id mean median ci_lower ci_upper low_level epidemic_level high_level very_high_level run_date dataset_id scale_id
86203 1004 2017 27 9.0 2 10.0 9.0 9.0 11.0 0.000 0.988 0.012 0.000 2017-10-11 3 2
86204 1004 2017 28 8.0 2 8.0 8.0 7.0 10.0 0.800 0.200 0.000 0.000 2017-10-11 3 2
86205 1004 2017 29 9.0 2 10.0 10.0 9.0 13.0 0.000 0.856 0.140 0.004 2017-10-11 3 2
86206 1004 2017 30 2.0 2 2.0 2.0 0.0 7.0 0.996 0.004 0.000 0.000 2017-10-11 3 2
86207 1004 2017 31 3.0 2 9.0 8.0 3.0 18.0 0.560 0.256 0.136 0.048 2017-10-11 3 2

In [17]:
dfs[dataset].dtypes


Out[17]:
territory_id         int64
epiyear              int64
epiweek              int64
value              float64
situation_id         int64
mean               float64
median             float64
ci_lower           float64
ci_upper           float64
low_level          float64
epidemic_level     float64
high_level         float64
very_high_level    float64
run_date            object
dataset_id           int64
scale_id             int64
dtype: object

In [18]:
# primary_keys
pks[dataset] = ['dataset_id', 'scale_id', 'territory_id', 'epiyear', 'epiweek']

dfs[dataset].set_index(pks[dataset], inplace=True)
dfs[dataset].head()


Out[18]:
value situation_id mean median ci_lower ci_upper low_level epidemic_level high_level very_high_level run_date
dataset_id scale_id territory_id epiyear epiweek
1 1 11 2009 1 0.0 3 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 2017-10-11
2 0.0 3 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 2017-10-11
3 0.0 3 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 2017-10-11
4 0.0 3 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 2017-10-11
5 0.0 3 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 2017-10-11

3. historical_estimated_values


In [19]:
dataset = 'historical_estimated_values'

In [20]:
dfs[dataset].head()


Out[20]:
UF epiyear epiweek SRAG Tipo Situation mean 50% 2.5% 97.5% L0 L1 L2 L3 Run date base_epiyearweek base_epiyear base_epiweek dado escala
0 11 2017 7 0.055377 Estado estimated 0.055377 0.055377 0.055377 0.055377 0.976 0.024 0.0 0.0 2017-10-11 2017W23 2017 23 srag incidência
1 11 2017 8 0.055377 Estado estimated 0.055377 0.055377 0.055377 0.110755 0.972 0.028 0.0 0.0 2017-10-11 2017W23 2017 23 srag incidência
2 11 2017 9 0.000000 Estado estimated 0.000000 0.000000 0.000000 0.000000 1.000 0.000 0.0 0.0 2017-10-11 2017W23 2017 23 srag incidência
3 11 2017 10 0.000000 Estado estimated 0.000000 0.000000 0.000000 0.055377 0.996 0.004 0.0 0.0 2017-10-11 2017W23 2017 23 srag incidência
4 11 2017 11 0.055377 Estado estimated 0.055377 0.055377 0.055377 0.110755 0.964 0.036 0.0 0.0 2017-10-11 2017W23 2017 23 srag incidência

In [21]:
migration_rules = {
    'UF': 'territory_id',
    'SRAG': 'value',
    'Tipo': 'territory_type',  # Not needed in the table
    'Situation': 'situation_id',
    '50%': 'median',
    '2.5%': 'ci_lower',
    '97.5%': 'ci_upper',
    'L0': 'low_level',
    'L1': 'epidemic_level',
    'L2': 'high_level',
    'L3': 'very_high_level',
    'Run date': 'run_date',
    'dado': 'dataset_id',  # or origin
    'escala': 'scale_id'
}

dfs[dataset].rename(
    columns=migration_rules, inplace=True
)
# apply categories
dfs[dataset].dataset_id = dfs[dataset].dataset_id.map(dataset_id)
dfs[dataset].scale_id = dfs[dataset].scale_id.map(scale_id)
dfs[dataset].situation_id = dfs[dataset].situation_id.map(situation_id)
regions_indeces = dfs[dataset].territory_id.isin([
    'BR', 'RegN', 'RegL', 'RegC', 'RegS'
])
dfs[dataset].loc[regions_indeces, 'territory_id'] = dfs[dataset].loc[
    regions_indeces, 'territory_id'
].map(region_id)
dfs[dataset].territory_id = dfs[dataset].territory_id.astype(int)

# remove unnecessary fields
dfs[dataset].drop(['territory_type'], axis=1, inplace=True)

dfs[dataset].head()


Out[21]:
territory_id epiyear epiweek value situation_id mean median ci_lower ci_upper low_level epidemic_level high_level very_high_level run_date base_epiyearweek base_epiyear base_epiweek dataset_id scale_id
0 11 2017 7 0.055377 2 0.055377 0.055377 0.055377 0.055377 0.976 0.024 0.0 0.0 2017-10-11 2017W23 2017 23 1 1
1 11 2017 8 0.055377 2 0.055377 0.055377 0.055377 0.110755 0.972 0.028 0.0 0.0 2017-10-11 2017W23 2017 23 1 1
2 11 2017 9 0.000000 2 0.000000 0.000000 0.000000 0.000000 1.000 0.000 0.0 0.0 2017-10-11 2017W23 2017 23 1 1
3 11 2017 10 0.000000 2 0.000000 0.000000 0.000000 0.055377 0.996 0.004 0.0 0.0 2017-10-11 2017W23 2017 23 1 1
4 11 2017 11 0.055377 2 0.055377 0.055377 0.055377 0.110755 0.964 0.036 0.0 0.0 2017-10-11 2017W23 2017 23 1 1

In [22]:
dfs[dataset].tail()


Out[22]:
territory_id epiyear epiweek value situation_id mean median ci_lower ci_upper low_level epidemic_level high_level very_high_level run_date base_epiyearweek base_epiyear base_epiweek dataset_id scale_id
20567 1004 2017 27 9.0 2 10.0 9.0 9.0 11.0 0.000 0.988 0.012 0.000 2017-10-11 2017W31 2017 31 3 2
20568 1004 2017 28 8.0 2 8.0 8.0 7.0 10.0 0.800 0.200 0.000 0.000 2017-10-11 2017W31 2017 31 3 2
20569 1004 2017 29 9.0 2 10.0 10.0 9.0 13.0 0.000 0.856 0.140 0.004 2017-10-11 2017W31 2017 31 3 2
20570 1004 2017 30 2.0 2 2.0 2.0 0.0 7.0 0.996 0.004 0.000 0.000 2017-10-11 2017W31 2017 31 3 2
20571 1004 2017 31 3.0 2 9.0 8.0 3.0 18.0 0.560 0.256 0.136 0.048 2017-10-11 2017W31 2017 31 3 2

In [23]:
dfs[dataset].dtypes


Out[23]:
territory_id          int64
epiyear               int64
epiweek               int64
value               float64
situation_id          int64
mean                float64
median              float64
ci_lower            float64
ci_upper            float64
low_level           float64
epidemic_level      float64
high_level          float64
very_high_level     float64
run_date             object
base_epiyearweek     object
base_epiyear          int64
base_epiweek          int64
dataset_id            int64
scale_id              int64
dtype: object

In [24]:
# primary_keys
pks[dataset] = [
    'dataset_id', 'scale_id', 'territory_id', 
    'base_epiyear', 'base_epiweek', 
    'epiyear', 'epiweek'
]

dfs[dataset].set_index(pks[dataset], inplace=True)
dfs[dataset].head()


Out[24]:
value situation_id mean median ci_lower ci_upper low_level epidemic_level high_level very_high_level run_date base_epiyearweek
dataset_id scale_id territory_id base_epiyear base_epiweek epiyear epiweek
1 1 11 2017 23 2017 7 0.055377 2 0.055377 0.055377 0.055377 0.055377 0.976 0.024 0.0 0.0 2017-10-11 2017W23
8 0.055377 2 0.055377 0.055377 0.055377 0.110755 0.972 0.028 0.0 0.0 2017-10-11 2017W23
9 0.000000 2 0.000000 0.000000 0.000000 0.000000 1.000 0.000 0.0 0.0 2017-10-11 2017W23
10 0.000000 2 0.000000 0.000000 0.000000 0.055377 0.996 0.004 0.0 0.0 2017-10-11 2017W23
11 0.055377 2 0.055377 0.055377 0.055377 0.110755 0.964 0.036 0.0 0.0 2017-10-11 2017W23

4. clean_data_epiweek-weekly-incidence_w_situation


In [25]:
dataset = 'clean_data_epiweek-weekly-incidence_w_situation'

In [26]:
dfs[dataset].head()


Out[26]:
0-4 anos 10-19 anos 2-4 anos 20-29 anos 30-39 anos 40-49 anos 5-9 anos 50-59 anos 60+ anos < 2 anos DELAYED FLU_A FLU_B INCONCLUSIVE Idade desconhecida NEGATIVE NOTTESTED OTHERS POSITIVE_CASES SRAG Situation TESTING_IGNORED Tipo UF Unidade da Federação VSR dado epiweek epiyear epiyearweek escala sexo
0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 stable 0.0 Estado 11 Rondônia 0.0 srag 1 2009 2009W01 incidência F
1 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 stable 0.0 Estado 11 Rondônia 0.0 srag 1 2009 2009W01 incidência M
2 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 stable 0.0 Estado 11 Rondônia 0.0 srag 1 2009 2009W01 incidência Total
3 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 stable 0.0 Estado 11 Rondônia 0.0 srag 2 2009 2009W02 incidência F
4 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 stable 0.0 Estado 11 Rondônia 0.0 srag 2 2009 2009W02 incidência M

In [27]:
dfs[dataset].keys()


Out[27]:
Index(['0-4 anos', '10-19 anos', '2-4 anos', '20-29 anos', '30-39 anos',
       '40-49 anos', '5-9 anos', '50-59 anos', '60+ anos', '< 2 anos',
       'DELAYED', 'FLU_A', 'FLU_B', 'INCONCLUSIVE', 'Idade desconhecida',
       'NEGATIVE', 'NOTTESTED', 'OTHERS', 'POSITIVE_CASES', 'SRAG',
       'Situation', 'TESTING_IGNORED', 'Tipo', 'UF', 'Unidade da Federação',
       'VSR', 'dado', 'epiweek', 'epiyear', 'epiyearweek', 'escala', 'sexo'],
      dtype='object')

In [28]:
migration_rules = {
    '0-4 anos': 'years_0_4',
    '10-19 anos': 'years_10_19',
    '2-4 anos': 'years_2_4',
    '20-29 anos': 'years_20_29',
    '30-39 anos': 'years_30_39',
    '40-49 anos': 'years_40_49',
    '5-9 anos': 'years_5_9',
    '50-59 anos': 'years_50_59',
    '60+ anos': 'years_60_or_more',
    '< 2 anos': 'years_lt_2',
    'DELAYED': 'delayed',
    'FLU_A': 'flu_a',
    'FLU_B': 'flu_b',
    'INCONCLUSIVE': 'inconclusive',
    'Idade desconhecida': 'unknown_age',
    'NEGATIVE': 'negative',
    'NOTTESTED': 'not_tested',
    'OTHERS': 'others',
    'POSITIVE_CASES': 'positive_cases',
    'SRAG': 'value',
    'Situation': 'situation_id',
    'TESTING_IGNORED': 'testing_ignored',
    'Tipo': 'territory_type',  # Not needed in the table
    'UF': 'territory_id',
    'Unidade da Federação': 'state_country_name',  # Not needed in the table
    'VSR': 'vsr',
    'dado': 'dataset_id',
    'escala': 'scale_id',
    'sexo': 'gender'
}

dfs[dataset].rename(
    columns=migration_rules, inplace=True
)
# apply categories
dfs[dataset].dataset_id = dfs[dataset].dataset_id.map(dataset_id)
dfs[dataset].scale_id = dfs[dataset].scale_id.map(scale_id)
dfs[dataset].situation_id = dfs[dataset].situation_id.map(situation_id)
regions_indeces = dfs[dataset].territory_id.isin([
    'BR', 'RegN', 'RegL', 'RegC', 'RegS'
])
dfs[dataset].loc[regions_indeces, 'territory_id'] = dfs[dataset].loc[
    regions_indeces, 'territory_id'
].map(region_id)
dfs[dataset].territory_id = dfs[dataset].territory_id.astype(int)

# remove unnecessary fields
dfs[dataset].drop([
    'state_country_name', 'territory_type'
], axis=1, inplace=True)


dfs[dataset].head()


Out[28]:
years_0_4 years_10_19 years_2_4 years_20_29 years_30_39 years_40_49 years_5_9 years_50_59 years_60_or_more years_lt_2 delayed flu_a flu_b inconclusive unknown_age negative not_tested others positive_cases value situation_id testing_ignored territory_id vsr dataset_id epiweek epiyear epiyearweek scale_id gender
0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 3 0.0 11 0.0 1 1 2009 2009W01 1 F
1 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 3 0.0 11 0.0 1 1 2009 2009W01 1 M
2 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 3 0.0 11 0.0 1 1 2009 2009W01 1 Total
3 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 3 0.0 11 0.0 1 2 2009 2009W02 1 F
4 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 3 0.0 11 0.0 1 2 2009 2009W02 1 M

In [29]:
dfs[dataset].tail()


Out[29]:
years_0_4 years_10_19 years_2_4 years_20_29 years_30_39 years_40_49 years_5_9 years_50_59 years_60_or_more years_lt_2 delayed flu_a flu_b inconclusive unknown_age negative not_tested others positive_cases value situation_id testing_ignored territory_id vsr dataset_id epiweek epiyear epiyearweek scale_id gender
301723 NaN 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 2.0 1 0.0 1004 0.0 3 30 2017 2017W30 2 Total
301724 NaN 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1 0.0 1004 0.0 3 31 2017 2017W31 2 F
301725 NaN 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1 0.0 1004 0.0 3 31 2017 2017W31 2 I
301726 NaN 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 2.0 2.0 1 0.0 1004 0.0 3 31 2017 2017W31 2 M
301727 NaN 0.0 0.0 0.0 0.0 0.0 0.0 1.0 2.0 0.0 0.0 1.0 2.0 0.0 0.0 0.0 0.0 0.0 3.0 3.0 1 0.0 1004 0.0 3 31 2017 2017W31 2 Total

In [30]:
dfs[dataset].dtypes


Out[30]:
years_0_4           float64
years_10_19         float64
years_2_4           float64
years_20_29         float64
years_30_39         float64
years_40_49         float64
years_5_9           float64
years_50_59         float64
years_60_or_more    float64
years_lt_2          float64
delayed             float64
flu_a               float64
flu_b               float64
inconclusive        float64
unknown_age         float64
negative            float64
not_tested          float64
others              float64
positive_cases      float64
value               float64
situation_id          int64
testing_ignored     float64
territory_id          int64
vsr                 float64
dataset_id            int64
epiweek               int64
epiyear               int64
epiyearweek          object
scale_id              int64
gender               object
dtype: object

In [31]:
# primary_keys
pks[dataset] = ['dataset_id', 'scale_id', 'territory_id', 'epiyear', 'epiweek']

dfs[dataset].set_index(pks[dataset], inplace=True)
dfs[dataset].head()


Out[31]:
years_0_4 years_10_19 years_2_4 years_20_29 years_30_39 years_40_49 years_5_9 years_50_59 years_60_or_more years_lt_2 delayed flu_a flu_b inconclusive unknown_age negative not_tested others positive_cases value situation_id testing_ignored vsr epiyearweek gender
dataset_id scale_id territory_id epiyear epiweek
1 1 11 2009 1 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 3 0.0 0.0 2009W01 F
1 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 3 0.0 0.0 2009W01 M
1 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 3 0.0 0.0 2009W01 Total
2 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 3 0.0 0.0 2009W02 F
2 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 NaN 0.0 0.0 0.0 0.0 0.0 3 0.0 0.0 2009W02 M

5. mem-report


In [32]:
dataset = 'mem-report'

In [33]:
dfs[dataset].head()


Out[33]:
UF População Média geométrica do pico de infecção das temporadas regulares região de baixa atividade típica limiar pré-epidêmico intensidade alta intensidade muito alta SE típica do início do surto SE típica do início do surto - IC inferior (2,5%) SE típica do início do surto - IC superior (97,5%) duração típica do surto duração típica do surto - IC inferior (2,5%) duração típica do surto - IC superior (97,5%) temporadas utilizadas para os corredores endêmicos ano Unidade da Federação Tipo dado escala
0 11 1805788 0.803458 0 0.103162 0.863224 1.168160 12.0 9.0 22.0 8.0 4.0 15.0 2010, 2013, 2014, 2016 2017 Rondônia Estado srag incidência
1 12 829619 0.953503 0 0.685939 1.783885 3.053363 24.0 5.0 49.0 6.0 3.0 16.0 2010, 2011, 2012, 2013, 2014, 2015, 2016 2017 Acre Estado srag incidência
2 13 4063614 0.119378 0 0.042171 0.165851 0.212193 12.0 4.0 25.0 12.0 6.0 13.0 2010, 2012, 2013, 2014, 2015, 2016 2017 Amazonas Estado srag incidência
3 14 522636 0.473615 0 0.414595 0.593964 0.801757 26.0 13.0 50.0 6.0 3.0 10.0 2012, 2013, 2014, 2015, 2016 2017 Roraima Estado srag incidência
4 15 8366628 0.256230 0 0.083865 0.466878 0.859727 12.0 10.0 18.0 10.0 5.0 13.0 2012, 2014, 2015, 2016 2017 Pará Estado srag incidência

In [34]:
dfs[dataset].keys()


Out[34]:
Index(['UF', 'População',
       'Média geométrica do pico de infecção das temporadas regulares',
       'região de baixa atividade típica', 'limiar pré-epidêmico',
       'intensidade alta', 'intensidade muito alta',
       'SE típica do início do surto',
       'SE típica do início do surto - IC inferior (2,5%)',
       'SE típica do início do surto - IC superior (97,5%)',
       'duração típica do surto',
       'duração típica do surto - IC inferior (2,5%)',
       'duração típica do surto - IC superior (97,5%)',
       'temporadas utilizadas para os corredores endêmicos', 'ano',
       'Unidade da Federação', 'Tipo', 'dado', 'escala'],
      dtype='object')

In [35]:
migration_rules = {
    'UF': 'territory_id',
    'População': 'population',
    'Média geométrica do pico de infecção das temporadas regulares': 'geom_average_peak',
    'região de baixa atividade típica': 'low_activity_region',
    'limiar pré-epidêmico': 'pre_epidemic_threshold',
    'intensidade alta': 'high_threshold',
    'intensidade muito alta': 'very_high_threshold',
    'SE típica do início do surto': 'epi_start',
    'SE típica do início do surto - IC inferior (2,5%)': 'epi_start_ci_lower',
    'SE típica do início do surto - IC superior (97,5%)': 'epi_start_ci_upper',
    'duração típica do surto': 'epi_duration',
    'duração típica do surto - IC inferior (2,5%)': 'epi_duration_ci_lower',
    'duração típica do surto - IC superior (97,5%)': 'epi_duration_ci_upper',
    'temporadas utilizadas para os corredores endêmicos': 'regular_seasons',
    'ano': 'year',
    'Unidade da Federação': 'state_country_name',  # Not needed in the table
    'Tipo': 'territory_type',  # Not needed in the table
    'dado': 'dataset_id',
    'escala': 'scale_id'
}

dfs[dataset].rename(
    columns=migration_rules, inplace=True
)
# apply categories
dfs[dataset].dataset_id = dfs[dataset].dataset_id.map(dataset_id)
dfs[dataset].scale_id = dfs[dataset].scale_id.map(scale_id)
regions_indeces = dfs[dataset].territory_id.isin([
    'BR', 'RegN', 'RegL', 'RegC', 'RegS'
])
dfs[dataset].loc[regions_indeces, 'territory_id'] = dfs[dataset].loc[
    regions_indeces, 'territory_id'
].map(region_id)
dfs[dataset].territory_id = dfs[dataset].territory_id.astype(int)

# remove unnecessary fields
dfs[dataset].drop([
    'state_country_name', 'territory_type'
], axis=1, inplace=True)

dfs[dataset].head()


Out[35]:
territory_id population geom_average_peak low_activity_region pre_epidemic_threshold high_threshold very_high_threshold epi_start epi_start_ci_lower epi_start_ci_upper epi_duration epi_duration_ci_lower epi_duration_ci_upper regular_seasons year dataset_id scale_id
0 11 1805788 0.803458 0 0.103162 0.863224 1.168160 12.0 9.0 22.0 8.0 4.0 15.0 2010, 2013, 2014, 2016 2017 1 1
1 12 829619 0.953503 0 0.685939 1.783885 3.053363 24.0 5.0 49.0 6.0 3.0 16.0 2010, 2011, 2012, 2013, 2014, 2015, 2016 2017 1 1
2 13 4063614 0.119378 0 0.042171 0.165851 0.212193 12.0 4.0 25.0 12.0 6.0 13.0 2010, 2012, 2013, 2014, 2015, 2016 2017 1 1
3 14 522636 0.473615 0 0.414595 0.593964 0.801757 26.0 13.0 50.0 6.0 3.0 10.0 2012, 2013, 2014, 2015, 2016 2017 1 1
4 15 8366628 0.256230 0 0.083865 0.466878 0.859727 12.0 10.0 18.0 10.0 5.0 13.0 2012, 2014, 2015, 2016 2017 1 1

In [36]:
dfs[dataset].tail()


Out[36]:
territory_id population geom_average_peak low_activity_region pre_epidemic_threshold high_threshold very_high_threshold epi_start epi_start_ci_lower epi_start_ci_upper epi_duration epi_duration_ci_lower epi_duration_ci_upper regular_seasons year dataset_id scale_id
187 0 207660929 26.289172 0 8.484101 49.118258 91.799762 16.0 5.0 21.0 12.0 11.0 15.0 2010, 2012, 2013, 2014, 2015 2017 3 2
188 1003 27644815 4.280383 0 1.046539 4.452711 5.507278 15.0 7.0 20.0 10.0 5.0 18.0 2010, 2012, 2013, 2014, 2015 2017 3 2
189 1002 67770757 2.771309 0 1.257426 3.756206 4.770510 16.0 8.0 30.0 8.0 3.0 17.0 2010, 2011, 2012, 2013, 2014, 2015 2017 3 2
190 1001 16386007 4.228626 0 1.182767 6.055162 7.565655 12.0 2.0 16.0 8.0 1.0 11.0 2010, 2012, 2013, 2014, 2015, 2016 2017 3 2
191 1004 95859350 6.935301 0 8.946847 11.831033 16.324451 25.0 17.0 33.0 10.0 7.0 14.0 2010, 2011, 2014, 2015 2017 3 2

In [37]:
dfs[dataset].dtypes


Out[37]:
territory_id                int64
population                  int64
geom_average_peak         float64
low_activity_region         int64
pre_epidemic_threshold    float64
high_threshold            float64
very_high_threshold       float64
epi_start                 float64
epi_start_ci_lower        float64
epi_start_ci_upper        float64
epi_duration              float64
epi_duration_ci_lower     float64
epi_duration_ci_upper     float64
regular_seasons            object
year                        int64
dataset_id                  int64
scale_id                    int64
dtype: object

In [38]:
# primary_keys
pks[dataset] = ['dataset_id', 'scale_id', 'territory_id', 'year']

dfs[dataset].set_index(pks[dataset], inplace=True)
dfs[dataset].head()


Out[38]:
population geom_average_peak low_activity_region pre_epidemic_threshold high_threshold very_high_threshold epi_start epi_start_ci_lower epi_start_ci_upper epi_duration epi_duration_ci_lower epi_duration_ci_upper regular_seasons
dataset_id scale_id territory_id year
1 1 11 2017 1805788 0.803458 0 0.103162 0.863224 1.168160 12.0 9.0 22.0 8.0 4.0 15.0 2010, 2013, 2014, 2016
12 2017 829619 0.953503 0 0.685939 1.783885 3.053363 24.0 5.0 49.0 6.0 3.0 16.0 2010, 2011, 2012, 2013, 2014, 2015, 2016
13 2017 4063614 0.119378 0 0.042171 0.165851 0.212193 12.0 4.0 25.0 12.0 6.0 13.0 2010, 2012, 2013, 2014, 2015, 2016
14 2017 522636 0.473615 0 0.414595 0.593964 0.801757 26.0 13.0 50.0 6.0 3.0 10.0 2012, 2013, 2014, 2015, 2016
15 2017 8366628 0.256230 0 0.083865 0.466878 0.859727 12.0 10.0 18.0 10.0 5.0 13.0 2012, 2014, 2015, 2016

6. mem-typical


In [39]:
dataset = 'mem-typical'

In [40]:
dfs[dataset].head()


Out[40]:
UF População epiweek corredor baixo corredor mediano corredor alto ano Unidade da Federação Tipo dado escala
0 11 1805788 1 0.0 0.013703 0.041109 2017 Rondônia Estado srag incidência
1 11 1805788 2 0.0 0.000000 0.000000 2017 Rondônia Estado srag incidência
2 11 1805788 3 0.0 0.013703 0.041109 2017 Rondônia Estado srag incidência
3 11 1805788 4 0.0 0.028790 0.087644 2017 Rondônia Estado srag incidência
4 11 1805788 5 0.0 0.000000 0.000000 2017 Rondônia Estado srag incidência

In [41]:
dfs[dataset].keys()


Out[41]:
Index(['UF', 'População', 'epiweek', 'corredor baixo', 'corredor mediano',
       'corredor alto', 'ano', 'Unidade da Federação', 'Tipo', 'dado',
       'escala'],
      dtype='object')

In [42]:
migration_rules = {
    'UF': 'territory_id',
    'População': 'population',
    'corredor baixo': 'low',
    'corredor mediano': 'median',
    'corredor alto': 'high',
    'ano': 'year',
    'Unidade da Federação': 'state_country_name',  # Not needed in the table
    'Tipo': 'territory_type',  # Not needed in the table
    'dado': 'dataset_id',
    'escala': 'scale_id'
}

dfs[dataset].rename(
    columns=migration_rules, inplace=True
)
# apply categories
dfs[dataset].dataset_id = dfs[dataset].dataset_id.map(dataset_id)
dfs[dataset].scale_id = dfs[dataset].scale_id.map(scale_id)
regions_indeces = dfs[dataset].territory_id.isin([
    'BR', 'RegN', 'RegL', 'RegC', 'RegS'
])
dfs[dataset].loc[regions_indeces, 'territory_id'] = dfs[dataset].loc[
    regions_indeces, 'territory_id'
].map(region_id)
dfs[dataset].territory_id = dfs[dataset].territory_id.astype(int)

# remove unnecessary fields
dfs[dataset].drop([
    'state_country_name', 'territory_type'
], axis=1, inplace=True)

dfs[dataset].head()


Out[42]:
territory_id population epiweek low median high year dataset_id scale_id
0 11 1805788 1 0.0 0.013703 0.041109 2017 1 1
1 11 1805788 2 0.0 0.000000 0.000000 2017 1 1
2 11 1805788 3 0.0 0.013703 0.041109 2017 1 1
3 11 1805788 4 0.0 0.028790 0.087644 2017 1 1
4 11 1805788 5 0.0 0.000000 0.000000 2017 1 1

In [43]:
dfs[dataset].tail()


Out[43]:
territory_id population epiweek low median high year dataset_id scale_id
9979 1004 95859350 48 0.000000 0.506813 1.500926 2017 3 2
9980 1004 95859350 49 0.194681 1.032562 1.871175 2017 3 2
9981 1004 95859350 50 0.208623 1.026247 1.844569 2017 3 2
9982 1004 95859350 51 0.269861 0.779518 1.289445 2017 3 2
9983 1004 95859350 52 0.000000 0.000000 0.000000 2017 3 2

In [44]:
dfs[dataset].dtypes


Out[44]:
territory_id      int64
population        int64
epiweek           int64
low             float64
median          float64
high            float64
year              int64
dataset_id        int64
scale_id          int64
dtype: object

In [45]:
# primary_keys
pks[dataset] = ['dataset_id', 'scale_id', 'territory_id', 'year', 'epiweek']

dfs[dataset].set_index(pks[dataset], inplace=True)
dfs[dataset].head()


Out[45]:
population low median high
dataset_id scale_id territory_id year epiweek
1 1 11 2017 1 1805788 0.0 0.013703 0.041109
2 1805788 0.0 0.000000 0.000000
3 1805788 0.0 0.013703 0.041109
4 1805788 0.0 0.028790 0.087644
5 1805788 0.0 0.000000 0.000000

SQL Migration


In [46]:
exception_type_field = {
    'run_date': 'DATE'
}

In [48]:
# creating dataset dataframe
df_dataset = pd.DataFrame({
    'id': list(dataset_id.values()), 
    'name': list(dataset_id.keys())
}).set_index('id')

dfs['dataset'] = df_dataset

df_dataset


Out[48]:
name
id
1 srag
2 sragflu
3 obitoflu

In [49]:
# creating situation dataframe
df_situation = pd.DataFrame({
    'id': list(situation_id.values()), 
    'name': list(situation_id.keys())
}).set_index('id')

dfs['situation'] = df_situation

df_situation


Out[49]:
name
id
1 unknown
2 estimated
3 stable
4 incomplete

In [50]:
# creating scale dataframe
df_scale = pd.DataFrame({
    'id': list(scale_id.values()), 
    'name': list(scale_id.keys())
}).set_index('id')

dfs['scale'] = df_scale

df_scale


Out[50]:
name
id
1 incidência
2 casos

In [51]:
# creating territory_type dataframe
df_territory_type = pd.DataFrame({
    'id': list(territory_type_id.values()), 
    'name': list(territory_type_id.keys())
}).set_index('id')

dfs['territory_type'] = df_territory_type

df_territory_type


Out[51]:
name
id
1 Estado
2 Regional
3 Região
4 País

In [52]:
dfs['territory'] = df_territory

df_territory.head()


Out[52]:
initials name territory_type_id
id
11 RO Rondônia 1
12 AC Acre 1
13 AM Amazonas 1
14 RR Roraima 1
15 PA Pará 1

In [53]:
dsn = 'postgresql://%(USER)s:%(PASSWORD)s@%(HOST)s/%(NAME)s'
engine = sqla.create_engine(dsn % settings.DATABASE)
    
for k, df in dfs.items():
    k_new = k.replace('-', '_')
    
    print('Migrating %s ...' % k_new)
    df.to_sql(
        k_new, engine, if_exists='replace',
        chunksize=2048
    )


Migrating mem_report ...
Migrating current_estimated_values ...
Migrating clean_data_epiweek_weekly_incidence_w_situation ...
Migrating historical_estimated_values ...
Migrating mem_typical ...
Migrating dataset ...
Migrating situation ...
Migrating scale ...
Migrating territory_type ...
Migrating territory ...