In [1]:
# -*- coding: UTF-8 -*-

# Render our plots inline
%matplotlib inline 

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import re

In [2]:
# Download and clean from OECD: 
# Demographic Statistics_TL3: Regional typology (urban=1/intermediate=2/rural=3,4,5)

In [3]:
# Load csv file first
data = pd.read_csv("5ce591ed-8cc2-4176-8eb7-69d10501b672.csv", encoding="utf-8")

In [4]:
data[0:2]


Out[4]:
country region
0 AUS: Australia AU105: Sydney, NSW
1 NaN AU110: Hunter, NSW

In [5]:
data.loc[0][0]


Out[5]:
u'AUS: Australia'

In [6]:
countries = {}
regions = {}

In [7]:
for i in data.iterrows():
    if pd.isnull(i[1][0]) == False:
        # Get countries: names and codes
        current_country = i[1][0]
        current_country_split = current_country.split(" ",1)
        countries[current_country_split[1]] = current_country_split[0]
        current_country_name = current_country_split[1]
    # Get regions
    current_region = i[1][1]
    current_region_split = current_region.split(": ",1)
    # FI194
    current_region_name = unicode(current_region_split[1])
    current_region_code = re.sub(r' ', "", current_region_split[0])
    regions[current_region_code] = {"name":current_region_name, "code":current_region_code, "country": countries[current_country_name]}

In [8]:
import json
with open('regions.json', 'w') as fp:
    json.dump(regions, fp)

In [ ]: