In [1]:
import numpy as np, requests, zipfile, StringIO, pandas as pd, json, copy

In [2]:
#data source WHO Mortality database
#http://www.who.int/healthinfo/statistics/mortality_rawdata/en/

In [3]:
# you might want to skip this cell, depending on your system configuration
import sys  
reload(sys)  
sys.setdefaultencoding('utf8')

In [4]:
#read country codes
z = zipfile.ZipFile('country_codes.zip') 
cc = pd.read_csv(z.open('country_codes'),low_memory=False).set_index('country')

In [5]:
def part1():
    #read data
    z = zipfile.ZipFile('morticd10_part1.zip') 
    df = pd.read_csv(z.open('Morticd10_part1'),low_memory=False)

    #filter out non-country (subdivision) data
    df = df[np.isnan(df['Admin1'])]
    df = df[np.isnan(df['SubDiv'])]
    df = df.drop(['Admin1','SubDiv'],axis=1)

    #filter out data after ICD10 classification
    df = df[df['List']=='104']
    df = df.drop(['List'],axis=1)

    #filter out non-detailed mortality
    #df = df[df['Frmat']==1]
    #df = df.drop(['Frmat'],axis=1)

    #filter out detailed infant mortality
    #df = df[df['IM_Frmat']==8]
    df = df.drop(['IM_Frmat'],axis=1)

    df=df.drop(['IM_Deaths1'],axis=1).\
        drop(['IM_Deaths2'],axis=1).\
        drop(['IM_Deaths3'],axis=1).\
        drop(['IM_Deaths4'],axis=1)
    return df

In [6]:
def part2():
    #read data
    z = zipfile.ZipFile('morticd10_part2.zip') 
    df2 = pd.read_csv(z.open('Morticd10_part2'),low_memory=False)

    #filter out non-country (subdivision) data
    df2 = df2[np.isnan(df2['Admin1'])]
    df2 = df2[df2['SubDiv']!='A30']
    df2 = df2.drop(['Admin1','SubDiv'],axis=1)

    #filter out data after ICD10 classification
    df2 = df2[df2['List']=='104']
    df2 = df2.drop(['List'],axis=1)

    #filter out non-detailed mortality
    #df2 = df2[df2['Frmat']==1]
    #df2 = df2.drop(['Frmat'],axis=1)

    #filter out detailed infant mortality
    #df2 = df2[df2['IM_Frmat']==8]
    df2 = df2.drop(['IM_Frmat'],axis=1)

    df2=df2.drop(['IM_Deaths1'],axis=1).\
        drop(['IM_Deaths2'],axis=1).\
        drop(['IM_Deaths3'],axis=1).\
        drop(['IM_Deaths4'],axis=1)
    return df2

In [7]:
def load_df(index):
    #merge the two dataframes
    df = pd.concat([part1(),part2()])
    #set index
    df = df.set_index(index)
    #set column names
    df.columns=['Deaths1']+range(5)+list(np.arange(1,20)*5)+['Deaths26']
    #normalize data
    dg0=df.loc[0].copy()
    dg1=df.loc[1].copy()
    dg1[90]=df.loc[1][85]*3/10.0
    dg1[95]=df.loc[1][85]*1/10.0
    dg1[85]=df.loc[2][85]*6/10.0
    dg2=df.loc[2].copy()
    dg2[90]=df.loc[2][85]*3/10.0
    dg2[95]=df.loc[2][85]*1/10.0
    dg2[85]=df.loc[2][85]*6/10.0
    dg2[2]=df.loc[2][1]*1/4.0
    dg2[3]=df.loc[2][1]*1/4.0
    dg2[4]=df.loc[2][1]*1/4.0
    dg2[1]=df.loc[2][1]*1/4.0
    return pd.concat([dg0,dg1,dg2])

In [8]:
#Load df for processing
df=load_df(['Frmat','Country','Year','Cause','Sex'])

In [9]:
countries=pd.read_html('http://www.geonames.org/countries/',header=0)[1]
countries.columns=['ISO2','ISO3','ISONUM','FIPS','Country','Capital','Area','Population','Continent']
countries.set_index('Country',drop=True,inplace=True)
countries=countries['ISONUM']
countries.head(5)


Out[9]:
Country
Andorra                  20
United Arab Emirates    784
Afghanistan               4
Antigua and Barbuda      28
Anguilla                660
Name: ISONUM, dtype: int64

In [10]:
ch={}
for i in countries.index:
    try: ch[countries.loc[i]]=i
    except: pass

In [11]:
def ccv(f):
    if f=='Russian Federation':return 'Russia'
    elif f=='Brunei Darussalam':return 'Brunei'
    elif f=='Reunion':return u'R\xc3\xa9union'
    elif f=='Saint Vincent and Grenadines':return 'Saint Vincent and the Grenadines'
    elif f=='United States of America':return 'United States'
    elif f=='Virgin Islands (USA)':return 'U.S. Virgin Islands'
    elif f=='Hong Kong SAR':return 'Hong Kong'
    elif f=='Republic of Korea':return 'South Korea'
    elif f=='Republic of Moldova':return 'Moldova'
    elif f=='Serbia and Montenegro, Former':return 'Serbia'
    else: return f

todrop=set()
for i in list(df.index.levels[0]):
    if ccv(cc.loc[i][0]) not in list(countries.index):
        print cc.loc[i][0]
        todrop.add(i)

In [12]:
#icd=pd.read_excel('icd.xlsx').set_index('code')
icd=pd.read_excel('icd_hun.xlsx',).set_index('code')

In [13]:
igroup=[1000,
1001,
1026,
1048,
1051,
1055,
1058,
1062,
1063,
1064,
1072,
1078,
1082,
1083,
1084,
1087,
1092,
1093,
1094,
1095,
2000]#protector dummy

In [14]:
#create hierarchy of diseases

#numbers first
hierarchy={}
currenti=0
for k in icd.T.iteritems():
    i=k[0]
    if i>igroup[currenti]: currenti+=1
    if igroup[currenti] not in hierarchy:
        hierarchy[igroup[currenti]]={'sub':{}}
    if i<igroup[currenti]:
        hierarchy[igroup[currenti-1]]['sub'][str(i)]={}
    if i not in hierarchy:hierarchy[i]={'parent':str(igroup[currenti-1])}
    hierarchy[i]['name']=k[1][1]
    hierarchy[i]['hun']=k[1][2]
        
hierarchy.pop(2000); #pop out protector dummy


Out[14]:
{'sub': {}}

In [15]:
# stringifiy all dictionary entries for JSON
for k in hierarchy.keys():
    hierarchy[str(k)]=hierarchy[k]
    hierarchy.pop(k);

In [16]:
#characters after
for i in icd.T.iteritems():
    try: 
        if np.isnan(i[1][0]): 
            hierarchy['AAA']={'parent':'1000'}
            #hierarchy[1000]['sub']['AAA']={'m':0,'f':0}
            hierarchy['1000']['sub']['AAA']={}
    except: 
        #only sub-groups, no major groups 
        a=False
        b=False
        w=str(i[0])
        if 'sub' in hierarchy[w]:
            if hierarchy[w]['sub']=={}: a=True
        if 'parent' in hierarchy[w]:
            b=True
        if (a or b):
            groups=i[1][0].split(',')
            for g in groups:
                if '-' in g:
                    first=g[:g.find('-')].strip()
                    second=g[g.find('-')+1:].strip()
                    if first[:1]==second[:1]:
                        for k in range(int(first[1:]),int(second[1:])+1):
                            disease = first[:1]+str(k).zfill(2)
                            hierarchy[disease]={'parent':w}
                    else: 
                        #character-break (A-B, X-Y etc) of category
                        for k in range(int(first[1:]),100):
                            disease = first[:1]+str(k).zfill(2)
                            hierarchy[disease]={'parent':w}
                        for k in range(1,int(second[1:])+1):
                            disease = second[:1]+str(k).zfill(2)
                            hierarchy[disease]={'parent':w}
                else:
                    hierarchy[g.strip()]={'parent':w}

In [17]:
#fill up the sub-hierarchies
for i in hierarchy:
    if 'parent' in hierarchy[i]:
        parent1=hierarchy[i]['parent']
        if 'parent' in hierarchy[parent1]:
            parent2=hierarchy[parent1]['parent']
            if 'sub' not in hierarchy[parent2]['sub'][parent1]:hierarchy[parent2]['sub'][parent1]['sub']={}
            hierarchy[parent2]['sub'][parent1]['sub'][i]={}
        else: 
            hierarchy[parent1]['sub'][i]={}

In [18]:
#group_getter
def get_group(i):
    if 'parent' in hierarchy[i]:
        parent1=hierarchy[i]['parent']
        if 'parent' in hierarchy[parent1]:
            return hierarchy[parent1]['parent']
        else: return parent1
    else: return i

In [19]:
#parent_getter
def get_parent(i):
    if 'parent' in hierarchy[i]:
        return hierarchy[i]['parent']
    else: return i

In [20]:
#read data for population
z = zipfile.ZipFile('Pop.zip') 
pop = pd.read_csv(z.open('pop'),low_memory=False)
pop = pop.set_index(['Frmat','Country','Year','Sex'])

In [21]:
#filter out non-country (subdivision) data
pop = pop[np.isnan(pop['Admin1'])]
pop = pop[pop['SubDiv']!='A30']
pop = pop[pop['SubDiv']!='A20']
pop = pop[pop['SubDiv']!='A35']
pop = pop[pop['SubDiv']!='A41']
pop = pop[pop['SubDiv']!='A51']
pop = pop[pop['SubDiv']!='A70']
pop = pop.drop(['Admin1','SubDiv'],axis=1)
pop = pop.drop(['Pop1','Pop26','Lb'],axis=1)
pop.columns=range(5)+list(np.arange(1,20)*5)

In [22]:
#normalize formatting
dr0=pop.loc[0].copy()
dr1=pop.loc[1].copy()
dr1[90]=pop.loc[1][85]*3/10.0
dr1[95]=pop.loc[1][85]*1/10.0
dr1[85]=pop.loc[1][85]*6/10.0
dr2=pop.loc[2].copy()
dr2[90]=pop.loc[2][85]*3/10.0
dr2[95]=pop.loc[2][85]*1/10.0
dr2[85]=pop.loc[2][85]*6/10.0
dr2[2]=pop.loc[2][1]*1/4.0
dr2[3]=pop.loc[2][1]*1/4.0
dr2[4]=pop.loc[2][1]*1/4.0
dr2[1]=pop.loc[2][1]*1/4.0
dr3=pop.loc[3].copy()
dr3[90]=pop.loc[3][75]*1/121.0
dr3[95]=pop.loc[3][75]*3/121.0
dr3[85]=pop.loc[3][75]*9/121.0
dr3[80]=pop.loc[3][75]*27/121.0
dr3[75]=pop.loc[3][75]*81/121.0
dr4=pop.loc[4].copy()
dr4[90]=pop.loc[4][75]*1/121.0
dr4[95]=pop.loc[4][75]*3/121.0
dr4[85]=pop.loc[4][75]*9/121.0
dr4[80]=pop.loc[4][75]*27/121.0
dr4[75]=pop.loc[4][75]*81/121.0
dr4[2]=pop.loc[4][1]*1/4.0
dr4[3]=pop.loc[4][1]*1/4.0
dr4[4]=pop.loc[4][1]*1/4.0
dr4[1]=pop.loc[4][1]*1/4.0
dr5=pop.loc[5].copy()
dr5[90]=pop.loc[5][70]*1/364.0
dr5[95]=pop.loc[5][70]*3/364.0
dr5[85]=pop.loc[5][70]*9/364.0
dr5[80]=pop.loc[5][70]*27/364.0
dr5[75]=pop.loc[5][70]*81/364.0
dr5[70]=pop.loc[5][70]*243/364.0
dr5[2]=pop.loc[5][1]*1/4.0
dr5[3]=pop.loc[5][1]*1/4.0
dr5[4]=pop.loc[5][1]*1/4.0
dr5[1]=pop.loc[5][1]*1/4.0
dr6=pop.loc[6].copy()
dr6[90]=pop.loc[6][65]*1/1093.0
dr6[95]=pop.loc[6][65]*3/1093.0
dr6[85]=pop.loc[6][65]*9/1093.0
dr6[80]=pop.loc[6][65]*27/1093.0
dr6[75]=pop.loc[6][65]*81/1093.0
dr6[70]=pop.loc[6][65]*243/1093.0
dr6[65]=pop.loc[6][65]*729/1093.0
dr6[2]=pop.loc[6][1]*1/4.0
dr6[3]=pop.loc[6][1]*1/4.0
dr6[4]=pop.loc[6][1]*1/4.0
dr6[1]=pop.loc[6][1]*1/4.0
dr7=pop.loc[7].copy()
dr7[2]=pop.loc[7][1]*1/4.0
dr7[3]=pop.loc[7][1]*1/4.0
dr7[4]=pop.loc[7][1]*1/4.0
dr7[1]=pop.loc[7][1]*1/4.0
dr7[10]=(pop.loc[7][5]+pop.loc[7][15])/4.0
dr7[20]=(pop.loc[7][15]+pop.loc[7][25])/4.0
dr7[30]=(pop.loc[7][25]+pop.loc[7][35])/4.0
dr7[40]=(pop.loc[7][35]+pop.loc[7][45])/4.0
dr7[50]=(pop.loc[7][45]+pop.loc[7][55])/4.0
dr7[60]=(pop.loc[7][55]+pop.loc[7][65])/4.0
dr7[70]=(pop.loc[7][65])/2.0
dr7[5]=pop.loc[7][5]/2.0
dr7[15]=pop.loc[7][15]/2.0
dr7[25]=pop.loc[7][25]/2.0
dr7[35]=pop.loc[7][35]/2.0
dr7[45]=pop.loc[7][45]/2.0
dr7[55]=pop.loc[7][55]/2.0
dr7[65]=pop.loc[7][65]/2.0
dr7[90]=pop.loc[7][75]*1/121.0
dr7[95]=pop.loc[7][75]*3/121.0
dr7[85]=pop.loc[7][75]*9/121.0
dr7[80]=pop.loc[7][75]*27/121.0
dr7[75]=pop.loc[7][75]*81/121.0
dr8=pop.loc[8].copy()
dr8[2]=pop.loc[8][1]*1/4.0
dr8[3]=pop.loc[8][1]*1/4.0
dr8[4]=pop.loc[8][1]*1/4.0
dr8[1]=pop.loc[8][1]*1/4.0
dr8[10]=(pop.loc[8][5]+pop.loc[7][15])/4.0
dr8[20]=(pop.loc[8][15]+pop.loc[7][25])/4.0
dr8[30]=(pop.loc[8][25]+pop.loc[7][35])/4.0
dr8[40]=(pop.loc[8][35]+pop.loc[7][45])/4.0
dr8[50]=(pop.loc[8][45]+pop.loc[7][55])/4.0
dr8[60]=(pop.loc[8][55])/2.0
dr8[5]=pop.loc[8][5]/2.0
dr8[15]=pop.loc[8][15]/2.0
dr8[25]=pop.loc[8][25]/2.0
dr8[35]=pop.loc[8][35]/2.0
dr8[45]=pop.loc[8][45]/2.0
dr8[55]=pop.loc[8][55]/2.0
dr8[90]=pop.loc[8][65]*1/1093.0
dr8[95]=pop.loc[8][65]*3/1093.0
dr8[85]=pop.loc[8][65]*9/1093.0
dr8[80]=pop.loc[8][65]*27/1093.0
dr8[75]=pop.loc[8][65]*81/1093.0
dr8[70]=pop.loc[8][65]*243/1093.0
dr8[65]=pop.loc[8][65]*729/1093.0

In [23]:
pop=pd.concat([dr0,dr1,dr2,dr3,dr4,dr5,dr6,dr7,dr8])

In [24]:
def ccw(c):
    if c=='Sao Tome and Principe':return u'S\xc3\xa3o Tom\xc3\xa9 and Pr\xc3\xadncipe'
    elif c=='Falkland Islands (Malvinas)':return 'Falkland Islands'
    elif c=='China: Province of Taiwan only':return 'Taiwan'
    elif c=='Iran (Islamic Republic of)':return 'Iran'
    elif c=='Syrian Arab Republic':return 'Syria'
    elif c=='TFYR Macedonia':return 'Macedonia' 
    else:return c

In [25]:
pp={'007':{}} #007 for world
for p in pop.T.iteritems():
    try:
        country_id=countries.loc[ccw(ccv(cc.loc[p[0][0]][0]))]
        if str(country_id) not in pp: pp[str(country_id)]={}
        if p[0][1]>1985: #no mortality data before that
            if str(p[0][1]) not in pp[str(country_id)]:pp[str(country_id)][str(p[0][1])]={}
            if str(p[0][1]) not in pp['007']:pp['007'][str(p[0][1])]={}
            if p[0][2]>1:g='f'
            else: g='m'
            if g not in pp[str(country_id)][str(p[0][1])]:pp[str(country_id)][str(p[0][1])][g]={}
            if g not in pp['007'][str(p[0][1])]:pp['007'][str(p[0][1])][g]={}
            for j in pop.columns:
                v=p[1][j]
                if np.isnan(p[1][j]):v=0
                pp[str(country_id)][str(p[0][1])][g][str(j)]=str(v)
                if str(j) not in pp['007'][str(p[0][1])][g]:pp['007'][str(p[0][1])][g][str(j)]=0
                pp['007'][str(p[0][1])][g][str(j)]+=v
    except: pass

In [26]:
#convert to str
for y in pp['007']:
    for g in pp['007'][y]:
        for a in pp['007'][y][g]:
            pp['007'][y][g][a]=str(pp['007'][y][g][a])

Alternative population counter


In [27]:
#load pop data from http://esa.un.org/unpd/wpp/DVD/Files/1_Indicators%20(Standard)/EXCEL_FILES/1_Population/WPP2015_POP_F15_2_ANNUAL_POPULATION_BY_AGE_MALE.XLS
wd=pd.read_excel('WPP2015_POP_F15_2_ANNUAL_POPULATION_BY_AGE_MALE.XLS',skiprows=16)
wd=wd.drop(['Major area, region, country or area *','Index','Variant','Notes','80+'],axis=1)
wd.columns=['Country','Year']+list(np.arange(21)*5)
wd=wd.set_index(['Country','Year'])

#load pop data from http://esa.un.org/unpd/wpp/DVD/Files/1_Indicators%20(Standard)/EXCEL_FILES/1_Population/WPP2015_POP_F15_2_ANNUAL_POPULATION_BY_AGE_MALE.XLS
wf=pd.read_excel('WPP2015_POP_F15_3_ANNUAL_POPULATION_BY_AGE_FEMALE.XLS',skiprows=16)
wf=wf.drop(['Major area, region, country or area *','Index','Variant','Notes','80+'],axis=1)
wf.columns=['Country','Year']+list(np.arange(21)*5)
wf=wf.set_index(['Country','Year'])

In [28]:
#fill missing years and aggregates
wd[2]=wd[0]*1/5.0
wd[3]=wd[0]*1/5.0
wd[4]=wd[0]*1/5.0
wd[1]=wd[0]*1/5.0
wd[0]=wd[0]*1/5.0
wd[95]=wd[95]+wd[100]
wd=wd.drop(100,axis=1)

#fill missing years and aggregates
wf[2]=wf[0]*1/5.0
wf[3]=wf[0]*1/5.0
wf[4]=wf[0]*1/5.0
wf[1]=wf[0]*1/5.0
wf[0]=wf[0]*1/5.0
wf[95]=wf[95]+wf[100]
wf=wf.drop(100,axis=1)

In [29]:
wdp={}
for p in wd.T.iteritems():
    try:
        country_id=p[0][0]
        if str(country_id) not in wdp: wdp[str(country_id)]={}
        if p[0][1]>1985: #no mortality data before that
            if str(p[0][1]) not in wdp[str(country_id)]:wdp[str(country_id)][str(p[0][1])]={}
            g='m'
            if g not in wdp[str(country_id)][str(p[0][1])]:wdp[str(country_id)][str(p[0][1])][g]={}
            for j in wd.columns:
                v=p[1][j]
                if np.isnan(p[1][j]):v=0
                wdp[str(country_id)][str(p[0][1])][g][str(j)]=str(v*1000.0)
    except: pass
    
for p in wf.T.iteritems():
    try:
        country_id=p[0][0]
        if str(country_id) not in wdp: wdp[str(country_id)]={}
        if p[0][1]>1985: #no mortality data before that
            if str(p[0][1]) not in wdp[str(country_id)]:wdp[str(country_id)][str(p[0][1])]={}
            g='f'
            if g not in wdp[str(country_id)][str(p[0][1])]:wdp[str(country_id)][str(p[0][1])][g]={}
            for j in wd.columns:
                v=p[1][j]
                if np.isnan(p[1][j]):v=0
                wdp[str(country_id)][str(p[0][1])][g][str(j)]=str(v*1000.0)
    except: pass

Data parser


In [ ]:
hierarchy2={}
c={}
for country in df.index.get_level_values('Country').unique():
    if ccv(cc.loc[country][0]) not in ['Netherlands Antilles',
                                       'United Kingdom, England and Wales',
                                       'United Kingdom, Scotland',
                                       'United Kingdom, Northern Ireland',
                                       'Rodrigues']:
        
        try:
            country_id=countries.loc[ccv(cc.loc[country][0])]            
            print country_id,ccv(cc.loc[country][0])
            c[country_id]=cc.loc[country][0]
            
            data={}
            data3=[]

            dk=df.loc[country].drop(['Deaths1','Deaths26'],axis=1)
            dk.columns=range(5)+list(np.arange(1,20)*5) 

            for i in dk.stack().iteritems():
                if i[0][2]>1:gender='f'
                else: gender='m'
                cause=i[0][1].strip()
                if cause not in 'AAA':
                    key='A'+str(i[0][3])+'C'+str(cause)+'T'+str(i[0][0])
                    if key not in data: data[key]={}
                    data[key]['a']=i[0][3]
                    data[key]['c']=cause
                    data[key]['t']=i[0][0]
                    data[key]['s']=i[1]
                    data[key]['g']=gender
                    if cause not in hierarchy2: hierarchy2[cause]={}
                    cause2=cause[:3]
                    hierarchy2[cause]["cause2"]=cause2
                    hierarchy2[cause]["parent"]=get_parent(cause2)
                    hierarchy2[cause]["group"]=get_group(cause2)
                    data3.append(data[key])
                        
            file('db/data.json','w').write(json.dumps(data3))  
            try:
                import zlib
                compression = zipfile.ZIP_DEFLATED
            except:
                compression = zipfile.ZIP_STORED
            zf = zipfile.ZipFile('db/'+str(country_id)+'.zip', mode='w')
            zf.write('db/data.json','data.json',compress_type=compression)
            zf.close()
            
        except: print 'error',country_id,ccv(cc.loc[country][0])

In [563]:
file('hierarchy.json','w').write(json.dumps(hierarchy2))  #only do once ever, dont overwrite!

In [76]:
hierarchy3={}
c={}
for country in df.index.get_level_values('Country').unique():
    if ccv(cc.loc[country][0]) not in ['Netherlands Antilles',
                                       'United Kingdom, England and Wales',
                                       'United Kingdom, Scotland',
                                       'United Kingdom, Northern Ireland',
                                       'Rodrigues']:
        
        try:
            country_id=countries.loc[ccv(cc.loc[country][0])]            
            print country_id,ccv(cc.loc[country][0])
            c[country_id]=cc.loc[country][0]
            
            data={}
            data3=[]

            dk=df.loc[country].drop(['Deaths1','Deaths26'],axis=1)
            dk.columns=range(5)+list(np.arange(1,20)*5) 

            for i in dk.stack().iteritems():
                if i[0][2]>1:gender='f'
                else: gender='m'
                cause=i[0][1].strip()
                if cause not in 'AAA':
                    cause=get_parent(i[0][1].strip()[:3])
                    key='A'+str(i[0][3])+'C'+str(cause)+'T'+str(i[0][0])+'G'+gender
                    if key not in data: data[key]={}
                    data[key]['a']=i[0][3]
                    data[key]['c']=cause
                    data[key]['g']=gender
                    data[key]['t']=i[0][0]
                    if 's' not in data[key]:data[key]['s']=0
                    data[key]['s']+=i[1]
                    if cause not in hierarchy3: hierarchy3[cause]={}
                    cause2=cause
                    hierarchy3[cause]["cause2"]=cause
                    hierarchy3[cause]["parent"]=cause
                    hierarchy3[cause]["group"]=get_group(cause2)
            
            for key in data:
                data3.append(data[key])
                        
            file('db2/data.json','w').write(json.dumps(data3))  
            try:
                import zlib
                compression = zipfile.ZIP_DEFLATED
            except:
                compression = zipfile.ZIP_STORED
            zf = zipfile.ZipFile('db2/'+str(country_id)+'.zip', mode='w')
            zf.write('db2/data.json','data.json',compress_type=compression)
            zf.close()
            
        except: pass#print 'error',country_id,ccv(cc.loc[country][0])

In [565]:
file('hierarchy2.json','w').write(json.dumps(hierarchy3))  #only do once ever, dont overwrite!

In [71]:
#run once!
#save country population for which there is modrtality data
mdata=['900']

for country in df.index.get_level_values('Country').unique():
    if ccv(cc.loc[country][0]) not in ['Netherlands Antilles',
                                       'United Kingdom, England and Wales',
                                       'United Kingdom, Scotland',
                                       'United Kingdom, Northern Ireland',
                                       'Rodrigues']:
        
        try:
            country_id=countries.loc[ccv(cc.loc[country][0])]            
            mdata.append(str(country_id)) #append country id to list of available countries
        except: print 'error',country_id,ccv(cc.loc[country][0])

for c in wdp.keys():
    if c not in mdata:
        wdp.pop(c);
file('pop.json','w').write(json.dumps(wdp))  #only do once ever, dont overwrite!
file('wpop.json','w').write(json.dumps(wdp))  #only do once ever, dont overwrite!

In [94]:
pp['32'].keys()


Out[94]:
['1986',
 '1987',
 '1993',
 '1992',
 '1995',
 '1994',
 '1990',
 '1996',
 '1988',
 '1989',
 '1991']

In [254]:
#get scaler of world vs. actual data
dr=[]
for y in range(1990,2005): #years with best overall data availability
    yr=str(y)
    for g in ['f','m']:
        for i in pp['007'][yr][g].keys():
            dr.append(float(pp['007'][yr][g][i])/float(wdp['900'][yr][g][i]))
wsc=np.array(dr).mean()

In [30]:
#Load df for global data
df=load_df(['Frmat','Sex', 'Year', 'Cause','Country'])
years=df.loc[1].index.get_level_values('Year').unique()

In [258]:
c={}
country_id=900 #world
with open("db2/data.json", "w") as data3: data3.write("")
with open("db2/data.json", "a") as data3: 
    data3.write("[")
    for y in range(2000,2014):#years:
        print y
        for g in [1,2]:
            try:
                dk=df.loc[g].loc[y].stack().unstack('Country').T.sum().unstack().drop(['Deaths1','Deaths26'],axis=1)
                dk.columns=range(5)+list(np.arange(1,20)*5) 
                data={}
                for i in dk.stack().iteritems():
                    if g>1:gender='f'
                    else: gender='m'
                    cause=i[0][0].strip()
                    if cause not in 'AAA':
                        #if i[1]>0:
                            cause=get_parent(i[0][0].strip()[:3])
                            key='A'+str(i[0][1])+'C'+str(cause)+'T'+str(y)+'G'+gender
                            if key not in data: data[key]={}
                            data[key]['a']=i[0][1]
                            data[key]['c']=cause
                            data[key]['g']=gender
                            data[key]['t']=str(y)
                            if 's' not in data[key]:data[key]['s']=0
                            data[key]['s']+=i[1]/wsc #multiply with global scaler
                for key in data:
                    data3.write(json.dumps(data[key])+',')    

            except: pass#print 'error',country_id,ccv(cc.loc[country][0])

In [259]:
#remove last comma character
#from here http://stackoverflow.com/questions/1877999/delete-final-line-in-file-via-python
with open("db2/data.json", "r+") as data3:
    #Move the pointer (similar to a cursor in a text editor) to the end of the file. 
    data3.seek(0, os.SEEK_END)

    #This code means the following code skips the very last character in the file - 
    #i.e. in the case the last line is null we delete the last line 
    #and the penultimate one
    pos = data3.tell() - 1
    data3.seek(pos, os.SEEK_SET)
    data3.truncate()

    data3.close()
with open("db2/data.json", "a") as data3: data3.write("]")

In [260]:
#try this, if it freezes, then just zip manually
try:
    import zlib
    compression = zipfile.ZIP_DEFLATED
except:
    compression = zipfile.ZIP_STORED
zf = zipfile.ZipFile('db2/'+str(country_id)+'.zip', mode='w')
zf.write('db2/data.json','data.json',compress_type=compression)
zf.close()

In [261]:
c={}
country_id=900 #world
with open("db/data.json", "w") as data3: data3.write("")
with open("db/data.json", "a") as data3: 
    data3.write("[")
    for y in range(2001,2014):#years:
        print y
        for g in [1,2]:
            try:
                dk=df.loc[g].loc[y].stack().unstack('Country').T.sum().unstack().drop(['Deaths1','Deaths26'],axis=1)
                dk.columns=range(5)+list(np.arange(1,20)*5) 
                data={}
                for i in dk.stack().iteritems():
                    if g>1:gender='f'
                    else: gender='m'
                    cause=i[0][0].strip()
                    if cause not in 'AAA':
                        #if i[1]>0:
                            cause=i[0][0].strip()[:3]
                            key='A'+str(i[0][1])+'C'+str(cause)+'T'+str(y)+'G'+gender
                            if key not in data: data[key]={}
                            data[key]['a']=i[0][1]
                            data[key]['c']=cause
                            data[key]['g']=gender
                            data[key]['t']=str(y)
                            if 's' not in data[key]:data[key]['s']=0
                            data[key]['s']+=i[1]/wsc #multiply with global scaler
                for key in data:
                    data3.write(json.dumps(data[key])+',')    

            except: pass#print 'error',country_id,ccv(cc.loc[country][0])

In [262]:
#remove last comma character
#from here http://stackoverflow.com/questions/1877999/delete-final-line-in-file-via-python
with open("db/data.json", "r+") as data3:
    #Move the pointer (similar to a cursor in a text editor) to the end of the file. 
    data3.seek(0, os.SEEK_END)

    #This code means the following code skips the very last character in the file - 
    #i.e. in the case the last line is null we delete the last line 
    #and the penultimate one
    pos = data3.tell() - 1
    data3.seek(pos, os.SEEK_SET)
    data3.truncate()

    data3.close()
with open("db/data.json", "a") as data3: data3.write("]")

In [263]:
#try this, if it freezes, then just zip manually
try:
    import zlib
    compression = zipfile.ZIP_DEFLATED
except:
    compression = zipfile.ZIP_STORED
zf = zipfile.ZipFile('db/'+str(country_id)+'.zip', mode='w')
zf.write('db/data.json','data.json',compress_type=compression)
zf.close()

In [217]:
#pretty country nams, EN+HUN
cnames=json.loads(file("../szekelyfold lakossag 2/cnames.json").read())
hnames=json.loads(file("../szekelyfold lakossag 2/hnames.json").read())

In [218]:
#need ot run only once
def hun(c):
    if c=='Antigua and Barbuda': return u'Antigua és Barbuda'
    elif c=='Bahamas': return u'Bahamák'
    elif c=='British Virgin Islands': return u'Brit Virgin-szigetek'
    elif c=='Cayman Islands': return u'Kajmán-szigetek'
    elif c=='Dominica': return u'Dominika'
    elif c==u'R\xc3\xa9union': return u'Réunion'
    elif c=='French Guiana': return u'Francia Guyana'
    elif c=='Saint Kitts and Nevis': return u'Saint Kitts és Nevis'
    elif c=='Saint Vincent and the Grenadines': return u'Szent Vincent és a Grenadine-szigetek'
    elif c=='Turks and Caicos Islands': return u'Turks és Caicos-szigetek'
    elif c=='U.S. Virgin Islands': return u'U.S. Virgin-szigetek'
    elif c=='Saint Pierre and Miquelon': return u'Saint Pierre és Miquelon'
    elif c=='World': return u'Egész Világ'
    else: return c
    
for country in df.index.get_level_values('Country').unique():
    try:
        country_id=countries.loc[ccv(cc.loc[country][0])]    
        if str(country_id) not in cnames:
            cnames[str(country_id)]=ccv(cc.loc[country][0])
        if ccv(cc.loc[country][0]) not in hnames:
            print repr(ccv(cc.loc[country][0]))
            hnames[ccv(cc.loc[country][0])]=hun(ccv(cc.loc[country][0]))
    except:pass
    
cnames[u'900']=u"World"
hnames[u'World']=u"Egész Világ"
file('cnames.json','w').write(json.dumps(cnames))  
file('hnames.json','w').write(json.dumps(hnames))

In [219]:
#pretty country nams, EN+HUN
cnames=json.loads(file("cnames.json").read())
hnames=json.loads(file("hnames.json").read())

In [220]:
def hc(c):
    if c=="Reunion": return u'R\xc3\xa9union'
    if c=='Saint Vincent and Grenadines': return u'Saint Vincent and the Grenadines'
    if c=='Serbia and Montenegro, Former': return "Serbia"
    if c=='United States of America': return "United States"
    if c=='Virgin Islands (USA)': return 'U.S. Virgin Islands'
    if c=='Hong Kong SAR': return 'Hong Kong'
    if c=='Republic of Moldova': return 'Moldova'
    if c=='Republic of Korea': return 'South Korea'
    return c

In [221]:
#only run if you haven't run country data parser
#recreate country list
c={}
for country in df.index.get_level_values('Country').unique():
    if ccv(cc.loc[country][0]) not in ['Netherlands Antilles',
                                       'United Kingdom, England and Wales',
                                       'United Kingdom, Scotland',
                                       'United Kingdom, Northern Ireland',
                                       'Rodrigues']:
            country_id=countries.loc[ccv(cc.loc[country][0])]            
            c[country_id]=cc.loc[country][0]

In [229]:
q={}
for i in c:
    q[hnames[hc(c[i])]]=i
e=[]
itera=q.keys()
itera.sort(cmp=locale.strcoll)
for i in itera:
    e.append(str(q[i]))
file('countries.json','w').write(json.dumps(['900']+e))

In [236]:
q={}
for i in c:
    q[hc(c[i])]=i
e=[]
itera=q.keys()
itera.sort(cmp=locale.strcoll)
for i in itera:
    e.append(str(q[i]))
file('wcountries.json','w').write(json.dumps(['900']+e))