In [1]:
import numpy as np, requests, zipfile, StringIO, pandas as pd, json, copy
In [2]:
#data source WHO Mortality database
#http://www.who.int/healthinfo/statistics/mortality_rawdata/en/
In [3]:
# you might want to skip this cell, depending on your system configuration
import sys
reload(sys)
sys.setdefaultencoding('utf8')
In [4]:
#read country codes
z = zipfile.ZipFile('country_codes.zip')
cc = pd.read_csv(z.open('country_codes'),low_memory=False).set_index('country')
In [5]:
def part1():
#read data
z = zipfile.ZipFile('morticd10_part1.zip')
df = pd.read_csv(z.open('Morticd10_part1'),low_memory=False)
#filter out non-country (subdivision) data
df = df[np.isnan(df['Admin1'])]
df = df[np.isnan(df['SubDiv'])]
df = df.drop(['Admin1','SubDiv'],axis=1)
#filter out data after ICD10 classification
df = df[df['List']=='104']
df = df.drop(['List'],axis=1)
#filter out non-detailed mortality
#df = df[df['Frmat']==1]
#df = df.drop(['Frmat'],axis=1)
#filter out detailed infant mortality
#df = df[df['IM_Frmat']==8]
df = df.drop(['IM_Frmat'],axis=1)
df=df.drop(['IM_Deaths1'],axis=1).\
drop(['IM_Deaths2'],axis=1).\
drop(['IM_Deaths3'],axis=1).\
drop(['IM_Deaths4'],axis=1)
return df
In [6]:
def part2():
#read data
z = zipfile.ZipFile('morticd10_part2.zip')
df2 = pd.read_csv(z.open('Morticd10_part2'),low_memory=False)
#filter out non-country (subdivision) data
df2 = df2[np.isnan(df2['Admin1'])]
df2 = df2[df2['SubDiv']!='A30']
df2 = df2.drop(['Admin1','SubDiv'],axis=1)
#filter out data after ICD10 classification
df2 = df2[df2['List']=='104']
df2 = df2.drop(['List'],axis=1)
#filter out non-detailed mortality
#df2 = df2[df2['Frmat']==1]
#df2 = df2.drop(['Frmat'],axis=1)
#filter out detailed infant mortality
#df2 = df2[df2['IM_Frmat']==8]
df2 = df2.drop(['IM_Frmat'],axis=1)
df2=df2.drop(['IM_Deaths1'],axis=1).\
drop(['IM_Deaths2'],axis=1).\
drop(['IM_Deaths3'],axis=1).\
drop(['IM_Deaths4'],axis=1)
return df2
In [7]:
def load_df(index):
#merge the two dataframes
df = pd.concat([part1(),part2()])
#set index
df = df.set_index(index)
#set column names
df.columns=['Deaths1']+range(5)+list(np.arange(1,20)*5)+['Deaths26']
#normalize data
dg0=df.loc[0].copy()
dg1=df.loc[1].copy()
dg1[90]=df.loc[1][85]*3/10.0
dg1[95]=df.loc[1][85]*1/10.0
dg1[85]=df.loc[2][85]*6/10.0
dg2=df.loc[2].copy()
dg2[90]=df.loc[2][85]*3/10.0
dg2[95]=df.loc[2][85]*1/10.0
dg2[85]=df.loc[2][85]*6/10.0
dg2[2]=df.loc[2][1]*1/4.0
dg2[3]=df.loc[2][1]*1/4.0
dg2[4]=df.loc[2][1]*1/4.0
dg2[1]=df.loc[2][1]*1/4.0
return pd.concat([dg0,dg1,dg2])
In [8]:
#Load df for processing
df=load_df(['Frmat','Country','Year','Cause','Sex'])
In [9]:
countries=pd.read_html('http://www.geonames.org/countries/',header=0)[1]
countries.columns=['ISO2','ISO3','ISONUM','FIPS','Country','Capital','Area','Population','Continent']
countries.set_index('Country',drop=True,inplace=True)
countries=countries['ISONUM']
countries.head(5)
Out[9]:
In [10]:
ch={}
for i in countries.index:
try: ch[countries.loc[i]]=i
except: pass
In [11]:
def ccv(f):
if f=='Russian Federation':return 'Russia'
elif f=='Brunei Darussalam':return 'Brunei'
elif f=='Reunion':return u'R\xc3\xa9union'
elif f=='Saint Vincent and Grenadines':return 'Saint Vincent and the Grenadines'
elif f=='United States of America':return 'United States'
elif f=='Virgin Islands (USA)':return 'U.S. Virgin Islands'
elif f=='Hong Kong SAR':return 'Hong Kong'
elif f=='Republic of Korea':return 'South Korea'
elif f=='Republic of Moldova':return 'Moldova'
elif f=='Serbia and Montenegro, Former':return 'Serbia'
else: return f
todrop=set()
for i in list(df.index.levels[0]):
if ccv(cc.loc[i][0]) not in list(countries.index):
print cc.loc[i][0]
todrop.add(i)
In [12]:
#icd=pd.read_excel('icd.xlsx').set_index('code')
icd=pd.read_excel('icd_hun.xlsx',).set_index('code')
In [13]:
igroup=[1000,
1001,
1026,
1048,
1051,
1055,
1058,
1062,
1063,
1064,
1072,
1078,
1082,
1083,
1084,
1087,
1092,
1093,
1094,
1095,
2000]#protector dummy
In [14]:
#create hierarchy of diseases
#numbers first
hierarchy={}
currenti=0
for k in icd.T.iteritems():
i=k[0]
if i>igroup[currenti]: currenti+=1
if igroup[currenti] not in hierarchy:
hierarchy[igroup[currenti]]={'sub':{}}
if i<igroup[currenti]:
hierarchy[igroup[currenti-1]]['sub'][str(i)]={}
if i not in hierarchy:hierarchy[i]={'parent':str(igroup[currenti-1])}
hierarchy[i]['name']=k[1][1]
hierarchy[i]['hun']=k[1][2]
hierarchy.pop(2000); #pop out protector dummy
Out[14]:
In [15]:
# stringifiy all dictionary entries for JSON
for k in hierarchy.keys():
hierarchy[str(k)]=hierarchy[k]
hierarchy.pop(k);
In [16]:
#characters after
for i in icd.T.iteritems():
try:
if np.isnan(i[1][0]):
hierarchy['AAA']={'parent':'1000'}
#hierarchy[1000]['sub']['AAA']={'m':0,'f':0}
hierarchy['1000']['sub']['AAA']={}
except:
#only sub-groups, no major groups
a=False
b=False
w=str(i[0])
if 'sub' in hierarchy[w]:
if hierarchy[w]['sub']=={}: a=True
if 'parent' in hierarchy[w]:
b=True
if (a or b):
groups=i[1][0].split(',')
for g in groups:
if '-' in g:
first=g[:g.find('-')].strip()
second=g[g.find('-')+1:].strip()
if first[:1]==second[:1]:
for k in range(int(first[1:]),int(second[1:])+1):
disease = first[:1]+str(k).zfill(2)
hierarchy[disease]={'parent':w}
else:
#character-break (A-B, X-Y etc) of category
for k in range(int(first[1:]),100):
disease = first[:1]+str(k).zfill(2)
hierarchy[disease]={'parent':w}
for k in range(1,int(second[1:])+1):
disease = second[:1]+str(k).zfill(2)
hierarchy[disease]={'parent':w}
else:
hierarchy[g.strip()]={'parent':w}
In [17]:
#fill up the sub-hierarchies
for i in hierarchy:
if 'parent' in hierarchy[i]:
parent1=hierarchy[i]['parent']
if 'parent' in hierarchy[parent1]:
parent2=hierarchy[parent1]['parent']
if 'sub' not in hierarchy[parent2]['sub'][parent1]:hierarchy[parent2]['sub'][parent1]['sub']={}
hierarchy[parent2]['sub'][parent1]['sub'][i]={}
else:
hierarchy[parent1]['sub'][i]={}
In [18]:
#group_getter
def get_group(i):
if 'parent' in hierarchy[i]:
parent1=hierarchy[i]['parent']
if 'parent' in hierarchy[parent1]:
return hierarchy[parent1]['parent']
else: return parent1
else: return i
In [19]:
#parent_getter
def get_parent(i):
if 'parent' in hierarchy[i]:
return hierarchy[i]['parent']
else: return i
In [20]:
#read data for population
z = zipfile.ZipFile('Pop.zip')
pop = pd.read_csv(z.open('pop'),low_memory=False)
pop = pop.set_index(['Frmat','Country','Year','Sex'])
In [21]:
#filter out non-country (subdivision) data
pop = pop[np.isnan(pop['Admin1'])]
pop = pop[pop['SubDiv']!='A30']
pop = pop[pop['SubDiv']!='A20']
pop = pop[pop['SubDiv']!='A35']
pop = pop[pop['SubDiv']!='A41']
pop = pop[pop['SubDiv']!='A51']
pop = pop[pop['SubDiv']!='A70']
pop = pop.drop(['Admin1','SubDiv'],axis=1)
pop = pop.drop(['Pop1','Pop26','Lb'],axis=1)
pop.columns=range(5)+list(np.arange(1,20)*5)
In [22]:
#normalize formatting
dr0=pop.loc[0].copy()
dr1=pop.loc[1].copy()
dr1[90]=pop.loc[1][85]*3/10.0
dr1[95]=pop.loc[1][85]*1/10.0
dr1[85]=pop.loc[1][85]*6/10.0
dr2=pop.loc[2].copy()
dr2[90]=pop.loc[2][85]*3/10.0
dr2[95]=pop.loc[2][85]*1/10.0
dr2[85]=pop.loc[2][85]*6/10.0
dr2[2]=pop.loc[2][1]*1/4.0
dr2[3]=pop.loc[2][1]*1/4.0
dr2[4]=pop.loc[2][1]*1/4.0
dr2[1]=pop.loc[2][1]*1/4.0
dr3=pop.loc[3].copy()
dr3[90]=pop.loc[3][75]*1/121.0
dr3[95]=pop.loc[3][75]*3/121.0
dr3[85]=pop.loc[3][75]*9/121.0
dr3[80]=pop.loc[3][75]*27/121.0
dr3[75]=pop.loc[3][75]*81/121.0
dr4=pop.loc[4].copy()
dr4[90]=pop.loc[4][75]*1/121.0
dr4[95]=pop.loc[4][75]*3/121.0
dr4[85]=pop.loc[4][75]*9/121.0
dr4[80]=pop.loc[4][75]*27/121.0
dr4[75]=pop.loc[4][75]*81/121.0
dr4[2]=pop.loc[4][1]*1/4.0
dr4[3]=pop.loc[4][1]*1/4.0
dr4[4]=pop.loc[4][1]*1/4.0
dr4[1]=pop.loc[4][1]*1/4.0
dr5=pop.loc[5].copy()
dr5[90]=pop.loc[5][70]*1/364.0
dr5[95]=pop.loc[5][70]*3/364.0
dr5[85]=pop.loc[5][70]*9/364.0
dr5[80]=pop.loc[5][70]*27/364.0
dr5[75]=pop.loc[5][70]*81/364.0
dr5[70]=pop.loc[5][70]*243/364.0
dr5[2]=pop.loc[5][1]*1/4.0
dr5[3]=pop.loc[5][1]*1/4.0
dr5[4]=pop.loc[5][1]*1/4.0
dr5[1]=pop.loc[5][1]*1/4.0
dr6=pop.loc[6].copy()
dr6[90]=pop.loc[6][65]*1/1093.0
dr6[95]=pop.loc[6][65]*3/1093.0
dr6[85]=pop.loc[6][65]*9/1093.0
dr6[80]=pop.loc[6][65]*27/1093.0
dr6[75]=pop.loc[6][65]*81/1093.0
dr6[70]=pop.loc[6][65]*243/1093.0
dr6[65]=pop.loc[6][65]*729/1093.0
dr6[2]=pop.loc[6][1]*1/4.0
dr6[3]=pop.loc[6][1]*1/4.0
dr6[4]=pop.loc[6][1]*1/4.0
dr6[1]=pop.loc[6][1]*1/4.0
dr7=pop.loc[7].copy()
dr7[2]=pop.loc[7][1]*1/4.0
dr7[3]=pop.loc[7][1]*1/4.0
dr7[4]=pop.loc[7][1]*1/4.0
dr7[1]=pop.loc[7][1]*1/4.0
dr7[10]=(pop.loc[7][5]+pop.loc[7][15])/4.0
dr7[20]=(pop.loc[7][15]+pop.loc[7][25])/4.0
dr7[30]=(pop.loc[7][25]+pop.loc[7][35])/4.0
dr7[40]=(pop.loc[7][35]+pop.loc[7][45])/4.0
dr7[50]=(pop.loc[7][45]+pop.loc[7][55])/4.0
dr7[60]=(pop.loc[7][55]+pop.loc[7][65])/4.0
dr7[70]=(pop.loc[7][65])/2.0
dr7[5]=pop.loc[7][5]/2.0
dr7[15]=pop.loc[7][15]/2.0
dr7[25]=pop.loc[7][25]/2.0
dr7[35]=pop.loc[7][35]/2.0
dr7[45]=pop.loc[7][45]/2.0
dr7[55]=pop.loc[7][55]/2.0
dr7[65]=pop.loc[7][65]/2.0
dr7[90]=pop.loc[7][75]*1/121.0
dr7[95]=pop.loc[7][75]*3/121.0
dr7[85]=pop.loc[7][75]*9/121.0
dr7[80]=pop.loc[7][75]*27/121.0
dr7[75]=pop.loc[7][75]*81/121.0
dr8=pop.loc[8].copy()
dr8[2]=pop.loc[8][1]*1/4.0
dr8[3]=pop.loc[8][1]*1/4.0
dr8[4]=pop.loc[8][1]*1/4.0
dr8[1]=pop.loc[8][1]*1/4.0
dr8[10]=(pop.loc[8][5]+pop.loc[7][15])/4.0
dr8[20]=(pop.loc[8][15]+pop.loc[7][25])/4.0
dr8[30]=(pop.loc[8][25]+pop.loc[7][35])/4.0
dr8[40]=(pop.loc[8][35]+pop.loc[7][45])/4.0
dr8[50]=(pop.loc[8][45]+pop.loc[7][55])/4.0
dr8[60]=(pop.loc[8][55])/2.0
dr8[5]=pop.loc[8][5]/2.0
dr8[15]=pop.loc[8][15]/2.0
dr8[25]=pop.loc[8][25]/2.0
dr8[35]=pop.loc[8][35]/2.0
dr8[45]=pop.loc[8][45]/2.0
dr8[55]=pop.loc[8][55]/2.0
dr8[90]=pop.loc[8][65]*1/1093.0
dr8[95]=pop.loc[8][65]*3/1093.0
dr8[85]=pop.loc[8][65]*9/1093.0
dr8[80]=pop.loc[8][65]*27/1093.0
dr8[75]=pop.loc[8][65]*81/1093.0
dr8[70]=pop.loc[8][65]*243/1093.0
dr8[65]=pop.loc[8][65]*729/1093.0
In [23]:
pop=pd.concat([dr0,dr1,dr2,dr3,dr4,dr5,dr6,dr7,dr8])
In [24]:
def ccw(c):
if c=='Sao Tome and Principe':return u'S\xc3\xa3o Tom\xc3\xa9 and Pr\xc3\xadncipe'
elif c=='Falkland Islands (Malvinas)':return 'Falkland Islands'
elif c=='China: Province of Taiwan only':return 'Taiwan'
elif c=='Iran (Islamic Republic of)':return 'Iran'
elif c=='Syrian Arab Republic':return 'Syria'
elif c=='TFYR Macedonia':return 'Macedonia'
else:return c
In [25]:
pp={'007':{}} #007 for world
for p in pop.T.iteritems():
try:
country_id=countries.loc[ccw(ccv(cc.loc[p[0][0]][0]))]
if str(country_id) not in pp: pp[str(country_id)]={}
if p[0][1]>1985: #no mortality data before that
if str(p[0][1]) not in pp[str(country_id)]:pp[str(country_id)][str(p[0][1])]={}
if str(p[0][1]) not in pp['007']:pp['007'][str(p[0][1])]={}
if p[0][2]>1:g='f'
else: g='m'
if g not in pp[str(country_id)][str(p[0][1])]:pp[str(country_id)][str(p[0][1])][g]={}
if g not in pp['007'][str(p[0][1])]:pp['007'][str(p[0][1])][g]={}
for j in pop.columns:
v=p[1][j]
if np.isnan(p[1][j]):v=0
pp[str(country_id)][str(p[0][1])][g][str(j)]=str(v)
if str(j) not in pp['007'][str(p[0][1])][g]:pp['007'][str(p[0][1])][g][str(j)]=0
pp['007'][str(p[0][1])][g][str(j)]+=v
except: pass
In [26]:
#convert to str
for y in pp['007']:
for g in pp['007'][y]:
for a in pp['007'][y][g]:
pp['007'][y][g][a]=str(pp['007'][y][g][a])
Alternative population counter
In [27]:
#load pop data from http://esa.un.org/unpd/wpp/DVD/Files/1_Indicators%20(Standard)/EXCEL_FILES/1_Population/WPP2015_POP_F15_2_ANNUAL_POPULATION_BY_AGE_MALE.XLS
wd=pd.read_excel('WPP2015_POP_F15_2_ANNUAL_POPULATION_BY_AGE_MALE.XLS',skiprows=16)
wd=wd.drop(['Major area, region, country or area *','Index','Variant','Notes','80+'],axis=1)
wd.columns=['Country','Year']+list(np.arange(21)*5)
wd=wd.set_index(['Country','Year'])
#load pop data from http://esa.un.org/unpd/wpp/DVD/Files/1_Indicators%20(Standard)/EXCEL_FILES/1_Population/WPP2015_POP_F15_2_ANNUAL_POPULATION_BY_AGE_MALE.XLS
wf=pd.read_excel('WPP2015_POP_F15_3_ANNUAL_POPULATION_BY_AGE_FEMALE.XLS',skiprows=16)
wf=wf.drop(['Major area, region, country or area *','Index','Variant','Notes','80+'],axis=1)
wf.columns=['Country','Year']+list(np.arange(21)*5)
wf=wf.set_index(['Country','Year'])
In [28]:
#fill missing years and aggregates
wd[2]=wd[0]*1/5.0
wd[3]=wd[0]*1/5.0
wd[4]=wd[0]*1/5.0
wd[1]=wd[0]*1/5.0
wd[0]=wd[0]*1/5.0
wd[95]=wd[95]+wd[100]
wd=wd.drop(100,axis=1)
#fill missing years and aggregates
wf[2]=wf[0]*1/5.0
wf[3]=wf[0]*1/5.0
wf[4]=wf[0]*1/5.0
wf[1]=wf[0]*1/5.0
wf[0]=wf[0]*1/5.0
wf[95]=wf[95]+wf[100]
wf=wf.drop(100,axis=1)
In [29]:
wdp={}
for p in wd.T.iteritems():
try:
country_id=p[0][0]
if str(country_id) not in wdp: wdp[str(country_id)]={}
if p[0][1]>1985: #no mortality data before that
if str(p[0][1]) not in wdp[str(country_id)]:wdp[str(country_id)][str(p[0][1])]={}
g='m'
if g not in wdp[str(country_id)][str(p[0][1])]:wdp[str(country_id)][str(p[0][1])][g]={}
for j in wd.columns:
v=p[1][j]
if np.isnan(p[1][j]):v=0
wdp[str(country_id)][str(p[0][1])][g][str(j)]=str(v*1000.0)
except: pass
for p in wf.T.iteritems():
try:
country_id=p[0][0]
if str(country_id) not in wdp: wdp[str(country_id)]={}
if p[0][1]>1985: #no mortality data before that
if str(p[0][1]) not in wdp[str(country_id)]:wdp[str(country_id)][str(p[0][1])]={}
g='f'
if g not in wdp[str(country_id)][str(p[0][1])]:wdp[str(country_id)][str(p[0][1])][g]={}
for j in wd.columns:
v=p[1][j]
if np.isnan(p[1][j]):v=0
wdp[str(country_id)][str(p[0][1])][g][str(j)]=str(v*1000.0)
except: pass
Data parser
In [ ]:
hierarchy2={}
c={}
for country in df.index.get_level_values('Country').unique():
if ccv(cc.loc[country][0]) not in ['Netherlands Antilles',
'United Kingdom, England and Wales',
'United Kingdom, Scotland',
'United Kingdom, Northern Ireland',
'Rodrigues']:
try:
country_id=countries.loc[ccv(cc.loc[country][0])]
print country_id,ccv(cc.loc[country][0])
c[country_id]=cc.loc[country][0]
data={}
data3=[]
dk=df.loc[country].drop(['Deaths1','Deaths26'],axis=1)
dk.columns=range(5)+list(np.arange(1,20)*5)
for i in dk.stack().iteritems():
if i[0][2]>1:gender='f'
else: gender='m'
cause=i[0][1].strip()
if cause not in 'AAA':
key='A'+str(i[0][3])+'C'+str(cause)+'T'+str(i[0][0])
if key not in data: data[key]={}
data[key]['a']=i[0][3]
data[key]['c']=cause
data[key]['t']=i[0][0]
data[key]['s']=i[1]
data[key]['g']=gender
if cause not in hierarchy2: hierarchy2[cause]={}
cause2=cause[:3]
hierarchy2[cause]["cause2"]=cause2
hierarchy2[cause]["parent"]=get_parent(cause2)
hierarchy2[cause]["group"]=get_group(cause2)
data3.append(data[key])
file('db/data.json','w').write(json.dumps(data3))
try:
import zlib
compression = zipfile.ZIP_DEFLATED
except:
compression = zipfile.ZIP_STORED
zf = zipfile.ZipFile('db/'+str(country_id)+'.zip', mode='w')
zf.write('db/data.json','data.json',compress_type=compression)
zf.close()
except: print 'error',country_id,ccv(cc.loc[country][0])
In [563]:
file('hierarchy.json','w').write(json.dumps(hierarchy2)) #only do once ever, dont overwrite!
In [76]:
hierarchy3={}
c={}
for country in df.index.get_level_values('Country').unique():
if ccv(cc.loc[country][0]) not in ['Netherlands Antilles',
'United Kingdom, England and Wales',
'United Kingdom, Scotland',
'United Kingdom, Northern Ireland',
'Rodrigues']:
try:
country_id=countries.loc[ccv(cc.loc[country][0])]
print country_id,ccv(cc.loc[country][0])
c[country_id]=cc.loc[country][0]
data={}
data3=[]
dk=df.loc[country].drop(['Deaths1','Deaths26'],axis=1)
dk.columns=range(5)+list(np.arange(1,20)*5)
for i in dk.stack().iteritems():
if i[0][2]>1:gender='f'
else: gender='m'
cause=i[0][1].strip()
if cause not in 'AAA':
cause=get_parent(i[0][1].strip()[:3])
key='A'+str(i[0][3])+'C'+str(cause)+'T'+str(i[0][0])+'G'+gender
if key not in data: data[key]={}
data[key]['a']=i[0][3]
data[key]['c']=cause
data[key]['g']=gender
data[key]['t']=i[0][0]
if 's' not in data[key]:data[key]['s']=0
data[key]['s']+=i[1]
if cause not in hierarchy3: hierarchy3[cause]={}
cause2=cause
hierarchy3[cause]["cause2"]=cause
hierarchy3[cause]["parent"]=cause
hierarchy3[cause]["group"]=get_group(cause2)
for key in data:
data3.append(data[key])
file('db2/data.json','w').write(json.dumps(data3))
try:
import zlib
compression = zipfile.ZIP_DEFLATED
except:
compression = zipfile.ZIP_STORED
zf = zipfile.ZipFile('db2/'+str(country_id)+'.zip', mode='w')
zf.write('db2/data.json','data.json',compress_type=compression)
zf.close()
except: pass#print 'error',country_id,ccv(cc.loc[country][0])
In [565]:
file('hierarchy2.json','w').write(json.dumps(hierarchy3)) #only do once ever, dont overwrite!
In [71]:
#run once!
#save country population for which there is modrtality data
mdata=['900']
for country in df.index.get_level_values('Country').unique():
if ccv(cc.loc[country][0]) not in ['Netherlands Antilles',
'United Kingdom, England and Wales',
'United Kingdom, Scotland',
'United Kingdom, Northern Ireland',
'Rodrigues']:
try:
country_id=countries.loc[ccv(cc.loc[country][0])]
mdata.append(str(country_id)) #append country id to list of available countries
except: print 'error',country_id,ccv(cc.loc[country][0])
for c in wdp.keys():
if c not in mdata:
wdp.pop(c);
file('pop.json','w').write(json.dumps(wdp)) #only do once ever, dont overwrite!
file('wpop.json','w').write(json.dumps(wdp)) #only do once ever, dont overwrite!
In [94]:
pp['32'].keys()
Out[94]:
In [254]:
#get scaler of world vs. actual data
dr=[]
for y in range(1990,2005): #years with best overall data availability
yr=str(y)
for g in ['f','m']:
for i in pp['007'][yr][g].keys():
dr.append(float(pp['007'][yr][g][i])/float(wdp['900'][yr][g][i]))
wsc=np.array(dr).mean()
In [30]:
#Load df for global data
df=load_df(['Frmat','Sex', 'Year', 'Cause','Country'])
years=df.loc[1].index.get_level_values('Year').unique()
In [258]:
c={}
country_id=900 #world
with open("db2/data.json", "w") as data3: data3.write("")
with open("db2/data.json", "a") as data3:
data3.write("[")
for y in range(2000,2014):#years:
print y
for g in [1,2]:
try:
dk=df.loc[g].loc[y].stack().unstack('Country').T.sum().unstack().drop(['Deaths1','Deaths26'],axis=1)
dk.columns=range(5)+list(np.arange(1,20)*5)
data={}
for i in dk.stack().iteritems():
if g>1:gender='f'
else: gender='m'
cause=i[0][0].strip()
if cause not in 'AAA':
#if i[1]>0:
cause=get_parent(i[0][0].strip()[:3])
key='A'+str(i[0][1])+'C'+str(cause)+'T'+str(y)+'G'+gender
if key not in data: data[key]={}
data[key]['a']=i[0][1]
data[key]['c']=cause
data[key]['g']=gender
data[key]['t']=str(y)
if 's' not in data[key]:data[key]['s']=0
data[key]['s']+=i[1]/wsc #multiply with global scaler
for key in data:
data3.write(json.dumps(data[key])+',')
except: pass#print 'error',country_id,ccv(cc.loc[country][0])
In [259]:
#remove last comma character
#from here http://stackoverflow.com/questions/1877999/delete-final-line-in-file-via-python
with open("db2/data.json", "r+") as data3:
#Move the pointer (similar to a cursor in a text editor) to the end of the file.
data3.seek(0, os.SEEK_END)
#This code means the following code skips the very last character in the file -
#i.e. in the case the last line is null we delete the last line
#and the penultimate one
pos = data3.tell() - 1
data3.seek(pos, os.SEEK_SET)
data3.truncate()
data3.close()
with open("db2/data.json", "a") as data3: data3.write("]")
In [260]:
#try this, if it freezes, then just zip manually
try:
import zlib
compression = zipfile.ZIP_DEFLATED
except:
compression = zipfile.ZIP_STORED
zf = zipfile.ZipFile('db2/'+str(country_id)+'.zip', mode='w')
zf.write('db2/data.json','data.json',compress_type=compression)
zf.close()
In [261]:
c={}
country_id=900 #world
with open("db/data.json", "w") as data3: data3.write("")
with open("db/data.json", "a") as data3:
data3.write("[")
for y in range(2001,2014):#years:
print y
for g in [1,2]:
try:
dk=df.loc[g].loc[y].stack().unstack('Country').T.sum().unstack().drop(['Deaths1','Deaths26'],axis=1)
dk.columns=range(5)+list(np.arange(1,20)*5)
data={}
for i in dk.stack().iteritems():
if g>1:gender='f'
else: gender='m'
cause=i[0][0].strip()
if cause not in 'AAA':
#if i[1]>0:
cause=i[0][0].strip()[:3]
key='A'+str(i[0][1])+'C'+str(cause)+'T'+str(y)+'G'+gender
if key not in data: data[key]={}
data[key]['a']=i[0][1]
data[key]['c']=cause
data[key]['g']=gender
data[key]['t']=str(y)
if 's' not in data[key]:data[key]['s']=0
data[key]['s']+=i[1]/wsc #multiply with global scaler
for key in data:
data3.write(json.dumps(data[key])+',')
except: pass#print 'error',country_id,ccv(cc.loc[country][0])
In [262]:
#remove last comma character
#from here http://stackoverflow.com/questions/1877999/delete-final-line-in-file-via-python
with open("db/data.json", "r+") as data3:
#Move the pointer (similar to a cursor in a text editor) to the end of the file.
data3.seek(0, os.SEEK_END)
#This code means the following code skips the very last character in the file -
#i.e. in the case the last line is null we delete the last line
#and the penultimate one
pos = data3.tell() - 1
data3.seek(pos, os.SEEK_SET)
data3.truncate()
data3.close()
with open("db/data.json", "a") as data3: data3.write("]")
In [263]:
#try this, if it freezes, then just zip manually
try:
import zlib
compression = zipfile.ZIP_DEFLATED
except:
compression = zipfile.ZIP_STORED
zf = zipfile.ZipFile('db/'+str(country_id)+'.zip', mode='w')
zf.write('db/data.json','data.json',compress_type=compression)
zf.close()
In [217]:
#pretty country nams, EN+HUN
cnames=json.loads(file("../szekelyfold lakossag 2/cnames.json").read())
hnames=json.loads(file("../szekelyfold lakossag 2/hnames.json").read())
In [218]:
#need ot run only once
def hun(c):
if c=='Antigua and Barbuda': return u'Antigua és Barbuda'
elif c=='Bahamas': return u'Bahamák'
elif c=='British Virgin Islands': return u'Brit Virgin-szigetek'
elif c=='Cayman Islands': return u'Kajmán-szigetek'
elif c=='Dominica': return u'Dominika'
elif c==u'R\xc3\xa9union': return u'Réunion'
elif c=='French Guiana': return u'Francia Guyana'
elif c=='Saint Kitts and Nevis': return u'Saint Kitts és Nevis'
elif c=='Saint Vincent and the Grenadines': return u'Szent Vincent és a Grenadine-szigetek'
elif c=='Turks and Caicos Islands': return u'Turks és Caicos-szigetek'
elif c=='U.S. Virgin Islands': return u'U.S. Virgin-szigetek'
elif c=='Saint Pierre and Miquelon': return u'Saint Pierre és Miquelon'
elif c=='World': return u'Egész Világ'
else: return c
for country in df.index.get_level_values('Country').unique():
try:
country_id=countries.loc[ccv(cc.loc[country][0])]
if str(country_id) not in cnames:
cnames[str(country_id)]=ccv(cc.loc[country][0])
if ccv(cc.loc[country][0]) not in hnames:
print repr(ccv(cc.loc[country][0]))
hnames[ccv(cc.loc[country][0])]=hun(ccv(cc.loc[country][0]))
except:pass
cnames[u'900']=u"World"
hnames[u'World']=u"Egész Világ"
file('cnames.json','w').write(json.dumps(cnames))
file('hnames.json','w').write(json.dumps(hnames))
In [219]:
#pretty country nams, EN+HUN
cnames=json.loads(file("cnames.json").read())
hnames=json.loads(file("hnames.json").read())
In [220]:
def hc(c):
if c=="Reunion": return u'R\xc3\xa9union'
if c=='Saint Vincent and Grenadines': return u'Saint Vincent and the Grenadines'
if c=='Serbia and Montenegro, Former': return "Serbia"
if c=='United States of America': return "United States"
if c=='Virgin Islands (USA)': return 'U.S. Virgin Islands'
if c=='Hong Kong SAR': return 'Hong Kong'
if c=='Republic of Moldova': return 'Moldova'
if c=='Republic of Korea': return 'South Korea'
return c
In [221]:
#only run if you haven't run country data parser
#recreate country list
c={}
for country in df.index.get_level_values('Country').unique():
if ccv(cc.loc[country][0]) not in ['Netherlands Antilles',
'United Kingdom, England and Wales',
'United Kingdom, Scotland',
'United Kingdom, Northern Ireland',
'Rodrigues']:
country_id=countries.loc[ccv(cc.loc[country][0])]
c[country_id]=cc.loc[country][0]
In [229]:
q={}
for i in c:
q[hnames[hc(c[i])]]=i
e=[]
itera=q.keys()
itera.sort(cmp=locale.strcoll)
for i in itera:
e.append(str(q[i]))
file('countries.json','w').write(json.dumps(['900']+e))
In [236]:
q={}
for i in c:
q[hc(c[i])]=i
e=[]
itera=q.keys()
itera.sort(cmp=locale.strcoll)
for i in itera:
e.append(str(q[i]))
file('wcountries.json','w').write(json.dumps(['900']+e))