In [1]:
import pandas as pd
In [2]:
import json
from pandas.io.json import json_normalize
In [3]:
from numpy import nan
Using data in file 'data/world_bank_projects.json' and the techniques demonstrated above,
In [4]:
pd.read_json('data/world_bank_projects.json')['countryname'].value_counts()[:10]
Out[4]:
In [5]:
wb_projects_str=json.load((open('data/world_bank_projects.json')))
In [6]:
themes=json_normalize(wb_projects_str,'mjtheme_namecode')
codecounts=themes['code'].value_counts()
codecounts.name='counts'
codecounts
Out[6]:
Make somthing to tell us what each code means
In [7]:
codedict=themes.replace('',nan).dropna().groupby('code').last()
codedict
Out[7]:
Answer:
Combine the two
In [8]:
pd.DataFrame([codedict['name'],codecounts]).transpose().sort('counts',ascending=False)
Out[8]:
Basically we just copy the code column and then replace all the codes with the corresponding name. This disturbs my sensibilities as it's doing lots more replacements than it has to, but it's faster than anything else I came up with.
In [9]:
%%time
themes['name']=themes['code']
themes['name'].replace(to_replace=codedict.index.values,value=codedict.values,inplace=True)
In [10]:
themes
Out[10]: