In [3]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
Using data in file 'data/world_bank_projects.json' and the techniques demonstrated above,
In [2]:
# Answer to Exercise 1 (Find the 10 countries with most projects.)
import pandas as pd
import json from pandas.io.json
import json_normalize
# load as Pandas dataframe
exercise_json_df = pd.read_json('data/world_bank_projects.json')
exercise_json_df
#Get frequency that the country name value occurs, this will signify the amount of projects each country has
country_project_counts = exercise_json_df.countryname.value_counts()
#Display only the top 10 countries with most projects
country_project_counts[:10]
Out[2]:
In [6]:
#Answer to Exercise 2 (Find the top 10 major project themes (using column 'mjtheme_namecode').)
import pandas as pd
import json
from pandas.io.json import json_normalize
# load json as string
wbprojects = json.load((open('data/world_bank_projects.json')))
# normalize 'mjtheme_namecode' JSON data into a flat table
# NOTE that there are blank names associated with the code.
mjtheme_normalized = json_normalize(wbprojects, 'mjtheme_namecode')
# Use both 'code' and 'name' as index but get a frequency count by the code,
# This way we won't run into the blank name issue.
mjtheme_normalized['Counts'] = mjtheme_normalized.groupby('code')['name'].transform('count')
# Now, sort by the frequency count field in a descending manner.
# Display only the first 10 most occuring project themes.
freq_sort = mjtheme_normalized.set_index(['Counts']).sort_index(axis=0,ascending=False)
freq_sort_no_dup = freq_sort.drop_duplicates('code',inplace = False)
freq_sort_no_dup[:10]
Out[6]:
In [7]:
# Answer to Exercise 3 (In 2. above you will notice that some entries have only the code and the name is missing. Create a dataframe with the missing names filled in.)
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize
# load json as string
wbprojects = json.load((open('data/world_bank_projects.json')))
# normalize 'mjtheme_namecode' JSON data into a flat table
# NOTE that there are blank names associated with the code.
mjtheme_normalized = json_normalize(wbprojects, 'mjtheme_namecode')
# We want to create a reference table of all valid names and values in 'mjtheme_namecode'
# In order to replace the missing name values in the original dataframe.
# First replace the empty name(s) with a null qualifier so pandas can recognize it as such.
mjtheme_normalized['name'].replace('', np.nan, inplace = True)
# Drop the rows with null name values and then make a unique set of rows to make our proper reference table.
mjtheme_no_dup = mjtheme_normalized.dropna(subset = ['name']).drop_duplicates('name', inplace = False, keep = 'last')
mjtheme_no_dup
# Now, merge reference table with mjtheme_namecode dataframe to populate the missing blank names.
left = mjtheme_normalized
right = mjtheme_no_dup
complete_mjtheme = pd.merge(left,right, on = ['code','name'], right_index= 'TRUE')
# Restore the sort
complete_mjtheme.sort_index()
Out[7]:
In [ ]: