In [69]:
import pandas as pd
In [70]:
import json
from pandas.io.json import json_normalize
In [71]:
# define json string
data = [{'state': 'Florida',
'shortname': 'FL',
'info': {'governor': 'Rick Scott'},
'counties': [{'name': 'Dade', 'population': 12345},
{'name': 'Broward', 'population': 40000},
{'name': 'Palm Beach', 'population': 60000}]},
{'state': 'Ohio',
'shortname': 'OH',
'info': {'governor': 'John Kasich'},
'counties': [{'name': 'Summit', 'population': 1234},
{'name': 'Cuyahoga', 'population': 1337}]}]
In [72]:
# use normalization to create tables from nested element
json_normalize(data, 'counties')
Out[72]:
In [73]:
# further populate tables created from nested element
json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']])
Out[73]:
In [74]:
# load json as string
json.load((open('data/world_bank_projects_less.json')))
In [ ]:
# load as Pandas dataframe
sample_json_df = pd.read_json('data/world_bank_projects_less.json')
sample_json_df
Using data in file 'data/world_bank_projects.json' and the techniques demonstrated above,
In [127]:
#import packages
import pandas as pd
import json
from pandas.io.json import json_normalize
# Displays all DataFrame columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# Load json file as a string
world_bank_file = json.load((open('world_bank_projects.json')))
# Load json file into a Pandas DataFrame
world_bank_df = pd.read_json('world_bank_projects.json')
#print(world_bank_df.head(5))
#print(world_bank_df.columns)
print(world_bank_df.info())
#print(world_bank_df[['mjthemecode','mjtheme']])
In [126]:
# 1. Find the 10 countries with the most projects
# Select the countryname and project_name columns from the Pandas DataFrame
most_projects = world_bank_df[['countryname', 'project_name']]
# Group by countryname, count and sort values from highest to lowest
most_projects = most_projects.groupby("countryname").size().sort_values(ascending = False)
# Print first 10 rows
print(most_projects.head(10))
In [124]:
# 2. Find the top 10 major project themes (using column ‘mjtheme_namecode’)
# Use normalization to create tables from nested element
major_themes = json_normalize(world_bank_file, 'mjtheme_namecode')
# Group by countryname, count and sort values from highest to lowest
major_themes = major_themes.groupby("name").size().sort_values(ascending = False)
# Print first 10 rows
print(major_themes)
In [125]:
# 3. In 2. above you will notice that some entries have only the code and the name is missing.
# Create a dataframe with the missing names filled in.
import numpy as np
# Use normalization to create table from 'mjtheme_namecode' column
missing_names = json_normalize(world_bank_file, 'mjtheme_namecode')
# Replacing empty entries with NaNs
missing_names = missing_names.apply(lambda x: x.str.strip()).replace('', np.nan)
#print(missing_names)
# Finding unique values
code_name_map = missing_names.loc[missing_names['name'] != '', :]
unique_names = code_name_map.drop_duplicates().dropna()
# Creating dictionary from unique_names
unique_names_dict_list = unique_names.set_index('code').T.to_dict(orient = 'records')
# Extracting dictionary from list
unique_names_dict = unique_names_dict_list[0]
# Fills NaNs with values from dictionary
filled_values = missing_names['name'].fillna(missing_names['code'].map(unique_names_dict))
print(filled_values.value_counts())