In [1]:
%matplotlib inline
from __future__ import division
import pickle
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
pd.set_option('display.max_columns', 25)
In [2]:
with open('un_reshape.pkl', 'r') as picklefile:
undata = pickle.load(picklefile)
In [3]:
undata.shape
Out[3]:
In [4]:
undata.describe()
Out[4]:
In [5]:
for column in undata.columns:
name_as_list = column.split()
new_list = []
for word in name_as_list:
word = re.sub("[^a-z0-9]", "", word.lower())
new_list.append(word)
new_name = '_'.join(new_list)
undata.rename(columns={column: new_name}, inplace=True)
In [6]:
for column in sorted(undata.columns):
print column
In [7]:
def convert_strings(col):
undata[col] = undata[col].map(lambda x: float(x))
return undata
In [8]:
contra = 'contraceptive_prevalence__of_women_ages_1549'
net_odadol = 'net_official_development_assistance_received_current_us'
net_odadol2 = 'net_official_development_assistance_and_official_aid_received_current_us'
net_odapc = 'net_oda_received_per_capita_current_us'
net_odapct = 'net_oda_received__of_gni'
pop = 'population_total'
hiv_prev = 'prevalence_of_hiv_total__of_population_ages_1549'
gni = 'gni_per_capita_atlas_method_current_us'
# columns to convert
string_cols = [contra, net_odadol, net_odadol2, net_odapc, net_odapct, pop, hiv_prev, gni]
In [9]:
for col in string_cols:
undata = convert_strings(col)
Create new variable of net ODA (Ofiicial Development Assistance) recieved in millions.
In [10]:
undata['net_oda_mill'] = undata[net_odadol].map(lambda x: x / 1000000)
undata['net_odaplus_mill'] = undata[net_odadol2].map(lambda x: x / 1000000)
In [11]:
unlimit = undata[undata['year'] >= 1990]
unlimit = unlimit[unlimit['year'] < 2014]
Save new dataframe to pickle file for use in further explorations.
In [12]:
with open('un_explore.pkl', 'w') as picklefile:
pickle.dump(unlimit, picklefile)
Created funtions for easy plotting.
plot_me_data
: plots the mean of a variable over time as a bar plotplot_su_data
: plots the sum of a variable per year over time as a bar plotplot_by_country
: plots each country over time in a single line plotplot_by_region
: plots each region over time in a single line plotIt is difficult to read the plot_by_country
output because there are so many countries and a legend would obscure the plot, however, from these we can get a sense of general trends for more detailed exploration later.
In [13]:
def plot_me_data(row, ylabel='percent'):
yearly = unlimit.groupby(['year'])[row].mean()
plt.figure(figsize = (12,5))
yearly.plot(kind='bar', x='year', y=row)
plt.title(row)
plt.ylabel(ylabel)
In [14]:
def plot_su_data(row, ylabel='percent'):
yearly = unlimit.groupby(['year'])[row].sum()
plt.figure(figsize = (12,5))
yearly.plot(kind='bar', x='year', y=row)
plt.title(row)
plt.ylabel(ylabel)
In [15]:
def plot_by_country(col, ylabel='percent'):
plt.figure(figsize=(12,5))
for country in set(unlimit['countryname']):
cp = unlimit[unlimit['countryname'] == country]
cp = cp.sort(columns='year')
plt.plot(cp['year'], cp[col], label=country)
plt.title(col)
plt.xlabel('year')
plt.ylabel(ylabel)
plt.show()
In [16]:
def plot_by_region(col, ylabel='percent'):
plt.figure(figsize=(12,5))
for region in set(unlimit['mdgregions']):
cp = unlimit[unlimit['mdgregions'] == region]
cp = pd.DataFrame(cp.groupby(['year'])[col].mean())
cp = cp.sort()
plt.plot(cp[col], label=region)
plt.title(col)
plt.xlabel('year')
plt.ylabel(ylabel)
plt.legend(prop={'size':8})
plt.show()
In [17]:
plot_su_data('aids_deaths', 'number of deaths')
plot_by_region('aids_deaths', 'number of deaths')
plot_by_country('aids_deaths', 'number of deaths')
In [18]:
undata.ix[undata['aids_deaths'].idxmax()][['countryname', 'year', 'aids_deaths']]
Out[18]:
In [19]:
hiv = 'hiv_incidence_rate_1549_years_old_percentage_midpoint'
plot_me_data(hiv)
plot_by_region(hiv)
plot_by_country(hiv)
In [20]:
# worldwide yearly mean of HIV incidence rate
unlimit.groupby(['year'])[hiv].mean()
Out[20]:
In [21]:
# countries and years with highest HIV incidence rates
undata[['countryname', 'year', hiv]].sort(columns=hiv, ascending=False).head(15)
Out[21]:
In [22]:
hiv_pct = 'people_living_with_hiv_1549_years_old_percentage'
plot_me_data(hiv_pct)
plot_by_region(hiv_pct)
plot_by_country(hiv_pct)
In [23]:
plot_me_data(contra)
plot_by_country(contra)
Contraceptive use data may be too sparse to use in modeling
In [24]:
yearly = pd.DataFrame(unlimit.groupby(['countryname', 'iso3code', 'year'])[hiv].mean())
bycountry = yearly.unstack()
bycountry.to_csv('hiv_incidence.csv', encoding='utf-8')
In [25]:
bycountry.head()
Out[25]:
In [26]:
abr = 'adolescent_birth_rate_per_1000_women'
plot_me_data(abr, 'rate')
plot_by_country(abr, 'rate')
In [27]:
plot_by_country('antenatal_care_coverage_at_least_one_visit_percentage')
In [28]:
plot_by_country('antenatal_care_coverage_at_least_four_visits_percentage')
Antenatal care coverage data is too sparse to use in modeling
In [29]:
pv_tot = 'population_below_national_poverty_line_total_percentage'
undata.ix[undata[pv_tot].idxmax()][['countryname', 'year', pv_tot]]
Out[29]:
In [30]:
plot_me_data(pv_tot)
plot_by_region(pv_tot)
plot_by_country(pv_tot)
In [31]:
# print sorted dataframe since line colors repeat on plot_by_region plot
pd.DataFrame(undata.groupby(['mdgregions'])[pv_tot].mean()).sort(columns=pv_tot, ascending=False)
Out[31]:
In [32]:
pv_urb = 'population_below_national_poverty_line_urban_percentage'
plot_me_data(pv_urb)
plot_by_region(pv_urb)
plot_by_country(pv_urb)
In [33]:
pv_rur = 'population_below_national_poverty_line_rural_percentage'
plot_me_data(pv_rur)
plot_by_region(pv_rur)
plot_by_country(pv_rur)
In [34]:
pv_125 = 'population_below_125_ppp_per_day_percentage'
plot_me_data(pv_125)
plot_by_region(pv_125)
plot_by_country(pv_125)
The general downward trend of these poverty variables is good, however, the data may be too sparse to use in modeling.
In [35]:
label = 'current US$ in miilions'
plot_su_data('net_oda_mill', label)
plot_by_country('net_oda_mill', label)
plot_by_region('net_oda_mill', label)
In [36]:
undata.ix[undata['net_oda_mill'].idxmax()][['countryname', 'year', 'net_oda_mill']]
Out[36]:
In [37]:
# countries receiving the most development assistance
undata[['countryname', 'year', 'net_oda_mill']].sort(columns='net_oda_mill', ascending=False).head(15)
Out[37]:
In [38]:
plot_me_data(net_odapct)
plot_by_country(net_odapct)
plot_by_region(net_odapct)
In [39]:
# countries receiving the most development assistance as a % of GNI in this period
oda1 = pd.DataFrame(undata.groupby(['countryname'])[net_odapct].mean())
oda1.sort(columns=net_odapct, ascending=False).head(10)
Out[39]:
In [40]:
# countries receiving the most development assistance as a % of GNI by year
undata[['countryname', 'year', net_odapct]].sort(columns=net_odapct, ascending=False).head(15)
Out[40]:
In [41]:
plot_me_data(net_odapc)
plot_by_country(net_odapc)
plot_by_region(net_odapc)
In [42]:
# countries receiving the most development assistance per capita in this period
oda2 = pd.DataFrame(undata.groupby(['countryname'])[net_odapc].mean())
oda2.sort(columns=net_odapc, ascending=False).head(10)
Out[42]:
In [43]:
# countries receiving the most development assistance per capita by year
undata[['countryname', 'year', net_odapc]].sort(columns=net_odapc, ascending=False).head(15)
Out[43]:
In [ ]: