In [1]:
import pandas as pd
import numpy as np
import pickle
from copy import deepcopy
from sqlalchemy_conn import engine
I set up a Postgres database on Amazon Web Services and used a pandasql engine to write SQL queries directly from Python. sqlalchemy_conn is script containing the connection details and includes my hostname and password to connect to the database.
I was most interested in goals 1, 5, 6, and 8.
I first queried the database to see the specific targets associated with each of these goals.
In [2]:
query = 'SELECT DISTINCT targetname FROM undata WHERE goalid = 1'
targets1 = pd.read_sql(query, engine)
for target in targets1['targetname']:
print target
In [3]:
query = 'SELECT DISTINCT targetname FROM undata WHERE goalid = 5'
targets5 = pd.read_sql(query, engine)
for target in targets5['targetname']:
print target
In [4]:
query = 'SELECT DISTINCT targetname FROM undata WHERE goalid = 6'
targets6 = pd.read_sql(query, engine)
for target in targets6['targetname']:
print target
In [5]:
query = 'SELECT DISTINCT targetname FROM undata WHERE goalid = 8'
targets8 = pd.read_sql(query, engine)
for target in targets8['targetname']:
print target
I was particularly interested in Goal 6 and Target 6.A: Have halted by 2015 and begun to reverse the spread of HIV/AIDS. My questions:
I felt that there may be data associated with goals 1, 5, and 8 which could also help me answer these questions. I wanted to examine the relationships between the reduction of HIV/AIDS with a reduction in poverty, promoting maternal health, and development assistance.
Next, I queried the database to see which series were available, keeping note of the target id for series which seemed like they might help answer my questions.
In [6]:
# PRINT FULL SERIES NAMES FOR GOAL 1
query = "SELECT DISTINCT targetid, seriesrowid, seriesname FROM undata WHERE goalid = 1;"
series1 = pd.read_sql(query, engine)
for name in series1['seriesname']:
print name
In [7]:
# PRINT TABLE
series1
Out[7]:
In [8]:
# PRINT FULL SERIES NAMES FOR GOAL 5
query = "SELECT DISTINCT targetid, seriesrowid, seriesname FROM undata WHERE goalid = 5;"
series5 = pd.read_sql(query, engine)
for name in series5['seriesname']:
print name
In [9]:
# PRINT TABLE
series5
Out[9]:
In [10]:
# PRINT FULL SERIES NAMES FOR GOAL 6
query = "SELECT DISTINCT targetid, seriesrowid, seriesname FROM undata WHERE goalid = 6;"
series6 = pd.read_sql(query, engine)
for name in series6['seriesname']:
print name
In [11]:
# PRINT TABLE
series6
Out[11]:
In [12]:
# PRINT FULL SERIES NAMES FOR GOAL 8
query = "SELECT DISTINCT targetid, seriesrowid, seriesname FROM undata WHERE goalid = 8;"
series8 = pd.read_sql(query, engine)
for name in series8['seriesname']:
print name
In [13]:
# PRINT TABLE
series8
Out[13]:
In [14]:
query = "SELECT * FROM undata WHERE targetid IN (1, 7, 12, 13, 14, 15, 19, 20);"
undata = pd.read_sql(query, engine)
undata.shape
Out[14]:
I wanted to use data for all countries, but the former Sudan was giving me errors when I tried to unstack and reshape the data. I took a look at what series data was available for the former Sudan.
In [15]:
print len(undata[undata.isformer == 1])
print len(set(undata[undata.isformer == 1]['seriesname']))
print set(undata[undata.isformer == 1]['seriesname'])
In [16]:
undata = undata[undata.isformer == 0]
undata = undata.drop(['isformer'], axis=1)
In [17]:
columns = [column for column in undata.columns]
columns
Out[17]:
In [18]:
unsimple = undata[['countryname', 'iso3code', 'year', 'isdeveloped', 'mdgregions',
'isldc2014', 'islldc', 'ismdgcountry', 'seriesname', 'gdppc2012',
'population2012', 'value']]
In [19]:
unsimple.describe()
Out[19]:
In [20]:
unsimple.shape
Out[20]:
The dataframe included a column called 'value' which contained the value available for the many different series. In order to explore and make predictions I unstacked the dataframe, turning the value column into multiple columns associated with the corresponding series name.
I started by extracting the values for the series 'AIDS deaths' and then looped through a list of the other series, merging them with the reshaped dataframe.
In [21]:
all_series = list(set(unsimple['seriesname']))
row = 'AIDS deaths'
all_series.remove(row)
In [22]:
unreshape = deepcopy(unsimple[unsimple['seriesname'] == row])
unreshape.rename(columns={'value': row}, inplace=True)
unreshape.shape
Out[22]:
In [23]:
for series in all_series:
new_cols = deepcopy(unsimple[unsimple['seriesname'] == series])
new_cols.rename(columns={'value': series}, inplace=True)
new_cols = new_cols.drop(['seriesname'], axis=1)
keys = ['countryname', 'iso3code', 'year', 'isdeveloped', 'mdgregions',
'ismdgcountry', 'gdppc2012', 'population2012', 'isldc2014', 'islldc']
unreshape = pd.merge(unreshape, new_cols, how='outer', on=keys)
unreshape.shape
Out[23]:
I found more variables which I wanted to include in my analysis on the World Bank database.
The years for this data were individual columns, so I needed to transform it into a shape similar to the original data in order to unstack and merge it with my reshaped dataframe.
In [24]:
undata_new = pd.read_csv('data/Data_Extract_From_Millennium_Development_Goals_Data.csv')
list(set(undata_new['Series Name']))
Out[24]:
In [25]:
undata_new.head()
Out[25]:
In [26]:
# drop series code
undata_new = undata_new.drop(['Series Code'], axis = 1)
In [27]:
# rename columns
years = range(1990, 2015)
undata_new.columns = ['countryname', 'iso3code', 'seriesname'] + years
In [28]:
undata_new.head()
Out[28]:
In [29]:
# set a multi-level index and stack the data
undata_new = undata_new.set_index(keys=['countryname', 'iso3code', 'seriesname'])
undata_new = pd.DataFrame(undata_new.stack(level=-1))
In [30]:
# rename value column
undata_new.columns = ['value']
In [31]:
undata_new.head()
Out[31]:
In [32]:
# reset the index to turn these variables back into columns
undata_new = undata_new.reset_index()
# rename columns
undata_new.columns = ['countryname', 'iso3code', 'seriesname', 'year', 'value']
undata_new.head()
Out[32]:
In order to properly merge the data provided by the UN with the data downloaded from the World Bank, the two dataframes need to have the same key columns. In order to add these columns to the new dataframe, I created a dictionary using the ISO code as a key and storing the values for the columns to add.
In [33]:
# columns to add
cols = ['countryname', 'isdeveloped', 'mdgregions', 'ismdgcountry', 'gdppc2012',
'population2012', 'isldc2014', 'islldc']
add_cols = {}
for code in set(unreshape['iso3code']):
country_cols = {}
for col in cols:
try:
temp = list(unreshape[unreshape['iso3code'] == code][col])[0]
country_cols[col] = temp
except IndexError:
country_cols[col] = np.NaN
add_cols[code] = country_cols
In [34]:
add_cols['AFG']
Out[34]:
A few countries in the World Bank data had ISO codes which differed from the original UN dataset, possibly these are outdated codes. I replaces these with the codes used in the UN dataset.
A number of countries from the new World Bank data were not present in the original UN dataset. I removed these to avoid errors.
In [36]:
undata_new['iso3code'] = undata_new['iso3code'].replace('ZAR', 'COD')
undata_new['iso3code'] = undata_new['iso3code'].replace('TMP', 'TLS')
In [39]:
missing_data = list(set(undata_new['iso3code']).difference(set(unreshape['iso3code'])))
sorted(missing_data)
Out[39]:
In [40]:
undata_new.shape
Out[40]:
In [41]:
for country in missing_data:
undata_new = undata_new[undata_new['iso3code'] != country]
In [42]:
undata_new.shape
Out[42]:
These loops add the columns 'countryname', 'isdeveloped', 'mdgregions', 'ismdgcountry', 'gdppc2012', 'population2012', 'isldc2014', and 'islldc' to the World Bank dataframe so that it can be properly merged with the UN dataframe.
In [43]:
for col in cols:
new_col = []
for code in undata_new['iso3code']:
new_col.append(add_cols[code][col])
undata_new[col] = new_col
for col in cols:
new_col = []
for code in unreshape['iso3code']:
new_col.append(add_cols[code][col])
unreshape[col] = new_col
In [44]:
undata_new.shape
Out[44]:
In [45]:
undata_new.dtypes
Out[45]:
In [46]:
for series in set(undata_new['seriesname']):
new_cols = deepcopy(undata_new[undata_new['seriesname'] == series])
new_cols.rename(columns={'value': series}, inplace=True)
new_cols = new_cols.drop(['seriesname'], axis=1)
keys = ['countryname', 'iso3code', 'year', 'isdeveloped', 'mdgregions',
'ismdgcountry', 'gdppc2012', 'population2012', 'isldc2014', 'islldc']
unreshape = pd.merge(unreshape, new_cols, how='outer', on=keys)
unreshape.shape
Out[46]:
In [47]:
# replace '..' values with NaN
unreshape = unreshape.replace('..', np.NaN)
In [48]:
# inspect how many observations are available for each variable
for col in sorted(unreshape.columns):
print unreshape[col].count(), col
In [49]:
# drop rows with no value for countryname or region
unreshape = unreshape.drop(pd.isnull(unreshape[['mdgregions', 'countryname']]).any(1).nonzero()[0])
In [50]:
# now that series have been transformed into columns, seriesname can be dropped
unreshape = unreshape.drop(['seriesname'], axis = 1)
In [51]:
with open('un_reshape.pkl', 'w') as picklefile:
pickle.dump(unreshape, picklefile)
In [ ]: