In [1]:
import requests
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline
Objectives:
DataFrameDataFrame to a csv fileDownload a file called corssCountryIncomePerCapita.csv by visiting http://www.briancjenkins.com/data/international/ and following the link for: "GDP per capita (constant US 2005 PPP $, levels)"
In [2]:
# Use the requests module to download cross country GDP per capita
url = 'http://www.briancjenkins.com/data/international/csv/crossCountryIncomePerCapita.csv'
filename='crossCountryIncomePerCapita.csv'
r = requests.get(url,verify=True)
with open(filename,'wb') as newFile:
newFile.write(r.content)
In [3]:
# Import the cross-country GDP data into a DataFrame called incomeDf with index_col=0
incomeDf = pd.read_csv('crossCountryIncomePerCapita.csv',index_col=0)
# Print the first five rows of incomeDf
print(incomeDf.head())
In [4]:
# Print the columns of incomeDf
print(incomeDf.columns)
In [5]:
# Print the number of countries represented in incomeDf
print(len(incomeDf.columns))
In [6]:
# Print the index of incomeDf
print(incomeDf.index)
In [7]:
# Print the number of years of data in incomeDf
print(len(incomeDf.index))
In [8]:
# Print the first five rows of the 'United States - USA 'column of incomeDf
print(incomeDf['United States - USA'].head())
In [9]:
# Print the last five rows of the 'United States - USA' column of incomeDf
print(incomeDf['United States - USA'].tail())
In [10]:
# Create a plot of income per capita from 1960 to 2011 for the US
plt.plot(incomeDf['United States - USA'].index,incomeDf['United States - USA'],lw=3,alpha = 0.7)
plt.grid()
plt.ylabel('Dollars')
plt.xlim([incomeDf.index[0],incomeDf.index[-1]])
plt.title('Income per capita: United States')
Out[10]:
In [11]:
# Create a plot of income per capita from 1960 to 2011 for another country in the dataset
# Use the random module to randomly draw a value from the column titles of incomeDf
import random
some_country = random.choice(incomeDf.columns)
plt.plot(incomeDf[some_country].index,incomeDf[some_country],lw=3,alpha = 0.7)
plt.grid()
plt.ylabel('Dollars')
plt.xlim([incomeDf.index[0],incomeDf.index[-1]])
plt.title('Income per capita: '+some_country[:-6])
Out[11]:
In [12]:
# Create a new variable called income60 equal to the 1960 row from incomeDf
income60 = incomeDf.loc[1960]
# Print the index of income60
print(income60)
In [13]:
# Print the average world income per capita in 1960
print('average income per capita in 1960: ',np.mean(income60))
# Print the standard deviation in world income per capita in 1960
print('standard deviation of income per capita in 1960:',np.sqrt(np.var(income60)))
In [14]:
# Print the names of the five countries with the highest five incomes per capita in 1960
print(income60.sort_values(ascending=False).head())
In [15]:
# Print the names of the five countries with the lowest five incomes per capita in 1960
print(income60.sort_values(ascending=True).head())
In [16]:
# Create a new variable called income11 equal to the 2011 row from incomeDf
income11 = incomeDf.loc[2011]
# Print the average world income per capita in 2011
print('average income per capita in 2011: ',np.mean(income11))
# Print the standard deviation in world income per capita in 2011
print('standard deviation of income per capita in 2011:',np.sqrt(np.var(income11)))
In [17]:
# Print the names of the five countries with the highest five incomes per capita in 2011
print(income11.sort_values(ascending=False).head())
In [18]:
# Print the names of the five countries with the lowest five incomes per capita in 2011
print(income11.sort_values(ascending=True).head())
In [19]:
# Create a DataFrame called growthDf with columns 'income 1960' and 'income 2011' equal to income per capita
# in 1960 and 2011 and an index equal to the index of income60
growthDf = pd.DataFrame({'income 1960':income60,'income 2011':income11},index=income60.index)
In [20]:
# Create a new column equal to the difference between 'income 2011' and 'income 1960' for each country
growthDf['difference'] = growthDf['income 2011']-growthDf['income 1960']
Let $y_t$ denotes income per capita for some country in some year $t$ and let $g$ denotes the average annual growth in income per capita between years 0 and $T$. $g$ is defined by:
\begin{align}
y_T & = (1+g)^T y_0
\end{align}
which implies:
\begin{align}
g & = \left(\frac{y_T}{y_0}\right)^{1/T} - 1
\end{align}
Note that since our data are from 1960 to 2011, $T = 51$. Which is also equal to len(incomeDf.index)-1.
In [21]:
# Create a new column equal to the average annual growth rate between for each country between 1960 and 2011
T = len(incomeDf.index) -1
growthDf['growth'] = (growthDf['income 2011']/growthDf['income 1960'])**(1/T) - 1
In [22]:
# Print the first five rows of growthDf
print(growthDf.head())
In [23]:
# Print the names of the five countries with the highest average annual growth rates
print(growthDf['growth'].sort_values(ascending=False).head())
In [24]:
# Print the names of the five countries with the lowest average annual growth rates
print(growthDf['growth'].sort_values(ascending=True).head())
In [25]:
# Print the average annual growth rate of income per capita from 1960 to 2011
print('average growth in income per capita in 2011: ',np.mean(growthDf['growth']))
print()
# Print the standard deviation of the annual growth rate of income per capita from 1960 to 2011
print('standard deviation of growth in income per capita in 2011:',np.sqrt(np.var(growthDf['growth'])))
In [26]:
# Construct a scatter plot:
# Use the plt.scatter function
# income per capita in 1960 on the horizontal axis and average annual growth rate on the vertical axis
# Set the opacity of the points to something like 0.25 - 0.35
# Label the plot clearly with axis labels and a title
plt.scatter(growthDf['income 1960'],growthDf['growth'],s=100,alpha = 0.3)
plt.xlim([-1000,20000])
plt.grid()
plt.xlabel('income per capita in 1960')
plt.ylabel('growth in income per capita\nfrom 1960 to 2011')
plt.title('income per capita versus growth for '+str(len(growthDf.index))+' countries')
Out[26]:
In [27]:
# Export the growthDf DataFrame to a csv file called 'growth_data.csv'
growthDf.to_csv('my_growth_data.csv')