In [1]:

    
import requests
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline

Class 6: More Pandas

Objectives:

Analize some cross-country GDP per capita data
Create a new DataFrame
Export a DataFrame to a csv file

Exercise: Cross-country income per capita statistics

Download a file called corssCountryIncomePerCapita.csv by visiting http://www.briancjenkins.com/data/international/ and following the link for: "GDP per capita (constant US 2005 PPP $, levels)"



In [2]:

    
# Use the requests module to download cross country GDP per capita
url = 'http://www.briancjenkins.com/data/international/csv/crossCountryIncomePerCapita.csv'
filename='crossCountryIncomePerCapita.csv'
r = requests.get(url,verify=True)

with open(filename,'wb') as newFile:
    
    newFile.write(r.content)



In [3]:

    
# Import the cross-country GDP data into a DataFrame called incomeDf with index_col=0
incomeDf = pd.read_csv('crossCountryIncomePerCapita.csv',index_col=0)

# Print the first five rows of incomeDf
print(incomeDf.head())









    



      Argentina - ARG  Australia - AUS  Austria - AUT  Bangladesh - BGD  \
year                                                                      
1960       2374.35805      12534.46265     8527.15975        1344.53631   
1961       2442.92853      12558.37869     8993.13422        1321.01632   
1962       2293.42091      12998.71801     9200.24846        1377.41183   
1963       2243.03616      13832.15347     9538.33838        1427.27810   
1964       2359.96841      14118.12087     9925.30068        1388.65044   

      Barbados - BRB  Belgium - BEL  Benin - BEN  Bolivia - BOL  \
year                                                              
1960      7452.94975     9259.07163    872.85321     1291.84641   
1961      7751.97503     9615.28509    877.09696     1335.75053   
1962      8080.90294    10056.05140    829.98223     1345.47941   
1963      8480.47368    10309.19080    846.89217     1397.78684   
1964      8159.19884    10952.67963    863.03304     1474.19237   

      Botswana - BWA  Brazil - BRA       ...        Tunisia - TUN  \
year                                     ...                        
1960       358.82249    1902.53245       ...           1338.01648   
1961       381.14701    2088.12578       ...           1436.05106   
1962       404.71385    2158.40684       ...           1432.59096   
1963       418.82307    2230.05748       ...           1539.26974   
1964       441.53648    2276.03838       ...           1520.38178   

      Turkey - TUR  Uganda - UGA  United Kingdom - GBR  \
year                                                     
1960    4034.88230     895.86465           10028.58681   
1961    3986.79466     860.05237           10236.88410   
1962    4113.18607     880.03861           10295.56741   
1963    4389.74825     926.36159           10592.26758   
1964    4466.95280    1013.20679           11123.29282   

      United Republic of Tanzania: Mainland - TZA  United States - USA  \
year                                                                     
1960                                    808.76931          15140.91585   
1961                                    817.70475          15262.48089   
1962                                    879.45183          15966.36299   
1963                                    930.85585          16406.56485   
1964                                   1037.90778          17108.68366   

      Uruguay - URY  Venezuela - VEN  Zambia - ZMB  Zimbabwe - ZWE  
year                                                                
1960     6377.31094       6894.31626    2875.21204      1748.01303  
1961     6405.48865       6799.56243    2727.09732      1794.26310  
1962     6456.85453       6838.95386    2608.08675      1726.02894  
1963     6182.08360       7070.81703    2688.18032      1708.91012  
1964     6361.93244       7472.00737    3062.45208      1610.33559  

[5 rows x 107 columns]



In [4]:

    
# Print the columns of incomeDf
print(incomeDf.columns)









    



Index(['Argentina - ARG', 'Australia - AUS', 'Austria - AUT',
       'Bangladesh - BGD', 'Barbados - BRB', 'Belgium - BEL', 'Benin - BEN',
       'Bolivia - BOL', 'Botswana - BWA', 'Brazil - BRA',
       ...
       'Tunisia - TUN', 'Turkey - TUR', 'Uganda - UGA', 'United Kingdom - GBR',
       'United Republic of Tanzania: Mainland - TZA', 'United States - USA',
       'Uruguay - URY', 'Venezuela - VEN', 'Zambia - ZMB', 'Zimbabwe - ZWE'],
      dtype='object', length=107)



In [5]:

    
# Print the number of countries represented in incomeDf
print(len(incomeDf.columns))



In [6]:

    
# Print the index of incomeDf
print(incomeDf.index)









    



Int64Index([1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970,
            1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981,
            1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992,
            1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
            2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011],
           dtype='int64', name='year')



In [7]:

    
# Print the number of years of data in incomeDf
print(len(incomeDf.index))



In [8]:

    
# Print the first five rows of the 'United States - USA 'column of incomeDf
print(incomeDf['United States - USA'].head())









    



year
1960    15140.91585
1961    15262.48089
1962    15966.36299
1963    16406.56485
1964    17108.68366
Name: United States - USA, dtype: float64



In [9]:

    
# Print the last five rows of the 'United States - USA' column of incomeDf
print(incomeDf['United States - USA'].tail())









    



year
2007    43715.28213
2008    42929.99502
2009    41465.62005
2010    42047.45150
2011    42426.36945
Name: United States - USA, dtype: float64



In [10]:

    
# Create a plot of income per capita from 1960 to 2011 for the US
plt.plot(incomeDf['United States - USA'].index,incomeDf['United States - USA'],lw=3,alpha = 0.7)
plt.grid()
plt.ylabel('Dollars')
plt.xlim([incomeDf.index[0],incomeDf.index[-1]])
plt.title('Income per capita: United States')









    Out[10]:





<matplotlib.text.Text at 0x1154fbf98>



In [11]:

    
# Create a plot of income per capita from 1960 to 2011 for another country in the dataset 


# Use the random module to randomly draw a value from the column titles of incomeDf
import random
some_country = random.choice(incomeDf.columns)

plt.plot(incomeDf[some_country].index,incomeDf[some_country],lw=3,alpha = 0.7)
plt.grid()
plt.ylabel('Dollars')
plt.xlim([incomeDf.index[0],incomeDf.index[-1]])
plt.title('Income per capita: '+some_country[:-6])









    Out[11]:





<matplotlib.text.Text at 0x1153a4198>



In [12]:

    
# Create a new variable called income60 equal to the 1960 row from incomeDf
income60 = incomeDf.loc[1960]

# Print the index of income60
print(income60)









    



Argentina - ARG                                 2374.35805
Australia - AUS                                12534.46265
Austria - AUT                                   8527.15975
Bangladesh - BGD                                1344.53631
Barbados - BRB                                  7452.94975
Belgium - BEL                                   9259.07163
Benin - BEN                                      872.85321
Bolivia - BOL                                   1291.84641
Botswana - BWA                                   358.82249
Brazil - BRA                                    1902.53245
Burkina Faso - BFA                               576.58114
Burundi - BDI                                    575.05689
Cameroon - CMR                                  1061.15810
Canada - CAN                                   11858.26614
Cape Verde - CPV                                 841.19532
Central African Republic - CAF                  1031.31880
Chad - TCD                                      1078.10670
Chile - CHL                                     4537.82473
China, People's Republic of - CHN               1031.00816
China: Hong Kong SAR - HKG                      3514.01216
Colombia - COL                                  3219.04904
Comoros - COM                                    878.20470
Congo - COG                                      836.95227
Costa Rica - CRI                                4174.28630
Cote d'Ivoire - CIV                             1526.24936
Cyprus - CYP                                    3053.45604
Democratic Republic of the Congo - COD          1183.82576
Denmark - DNK                                  10875.00779
Dominican Republic - DOM                        2056.13949
Ecuador - ECU                                   2113.40113
                                                  ...     
Panama - PAN                                    2419.14884
Paraguay - PRY                                  1414.77478
Peru - PER                                      2392.39407
Philippines - PHL                               1719.19745
Portugal - PRT                                  3713.70242
Republic of Korea - KOR                         1099.74644
Romania - ROU                                   1207.29389
Rwanda - RWA                                     850.66915
Senegal - SEN                                   1893.93382
Singapore - SGP                                 2507.68247
South Africa - ZAF                              4924.05532
Spain - ESP                                     5121.62619
Sri Lanka - LKA                                 2292.63616
Sweden - SWE                                   11086.47012
Switzerland - CHE                              17880.66844
Syrian Arab Republic - SYR                      1635.61014
Taiwan - TWN                                    2197.17626
Thailand - THA                                  1008.32626
Togo - TGO                                       836.26248
Trinidad and Tobago - TTO                       6374.55343
Tunisia - TUN                                   1338.01648
Turkey - TUR                                    4034.88230
Uganda - UGA                                     895.86465
United Kingdom - GBR                           10028.58681
United Republic of Tanzania: Mainland - TZA      808.76931
United States - USA                            15140.91585
Uruguay - URY                                   6377.31094
Venezuela - VEN                                 6894.31626
Zambia - ZMB                                    2875.21204
Zimbabwe - ZWE                                  1748.01303
Name: 1960, dtype: float64



In [13]:

    
# Print the average world income per capita in 1960
print('average income per capita in 1960:              ',np.mean(income60))

# Print the standard deviation in world income per capita in 1960
print('standard deviation of income per capita in 1960:',np.sqrt(np.var(income60)))









    



average income per capita in 1960:               3678.890915700932
standard deviation of income per capita in 1960: 3841.34100936



In [14]:

    
# Print the names of the five countries with the highest five incomes per capita in 1960
print(income60.sort_values(ascending=False).head())









    



Switzerland - CHE      17880.66844
Luxembourg - LUX       17104.97220
United States - USA    15140.91585
Australia - AUS        12534.46265
New Zealand - NZL      12007.53784
Name: 1960, dtype: float64



In [15]:

    
# Print the names of the five countries with the lowest five incomes per capita in 1960
print(income60.sort_values(ascending=True).head())









    



Botswana - BWA             358.82249
Mozambique - MOZ           391.21951
Equatorial Guinea - GNQ    404.96432
Mali - MLI                 420.69136
Ethiopia - ETH             432.37363
Name: 1960, dtype: float64



In [16]:

    
# Create a new variable called income11 equal to the 2011 row from incomeDf
income11 = incomeDf.loc[2011]

# Print the average world income per capita in 2011
print('average income per capita in 2011:              ',np.mean(income11))

# Print the standard deviation in world income per capita in 2011
print('standard deviation of income per capita in 2011:',np.sqrt(np.var(income11)))









    



average income per capita in 2011:               14345.862188504672
standard deviation of income per capita in 2011: 15324.9562137



In [17]:

    
# Print the names of the five countries with the highest five incomes per capita in 2011
print(income11.sort_values(ascending=False).head())









    



Luxembourg - LUX              78203.51249
Singapore - SGP               59149.46642
Norway - NOR                  52217.16995
China: Hong Kong SAR - HKG    44882.03114
Switzerland - CHE             44641.57380
Name: 2011, dtype: float64



In [18]:

    
# Print the names of the five countries with the lowest five incomes per capita in 2011
print(income11.sort_values(ascending=True).head())









    



Democratic Republic of the Congo - COD    404.14219
Burundi - BDI                             612.27458
Niger - NER                               646.85764
Central African Republic - CAF            764.89164
Madagascar - MDG                          932.46069
Name: 2011, dtype: float64

Creating a new `DataFrame`

Now we'll use our cross-country income per capita data to create a new DataFrame containing growth data.



In [19]:

    
# Create a DataFrame called growthDf with columns 'income 1960' and 'income 2011' equal to income per capita
# in 1960 and 2011 and an index equal to the index of income60
growthDf = pd.DataFrame({'income 1960':income60,'income 2011':income11},index=income60.index)



In [20]:

    
# Create a new column equal to the difference between 'income 2011' and 'income 1960' for each country
growthDf['difference'] = growthDf['income 2011']-growthDf['income 1960']

Let $y_t$ denotes income per capita for some country in some year $t$ and let $g$ denotes the average annual growth in income per capita between years 0 and $T$. $g$ is defined by: \begin{align} y_T & = (1+g)^T y_0 \end{align} which implies: \begin{align} g & = \left(\frac{y_T}{y_0}\right)^{1/T} - 1 \end{align} Note that since our data are from 1960 to 2011, $T = 51$. Which is also equal to len(incomeDf.index)-1.



In [21]:

    
# Create a new column equal to the average annual growth rate between for each country between 1960 and 2011
T = len(incomeDf.index) -1
growthDf['growth'] = (growthDf['income 2011']/growthDf['income 1960'])**(1/T) - 1



In [22]:

    
# Print the first five rows of growthDf
print(growthDf.head())









    



                  income 1960  income 2011   difference    growth
Argentina - ARG    2374.35805  15302.97358  12928.61553  0.037211
Australia - AUS   12534.46265  38322.50654  25788.04389  0.022155
Austria - AUT      8527.15975  37151.06454  28623.90479  0.029278
Bangladesh - BGD   1344.53631   1757.30744    412.77113  0.005263
Barbados - BRB     7452.94975  20586.55069  13133.60094  0.020122



In [23]:

    
# Print the names of the five countries with the highest average annual growth rates
print(growthDf['growth'].sort_values(ascending=False).head())









    



Botswana - BWA             0.075280
Equatorial Guinea - GNQ    0.067361
Republic of Korea - KOR    0.065051
Singapore - SGP            0.063935
Taiwan - TWN               0.054510
Name: growth, dtype: float64



In [24]:

    
# Print the names of the five countries with the lowest average annual growth rates
print(growthDf['growth'].sort_values(ascending=True).head())









    



Democratic Republic of the Congo - COD   -0.020853
Niger - NER                              -0.011847
Guinea - GIN                             -0.009272
Central African Republic - CAF           -0.005843
Zambia - ZMB                             -0.002752
Name: growth, dtype: float64



In [25]:

    
# Print the average annual growth rate of income per capita from 1960 to 2011
print('average growth in income per capita in 2011:              ',np.mean(growthDf['growth']))
print()

# Print the standard deviation of the annual growth rate of income per capita from 1960 to 2011
print('standard deviation of growth in income per capita in 2011:',np.sqrt(np.var(growthDf['growth'])))









    



average growth in income per capita in 2011:               0.02264690983222154

standard deviation of growth in income per capita in 2011: 0.0164334080634



In [26]:

    
# Construct a scatter plot:
#    Use the plt.scatter function
#    income per capita in 1960 on the horizontal axis and average annual growth rate on the vertical axis
#    Set the opacity of the points to something like 0.25 - 0.35 
#    Label the plot clearly with axis labels and a title

plt.scatter(growthDf['income 1960'],growthDf['growth'],s=100,alpha = 0.3)
plt.xlim([-1000,20000])
plt.grid()
plt.xlabel('income per capita in 1960')
plt.ylabel('growth in income per capita\nfrom 1960 to 2011')
plt.title('income per capita versus growth for '+str(len(growthDf.index))+' countries')









    Out[26]:





<matplotlib.text.Text at 0x11541c860>

Exporting a `DataFrame` to csv

Use the DataFrame method to_csv().



In [27]:

    
# Export the growthDf DataFrame to a csv file called 'growth_data.csv'
growthDf.to_csv('my_growth_data.csv')

Class 6: More Pandas

Exercise: Cross-country income per capita statistics

Creating a new DataFrame

Exporting a DataFrame to csv

Creating a new `DataFrame`

Exporting a `DataFrame` to csv