In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

In [2]:
data = pd.read_csv(r'C:\Users\hrao\Documents\Personal\HK\Python\world-development-indicators\Indicators.csv')

In [3]:
data.shape


Out[3]:
(5656458, 6)

In [4]:
data.head(n=10)


Out[4]:
CountryName CountryCode IndicatorName IndicatorCode Year Value
0 Arab World ARB Adolescent fertility rate (births per 1,000 wo... SP.ADO.TFRT 1960 1.335609e+02
1 Arab World ARB Age dependency ratio (% of working-age populat... SP.POP.DPND 1960 8.779760e+01
2 Arab World ARB Age dependency ratio, old (% of working-age po... SP.POP.DPND.OL 1960 6.634579e+00
3 Arab World ARB Age dependency ratio, young (% of working-age ... SP.POP.DPND.YG 1960 8.102333e+01
4 Arab World ARB Arms exports (SIPRI trend indicator values) MS.MIL.XPRT.KD 1960 3.000000e+06
5 Arab World ARB Arms imports (SIPRI trend indicator values) MS.MIL.MPRT.KD 1960 5.380000e+08
6 Arab World ARB Birth rate, crude (per 1,000 people) SP.DYN.CBRT.IN 1960 4.769789e+01
7 Arab World ARB CO2 emissions (kt) EN.ATM.CO2E.KT 1960 5.956399e+04
8 Arab World ARB CO2 emissions (metric tons per capita) EN.ATM.CO2E.PC 1960 6.439635e-01
9 Arab World ARB CO2 emissions from gaseous fuel consumption (%... EN.ATM.CO2E.GF.ZS 1960 5.041292e+00

In [5]:
len(data['CountryName'].unique().tolist())


Out[5]:
247

In [6]:
len(data['CountryCode'].unique().tolist())


Out[6]:
247

In [7]:
len(data['IndicatorCode'].unique().tolist())


Out[7]:
1344

In [8]:
len(data['Year'].unique().tolist())


Out[8]:
56

In [9]:
print('Range of Years:', min(data['Year']),'to',max(data['Year']))


Range of Years: 1960 to 2015

In [10]:
hist_country = 'USA'
mask1 = data['CountryCode'].str.contains(hist_country)

hist_indicator = 'CO2 emissions \(metric'
mask2 = data['IndicatorName'].str.contains(hist_indicator)

stage = data[mask1 & mask2]

In [11]:
stage.head()


Out[11]:
CountryName CountryCode IndicatorName IndicatorCode Year Value
22232 United States USA CO2 emissions (metric tons per capita) EN.ATM.CO2E.PC 1960 15.999779
48708 United States USA CO2 emissions (metric tons per capita) EN.ATM.CO2E.PC 1961 15.681256
77087 United States USA CO2 emissions (metric tons per capita) EN.ATM.CO2E.PC 1962 16.013937
105704 United States USA CO2 emissions (metric tons per capita) EN.ATM.CO2E.PC 1963 16.482762
134742 United States USA CO2 emissions (metric tons per capita) EN.ATM.CO2E.PC 1964 16.968119

In [12]:
years = stage['Year'].values
co2 = stage['Value'].values

In [13]:
plt.bar(years, co2)
plt.show()



In [14]:
plt.plot(stage['Year'].values, stage['Value'].values)
plt.xlabel('Year')
plt.ylabel(stage['IndicatorName'].iloc[0])
plt.axis([1959, 2011, 0, 25])
plt.title('CO2 Emissions in USA')
plt.show()



In [15]:
hist_data = stage['Value'].values

In [16]:
len(hist_data)


Out[16]:
52

In [17]:
plt.hist(hist_data, 10, normed = False, facecolor='green')
plt.xlabel(stage['IndicatorName'].iloc[0])
plt.ylabel('# of Years')
plt.title('Histogram Example')
plt.grid(True)
plt.show()



In [18]:
hist_indicator = 'CO2 emissions \(metric'
hist_year = 2011

mask1 = data['IndicatorName'].str.contains(hist_indicator)
mask2 = data['Year'].isin([hist_year])

co2_2011 = data[mask1 & mask2]
co2_2011.head()


Out[18]:
CountryName CountryCode IndicatorName IndicatorCode Year Value
5026275 Arab World ARB CO2 emissions (metric tons per capita) EN.ATM.CO2E.PC 2011 4.724500
5026788 Caribbean small states CSS CO2 emissions (metric tons per capita) EN.ATM.CO2E.PC 2011 9.692960
5027295 Central Europe and the Baltics CEB CO2 emissions (metric tons per capita) EN.ATM.CO2E.PC 2011 6.911131
5027870 East Asia & Pacific (all income levels) EAS CO2 emissions (metric tons per capita) EN.ATM.CO2E.PC 2011 5.859548
5028456 East Asia & Pacific (developing only) EAP CO2 emissions (metric tons per capita) EN.ATM.CO2E.PC 2011 5.302499

In [19]:
len(co2_2011)


Out[19]:
232

In [20]:
fig, ax = plt.subplots()

ax.annotate("USA", xy=(18,5), xycoords='data',
            xytext = (18,30), textcoords = 'data',
            arrowprops = dict(arrowstyle='->',
            connectionstyle='arc3'),)

plt.hist(co2_2011['Value'], 10, normed=False, facecolor='green')
plt.ylabel('# of Countries')
plt.title('Histogram of CO2 Emissions Per Capita')
plt.grid(True)

plt.show()



In [21]:
hist_indicator = 'GDP per capita \(constant 2005'
hist_country = 'USA'

mask1 = data['IndicatorName'].str.contains(hist_indicator)
mask2 = data['CountryCode'].str.contains(hist_country)

gdp_stage = data[mask1 & mask2]

In [22]:
gdp_stage.head(n = 5)


Out[22]:
CountryName CountryCode IndicatorName IndicatorCode Year Value
22282 United States USA GDP per capita (constant 2005 US$) NY.GDP.PCAP.KD 1960 15482.707760
48759 United States USA GDP per capita (constant 2005 US$) NY.GDP.PCAP.KD 1961 15578.409657
77142 United States USA GDP per capita (constant 2005 US$) NY.GDP.PCAP.KD 1962 16276.426685
105760 United States USA GDP per capita (constant 2005 US$) NY.GDP.PCAP.KD 1963 16749.789436
134798 United States USA GDP per capita (constant 2005 US$) NY.GDP.PCAP.KD 1964 17476.822248

In [23]:
stage.head(n = 5)


Out[23]:
CountryName CountryCode IndicatorName IndicatorCode Year Value
22232 United States USA CO2 emissions (metric tons per capita) EN.ATM.CO2E.PC 1960 15.999779
48708 United States USA CO2 emissions (metric tons per capita) EN.ATM.CO2E.PC 1961 15.681256
77087 United States USA CO2 emissions (metric tons per capita) EN.ATM.CO2E.PC 1962 16.013937
105704 United States USA CO2 emissions (metric tons per capita) EN.ATM.CO2E.PC 1963 16.482762
134742 United States USA CO2 emissions (metric tons per capita) EN.ATM.CO2E.PC 1964 16.968119

In [24]:
plt.plot(gdp_stage['Year'].values, gdp_stage['Value'].values)

plt.xlabel('Year')
plt.ylabel(gdp_stage['IndicatorName'].iloc[0])

plt.title('GDP Per Capita USA')

plt.show()



In [25]:
print('GDP Min Year',min(gdp_stage['Year']), "Max: ", max(gdp_stage['Year']))
print('CO2 Min Year',min(stage['Year']), "Max: ", max(stage['Year']))


GDP Min Year 1960 Max:  2014
CO2 Min Year 1960 Max:  2011

In [26]:
gdp_stage_trunc = gdp_stage[gdp_stage['Year'] < 2012]
print(len(gdp_stage_trunc))
print(len(stage))


52
52

In [27]:
%matplotlib inline
import matplotlib.pyplot as plt

fig, axis = plt.subplots()

axis.yaxis.grid(True)
axis.set_title('CO2 Emissions vs GDP \(per capita\)', fontsize=10)
axis.set_xlabel(gdp_stage_trunc['IndicatorName'].iloc[0], fontsize=10)
axis.set_ylabel(stage['IndicatorName'].iloc[0], fontsize=10)

X = gdp_stage_trunc['Value']
Y = stage['Value']

axis.scatter(X,Y)

plt.show()



In [28]:
np.corrcoef(gdp_stage_trunc['Value'], stage['Value'])


Out[28]:
array([[ 1.        ,  0.07676005],
       [ 0.07676005,  1.        ]])

Correlation analysis for different countries


In [29]:
data.head(n=10)


Out[29]:
CountryName CountryCode IndicatorName IndicatorCode Year Value
0 Arab World ARB Adolescent fertility rate (births per 1,000 wo... SP.ADO.TFRT 1960 1.335609e+02
1 Arab World ARB Age dependency ratio (% of working-age populat... SP.POP.DPND 1960 8.779760e+01
2 Arab World ARB Age dependency ratio, old (% of working-age po... SP.POP.DPND.OL 1960 6.634579e+00
3 Arab World ARB Age dependency ratio, young (% of working-age ... SP.POP.DPND.YG 1960 8.102333e+01
4 Arab World ARB Arms exports (SIPRI trend indicator values) MS.MIL.XPRT.KD 1960 3.000000e+06
5 Arab World ARB Arms imports (SIPRI trend indicator values) MS.MIL.MPRT.KD 1960 5.380000e+08
6 Arab World ARB Birth rate, crude (per 1,000 people) SP.DYN.CBRT.IN 1960 4.769789e+01
7 Arab World ARB CO2 emissions (kt) EN.ATM.CO2E.KT 1960 5.956399e+04
8 Arab World ARB CO2 emissions (metric tons per capita) EN.ATM.CO2E.PC 1960 6.439635e-01
9 Arab World ARB CO2 emissions from gaseous fuel consumption (%... EN.ATM.CO2E.GF.ZS 1960 5.041292e+00

In [30]:
hist_indicator = 'CO2 emissions \(metric'
hist_country = 'IND'

mask1 = data['IndicatorName'].str.contains(hist_indicator)
mask2 = data['CountryCode'].str.contains(hist_country)

ind_data = data[mask1 & mask2]

In [31]:
ind_data.head()


Out[31]:
CountryName CountryCode IndicatorName IndicatorCode Year Value
11577 India IND CO2 emissions (metric tons per capita) EN.ATM.CO2E.PC 1960 0.268161
36513 India IND CO2 emissions (metric tons per capita) EN.ATM.CO2E.PC 1961 0.284292
64049 India IND CO2 emissions (metric tons per capita) EN.ATM.CO2E.PC 1962 0.306519
92493 India IND CO2 emissions (metric tons per capita) EN.ATM.CO2E.PC 1963 0.322533
121290 India IND CO2 emissions (metric tons per capita) EN.ATM.CO2E.PC 1964 0.308900

In [32]:
plt.hist(ind_data['Value'], 10, normed=False, facecolor='green')

plt.xlabel(ind_data['IndicatorName'].iloc[0])
plt.ylabel('# of Years')

plt.title('Histogram of CO2 Emissions')

plt.grid()
plt.show()



In [33]:
hist_indicator = 'CO2 emissions \(metric'
hist_year = 2011

mask1 = data['IndicatorName'].str.contains(hist_indicator)
mask2 = data['Year'].isin([hist_year])

co2_2011_dev = data[mask1 & mask2]

In [34]:
co2_2011_dev.head()


Out[34]:
CountryName CountryCode IndicatorName IndicatorCode Year Value
5026275 Arab World ARB CO2 emissions (metric tons per capita) EN.ATM.CO2E.PC 2011 4.724500
5026788 Caribbean small states CSS CO2 emissions (metric tons per capita) EN.ATM.CO2E.PC 2011 9.692960
5027295 Central Europe and the Baltics CEB CO2 emissions (metric tons per capita) EN.ATM.CO2E.PC 2011 6.911131
5027870 East Asia & Pacific (all income levels) EAS CO2 emissions (metric tons per capita) EN.ATM.CO2E.PC 2011 5.859548
5028456 East Asia & Pacific (developing only) EAP CO2 emissions (metric tons per capita) EN.ATM.CO2E.PC 2011 5.302499

In [35]:
plt.hist(co2_2011_dev['Value'], 10, normed=False, facecolor='green')

plt.xlabel(co2_2011_dev['IndicatorName'].iloc[0])
plt.ylabel('# of Countries')
plt.title('Histogram of CO2 Emissions Per Capita')

plt.annotate("IND", 
             xy=(0.3, 135), xycoords='data',
            xytext=(0.3,160), textcoords='data',
            arrowprops=dict(arrowstyle='->',
                           connectionstyle='arc3'),)

plt.grid(True)
plt.show()



In [36]:
hist_indicator = 'GDP per capita \(constant 2005'
hist_country = 'IND'

mask1 = data['IndicatorName'].str.contains(hist_indicator)
mask2 = data['CountryCode'].str.contains(hist_country)

gdp_dev_stage = data[mask1 & mask2]

In [37]:
gdp_dev_stage.head()


Out[37]:
CountryName CountryCode IndicatorName IndicatorCode Year Value
11616 India IND GDP per capita (constant 2005 US$) NY.GDP.PCAP.KD 1960 228.304470
36555 India IND GDP per capita (constant 2005 US$) NY.GDP.PCAP.KD 1961 232.142053
64095 India IND GDP per capita (constant 2005 US$) NY.GDP.PCAP.KD 1962 234.166685
92540 India IND GDP per capita (constant 2005 US$) NY.GDP.PCAP.KD 1963 243.176418
121337 India IND GDP per capita (constant 2005 US$) NY.GDP.PCAP.KD 1964 255.963668

In [38]:
plt.plot(gdp_dev_stage['Year'], gdp_dev_stage['Value'])
plt.xlabel('Year')
plt.ylabel(gdp_dev_stage['IndicatorName'].iloc[0])
plt.title('GDP Per Capita IND')

plt.show()



In [39]:
print('GDP Min Year = ', min(gdp_dev_stage['Year']), 'Max: ', gdp_dev_stage['Year'].max())
print('GDP Min Year = ', min(ind_data['Year']), 'Max: ', ind_data['Year'].max())


GDP Min Year =  1960 Max:  2014
GDP Min Year =  1960 Max:  2011

In [40]:
gdp_dev_stage_trunc = gdp_dev_stage[gdp_dev_stage['Year'] < 2012]

In [41]:
print(len(gdp_dev_stage_trunc))
print(len(ind_data))


52
52

In [46]:
%matplotlib inline
import matplotlib.pyplot as plt

fig, axis = plt.subplots()

axis.yaxis.grid(True)
axis.set_xlabel(gdp_dev_stage_trunc['IndicatorName'].iloc[0], fontsize=12)
axis.set_ylabel(ind_data['IndicatorName'].iloc[0], fontsize=12)
axis.set_title('CO2 Emissions vs GDP (per capita)', fontsize = 12)

X = gdp_dev_stage_trunc['Value']
Y = ind_data['Value']

axis.scatter(X, Y)
plt.show()



In [43]:
np.corrcoef(gdp_dev_stage_trunc['Value'], ind_data['Value'])


Out[43]:
array([[ 1.        ,  0.96753758],
       [ 0.96753758,  1.        ]])