In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
In [2]:
data = pd.read_csv(r'C:\Users\hrao\Documents\Personal\HK\Python\world-development-indicators\Indicators.csv')
In [3]:
data.shape
Out[3]:
In [4]:
data.head(n=10)
Out[4]:
In [5]:
len(data['CountryName'].unique().tolist())
Out[5]:
In [6]:
len(data['CountryCode'].unique().tolist())
Out[6]:
In [7]:
len(data['IndicatorCode'].unique().tolist())
Out[7]:
In [8]:
len(data['Year'].unique().tolist())
Out[8]:
In [9]:
print('Range of Years:', min(data['Year']),'to',max(data['Year']))
In [10]:
hist_country = 'USA'
mask1 = data['CountryCode'].str.contains(hist_country)
hist_indicator = 'CO2 emissions \(metric'
mask2 = data['IndicatorName'].str.contains(hist_indicator)
stage = data[mask1 & mask2]
In [11]:
stage.head()
Out[11]:
In [12]:
years = stage['Year'].values
co2 = stage['Value'].values
In [13]:
plt.bar(years, co2)
plt.show()
In [14]:
plt.plot(stage['Year'].values, stage['Value'].values)
plt.xlabel('Year')
plt.ylabel(stage['IndicatorName'].iloc[0])
plt.axis([1959, 2011, 0, 25])
plt.title('CO2 Emissions in USA')
plt.show()
In [15]:
hist_data = stage['Value'].values
In [16]:
len(hist_data)
Out[16]:
In [17]:
plt.hist(hist_data, 10, normed = False, facecolor='green')
plt.xlabel(stage['IndicatorName'].iloc[0])
plt.ylabel('# of Years')
plt.title('Histogram Example')
plt.grid(True)
plt.show()
In [18]:
hist_indicator = 'CO2 emissions \(metric'
hist_year = 2011
mask1 = data['IndicatorName'].str.contains(hist_indicator)
mask2 = data['Year'].isin([hist_year])
co2_2011 = data[mask1 & mask2]
co2_2011.head()
Out[18]:
In [19]:
len(co2_2011)
Out[19]:
In [20]:
fig, ax = plt.subplots()
ax.annotate("USA", xy=(18,5), xycoords='data',
xytext = (18,30), textcoords = 'data',
arrowprops = dict(arrowstyle='->',
connectionstyle='arc3'),)
plt.hist(co2_2011['Value'], 10, normed=False, facecolor='green')
plt.ylabel('# of Countries')
plt.title('Histogram of CO2 Emissions Per Capita')
plt.grid(True)
plt.show()
In [21]:
hist_indicator = 'GDP per capita \(constant 2005'
hist_country = 'USA'
mask1 = data['IndicatorName'].str.contains(hist_indicator)
mask2 = data['CountryCode'].str.contains(hist_country)
gdp_stage = data[mask1 & mask2]
In [22]:
gdp_stage.head(n = 5)
Out[22]:
In [23]:
stage.head(n = 5)
Out[23]:
In [24]:
plt.plot(gdp_stage['Year'].values, gdp_stage['Value'].values)
plt.xlabel('Year')
plt.ylabel(gdp_stage['IndicatorName'].iloc[0])
plt.title('GDP Per Capita USA')
plt.show()
In [25]:
print('GDP Min Year',min(gdp_stage['Year']), "Max: ", max(gdp_stage['Year']))
print('CO2 Min Year',min(stage['Year']), "Max: ", max(stage['Year']))
In [26]:
gdp_stage_trunc = gdp_stage[gdp_stage['Year'] < 2012]
print(len(gdp_stage_trunc))
print(len(stage))
In [27]:
%matplotlib inline
import matplotlib.pyplot as plt
fig, axis = plt.subplots()
axis.yaxis.grid(True)
axis.set_title('CO2 Emissions vs GDP \(per capita\)', fontsize=10)
axis.set_xlabel(gdp_stage_trunc['IndicatorName'].iloc[0], fontsize=10)
axis.set_ylabel(stage['IndicatorName'].iloc[0], fontsize=10)
X = gdp_stage_trunc['Value']
Y = stage['Value']
axis.scatter(X,Y)
plt.show()
In [28]:
np.corrcoef(gdp_stage_trunc['Value'], stage['Value'])
Out[28]:
In [29]:
data.head(n=10)
Out[29]:
In [30]:
hist_indicator = 'CO2 emissions \(metric'
hist_country = 'IND'
mask1 = data['IndicatorName'].str.contains(hist_indicator)
mask2 = data['CountryCode'].str.contains(hist_country)
ind_data = data[mask1 & mask2]
In [31]:
ind_data.head()
Out[31]:
In [32]:
plt.hist(ind_data['Value'], 10, normed=False, facecolor='green')
plt.xlabel(ind_data['IndicatorName'].iloc[0])
plt.ylabel('# of Years')
plt.title('Histogram of CO2 Emissions')
plt.grid()
plt.show()
In [33]:
hist_indicator = 'CO2 emissions \(metric'
hist_year = 2011
mask1 = data['IndicatorName'].str.contains(hist_indicator)
mask2 = data['Year'].isin([hist_year])
co2_2011_dev = data[mask1 & mask2]
In [34]:
co2_2011_dev.head()
Out[34]:
In [35]:
plt.hist(co2_2011_dev['Value'], 10, normed=False, facecolor='green')
plt.xlabel(co2_2011_dev['IndicatorName'].iloc[0])
plt.ylabel('# of Countries')
plt.title('Histogram of CO2 Emissions Per Capita')
plt.annotate("IND",
xy=(0.3, 135), xycoords='data',
xytext=(0.3,160), textcoords='data',
arrowprops=dict(arrowstyle='->',
connectionstyle='arc3'),)
plt.grid(True)
plt.show()
In [36]:
hist_indicator = 'GDP per capita \(constant 2005'
hist_country = 'IND'
mask1 = data['IndicatorName'].str.contains(hist_indicator)
mask2 = data['CountryCode'].str.contains(hist_country)
gdp_dev_stage = data[mask1 & mask2]
In [37]:
gdp_dev_stage.head()
Out[37]:
In [38]:
plt.plot(gdp_dev_stage['Year'], gdp_dev_stage['Value'])
plt.xlabel('Year')
plt.ylabel(gdp_dev_stage['IndicatorName'].iloc[0])
plt.title('GDP Per Capita IND')
plt.show()
In [39]:
print('GDP Min Year = ', min(gdp_dev_stage['Year']), 'Max: ', gdp_dev_stage['Year'].max())
print('GDP Min Year = ', min(ind_data['Year']), 'Max: ', ind_data['Year'].max())
In [40]:
gdp_dev_stage_trunc = gdp_dev_stage[gdp_dev_stage['Year'] < 2012]
In [41]:
print(len(gdp_dev_stage_trunc))
print(len(ind_data))
In [46]:
%matplotlib inline
import matplotlib.pyplot as plt
fig, axis = plt.subplots()
axis.yaxis.grid(True)
axis.set_xlabel(gdp_dev_stage_trunc['IndicatorName'].iloc[0], fontsize=12)
axis.set_ylabel(ind_data['IndicatorName'].iloc[0], fontsize=12)
axis.set_title('CO2 Emissions vs GDP (per capita)', fontsize = 12)
X = gdp_dev_stage_trunc['Value']
Y = ind_data['Value']
axis.scatter(X, Y)
plt.show()
In [43]:
np.corrcoef(gdp_dev_stage_trunc['Value'], ind_data['Value'])
Out[43]: