In [1]:
# numerical python package to do fast computations.
import numpy as np
import pandas as pd

# plotting library to visulise the data.
import matplotlib.pyplot as plt

# Instruction to matplot to display plot in Jupyter Notebook itself.
%matplotlib inline

In [ ]:
data_points_count = 50

# x = np.linspace(0, 10, data_points_count)

x = np.array([  0.        ,   0.20408163,   0.40816327,   0.6122449 ,
         0.81632653,   1.02040816,   1.2244898 ,   1.42857143,
         1.63265306,   1.83673469,   2.04081633,   2.24489796,
         2.44897959,   2.65306122,   2.85714286,   3.06122449,
         3.26530612,   3.46938776,   3.67346939,   3.87755102,
         4.08163265,   4.28571429,   4.48979592,   4.69387755,
         4.89795918,   5.10204082,   5.30612245,   5.51020408,
         5.71428571,   5.91836735,   6.12244898,   6.32653061,
         6.53061224,   6.73469388,   6.93877551,   7.14285714,
         7.34693878,   7.55102041,   7.75510204,   7.95918367,
         8.16326531,   8.36734694,   8.57142857,   8.7755102 ,
         8.97959184,   9.18367347,   9.3877551 ,   9.59183673,
         9.79591837,  10.        ])

m = np.random.random(1) * 5
c = np.random.random(1) * 10

y = m * x + c + 3.5 * np.random.random(data_points_count)

y2 = 5 * np.sin(x) + 1 * x + 0.1 * x * x + 12 * np.random.random(data_points_count)

print(m, c)

plt.figure(figsize=(15, 8))
plt.plot(x, y, '*r')

In [ ]:
%%sh

# wget http://mospi.nic.in/sites/default/files/logo.png
# wget https://upload.wikimedia.org/wikipedia/commons/thumb/e/e7/G20.svg/863px-G20.svg.png

INDIA, G-20 AND THE WORLD - Statistical Year Book India 2016

Navigation Path: Home > Statistical Year Book India 2016 > INDIA, G-20 AND THE WORLD

The G20 (or G-20 or Group of Twenty) is an international forum for the governments and central bank governors from 20 major economies. It was founded in 1999 with the aim of studying, reviewing, and promoting high-level discussion of policy issues pertaining to the promotion of international financial stability.[3] It seeks to address issues that go beyond the responsibilities of any one organization.[3] The G20 heads of government or heads of state have periodically conferred at summits since their initial meeting in 2008, and the group also hosts separate meetings of finance ministers and central bank governors.

The members include 19 individual countries and along with the European Union (EU). The EU is represented by the European Commission and by the European Central Bank. Collectively, the G20 economies account for around 85% of the gross world product (GWP), 80% of world trade (or, if excluding EU intra-trade, 75%), and two-thirds of the world population.[2]

Data Source:

References:

Data Gathering

  • Country
  • Area
  • Population (Millions)
  • GDP Billions (USD)
  • Gross Domestic Product Per Capita Income at Current Price (USD)
  • Gross domestic product based on Purchasing-Power-Parity (PPP) valuation of Country GDP in Billions ( Current International Dollar)

In [ ]:
%%sh

# ls -l ~/Downloads/G20*csv

# mv ~/Downloads/G20*csv G20.csv

Data Cleanup


In [93]:
data = pd.read_csv('G20.csv')

cols = ['Area', 'Population_2010', 'Population_2011',
       'Population_2012', 'Population_2013', 'Population_2014',
       'Population_2015', 'GDP_2010', 'GDP_2011', 'GDP_2012', 'GDP_2013',
       'GDP_2014', 'GDP_2015', 'GDP_PCI_2010', 'GDP_PCI_2011', 'GDP_PCI_2012',
       'GDP_PCI_2013', 'GDP_PCI_2014', 'GDP_PCI_2015', 'GDP_PPP_2010',
       'GDP_PPP_2011', 'GDP_PPP_2012', 'GDP_PPP_2013', 'GDP_PPP_2014',
       'GDP_PPP_2015']

data[cols] = data[cols].applymap(lambda x: float(str(x).replace(',', '')))
all_countries = sorted(data.Country.unique())
country_labler = all_countries.index
# country_labler('India')
# data.Country = data.Country.map(country_labler)

In [103]:
sorted(data.columns.tolist())

cols1 = ['GDP_2010',
 'GDP_2011',
 'GDP_2012',
 'GDP_2013',
 'GDP_2014',
 'GDP_2015',]

cols2 = [
 'GDP_PPP_2010',
 'GDP_PPP_2011',
 'GDP_PPP_2012',
 'GDP_PPP_2013',
 'GDP_PPP_2014',
 'GDP_PPP_2015']

cols3 = []

data1 = data[['Area',
 'Country',
 'GDP_2010',
 'GDP_2011',
 'GDP_2012',
 'GDP_2013',
 'GDP_2014',
 'GDP_2015',]].copy()

data2 = data[['Area',
 'Country',
 'GDP_PPP_2010',
 'GDP_PPP_2011',
 'GDP_PPP_2012',
 'GDP_PPP_2013',
 'GDP_PPP_2014',
 'GDP_PPP_2015',]].copy()

data3 = data[['Area',
 'Country',
 'GDP_PCI_2010',
 'GDP_PCI_2011',
 'GDP_PCI_2012',
 'GDP_PCI_2013',
 'GDP_PCI_2014',
 'GDP_PCI_2015',]].copy()


data4 = data[['Area',
 'Country',
 'Population_2010',
 'Population_2011',
 'Population_2012',
 'Population_2013',
 'Population_2014',
 'Population_2015']].copy()

Experiments


In [95]:
import sklearn.cluster

clf = sklearn.cluster.AgglomerativeClustering(5)

pred = clf.fit_predict(data1['GDP_2010	GDP_2011	GDP_2012	GDP_2013	GDP_2014	GDP_2015'.split()])

pred


Out[95]:
array([3, 3, 0, 0, 2, 0, 0, 0, 3, 0, 4, 3, 0, 3, 3, 3, 3, 0, 1, 1])

In [ ]:
new_data.metric.unique()

In [ ]:
new_data.head(20).copy(deep=True)

In [ ]:
# segregating year & param
new_data['year'] = new_data.metric.map(lambda x: int(x.rsplit('_')[-1]))
new_data['param'] = new_data.metric.map(lambda x: ''.join(x.rsplit('_')[:-1]))

# drop metric column
new_data.drop('metric', axis=1, inplace=True)

# converting data into integers

In [ ]:
# Key values to check how the world 
print('Country', new_data.country.unique())
print('Country', new_data.param.unique())

In [ ]:
temp = new_data[(new_data.country == 'USA') & (new_data.param ==  'GDP')].copy(deep=True)

temp

In [ ]:
X_Label = 'USA'
Y_Label = 'GDP'
plt.figure(figsize=(15, 5))

temp = new_data[(new_data.country == X_Label) & (new_data.param == Y_Label)].copy(deep=True)
_x,  _y = temp.year.values, temp.value.values
plt.plot(_x, _y)
plt.xticks(_x, map(str, _x))


X_Label = 'European Union'
Y_Label = 'GDP'
plt.figure(figsize=(15, 5))

temp = new_data[(new_data.country == X_Label) & (new_data.param == Y_Label)].copy(deep=True)
_x,  _y = temp.year.values, temp.value.values
plt.plot(_x, _y)
plt.xticks(_x, map(str, _x))

In [ ]:
X_Label = 'USA'
Y_Label = 'GDP'
plt.figure(figsize=(15, 5))

temp = new_data[(new_data.country == X_Label) & (new_data.param == Y_Label)].copy(deep=True)
_x,  _y = temp.year.values, temp.value.values
plt.plot(_x, _y)
plt.xticks(_x, map(str, _x))

In [ ]:
_y

In [ ]:
_y - _y.min()

In [ ]:
Y_Label = 'Population'
plt.figure(figsize=(15, 8))

all_countries = new_data.country.unique()[:5]

for X_Label in all_countries:
    temp = new_data[(new_data.country == X_Label) & (new_data.param == Y_Label)].copy(deep=True)
    _x,  _y = temp.year.values, temp.value.values
    _y = _y - _y.min()
    plt.plot(_x, _y)
    plt.xticks(_x, map(str, _x))
    
plt.legend(all_countries)

Ideas

  • Show top 5 countries
  • Show only comparable countries

In [164]:
country_codes = {'Argentina': 'ARG',
 'Australia': 'AUS',
 'Brazil': 'BRA',
 'Canada': 'CAN',
 'China': 'CHN',
 'European Union': 'USA',
 'France': 'FRA',
 'Germany': 'DEU',
 'India': 'IND',
 'Indonesia': 'IDN',
 'Italy': 'ITA',
 'Japan': 'JPN',
 'Mexico': 'MEX',
 'Republic of Korea': 'USA',
 'Russia': 'RUS',
 'Saudi Arabia': 'SAU',
 'South Africa': 'ZAF',
 'Turkey': 'TUR',
 'USA': 'USA',
 'United Kingdom': 'GBR'}

chart_colors = ["rgb(0,0,0)",
"rgb(255,255,255)",
"rgb(255,0,0)",
"rgb(0,255,0)",
"rgb(0,0,255)",
"rgb(255,255,0)",
"rgb(0,255,255)",
"rgb(255,0,255)",
"rgb(192,192,192)",
"rgb(128,128,128)",
"rgb(128,0,0)",
"rgb(128,128,0)",
"rgb(0,128,0)",
"rgb(128,0,128)",
"rgb(0,128,128)",
"rgb(0,0,128)",]

chart_colors += chart_colors

chart_colors = chart_colors[:len(country_codes)]

data1['Country_Codes'] = data1['Country'].map(lambda x: country_codes[x])

In [140]:
import sklearn.cluster

clf = sklearn.cluster.AgglomerativeClustering(5)

pred = clf.fit_predict(data1['GDP_2010	GDP_2011	GDP_2012	GDP_2013	GDP_2014	GDP_2015'.split()])

pred


Out[140]:
array([3, 3, 0, 0, 2, 0, 0, 0, 3, 0, 4, 3, 0, 3, 3, 3, 3, 0, 1, 1])

In [141]:
data1['cluster'] = pred

In [112]:
data1['text'] = 'Cluster ID' + data1.cluser

In [142]:
data1.head()


Out[142]:
Area Country GDP_2010 GDP_2011 GDP_2012 GDP_2013 GDP_2014 GDP_2015 Country_Codes cluser cluster
0 2766890.0 Argentina 461.65 558.68 607.60 622.05 543.06 578.71 ARG 3 3
1 7686850.0 Australia 1244.97 1499.95 1555.26 1497.22 1442.72 1240.80 AUS 3 3
2 8511965.0 Brazil 2209.27 2613.06 2412.02 2391.03 2346.58 1799.61 BRA 0 0
3 9976140.0 Canada 1614.07 1788.74 1832.72 1838.96 1785.39 1572.78 CAN 0 0
4 9596960.0 China 6039.55 7492.53 8461.51 9490.85 10356.51 11384.76 CHN 2 2

In [121]:
import plotly.plotly as py
import pandas as pd

# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2014_world_gdp_with_codes.csv')

data = [ dict(
        type = 'choropleth',
        locations = data1['Country_Codes'],
        z = data1['cluser'],
        text = data1['Country_Codes'],
#         colorscale = [[0,"rgb(5, 10, 172)"],[0.35,"rgb(40, 60, 190)"],[0.5,"rgb(70, 100, 245)"],\
#             [0.6,"rgb(90, 120, 245)"],[0.7,"rgb(106, 137, 247)"],[1,"rgb(220, 220, 220)"]],
#         autocolorscale = True,
#         reversescale = True,
#         marker = dict(
#             line = dict (
#                 color = 'rgb(180,180,180)',
#                 width = 0.5
#             ) ),
        colorbar = dict(
            autotick = False,
            tickprefix = '$',
            title = 'GDP<br>Billions US$'),
      ) ]

layout = dict(
    title = 'G-20"s GDP',
    geo = dict(
        showframe = False,
        showcoastlines = False,
        projection = dict(
            type = 'Mercator'
        )
    )
)

fig = dict(data=data, layout=layout)
# py.iplot( fig, validate=False, filename='d3-world-map' )
plot( fig, validate=False, filename='d3-world-map')


/Users/sampathm/miniconda3/lib/python3.5/site-packages/plotly/offline/offline.py:459: UserWarning:

Your filename `d3-world-map` didn't end with .html. Adding .html to the end of your file.

Out[121]:
'file:///Users/sampathm/devbox/d3-world-map.html'
from IPython.display import IFrame IFrame('d3-world-map.html', width=900, height=500)

In [144]:
fig = {
    'data': [
  		{
  			'x': df2007.gdpPercap, 
        	'y': df2007.lifeExp, 
        	'text': df2007.country, 
        	'mode': 'markers', 
        	'name': '2007'},
        {
        	'x': df1952.gdpPercap, 
        	'y': df1952.lifeExp, 
        	'text': df1952.country, 
        	'mode': 'markers', 
        	'name': '1952'}
    ],
    'layout': {
        'xaxis': {'title': 'GDP per Capita', 'type': 'log'},
        'yaxis': {'title': "Life Expectancy"}
    }
}

In [165]:
data = []

year = 'GDP_2015'
data.append({
    'x': data1[year],
    'y': data1['cluster'],
    'mode': 'markers',
    'text': data1['Country'],
    'name': year,
    'colors': chart_colors
})

In [166]:
fig = dict(data=data, layout=layout)
# py.iplot( fig, validate=False, filename='d3-world-map' )
plot( fig, validate=False, filename='d3-world-map')


/Users/sampathm/miniconda3/lib/python3.5/site-packages/plotly/offline/offline.py:459: UserWarning:

Your filename `d3-world-map` didn't end with .html. Adding .html to the end of your file.

Out[166]:
'file:///Users/sampathm/devbox/d3-world-map.html'

IRIS Dataset


In [174]:
from sklearn import datasets

# import some data to play with
iris = datasets.load_iris()
X = iris.data # [:, :2]  # we only take the first two features.
Y = iris.target

In [175]:
X[:5]


Out[175]:
array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2],
       [ 5. ,  3.6,  1.4,  0.2]])

In [190]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25, random_state=0)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)


(112, 4) (112,) (38, 4) (38,)

In [199]:
from sklearn.metrics import accuracy_score

Random Forest


In [205]:
from  sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf = clf.fit(X_train, y_train)

In [206]:
accuracy_score(clf.predict(X_train), y_train)


Out[206]:
1.0

In [207]:
accuracy_score(clf.predict(X_test), y_test)


Out[207]:
0.97368421052631582

In [208]:
accuracy_score(clf.predict(X), Y)


Out[208]:
0.99333333333333329

SVM


In [210]:
from sklearn import svm
clf = svm.SVC(kernel='linear', C=2)

clf = clf.fit(X_train, y_train)

In [211]:
accuracy_score(clf.predict(X_train), y_train)


Out[211]:
0.9821428571428571

In [212]:
accuracy_score(clf.predict(X_test), y_test)


Out[212]:
0.97368421052631582

In [213]:
accuracy_score(clf.predict(X), Y)


Out[213]:
0.97999999999999998