In [1]:

    
# numerical python package to do fast computations.
import numpy as np
import pandas as pd

# plotting library to visulise the data.
import matplotlib.pyplot as plt

# Instruction to matplot to display plot in Jupyter Notebook itself.
%matplotlib inline



In [ ]:

    
data_points_count = 50

# x = np.linspace(0, 10, data_points_count)

x = np.array([  0.        ,   0.20408163,   0.40816327,   0.6122449 ,
         0.81632653,   1.02040816,   1.2244898 ,   1.42857143,
         1.63265306,   1.83673469,   2.04081633,   2.24489796,
         2.44897959,   2.65306122,   2.85714286,   3.06122449,
         3.26530612,   3.46938776,   3.67346939,   3.87755102,
         4.08163265,   4.28571429,   4.48979592,   4.69387755,
         4.89795918,   5.10204082,   5.30612245,   5.51020408,
         5.71428571,   5.91836735,   6.12244898,   6.32653061,
         6.53061224,   6.73469388,   6.93877551,   7.14285714,
         7.34693878,   7.55102041,   7.75510204,   7.95918367,
         8.16326531,   8.36734694,   8.57142857,   8.7755102 ,
         8.97959184,   9.18367347,   9.3877551 ,   9.59183673,
         9.79591837,  10.        ])

m = np.random.random(1) * 5
c = np.random.random(1) * 10

y = m * x + c + 3.5 * np.random.random(data_points_count)

y2 = 5 * np.sin(x) + 1 * x + 0.1 * x * x + 12 * np.random.random(data_points_count)

print(m, c)

plt.figure(figsize=(15, 8))
plt.plot(x, y, '*r')



In [ ]:

    
%%sh

# wget http://mospi.nic.in/sites/default/files/logo.png
# wget https://upload.wikimedia.org/wikipedia/commons/thumb/e/e7/G20.svg/863px-G20.svg.png

INDIA, G-20 AND THE WORLD - Statistical Year Book India 2016

Navigation Path: Home > Statistical Year Book India 2016 > INDIA, G-20 AND THE WORLD

The G20 (or G-20 or Group of Twenty) is an international forum for the governments and central bank governors from 20 major economies. It was founded in 1999 with the aim of studying, reviewing, and promoting high-level discussion of policy issues pertaining to the promotion of international financial stability.[3] It seeks to address issues that go beyond the responsibilities of any one organization.[3] The G20 heads of government or heads of state have periodically conferred at summits since their initial meeting in 2008, and the group also hosts separate meetings of finance ministers and central bank governors.

The members include 19 individual countries and along with the European Union (EU). The EU is represented by the European Commission and by the European Central Bank. Collectively, the G20 economies account for around 85% of the gross world product (GWP), 80% of world trade (or, if excluding EU intra-trade, 75%), and two-thirds of the world population.[2]

Data Source:

http://mospi.nic.in/statistical-year-book-india/2016/170

References:

Wikipedia G20

Data Gathering

wget http://mospi.nic.in/statistical-year-book-india/2016/170

Country
Area
Population (Millions)
GDP Billions (USD)
Gross Domestic Product Per Capita Income at Current Price (USD)
Gross domestic product based on Purchasing-Power-Parity (PPP) valuation of Country GDP in Billions ( Current International Dollar)

wget https://docs.google.com/a/imaginea.com/spreadsheets/d/1jbwyZsHy_SsJ-ANWlNVgMKOl5PkoMMcqkMiMJRXDXms/edit?usp=sharing



In [ ]:

    
%%sh

# ls -l ~/Downloads/G20*csv

# mv ~/Downloads/G20*csv G20.csv

Data Cleanup



In [93]:

    
data = pd.read_csv('G20.csv')

cols = ['Area', 'Population_2010', 'Population_2011',
       'Population_2012', 'Population_2013', 'Population_2014',
       'Population_2015', 'GDP_2010', 'GDP_2011', 'GDP_2012', 'GDP_2013',
       'GDP_2014', 'GDP_2015', 'GDP_PCI_2010', 'GDP_PCI_2011', 'GDP_PCI_2012',
       'GDP_PCI_2013', 'GDP_PCI_2014', 'GDP_PCI_2015', 'GDP_PPP_2010',
       'GDP_PPP_2011', 'GDP_PPP_2012', 'GDP_PPP_2013', 'GDP_PPP_2014',
       'GDP_PPP_2015']

data[cols] = data[cols].applymap(lambda x: float(str(x).replace(',', '')))
all_countries = sorted(data.Country.unique())
country_labler = all_countries.index
# country_labler('India')
# data.Country = data.Country.map(country_labler)



In [103]:

    
sorted(data.columns.tolist())

cols1 = ['GDP_2010',
 'GDP_2011',
 'GDP_2012',
 'GDP_2013',
 'GDP_2014',
 'GDP_2015',]

cols2 = [
 'GDP_PPP_2010',
 'GDP_PPP_2011',
 'GDP_PPP_2012',
 'GDP_PPP_2013',
 'GDP_PPP_2014',
 'GDP_PPP_2015']

cols3 = []

data1 = data[['Area',
 'Country',
 'GDP_2010',
 'GDP_2011',
 'GDP_2012',
 'GDP_2013',
 'GDP_2014',
 'GDP_2015',]].copy()

data2 = data[['Area',
 'Country',
 'GDP_PPP_2010',
 'GDP_PPP_2011',
 'GDP_PPP_2012',
 'GDP_PPP_2013',
 'GDP_PPP_2014',
 'GDP_PPP_2015',]].copy()

data3 = data[['Area',
 'Country',
 'GDP_PCI_2010',
 'GDP_PCI_2011',
 'GDP_PCI_2012',
 'GDP_PCI_2013',
 'GDP_PCI_2014',
 'GDP_PCI_2015',]].copy()


data4 = data[['Area',
 'Country',
 'Population_2010',
 'Population_2011',
 'Population_2012',
 'Population_2013',
 'Population_2014',
 'Population_2015']].copy()

Experiments



In [95]:

    
import sklearn.cluster

clf = sklearn.cluster.AgglomerativeClustering(5)

pred = clf.fit_predict(data1['GDP_2010	GDP_2011	GDP_2012	GDP_2013	GDP_2014	GDP_2015'.split()])

pred









    Out[95]:





array([3, 3, 0, 0, 2, 0, 0, 0, 3, 0, 4, 3, 0, 3, 3, 3, 3, 0, 1, 1])



In [ ]:

    
new_data.metric.unique()



In [ ]:

    
new_data.head(20).copy(deep=True)



In [ ]:

    
# segregating year & param
new_data['year'] = new_data.metric.map(lambda x: int(x.rsplit('_')[-1]))
new_data['param'] = new_data.metric.map(lambda x: ''.join(x.rsplit('_')[:-1]))

# drop metric column
new_data.drop('metric', axis=1, inplace=True)

# converting data into integers



In [ ]:

    
# Key values to check how the world 
print('Country', new_data.country.unique())
print('Country', new_data.param.unique())



In [ ]:

    
temp = new_data[(new_data.country == 'USA') & (new_data.param ==  'GDP')].copy(deep=True)

temp



In [ ]:

    
X_Label = 'USA'
Y_Label = 'GDP'
plt.figure(figsize=(15, 5))

temp = new_data[(new_data.country == X_Label) & (new_data.param == Y_Label)].copy(deep=True)
_x,  _y = temp.year.values, temp.value.values
plt.plot(_x, _y)
plt.xticks(_x, map(str, _x))


X_Label = 'European Union'
Y_Label = 'GDP'
plt.figure(figsize=(15, 5))

temp = new_data[(new_data.country == X_Label) & (new_data.param == Y_Label)].copy(deep=True)
_x,  _y = temp.year.values, temp.value.values
plt.plot(_x, _y)
plt.xticks(_x, map(str, _x))



In [ ]:

    
X_Label = 'USA'
Y_Label = 'GDP'
plt.figure(figsize=(15, 5))

temp = new_data[(new_data.country == X_Label) & (new_data.param == Y_Label)].copy(deep=True)
_x,  _y = temp.year.values, temp.value.values
plt.plot(_x, _y)
plt.xticks(_x, map(str, _x))



In [ ]:

    
_y



In [ ]:

    
_y - _y.min()



In [ ]:

    
Y_Label = 'Population'
plt.figure(figsize=(15, 8))

all_countries = new_data.country.unique()[:5]

for X_Label in all_countries:
    temp = new_data[(new_data.country == X_Label) & (new_data.param == Y_Label)].copy(deep=True)
    _x,  _y = temp.year.values, temp.value.values
    _y = _y - _y.min()
    plt.plot(_x, _y)
    plt.xticks(_x, map(str, _x))
    
plt.legend(all_countries)

Ideas

Show top 5 countries
Show only comparable countries



In [164]:

    
country_codes = {'Argentina': 'ARG',
 'Australia': 'AUS',
 'Brazil': 'BRA',
 'Canada': 'CAN',
 'China': 'CHN',
 'European Union': 'USA',
 'France': 'FRA',
 'Germany': 'DEU',
 'India': 'IND',
 'Indonesia': 'IDN',
 'Italy': 'ITA',
 'Japan': 'JPN',
 'Mexico': 'MEX',
 'Republic of Korea': 'USA',
 'Russia': 'RUS',
 'Saudi Arabia': 'SAU',
 'South Africa': 'ZAF',
 'Turkey': 'TUR',
 'USA': 'USA',
 'United Kingdom': 'GBR'}

chart_colors = ["rgb(0,0,0)",
"rgb(255,255,255)",
"rgb(255,0,0)",
"rgb(0,255,0)",
"rgb(0,0,255)",
"rgb(255,255,0)",
"rgb(0,255,255)",
"rgb(255,0,255)",
"rgb(192,192,192)",
"rgb(128,128,128)",
"rgb(128,0,0)",
"rgb(128,128,0)",
"rgb(0,128,0)",
"rgb(128,0,128)",
"rgb(0,128,128)",
"rgb(0,0,128)",]

chart_colors += chart_colors

chart_colors = chart_colors[:len(country_codes)]

data1['Country_Codes'] = data1['Country'].map(lambda x: country_codes[x])



In [140]:

    
import sklearn.cluster

clf = sklearn.cluster.AgglomerativeClustering(5)

pred = clf.fit_predict(data1['GDP_2010	GDP_2011	GDP_2012	GDP_2013	GDP_2014	GDP_2015'.split()])

pred









    Out[140]:





array([3, 3, 0, 0, 2, 0, 0, 0, 3, 0, 4, 3, 0, 3, 3, 3, 3, 0, 1, 1])



In [141]:

    
data1['cluster'] = pred



In [112]:

    
data1['text'] = 'Cluster ID' + data1.cluser



In [142]:

    
data1.head()









    Out[142]:






  
    
      
      Area
      Country
      GDP_2010
      GDP_2011
      GDP_2012
      GDP_2013
      GDP_2014
      GDP_2015
      Country_Codes
      cluser
      cluster
    
  
  
    
      0
      2766890.0
      Argentina
      461.65
      558.68
      607.60
      622.05
      543.06
      578.71
      ARG
      3
      3
    
    
      1
      7686850.0
      Australia
      1244.97
      1499.95
      1555.26
      1497.22
      1442.72
      1240.80
      AUS
      3
      3
    
    
      2
      8511965.0
      Brazil
      2209.27
      2613.06
      2412.02
      2391.03
      2346.58
      1799.61
      BRA
      0
      0
    
    
      3
      9976140.0
      Canada
      1614.07
      1788.74
      1832.72
      1838.96
      1785.39
      1572.78
      CAN
      0
      0
    
    
      4
      9596960.0
      China
      6039.55
      7492.53
      8461.51
      9490.85
      10356.51
      11384.76
      CHN
      2
      2



In [121]:

    
import plotly.plotly as py
import pandas as pd

# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2014_world_gdp_with_codes.csv')

data = [ dict(
        type = 'choropleth',
        locations = data1['Country_Codes'],
        z = data1['cluser'],
        text = data1['Country_Codes'],
#         colorscale = [[0,"rgb(5, 10, 172)"],[0.35,"rgb(40, 60, 190)"],[0.5,"rgb(70, 100, 245)"],\
#             [0.6,"rgb(90, 120, 245)"],[0.7,"rgb(106, 137, 247)"],[1,"rgb(220, 220, 220)"]],
#         autocolorscale = True,
#         reversescale = True,
#         marker = dict(
#             line = dict (
#                 color = 'rgb(180,180,180)',
#                 width = 0.5
#             ) ),
        colorbar = dict(
            autotick = False,
            tickprefix = '$',
            title = 'GDP<br>Billions US$'),
      ) ]

layout = dict(
    title = 'G-20"s GDP',
    geo = dict(
        showframe = False,
        showcoastlines = False,
        projection = dict(
            type = 'Mercator'
        )
    )
)

fig = dict(data=data, layout=layout)
# py.iplot( fig, validate=False, filename='d3-world-map' )
plot( fig, validate=False, filename='d3-world-map')









    



/Users/sampathm/miniconda3/lib/python3.5/site-packages/plotly/offline/offline.py:459: UserWarning:

Your filename `d3-world-map` didn't end with .html. Adding .html to the end of your file.







    Out[121]:





'file:///Users/sampathm/devbox/d3-world-map.html'

from IPython.display import IFrame IFrame('d3-world-map.html', width=900, height=500)



In [144]:

    
fig = {
    'data': [
  		{
  			'x': df2007.gdpPercap, 
        	'y': df2007.lifeExp, 
        	'text': df2007.country, 
        	'mode': 'markers', 
        	'name': '2007'},
        {
        	'x': df1952.gdpPercap, 
        	'y': df1952.lifeExp, 
        	'text': df1952.country, 
        	'mode': 'markers', 
        	'name': '1952'}
    ],
    'layout': {
        'xaxis': {'title': 'GDP per Capita', 'type': 'log'},
        'yaxis': {'title': "Life Expectancy"}
    }
}



In [165]:

    
data = []

year = 'GDP_2015'
data.append({
    'x': data1[year],
    'y': data1['cluster'],
    'mode': 'markers',
    'text': data1['Country'],
    'name': year,
    'colors': chart_colors
})



In [166]:

    
fig = dict(data=data, layout=layout)
# py.iplot( fig, validate=False, filename='d3-world-map' )
plot( fig, validate=False, filename='d3-world-map')









    



/Users/sampathm/miniconda3/lib/python3.5/site-packages/plotly/offline/offline.py:459: UserWarning:

Your filename `d3-world-map` didn't end with .html. Adding .html to the end of your file.







    Out[166]:





'file:///Users/sampathm/devbox/d3-world-map.html'

IRIS Dataset



In [174]:

    
from sklearn import datasets

# import some data to play with
iris = datasets.load_iris()
X = iris.data # [:, :2]  # we only take the first two features.
Y = iris.target



In [175]:

    
X[:5]









    Out[175]:





array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2],
       [ 5. ,  3.6,  1.4,  0.2]])



In [190]:

    
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25, random_state=0)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)









    



(112, 4) (112,) (38, 4) (38,)



In [199]:

    
from sklearn.metrics import accuracy_score

Random Forest



In [205]:

    
from  sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf = clf.fit(X_train, y_train)



In [206]:

    
accuracy_score(clf.predict(X_train), y_train)









    Out[206]:





1.0



In [207]:

    
accuracy_score(clf.predict(X_test), y_test)









    Out[207]:





0.97368421052631582



In [208]:

    
accuracy_score(clf.predict(X), Y)









    Out[208]:





0.99333333333333329

SVM



In [210]:

    
from sklearn import svm
clf = svm.SVC(kernel='linear', C=2)

clf = clf.fit(X_train, y_train)



In [211]:

    
accuracy_score(clf.predict(X_train), y_train)









    Out[211]:





0.9821428571428571



In [212]:

    
accuracy_score(clf.predict(X_test), y_test)









    Out[212]:





0.97368421052631582



In [213]:

    
accuracy_score(clf.predict(X), Y)









    Out[213]:





0.97999999999999998

	Area	Country	GDP_2010	GDP_2011	GDP_2012	GDP_2013	GDP_2014	GDP_2015	Country_Codes	cluser	cluster
0	2766890.0	Argentina	461.65	558.68	607.60	622.05	543.06	578.71	ARG	3	3
1	7686850.0	Australia	1244.97	1499.95	1555.26	1497.22	1442.72	1240.80	AUS	3	3
2	8511965.0	Brazil	2209.27	2613.06	2412.02	2391.03	2346.58	1799.61	BRA	0	0
3	9976140.0	Canada	1614.07	1788.74	1832.72	1838.96	1785.39	1572.78	CAN	0	0
4	9596960.0	China	6039.55	7492.53	8461.51	9490.85	10356.51	11384.76	CHN	2	2