In [ ]:
# The goal of this coding project is to analyze various statistics about the nations of the world.
# There are approximately 20 information about each country.
# I have used pairwise correlation, mapping, linear regression model, ranking function,
# distance function, and K nearest neighbor function to find meaningful patterns.
# If you want to contribute to the project, contact Kris Pan via kriskwpan@gmail.com
In [1]:
import numpy as np
import pandas as pd
In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [3]:
fort = pd.read_csv('fort.csv')
In [4]:
# Checking if csv was loaded correctly
fort
Out[4]:
In [5]:
# Determining characteristics of objects within the matrix for data cleansing
fort.info()
In [7]:
# Basic Statstical imformation for each data
fort.describe()
Out[7]:
In [9]:
import plotly
plotly.tools.set_credentials_file(username='pandamic', api_key='cg2kpwsuW7kMWub4tUhW')
import plotly.plotly as py
In [10]:
# Data map reveal that renewable water is concentrated on few countries.
# Coastline,land boundaries, or location
# does not seem to have meaningful correlation with renewable water resource.
data = [ dict(
type = 'choropleth',
locations = fort['Country Code'],
z = fort['Renewable Water Resources (cu km)'],
text = fort['Country Name'],
colorscale = [[0,"rgb(5, 10, 172)"],[0.35,"rgb(40, 60, 190)"],[0.5,"rgb(70, 100, 245)"],\
[0.6,"rgb(90, 120, 245)"],[0.7,"rgb(106, 137, 247)"],[1,"rgb(220, 220, 220)"]],
autocolorscale = False,
reversescale = True,
marker = dict(
line = dict (
color = 'rgb(180,180,180)',
width = 0.5
) ),
colorbar = dict(
autotick = False,
tickprefix = '(cu km)',
title = 'Renewable Water Resources'),
) ]
layout = dict(
title = 'Global Renewable Water Resources',
geo = dict(
showframe = False,
showcoastlines = False,
projection = dict(
type = 'Mercator'
)
)
)
fig = dict( data=data, layout=layout )
py.iplot( fig, validate=False, filename='d3-world-map' )
Out[10]:
In [209]:
# Top 10 Countries highest life expectancy.
# Joint plot will be used to determine if GDP (PPP) in USD by country have correlation.
fort.nlargest(10, 'Life Expectancy')
Out[209]:
In [211]:
# Histogram and Scatterplot reveal there is no meaningful correlation between
# the two data. Correlation must be tested with other data to determine
# which factor influence Life Expectancy/GDP PPP in US$
sns.jointplot(x='GDP (PPP) in US $',y='Life Expectancy',data=fort,marginal_kws=dict(bins=20, rug=True))
Out[211]:
In [212]:
# Heatmap of correlation reveal strong correlation between Life Expectancy/Literacy Rate, Internet Users/GDP (PPP) in USD,
# Internet Users/Population, Labor Force/Population, GDP (PPP) in USD/Population, Land Area/Renewable Water Resources, Land Area/Land Boundaries
# The weakest correlation was between Commercial Prime Lending Rate/Life Expectancy
In [213]:
sns.heatmap(fort.corr())
Out[213]:
In [282]:
fort.corr(method='pearson')
Out[282]:
In [216]:
# Permutation test of 60 samples were used to validate the correlation found above
fort_perm=fort.take(np.random.permutation(len(fort))[:60])
fort_perm.corr(method='pearson')
Out[216]:
In [217]:
# Linear Regression model is used to predict GDP per capita, with Life Expectancy and Litereacy Rate.
# The two data was used because they have the most correlation with GDP per capita
In [218]:
fort.columns
Out[218]:
In [281]:
# Creating data for GDP per capita
fort['GDP per capita'] = fort['GDP (PPP) in US $'] / fort['Population']
In [220]:
# Training Linear Regression Model
In [283]:
LRsample = fort[['GDP per capita','Internet Users (circa 2009)','Life Expectancy']].copy()
LR = LRsample.dropna(axis=0)
In [284]:
X = LR[['Internet Users (circa 2009)','Life Expectancy']]
In [285]:
y = LR['GDP per capita']
In [286]:
from sklearn.model_selection import train_test_split
In [287]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
In [288]:
from sklearn.linear_model import LinearRegression
In [289]:
lm = LinearRegression()
In [290]:
lm.fit(X_train,y_train)
Out[290]:
In [291]:
# The coefficients
print('Coefficients: \n', lm.coef_)
In [292]:
# Predicting Test Data
In [293]:
predictions = lm.predict( X_test)
In [294]:
plt.scatter(y_test,predictions)
plt.xlabel('Y Test')
plt.ylabel('Predicted Y')
Out[294]:
In [295]:
# Evaluation show that the Linear regression model explain only about 27% of the variance in forecast for GDP par capita
# Other factors must be included to create more precise model
## need to find a method to label each point with country name
In [300]:
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))
In [301]:
metrics.explained_variance_score(y_test,predictions)
Out[301]:
In [234]:
# Ranking General Social Welfare
In [235]:
# Creating Rankings for data that influence social welfare
fort['Literacy Rank'] = fort['Literacy Rate (%)'].rank(ascending=False,numeric_only=float)
fort['Health Expenditure Rank'] = fort['Health Expenditure/GDP'].rank(ascending=False,numeric_only=float)
fort['Renewable Water Rank'] = fort['Renewable Water Resources (cu km)'].rank(ascending=False,numeric_only=float)
In [236]:
# Social Welfare Ranking based on three rankings that contribute
fort['Social Welfare Score']=fort['Literacy Rank'] + fort['Health Expenditure Rank'] + fort['Renewable Water Rank']
fort['Social Welfare Rank'] = fort['Social Welfare Score'].rank(ascending=True,numeric_only=float)
In [237]:
SocialWelfare = fort[['Country Name', 'Social Welfare Rank', 'Literacy Rank', 'Health Expenditure Rank', 'Renewable Water Rank']].copy()
In [238]:
# Based on the Social Welfare Ranking Model below, the best country for social welfare is United States and the worst
# is Eritrea. This evaluation is terrible due to multiple reasons. First of all, the data given for this model
# is too limited to compare multiple social welfares. Factors such as quality of environment, level of crime, availibility
# of essential social services and many more are omitted. Secondly. the the data for certain criterias are unknown.
# The model was not able to rank countries that had unknown data. Thirdly, each criteria used to evaluate Social
# Welfare Rank have different level of impact. For example, United States Rank third on Health Expenditure/GDP
# but life Expectancy does not even rank top 20. It is uncertain how much each criteria contribute to general social welfare.
In [239]:
SocialWelfare.nsmallest(20, 'Social Welfare Rank')
Out[239]:
In [240]:
SocialWelfare.nlargest(20, 'Social Welfare Rank')
Out[240]:
In [241]:
# K Nearest Neighbors Model will be used to determine
# Countries based on Democratic, Parliamentary, or Consitutional system are assigned with code 0
# Countries based on Republic, Communist, or Military system are assigned with code 1
# Model show that Higher GDP per capita and Life Expectancy will tend to be code 0
In [242]:
KNN=fort.dropna(axis=1)
In [243]:
df=KNN[['Life Expectancy','GDP per capita','Population','Gov Type Code']]
In [244]:
df
Out[244]:
In [245]:
sns.pairplot(df,hue='Gov Type Code',palette='coolwarm')
Out[245]:
In [246]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
In [247]:
scaler.fit(df.drop('Gov Type Code',axis=1))
Out[247]:
In [ ]:
# normalize variables
In [248]:
scaled_features = scaler.transform(df.drop('Gov Type Code',axis=1))
In [249]:
df_feat = pd.DataFrame(scaled_features,columns=df.columns[:-1])
df_feat.head()
Out[249]:
In [250]:
# Train Test Split
In [251]:
from sklearn.model_selection import train_test_split
In [252]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features,df['Gov Type Code'],
test_size=0.30)
In [253]:
# Using KNN
In [254]:
from sklearn.neighbors import KNeighborsClassifier
In [255]:
knn = KNeighborsClassifier(n_neighbors=1)
In [256]:
knn.fit(X_train,y_train)
Out[256]:
In [257]:
# Predictions and Evaluaions
In [258]:
pred = knn.predict(X_test)
In [259]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,pred))
In [260]:
print(classification_report(y_test,pred))
In [261]:
error_rate = []
for i in range(1,80):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train,y_train)
pred_i = knn.predict(X_test)
error_rate.append(np.mean(pred_i != y_test))
In [262]:
plt.figure(figsize=(10,6))
plt.plot(range(1,80),error_rate,color='blue', linestyle='dashed', marker='o',
markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
Out[262]:
In [263]:
# Now with K=30, which has the lowest error rate
knn = KNeighborsClassifier(n_neighbors=30)
knn.fit(X_train,y_train)
pred = knn.predict(X_test)
print('WITH K=30')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))
In [307]:
fort.columns
Out[307]:
In [314]:
fort2=fort[['Country Name','Population', 'Life Expectancy',
'Area (sq km)', 'Land Boundaries (km)',
'Coastline (km)', 'Literacy Rate (%)', 'Health Expenditure/GDP',
'Military Expenditure/GDP', 'Renewable Water Resources (cu km)',
'Net Migration Rate/1000 Population', 'Labor Force',
'Internet Users (circa 2009)', 'Commercial Prime Lending Rate',
]].copy()
fort3=fort[['Population', 'Life Expectancy',
'Area (sq km)', 'Land Boundaries (km)',
'Coastline (km)', 'Literacy Rate (%)', 'Health Expenditure/GDP',
'Military Expenditure/GDP', 'Renewable Water Resources (cu km)',
'Net Migration Rate/1000 Population', 'Labor Force',
'Internet Users (circa 2009)', 'Commercial Prime Lending Rate',
]].copy()
In [ ]:
# Distance Functions were used to evaluate relationship between countries
# Further research is required to label each point by country name
In [320]:
from sklearn.metrics.pairwise import euclidean_distances
zero_data = fort3.fillna(0)
pairwise_dist_mat = euclidean_distances(zero_data)
pairwise_dist_mat
Out[320]: