第三期教育の入学率をGDP, 人口, ジニ係数, 教育に対する政府支出割合によって重回帰分析を行う。
出所 :
OECD (2015), Gross domestic product (GDP) (indicator). doi: 10.1787/dc2f7aec-en (Accessed on 10 October 2015)
UNESCO Institute for Statistics(2015), data extracted on 10 Oct 2015 09:13 UTC (GMT) from UIS/ISU
World Bank, Development Research Group(2015), Data from database: Poverty and Equity Database. (Last Updated: 07/08/2015)
より算出
In [1]:
%matplotlib inline
In [2]:
# -*- coding:utf-8 -*-
from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy import stats
In [3]:
# データ読み込み
# 第三期教育入学率
# http://data.uis.unesco.org/
data_enroll = pd.read_csv("tertiary.csv", index_col='Country', dtype='O')
data_enroll[data_enroll == '..'] = np.nan
# GDP
# https://data.oecd.org/gdp/gross-domestic-product-gdp.htm
data_gdp = pd.read_csv("gdp.csv", index_col='Country', dtype='O')
data_gdp[data_gdp == '..'] = np.nan
# 人口
# http://databank.worldbank.org/data/reports.aspx?Code=SP.POP.TOTL&id=af3ce82b&report_name=Popular_indicators&populartype=series&ispopular=y#
data_pop = pd.read_csv("population.csv", index_col='Country', dtype='O')
data_pop[data_pop == '..'] = np.nan
# ジニ係数
# http://databank.worldbank.org/data/reports.aspx?Code=SI.POV.GINI&id=af3ce82b&report_name=Popular_indicators&populartype=series&ispopular=y#
data_gini = pd.read_csv("gini.csv", index_col='Country', dtype='O')
data_gini[data_gini == '..'] = np.nan
# 第三期教育に対する政府支出
# https://data.oecd.org/eduresource/public-spending-on-education.htm
data_public = pd.read_csv("public_spending.csv", index_col='Country', dtype='O')
data_public[data_public == '..'] = np.nan
In [4]:
data_enroll.head()
Out[4]:
In [5]:
data_gdp.head()
Out[5]:
In [6]:
data_pop.head()
Out[6]:
In [7]:
data_gini.head()
Out[7]:
In [8]:
data_public.head()
Out[8]:
In [9]:
# 4つの指標全てにおいて調査された国を調べる
country_list = []
for i in np.asarray(data_enroll.index):
if i in np.asarray(data_gdp.index):
if i in np.asarray(data_pop.index):
if i in np.asarray(data_gini.index):
if i in np.asarray(data_public.index):
country_list.append(i)
print(country_list)
print(len(country_list))
In [10]:
# 2000〜2014年におけるそれぞれのデータの最新をまとめる
for i in reversed(range(2000, 2013)):
d = {
'tertiary': data_enroll.ix[country_list]["%s" % i].astype(float),
'log_gdp': np.log(data_gdp.ix[country_list]["%s" % i], dtype=float),
'log_pop': np.log(data_pop.ix[country_list]["%s" % i].astype(float)),
'gini': data_gini.ix[country_list]["%s" % i].astype(float),
'public': data_public.ix[country_list]['2009'].astype(float),
'year': i
}
if i == 2012:
df = pd.DataFrame(d).dropna()
df_test = pd.DataFrame(d).dropna()
for j in df_test.index.values:
if j not in df.index.values:
df.ix[j] = df_test.ix[j]
print(len(df))
df[['tertiary', 'log_gdp', 'log_pop', 'gini', 'public']].describe()
Out[10]:
In [11]:
# 外れ値を切り捨てる
df = df[df['tertiary'] >= 30]
print(len(df))
df[['tertiary', 'log_gdp', 'log_pop', 'gini', 'public']].describe()
Out[11]:
In [12]:
# 相関を求める
df[['log_gdp', 'log_pop', 'gini', 'public']].corr()
Out[12]:
In [13]:
# 単回帰GDP
# 説明変数設定
X = df[['log_gdp']]
X = sm.add_constant(X)
X.head()
# 被説明変数設定
Y = df['tertiary']
Y.head()
# OLSの実行(Ordinary Least Squares: 最小二乗法)
model = sm.OLS(Y,X)
results = model.fit()
print(results.summary())
In [14]:
# 単回帰人口
# 説明変数設定
X = df[['log_pop']]
X = sm.add_constant(X)
X.head()
# 被説明変数設定
Y = df['tertiary']
Y.head()
# OLSの実行(Ordinary Least Squares: 最小二乗法)
model = sm.OLS(Y,X)
results = model.fit()
print(results.summary())
In [15]:
# 単回帰ジニ係数
# 説明変数設定
X = df[['gini']]
X = sm.add_constant(X)
X.head()
# 被説明変数設定
Y = df['tertiary']
Y.head()
# OLSの実行(Ordinary Least Squares: 最小二乗法)
model = sm.OLS(Y,X)
results = model.fit()
print(results.summary())
In [16]:
# 単回帰政府支出
# 説明変数設定
X = df[['public']]
X = sm.add_constant(X)
X.head()
# 被説明変数設定
Y = df['tertiary']
Y.head()
# OLSの実行(Ordinary Least Squares: 最小二乗法)
model = sm.OLS(Y,X)
results = model.fit()
print(results.summary())
In [17]:
# 重回帰分析
# 説明変数設定
X = df[['log_gdp', 'log_pop', 'gini', 'public']]
X = sm.add_constant(X)
X.head()
# 被説明変数設定
Y = df['tertiary']
Y.head()
# OLSの実行(Ordinary Least Squares: 最小二乗法)
model = sm.OLS(Y,X)
results = model.fit()
print(results.summary())
In [18]:
# 説明変数設定
X = df[['log_gdp', 'log_pop', 'gini']]
X = sm.add_constant(X)
X.head()
# 被説明変数設定
Y = df['tertiary']
Y.head()
# OLSの実行(Ordinary Least Squares: 最小二乗法)
model = sm.OLS(Y,X)
results = model.fit()
print(results.summary())
In [19]:
# 説明変数設定
X = df[['log_gdp', 'log_pop']]
X = sm.add_constant(X)
X.head()
# 被説明変数設定
Y = df['tertiary']
Y.head()
# OLSの実行(Ordinary Least Squares: 最小二乗法)
model = sm.OLS(Y,X)
results = model.fit()
print(results.summary())