In [1]:
%matplotlib inline
In [2]:
# -*- coding:utf-8 -*-
from __future__ import print_function
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy import stats
In [3]:
# データを読み込む
df = pd.read_csv("domestic.csv", index_col='year', dtype='float')
df['log_income'] = np.log(df['income'])
df['log_pay'] = np.log(df['pay'])
df['log_pop'] = np.log(df['pop'])
In [4]:
# 要約統計量
df[['enroll' ,'log_income', 'log_pay', 'log_pop']].describe()
Out[4]:
In [5]:
# 相関を求める
df[['log_income', 'log_pay', 'log_pop']].corr()
Out[5]:
In [6]:
# 単回帰分析(大学進学率と可処分所得)
# 説明変数設定
X = df[['log_income']]
X = sm.add_constant(X)
X.head()
# 被説明変数設定
Y = df['enroll']
Y.head()
# OLSの実行(Ordinary Least Squares: 最小二乗法)
model1 = sm.OLS(Y,X)
results1 = model1.fit()
print(results1.summary())
plt.scatter(df['log_income'], df['enroll'])
plt.plot(df['log_income'], results1.predict())
Out[6]:
In [7]:
# 単回帰分析(大学進学率と初年度納付金)
# 説明変数設定
X = df[['log_pay']]
X = sm.add_constant(X)
# 被説明変数設定
Y = df['enroll']
# OLSの実行(Ordinary Least Squares: 最小二乗法)
model2 = sm.OLS(Y,X)
results2 = model2.fit()
print(results2.summary())
plt.scatter(df['log_pay'], df['enroll'])
plt.plot(df['log_pay'], results2.predict())
Out[7]:
In [8]:
# 単回帰分析(大学進学率と初年度納付金)
# 説明変数設定
X = np.log(df[['log_pop']])
X = sm.add_constant(X)
# 被説明変数設定
Y = df['enroll']
# OLSの実行(Ordinary Least Squares: 最小二乗法)
model3 = sm.OLS(Y,X)
results3 = model3.fit()
print(results3.summary())
plt.scatter(np.log(df['log_pop']), df['enroll'])
plt.plot(np.log(df['log_pop']), results3.predict())
Out[8]:
In [9]:
# 説明変数設定
X = df[['log_income', 'log_pay', 'log_pop']]
X = sm.add_constant(X)
X.head()
# 被説明変数設定
Y = df['enroll']
Y.head()
# OLSの実行(Ordinary Least Squares: 最小二乗法)
model4 = sm.OLS(Y,X)
results4 = model4.fit()
print(results4.summary())
In [10]:
plt.plot(np.asarray(df.index).astype(int), df['enroll'], label='actual')
plt.plot(np.asarray(df.index).astype(int), results1.predict(), label='predict1')
plt.xlabel('year')
plt.ylabel('Enrollment Rate')
plt.legend(loc=2)
plt.savefig('predict1.png')
In [11]:
plt.plot(np.asarray(df.index).astype(int), df['enroll'], label='actual')
plt.plot(np.asarray(df.index).astype(int), results2.predict(), label='predict2')
plt.xlabel('year')
plt.ylabel('Enrollment Rate')
plt.legend(loc=2)
plt.savefig('predict2.png')
In [12]:
plt.plot(np.asarray(df.index).astype(int), df['enroll'], label='actual')
plt.plot(np.asarray(df.index).astype(int), results3.predict(), label='predict3')
plt.xlabel('year')
plt.ylabel('Enrollment Rate')
plt.legend(loc=2)
plt.savefig('predict3.png')
In [13]:
plt.plot(np.asarray(df.index).astype(int), df['enroll'], label='actual')
plt.plot(np.asarray(df.index).astype(int), results4.predict(), label='predict4')
plt.xlabel('year')
plt.ylabel('Enrollment Rate')
plt.legend(loc=2)
plt.savefig('predict4.png')