In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import Series, DataFrame
from numpy.random import normal
In [2]:
def create_dataset(num):
dataset = DataFrame(columns=['x','y'])
for i in range(num):
x = float(i)/float(num-1)
y = np.sin(2*np.pi*x) + normal(scale=0.3)
dataset = dataset.append(Series([x,y], index=['x','y']),
ignore_index=True)
return dataset
In [3]:
train_set = create_dataset(10)
In [4]:
train_set
Out[4]:
In [5]:
train_set.plot(kind='scatter', x='x', y='y',
xlim=[-0.1,1.1], ylim=[-1.5,1.5])
Out[5]:
In [6]:
def create_dataset(num):
data_x = np.linspace(0,1,num)
data_y = np.sin(2*np.pi*data_x) + normal(loc=0,scale=0.3,size=num)
dataset = DataFrame({'x': data_x, 'y': data_y})
return dataset
In [7]:
def resolve(dataset, m):
t = dataset.y
phi = DataFrame()
for i in range(0,m+1):
p = dataset.x**i
p.name="x**%d" % i
phi = pd.concat([phi,p], axis=1)
tmp = np.linalg.inv(np.dot(phi.T, phi))
ws = np.dot(np.dot(tmp, phi.T), t)
def f(x):
y = 0
for i, w in enumerate(ws):
y += w * (x ** i)
return y
return f
In [8]:
def resolve_debug(dataset, m):
t = dataset.y
print "\nt:"
print t
phi = DataFrame()
for i in range(0,m+1):
p = dataset.x**i
p.name="x**%d" % i
phi = pd.concat([phi,p], axis=1)
print "\nphi:"
print phi
tmp = np.linalg.inv(np.dot(phi.T, phi))
ws = np.dot(np.dot(tmp, phi.T), t)
print "\nws:"
print ws
def f(x):
y = 0
for i, w in enumerate(ws):
y += w * (x ** i)
return y
return f
In [9]:
f = resolve_debug(train_set, 3)
In [10]:
def rms_error(dataset, f):
err = 0.0
for index, line in dataset.iterrows():
x, y = line.x, line.y
err += 0.5 * (y - f(x))**2
return np.sqrt(2 * err / len(dataset))
In [11]:
def rms_error(dataset, f):
return np.sqrt(np.sum((dataset.y - f(dataset.x))**2)/len(dataset))
In [12]:
def show_result(subplot, train_set, m):
f = resolve(train_set, m)
subplot.set_xlim(-0.05,1.05)
subplot.set_ylim(-1.5,1.5)
subplot.set_title("M=%d" % m)
# トレーニングセットを表示
subplot.scatter(train_set.x, train_set.y, marker='o',
color='blue', label=None)
# 真の曲線を表示
linex = np.linspace(0,1,101)
liney = np.sin(2*np.pi*linex)
subplot.plot(linex, liney, color='green', linestyle='--')
# 多項式近似の曲線を表示
linex = np.linspace(0,1,101)
liney = f(linex)
label = "E(RMS)=%.2f" % rms_error(train_set, f)
subplot.plot(linex, liney, color='red', label=label)
subplot.legend(loc=1)
In [17]:
fig = plt.figure(figsize=(8, 6))
for i, m in enumerate([0,1,3,9]):
subplot = fig.add_subplot(2,2,i+1)
show_result(subplot, train_set, m)
In [14]:
def show_rms_trend(train_set, test_set):
df = DataFrame(columns=['Training set','Test set'])
for m in range(0,10): # 多項式の次数
f = resolve(train_set, m)
train_error = rms_error(train_set, f)
test_error = rms_error(test_set, f)
df = df.append(Series([train_error, test_error],
index=['Training set','Test set']),
ignore_index=True)
df.plot(title='RMS Error', style=['-','--'], grid=True, ylim=(0,0.9))
In [15]:
def show_rms_trend_debug(train_set, test_set):
df = DataFrame(columns=['Training set','Test set'])
for m in range(0,10): # 多項式の次数
f = resolve(train_set, m)
train_error = rms_error(train_set, f)
test_error = rms_error(test_set, f)
df = df.append(Series([train_error, test_error],
index=['Training set','Test set']),
ignore_index=True)
print df
df.plot(title='RMS Error', style=['-','--'], grid=True, ylim=(0,0.9))
In [16]:
train_set = create_dataset(10)
test_set = create_dataset(10)
show_rms_trend_debug(train_set, test_set)