In [2]:
# -*- coding: utf8 -*-

import scipy as sp

data_path = './ch01/web_traffic.tsv'

# 预处理数据
data = sp.genfromtxt(data_path, delimiter="\t")
x = data[:, 0]
y = data[:, 1]
x = x[~sp.isnan(y)]
y = y[~sp.isnan(y)]

# 通过图像观察数据
import matplotlib.pyplot as plt
plt1 = plt.subplot(211)
plt2 = plt.subplot(212)


def plt_org(p, x, y):
    p.scatter(x, y, s=10)
    # p.title("Web traffic over the last month")
    # p.xlabel("Time")
    # p.ylabel("Hits/Hour")
    # plt.xticks([w*7*24 for w in range(10)], ['week %i' % w for w in range(10)])
    p.set_xticks([w*7*24 for w in range(10)])
    p.set_xticklabels(['week %i' % w for w in range(10)])
    p.autoscale(tight=True)
    p.grid(True, linestyle='-', color='0.75')

plt_org(plt1, x, y)
# plt.show()


# ==================================
# 第一个模型: 一阶多项式,通过 polyfit 求出对于多项式的参数(误差最小)
fp1, residuals, rank, sv, rcond = sp.polyfit(x, y, 1, full=True)
print "Model parameters: %s" % fp1  # 模型对于的参数
print "the error of approximation: %s" % residuals
# Model parameters: [   2.59619213  989.02487106]
# the error of approximation: [  3.17389767e+08]
# 即对于一阶多项式,最接近的模型函数为: fx = 2.59 * x + 989

# 使用 poly1d() 构建函数模型
f1 = sp.poly1d(fp1)


def error(f, x, y):
    """误差函数"""
    return sp.sum((f(x) - y) ** 2)

print 'f1 error: %s' % error(f1, x, y)

# 画出模型 f1
fx = sp.linspace(0, x[-1], 1000)
plt1.plot(fx, f1(fx), linewidth=4)
# plt.legend(["d=%i" % f1.order], loc="upper left")
# plt.show()


# ====================================
# 第二个模型: 二阶多项式
fp2 = sp.polyfit(x, y, 2)
print "fp2: %s" % fp2
f2 = sp.poly1d(fp2)
print 'f2 error: %s' % error(f2, x, y)
plt1.plot(fx, f2(fx), linewidth=4)
# plt.legend(["d=%i" % f2.order], loc="upper left")
# plt.show()

# ===================================
# 高阶多项式
fp3 = sp.polyfit(x, y, 3)
print "fp3: %s" % fp3
f3 = sp.poly1d(fp3)
print 'f3 error: %s' % error(f3, x, y)
plt1.plot(fx, f3(fx), linewidth=4)

fp10 = sp.polyfit(x, y, 10)
print "fp10: %s" % fp10
f10 = sp.poly1d(fp10)
print 'f10 error: %s' % error(f10, x, y)
plt1.plot(fx, f10(fx), linewidth=4)


fp53 = sp.polyfit(x, y, 53)
print "fp53: %s" % fp53
f53 = sp.poly1d(fp53)
print 'f53 error: %s' % error(f53, x, y)
plt1.plot(fx, f53(fx), linewidth=4)
plt1.legend(["d=%i" % i for i in [f1.order, f2.order, f3.order, f10.order, f53.order]], loc="upper left")
# plt1.show()


# ==================================
# 通过观察,发现多项式的模型并不好,回来重新分析原始数据
# 发现在 3.5 week 的时候有个较大变化
inflection = 3.5 * 7 * 24
xa = x[:inflection]
ya = y[:inflection]
xb = x[inflection:]
yb = y[inflection:]

fa = sp.poly1d(sp.polyfit(xa, ya, 1))
fb = sp.poly1d(sp.polyfit(xb, yb, 1))

fa_error = error(fa, xa, ya)
fb_error = error(fb, xb, yb)
print "Errors: %s" % (fa_error + fb_error)
plt_org(plt2, x, y)
plt2.plot(fx, fa(fx), linewidth=4)
fx_b = fx[fb(fx) > 0]
plt2.plot(fx_b, fb(fx_b), linewidth=4)

plt.show()


/usr/local/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
Model parameters: [   2.59619213  989.02487106]
the error of approximation: [  3.17389767e+08]
f1 error: 317389767.34
fp2: [  1.05322215e-02  -5.26545650e+00   1.97476082e+03]
f2 error: 179983507.878
fp3: [  3.04960132e-05  -2.35893797e-02   4.94226019e+00   1.33328797e+03]
f3 error: 139350144.032
fp10: [ -3.73981969e-22   1.36473757e-18  -2.14294407e-15   1.89884971e-12
  -1.04570108e-09   3.70867732e-07  -8.45604589e-05   1.19167041e-02
  -9.41618608e-01   3.33703840e+01   1.26421204e+03]
f10 error: 121942326.364
fp53: [ -6.72739871e-140   1.19706771e-136  -4.66249925e-135  -4.66521249e-131
  -2.54199679e-128   2.26217122e-126   1.39006833e-122   1.16089558e-119
   4.32209353e-117  -1.50805718e-114  -3.76642325e-111  -3.20501324e-108
  -1.48577318e-105   6.29562287e-104   8.53557543e-100   9.10937328e-097
   5.56791223e-094   1.31788577e-091  -1.52860186e-088  -2.41364190e-085
  -1.84979297e-082  -7.30150895e-080   2.07963586e-077   6.26248966e-074
   5.61005011e-071   2.52828198e-068  -4.42064058e-066  -1.85469460e-062
  -1.62963159e-059  -5.76751807e-057   3.54986921e-054   6.50812164e-051
   3.79811829e-048  -4.37696633e-046  -2.48071307e-042  -1.57853533e-039
   3.46180014e-037   1.09208140e-033   3.34265576e-031  -4.93695896e-028
  -2.97749093e-025   2.64737703e-022   1.18865563e-019  -2.02249177e-016
   1.03620080e-013  -2.90526737e-011   4.79704158e-009  -4.20993694e-007
   5.97626292e-006   2.59243705e-003  -2.60200785e-001   1.03508668e+001
  -1.60104499e+002   2.14972274e+003]
f53 error: 109452413.887
Errors: 132950348.198
/usr/local/lib/python2.7/site-packages/numpy/lib/polynomial.py:595: RankWarning: Polyfit may be poorly conditioned
  warnings.warn(msg, RankWarning)
/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:99: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:100: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:101: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:102: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future

In [ ]: