In [1]:
import scipy as sp
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
# reading the web Traffic data
data = sp.genfromtxt("data/web_traffic.tsv", delimiter="\t")
data
Out[2]:
In [3]:
# splitting the data into X and y
print(data.dtype)
X = data[:,0]
y = data[:,1]
print("total X :", len(X))
print("total X :", len(y))
In [4]:
# checking if any value in y is NA or nan which is not required for us to calculation
print(np.sum(X==np.nan)) ## this calc is wrong as we can not compare the NA or nan with any value
print(np.sum(np.isnan(X)))
print(np.sum(y==np.nan)) ##
print(np.sum(np.isnan(y)))
In [5]:
# so y is having 8 NaN or NA in the data so filtering the
X = X[~np.isnan(y)]
y = y[~np.isnan(y)]
print(len(X), len(y))
In [6]:
## Plotting the data
# setting the plot
plt.figure(figsize=(8,6), dpi=80)
# plot type and color
plt.scatter(X,y, color='green')
# plot labels
plt.xlabel("X (week) ")
plt.ylabel("y (traffic in thousands) ")
plt.title("Weekly web traffic data")
# grid and autoscale
plt.grid(True, linestyle='-', color='0.75')
plt.autoscale(True)
# ticks renaming
# plt.xticks(x, labels, rotation='vertical')
v1 = [w*7*24 for w in range(10)]
lbl = ["week %i" % i for i in range(10) ]
plt.xticks(v1, lbl, rotation='vertical')
plt.yticks([i*1000 for i in range(10)], ["%i" % i for i in range(10)])
# Pad margins so that markers don't get clipped by the axes
plt.margins(0.2)
# display plot
plt.show()
In [7]:
# Chossing the model
# creating the rss error func
def rssErr(f, x, y):
return(np.sum((y-f(x))**2))
Fitting a linear model fp1
In [8]:
# starting with linear model where degree is 1
# polyfit() - best put that line into the chart so that it results in the smallest
# approximation error
fp1, residuals, rank, sv, rcond = sp.polyfit(X, y, 1, full=True)
fp1
Out[8]:
Linear Function :
f(x) = 2.59619213 + 989.02487106 * x
In [9]:
print(residuals)
print(rank, sv, rcond)
In [10]:
# fitting these value in a linear model
f1 = sp.poly1d(fp1)
f1
Out[10]:
In [11]:
# checking put the error for fp1 model
rssErr(f1, X, y)
Out[11]:
In [12]:
# plotting it
# setting the plot
plt.figure(figsize=(8,6), dpi=80)
# plot type and color
plt.scatter(X,y, color='green')
plt.plot(X, f1(X), color='blue', linewidth=3) #plotting the fucntions
# adding legends
plt.legend(["d=%i" % f1.order], loc="upper left")
# plot labels
plt.xlabel("X (week) ")
plt.ylabel("y (traffic in thousands) ")
plt.title("Weekly web traffic data")
# grid and autoscale
plt.grid(True, linestyle='-', color='0.75')
plt.autoscale(True)
# ticks renaming
# plt.xticks(x, labels, rotation='vertical')
v1 = [w*7*24 for w in range(10)]
lbl = ["week %i" % i for i in range(10) ]
plt.xticks(v1, lbl, rotation='vertical')
plt.yticks([i*1000 for i in range(10)], ["%i" % i for i in range(10)])
# Pad margins so that markers don't get clipped by the axes
plt.margins(0.2)
# display plot
plt.show()
Fitting the model with polynomial degree of 2
In [13]:
fp2 = sp.polyfit(X, y, 2)
fp2
Out[13]:
In [14]:
# fitting the model
f2 = sp.poly1d(fp2)
f2
Out[14]:
In [15]:
# checking put the error for fp1 model
rssErr(f2, X, y)
Out[15]:
In [37]:
# plotting it
# setting the plot
plt.figure(figsize=(8,6), dpi=80)
# plot type and color
plt.scatter(X,y, color='green')
l1, = plt.plot(X, f1(X), color='blue', linewidth=3) #plotting the fucntions
l2, = plt.plot(X, f2(X), color='red', linewidth=3) #plotting the fucntions
# adding legends
plt.legend([l1, l2], ["d=%i" % f1.order, "d=%i" % f2.order], loc="upper left")
#plt.legend(["d=%i" % f1.order], loc="upper left")
#plt.legend(["d=%i" % f2.order], loc="upper left")
# plot labels
plt.xlabel("X (week) ")
plt.ylabel("y (traffic in thousands) ")
plt.title("Weekly web traffic data")
# grid and autoscale
plt.grid(True, linestyle='-', color='0.75')
plt.autoscale(True)
# ticks renaming
# plt.xticks(x, labels, rotation='vertical')
v1 = [w*7*24 for w in range(10)]
lbl = ["week %i" % i for i in range(10) ]
plt.xticks(v1, lbl, rotation='vertical')
plt.yticks([i*1000 for i in range(10)], ["%i" % i for i in range(10)])
# Pad margins so that markers don't get clipped by the axes
plt.margins(0.2)
# display plot
plt.show()
In [20]:
fp53 = sp.polyfit(X, y, 53)
fp53
Out[20]:
In [21]:
# fitting the model
f53 = sp.poly1d(fp53)
f53
Out[21]:
In [23]:
# checking put the error for fp1 model
rssErr(f53, X, y)
Out[23]:
In [40]:
# plotting it
# setting the plot
plt.figure(figsize=(12,9), dpi=80)
# plot type and color
plt.scatter(X,y, color='green')
l1, = plt.plot(X, f1(X), color='blue', linewidth=3) #plotting the fucntions
l53, = plt.plot(X, f53(X), color='red', linewidth=3) #plotting the fucntions
# adding legends
plt.legend([l1, l53],["d=%i" % f1.order, "d=%i" % f53.order], loc="upper left")
# plot labels
plt.xlabel("X (week) ")
plt.ylabel("y (traffic in thousands) ")
plt.title("Weekly web traffic data")
# grid and autoscale
plt.grid(True, linestyle='-', color='0.75')
plt.autoscale(True)
# ticks renaming
# plt.xticks(x, labels, rotation='vertical')
v1 = [w*7*24 for w in range(10)]
lbl = ["week %i" % i for i in range(10) ]
plt.xticks(v1, lbl, rotation='vertical')
plt.yticks([i*1000 for i in range(10)], ["%i" % i for i in range(10)])
# Pad margins so that markers don't get clipped by the axes
plt.margins(0.2)
# display plot
plt.show()
As we can see that between week3 and week4, there is drastic change in data behaviour, so instead of plotting one line for all the data, we are going to split the data into two parts based on data behaviour
In [47]:
# taking n=3.5
split=int(3.5*7*24)
X1 = X[:split]
X2 = X[split:]
y1 = y[:split]
y2 = y[split:]
Plotting the linear model with these 2 datasets
In [48]:
fps1 = sp.polyfit(X1, y1, 1)
fs1 = sp.poly1d(fps1)
fps2 = sp.polyfit(X2, y2, 1)
fs2 = sp.poly1d(fps2)
In [49]:
# error in both
print(rssErr(fs1, X1, y1))
print(rssErr(fs2, X2, y2))
In [90]:
# plotting the data into
plt.figure(figsize=(12,9), dpi=120)
# grid style and autoscale properties
plt.grid(True)
plt.autoscale(True)
plt.margins(0.15)
# assigning labels
plt.title("Web traffic data")
plt.xlabel("Week")
plt.ylabel("Hits")
# re-marking ticks
plt.xticks([w*7*24 for w in range(0, 9)], ["week %d" %i for i in range(0, 9)])
# plotting the data
plt.scatter(X, y, color='green')
ls1, = plt.plot(X1, fs1(X1), color='blue', linewidth=3)
plt.plot(X2, fs1(X2), color='blue', linewidth=3, linestyle='--')
ls2, = plt.plot(X2, fs2(X2), color='red', linewidth=3)
plt.plot(X1[split-40:split], fs2(X1[split-40:split]), color='red', linewidth=3, linestyle='--')
# adding legeds
plt.legend([ls1, ls2], ["sub-dataset1 d=%d " %fs1.order, "subdataset2 d=%d" %fs2.order], loc="upper left")
# display plot
plt.show()
In [ ]:
In [ ]: