notebook.community

Edit and run



In [1]:

    
import scipy as sp
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline



In [2]:

    
# reading the web Traffic data
data = sp.genfromtxt("data/web_traffic.tsv", delimiter="\t")
data









    Out[2]:





array([[  1.00000000e+00,   2.27200000e+03],
       [  2.00000000e+00,              nan],
       [  3.00000000e+00,   1.38600000e+03],
       ..., 
       [  7.41000000e+02,   5.39200000e+03],
       [  7.42000000e+02,   5.90600000e+03],
       [  7.43000000e+02,   4.88100000e+03]])



In [3]:

    
# splitting the data into X and y
print(data.dtype)

X = data[:,0]
y = data[:,1]

print("total X :", len(X))
print("total X :", len(y))









    



float64
total X : 743
total X : 743



In [4]:

    
# checking if any value in y is NA or nan which is not required for us to calculation
print(np.sum(X==np.nan))  ## this calc is wrong as we can not compare the NA or nan with any value
print(np.sum(np.isnan(X)))

print(np.sum(y==np.nan))  ## 
print(np.sum(np.isnan(y)))



In [5]:

    
# so y is having 8 NaN or NA in the data so filtering the
X = X[~np.isnan(y)]
y = y[~np.isnan(y)]

print(len(X), len(y))



In [6]:

    
## Plotting the data 

#  setting the plot
plt.figure(figsize=(8,6), dpi=80)

# plot type and color
plt.scatter(X,y, color='green')

# plot labels
plt.xlabel("X (week) ")
plt.ylabel("y (traffic in thousands) ")
plt.title("Weekly web traffic data")

# grid and autoscale
plt.grid(True, linestyle='-', color='0.75')
plt.autoscale(True)

# ticks renaming
# plt.xticks(x, labels, rotation='vertical')
v1 = [w*7*24 for w in range(10)]
lbl = ["week %i" % i for i in range(10) ]
plt.xticks(v1, lbl, rotation='vertical')
plt.yticks([i*1000 for i in range(10)], ["%i" % i for i in range(10)])

# Pad margins so that markers don't get clipped by the axes
plt.margins(0.2)

# display plot
plt.show()



In [7]:

    
# Chossing the model

# creating the rss error func
def rssErr(f, x, y):
    return(np.sum((y-f(x))**2))

Fitting a linear model fp1



In [8]:

    
# starting with linear model where degree is 1
# polyfit() - best put that line into the chart so that it results in the smallest
# approximation error

fp1, residuals, rank, sv, rcond = sp.polyfit(X, y, 1, full=True)
fp1









    Out[8]:





array([   2.59619213,  989.02487106])

Linear Function :
f(x) = 2.59619213 + 989.02487106 * x



In [9]:

    
print(residuals)
print(rank, sv, rcond)









    



[  3.17389767e+08]
2 [ 1.36699071  0.36240365] 1.6320278462e-13



In [10]:

    
# fitting these value in a linear model
f1 = sp.poly1d(fp1)
f1









    Out[10]:





poly1d([   2.59619213,  989.02487106])



In [11]:

    
# checking put the error for fp1 model
rssErr(f1, X, y)









    Out[11]:





317389767.33977801



In [12]:

    
# plotting it
#  setting the plot
plt.figure(figsize=(8,6), dpi=80)

# plot type and color
plt.scatter(X,y, color='green')
plt.plot(X, f1(X), color='blue', linewidth=3)   #plotting the fucntions

# adding legends
plt.legend(["d=%i" % f1.order], loc="upper left")

# plot labels
plt.xlabel("X (week) ")
plt.ylabel("y (traffic in thousands) ")
plt.title("Weekly web traffic data")

# grid and autoscale
plt.grid(True, linestyle='-', color='0.75')
plt.autoscale(True)

# ticks renaming
# plt.xticks(x, labels, rotation='vertical')
v1 = [w*7*24 for w in range(10)]
lbl = ["week %i" % i for i in range(10) ]
plt.xticks(v1, lbl, rotation='vertical')
plt.yticks([i*1000 for i in range(10)], ["%i" % i for i in range(10)])

# Pad margins so that markers don't get clipped by the axes
plt.margins(0.2)

# display plot
plt.show()

Fitting the model with polynomial degree of 2



In [13]:

    
fp2 = sp.polyfit(X, y, 2)
fp2









    Out[13]:





array([  1.05322215e-02,  -5.26545650e+00,   1.97476082e+03])



In [14]:

    
# fitting the model
f2 = sp.poly1d(fp2)
f2









    Out[14]:





poly1d([  1.05322215e-02,  -5.26545650e+00,   1.97476082e+03])



In [15]:

    
# checking put the error for fp1 model
rssErr(f2, X, y)









    Out[15]:





179983507.87817919



In [37]:

    
# plotting it
#  setting the plot
plt.figure(figsize=(8,6), dpi=80)

# plot type and color
plt.scatter(X,y, color='green')
l1, = plt.plot(X, f1(X), color='blue', linewidth=3)   #plotting the fucntions
l2, = plt.plot(X, f2(X), color='red', linewidth=3)   #plotting the fucntions

# adding legends
plt.legend([l1, l2], ["d=%i" % f1.order, "d=%i" % f2.order], loc="upper left")
#plt.legend(["d=%i" % f1.order], loc="upper left")
#plt.legend(["d=%i" % f2.order], loc="upper left")

# plot labels
plt.xlabel("X (week) ")
plt.ylabel("y (traffic in thousands) ")
plt.title("Weekly web traffic data")

# grid and autoscale
plt.grid(True, linestyle='-', color='0.75')
plt.autoscale(True)

# ticks renaming
# plt.xticks(x, labels, rotation='vertical')
v1 = [w*7*24 for w in range(10)]
lbl = ["week %i" % i for i in range(10) ]
plt.xticks(v1, lbl, rotation='vertical')
plt.yticks([i*1000 for i in range(10)], ["%i" % i for i in range(10)])

# Pad margins so that markers don't get clipped by the axes
plt.margins(0.2)

# display plot
plt.show()

Trying to fir the model with 53 polynomial



In [20]:

    
fp53 = sp.polyfit(X, y, 53)
fp53









    



C:\Anaconda3\lib\site-packages\numpy\lib\polynomial.py:594: RankWarning: Polyfit may be poorly conditioned
  warnings.warn(msg, RankWarning)






    Out[20]:





array([ -6.72757712e-140,   1.19710645e-136,  -4.66409475e-135,
        -4.66529661e-131,  -2.54198136e-128,   2.26156478e-126,
         1.39014762e-122,   1.16093890e-119,   4.32182384e-117,
        -1.50838091e-114,  -3.76625193e-111,  -3.20489362e-108,
        -1.48599404e-105,   6.30299226e-104,   8.53578286e-100,
         9.10930904e-097,   5.56783977e-094,   1.31815161e-091,
        -1.52871800e-088,  -2.41369625e-085,  -1.84967143e-082,
        -7.30316898e-080,   2.08030615e-077,   6.26218660e-074,
         5.61037248e-071,   2.52820927e-068,  -4.41898626e-066,
        -1.85478081e-062,  -1.62962217e-059,  -5.76873382e-057,
         3.55020117e-054,   6.50872665e-051,   3.79794068e-048,
        -4.37595886e-046,  -2.48093022e-042,  -1.57847617e-039,
         3.46199499e-037,   1.09207054e-033,   3.34298349e-031,
        -4.93730909e-028,  -2.97732759e-025,   2.64726514e-022,
         1.18878653e-019,  -2.02259085e-016,   1.03624647e-013,
        -2.90540411e-011,   4.79731673e-009,  -4.21031124e-007,
         5.97963908e-006,   2.59224493e-003,  -2.60194569e-001,
         1.03507793e+001,  -1.60104587e+002,   2.14973092e+003])



In [21]:

    
# fitting the model
f53 = sp.poly1d(fp53)
f53









    Out[21]:





poly1d([ -6.72757712e-140,   1.19710645e-136,  -4.66409475e-135,
        -4.66529661e-131,  -2.54198136e-128,   2.26156478e-126,
         1.39014762e-122,   1.16093890e-119,   4.32182384e-117,
        -1.50838091e-114,  -3.76625193e-111,  -3.20489362e-108,
        -1.48599404e-105,   6.30299226e-104,   8.53578286e-100,
         9.10930904e-097,   5.56783977e-094,   1.31815161e-091,
        -1.52871800e-088,  -2.41369625e-085,  -1.84967143e-082,
        -7.30316898e-080,   2.08030615e-077,   6.26218660e-074,
         5.61037248e-071,   2.52820927e-068,  -4.41898626e-066,
        -1.85478081e-062,  -1.62962217e-059,  -5.76873382e-057,
         3.55020117e-054,   6.50872665e-051,   3.79794068e-048,
        -4.37595886e-046,  -2.48093022e-042,  -1.57847617e-039,
         3.46199499e-037,   1.09207054e-033,   3.34298349e-031,
        -4.93730909e-028,  -2.97732759e-025,   2.64726514e-022,
         1.18878653e-019,  -2.02259085e-016,   1.03624647e-013,
        -2.90540411e-011,   4.79731673e-009,  -4.21031124e-007,
         5.97963908e-006,   2.59224493e-003,  -2.60194569e-001,
         1.03507793e+001,  -1.60104587e+002,   2.14973092e+003])



In [23]:

    
# checking put the error for fp1 model
rssErr(f53, X, y)









    Out[23]:





109452401.96765307



In [40]:

    
# plotting it
#  setting the plot
plt.figure(figsize=(12,9), dpi=80)

# plot type and color
plt.scatter(X,y, color='green')
l1, = plt.plot(X, f1(X), color='blue', linewidth=3)   #plotting the fucntions
l53, = plt.plot(X, f53(X), color='red', linewidth=3)   #plotting the fucntions

# adding legends
plt.legend([l1, l53],["d=%i" % f1.order, "d=%i" % f53.order], loc="upper left")


# plot labels
plt.xlabel("X (week) ")
plt.ylabel("y (traffic in thousands) ")
plt.title("Weekly web traffic data")

# grid and autoscale
plt.grid(True, linestyle='-', color='0.75')
plt.autoscale(True)

# ticks renaming
# plt.xticks(x, labels, rotation='vertical')
v1 = [w*7*24 for w in range(10)]
lbl = ["week %i" % i for i in range(10) ]
plt.xticks(v1, lbl, rotation='vertical')
plt.yticks([i*1000 for i in range(10)], ["%i" % i for i in range(10)])

# Pad margins so that markers don't get clipped by the axes
plt.margins(0.2)

# display plot
plt.show()

As we can see that between week3 and week4, there is drastic change in data behaviour, so instead of plotting one line for all the data, we are going to split the data into two parts based on data behaviour



In [47]:

    
# taking n=3.5
split=int(3.5*7*24)
X1 = X[:split]
X2 = X[split:]
y1 = y[:split]
y2 = y[split:]

Plotting the linear model with these 2 datasets



In [48]:

    
fps1 = sp.polyfit(X1, y1, 1)
fs1 = sp.poly1d(fps1)

fps2 = sp.polyfit(X2, y2, 1)
fs2 = sp.poly1d(fps2)



In [49]:

    
# error in both
print(rssErr(fs1, X1, y1))
print(rssErr(fs2, X2, y2))









    



110806407.09
22143941.1076



In [90]:

    
# plotting the data into 
plt.figure(figsize=(12,9), dpi=120)

# grid style and autoscale properties
plt.grid(True)
plt.autoscale(True)
plt.margins(0.15)

# assigning labels
plt.title("Web traffic data")
plt.xlabel("Week")
plt.ylabel("Hits")

# re-marking ticks
plt.xticks([w*7*24 for w in range(0, 9)], ["week %d" %i for i in range(0, 9)])

# plotting the data
plt.scatter(X, y, color='green')
ls1, = plt.plot(X1, fs1(X1), color='blue', linewidth=3)
plt.plot(X2, fs1(X2), color='blue', linewidth=3, linestyle='--')
ls2, = plt.plot(X2, fs2(X2), color='red', linewidth=3)
plt.plot(X1[split-40:split], fs2(X1[split-40:split]), color='red', linewidth=3, linestyle='--')

# adding legeds
plt.legend([ls1, ls2], ["sub-dataset1 d=%d " %fs1.order, "subdataset2 d=%d" %fs2.order], loc="upper left")

# display plot
plt.show()



In [ ]:



In [ ]: