In [1]:
# import
import numpy as np
import scipy as sp
import timeit
import matplotlib.pyplot as plt
%matplotlib inline
Comparing the time
In [2]:
start = timeit.timeit()
X = range(1000)
pySum = sum([n*n for n in X])
end = timeit.timeit()
print("Total time taken: ", end-start)
Learning Scipy
In [20]:
# reading the web data
data = sp.genfromtxt("data/web_traffic.tsv", delimiter="\t")
print(data[:3])
print(len(data))
Preprocessing and Cleaning the data
In [4]:
X = data[:, 0]
y = data[:, 1]
# checking for nan values
print(sum(np.isnan(X)))
print(sum(np.isnan(y)))
Filtering the nan data
In [5]:
X = X[~np.isnan(y)]
y = y[~np.isnan(y)]
# checking for nan values
print(sum(np.isnan(X)))
print(sum(np.isnan(y)))
In [13]:
fig, ax = plt.subplots(figsize=(8,6))
ax.plot(X, y, '.b')
ax.margins(0.2)
plt.xticks([w*24*7 for w in range(0, 6)], ["week %d" %w for w in range(0, 6)])
ax.set_xlabel("Week")
ax.set_ylabel("Hits / Week")
ax.set_title("Web Traffic over weeks")
Out[13]:
Choosing the right model and learning algorithm
In [7]:
# creating a error calc fuction
def error(f, x, y):
return np.sum((f(x) - y)**2)
Linear 1-d model
In [14]:
# sp's polyfit func do the same
fp1, residuals, rank, sv, rcond = sp.polyfit(X, y, 1, full=True)
print(fp1)
print(residuals)
# generating the one order function
f1 = sp.poly1d(fp1)
# checking error
print("Error : ",error(f1, X, y))
x1 = np.array([-100, np.max(X)+100])
y1 = f1(x1)
ax.plot(x1, y1, c='g', linewidth=2)
ax.legend(["data", "d = %i" % f1.order], loc='best')
fig
Out[14]:
$$ f(x) = 2.59619213 * x + 989.02487106 $$
Polynomial 2-d
In [15]:
# sp's polyfit func do the same
fp2 = sp.polyfit(X, y, 2)
print(fp2)
# generating the 2 order function
f2= sp.poly1d(fp2)
# checking error
print("Error : ",error(f2, X, y))
x1= np.linspace(-100, np.max(X)+100, 2000)
y2= f2(x1)
ax.plot(x1, y2, c='r', linewidth=2)
ax.legend(["data", "d = %i" % f1.order, "d = %i" % f2.order], loc='best')
fig
Out[15]:
What if we want to regress two response output instead of one, As we can see in the graph that there is a steep change in data between week 3 and 4, so let's draw two reponses line, one for the data between week0 and week3.5 and second for week3.5 to week5
In [10]:
# we are going to divide the data on time so
div = 3.5*7*24
X1 = X[X<=div]
Y1 = y[X<=div]
X2 = X[X>div]
Y2 = y[X>div]
In [11]:
# now plotting the both data
fa = sp.poly1d(sp.polyfit(X1, Y1, 1))
fb = sp.poly1d(sp.polyfit(X2, Y2, 1))
fa_error = error(fa, X1, Y1)
fb_error = error(fb, X2, Y2)
print("Error inflection = %f" % (fa_error + fb_error))
In [16]:
x1 = np.linspace(-100, X1[-1]+100, 1000)
x2 = np.linspace(X1[-10], X2[-1]+100, 1000)
ya = fa(x1)
yb = fb(x2)
ax.plot(x1, ya, c='#800000', linewidth=2) # brown
ax.plot(x2, yb, c='#FFA500', linewidth=2) # orange
ax.grid(True)
fig
Out[16]:
Suppose we choose that function with degree 2 is best fit for our data and want to predict that if everything will go same then when we will hit the 100000 count ??
$$ 0 = f(x) - 100000 = 0.0105322215 * x^2 - 5.26545650 * x + 1974.6082 - 100000 $$SciPy's optimize module has the function fsolve that achieves this, when providing an initial starting position with parameter x0. As every entry in our input data file corresponds to one hour, and we have 743 of them, we set the starting position to some value after that. Let fbt2 be the winning polynomial of degree 2.
In [17]:
print(f2)
In [19]:
print(f2 - 100000)
# import
from scipy.optimize import fsolve
reached_max = fsolve(f2-100000, x0=800)/(7*24)
print("100,000 hits/hour expected at week %f" % reached_max[0])
In [ ]: