In [1]:
# import
import numpy as np
import scipy as sp
import timeit
import matplotlib.pyplot as plt

%matplotlib inline

Comparing the time


In [2]:
start = timeit.timeit()

X = range(1000)

pySum = sum([n*n for n in X])

end = timeit.timeit()

print("Total time taken: ", end-start)


Total time taken:  -0.006210096431396868

Learning Scipy


In [20]:
# reading the web data 

data = sp.genfromtxt("data/web_traffic.tsv", delimiter="\t")
print(data[:3])
print(len(data))


[[  1.00000000e+00   2.27200000e+03]
 [  2.00000000e+00              nan]
 [  3.00000000e+00   1.38600000e+03]]
743

Preprocessing and Cleaning the data


In [4]:
X = data[:, 0]
y = data[:, 1]

# checking for nan values
print(sum(np.isnan(X)))
print(sum(np.isnan(y)))


0
8

Filtering the nan data


In [5]:
X = X[~np.isnan(y)]
y = y[~np.isnan(y)]

# checking for nan values
print(sum(np.isnan(X)))
print(sum(np.isnan(y)))


0
0

In [13]:
fig, ax = plt.subplots(figsize=(8,6))

ax.plot(X, y, '.b')
ax.margins(0.2)
plt.xticks([w*24*7 for w in range(0, 6)], ["week %d" %w for w in range(0, 6)])
ax.set_xlabel("Week")
ax.set_ylabel("Hits / Week")
ax.set_title("Web Traffic over weeks")


Out[13]:
<matplotlib.text.Text at 0x297739d8710>

Choosing the right model and learning algorithm


In [7]:
# creating a error calc fuction
def error(f, x, y):
    return np.sum((f(x) - y)**2)

Linear 1-d model


In [14]:
# sp's polyfit func do the same
fp1, residuals, rank, sv, rcond = sp.polyfit(X, y, 1, full=True)

print(fp1)
print(residuals)

# generating the one order function
f1 = sp.poly1d(fp1)

# checking error
print("Error : ",error(f1, X, y))

x1 = np.array([-100, np.max(X)+100])
y1 = f1(x1)

ax.plot(x1, y1, c='g', linewidth=2)
ax.legend(["data", "d = %i" % f1.order], loc='best')
fig


[   2.59619213  989.02487106]
[  3.17389767e+08]
Error :  317389767.34
Out[14]:

$$ f(x) = 2.59619213 * x + 989.02487106 $$

Polynomial 2-d


In [15]:
# sp's polyfit func do the same
fp2 = sp.polyfit(X, y, 2)

print(fp2)

# generating the 2 order function
f2= sp.poly1d(fp2)

# checking error
print("Error : ",error(f2, X, y))

x1= np.linspace(-100, np.max(X)+100, 2000)
y2= f2(x1)

ax.plot(x1, y2, c='r', linewidth=2)
ax.legend(["data", "d = %i" % f1.order, "d = %i" % f2.order], loc='best')
fig


[  1.05322215e-02  -5.26545650e+00   1.97476082e+03]
Error :  179983507.878
Out[15]:
$$ f(x) = 0.0105322215 * x^2 - 5.26545650 * x + 1974.6082 $$

What if we want to regress two response output instead of one, As we can see in the graph that there is a steep change in data between week 3 and 4, so let's draw two reponses line, one for the data between week0 and week3.5 and second for week3.5 to week5


In [10]:
# we are going to divide the data on time so
div = 3.5*7*24

X1 = X[X<=div]
Y1 = y[X<=div]

X2 = X[X>div]
Y2 = y[X>div]

In [11]:
# now plotting the both data

fa = sp.poly1d(sp.polyfit(X1, Y1, 1))
fb = sp.poly1d(sp.polyfit(X2, Y2, 1))

fa_error = error(fa, X1, Y1)
fb_error = error(fb, X2, Y2)
print("Error inflection = %f" % (fa_error + fb_error))


Error inflection = 135015350.586215

In [16]:
x1 = np.linspace(-100, X1[-1]+100, 1000)
x2 = np.linspace(X1[-10], X2[-1]+100, 1000)

ya = fa(x1)
yb = fb(x2)

ax.plot(x1, ya, c='#800000', linewidth=2)  # brown
ax.plot(x2, yb, c='#FFA500', linewidth=2)  # orange
ax.grid(True)


fig


Out[16]:

Suppose we choose that function with degree 2 is best fit for our data and want to predict that if everything will go same then when we will hit the 100000 count ??

$$ 0 = f(x) - 100000 = 0.0105322215 * x^2 - 5.26545650 * x + 1974.6082 - 100000 $$

SciPy's optimize module has the function fsolve that achieves this, when providing an initial starting position with parameter x0. As every entry in our input data file corresponds to one hour, and we have 743 of them, we set the starting position to some value after that. Let fbt2 be the winning polynomial of degree 2.


In [17]:
print(f2)


         2
0.01053 x - 5.265 x + 1975

In [19]:
print(f2 - 100000)

# import 
from scipy.optimize import fsolve

reached_max = fsolve(f2-100000, x0=800)/(7*24)
print("100,000 hits/hour expected at week %f" % reached_max[0])


         2
0.01053 x - 5.265 x - 9.803e+04
100,000 hits/hour expected at week 19.708090

In [ ]: