In [33]:
import matplotlib
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
%matplotlib inline

In [34]:
print("Pandas version:{}\nNumpy version:{}\n".format(pd.__version__, np.__version__))


Pandas version:0.17.1
Numpy version:1.10.1


In [35]:
data_file = open("data/ta_feng_dataset/D02")
data = pd.read_csv(data_file,';',skiprows=[2,3,4,5,6,7,8],names=['Date','Customer_Id','Age','Area','Product_Class','Product_Id','Amount','Assets','Price'])
data.head()


Out[35]:
Date Customer_Id Age Area Product_Class Product_Id Amount Assets Price
0 2001-02-01 00:00:00 557818 H E 500210 4710114105046 1 123 135
1 2001-02-01 00:00:00 1677683 C B 711310 4902520163103 6 840 894
2 2001-02-03 00:00:00 1543483 C F 500307 4901422038939 1 110 119
3 2001-02-03 00:00:00 2120010 G F 130207 4710105011011 1 24 29
4 2001-02-04 00:00:00 1431131 K G 120103 4710011405133 1 23 29

In [36]:
data.shape


Out[36]:
(199032, 9)

In [37]:
unrelated_columns = range(2,9)
data = data.drop(data.columns[unrelated_columns],axis=1)
data.head()
# data.plot(kind='scatter', y='Amount', x='Customer_Id')


Out[37]:
Date Customer_Id
0 2001-02-01 00:00:00 557818
1 2001-02-01 00:00:00 1677683
2 2001-02-03 00:00:00 1543483
3 2001-02-03 00:00:00 2120010
4 2001-02-04 00:00:00 1431131

In [38]:
data['Date'] = pd.to_datetime(data['Date'])
data.head()


Out[38]:
Date Customer_Id
0 2001-02-01 557818
1 2001-02-01 1677683
2 2001-02-03 1543483
3 2001-02-03 2120010
4 2001-02-04 1431131

In [39]:
data_count = data.groupby('Date').count()
data_count.rename(columns={'Customer_Id':'People_count','Date':'Date'},inplace=True)
data_count


Out[39]:
People_count
Date
2001-02-01 4282
2001-02-02 4615
2001-02-03 10497
2001-02-04 11820
2001-02-05 5578
2001-02-06 5684
2001-02-07 4931
2001-02-08 5448
2001-02-09 5309
2001-02-10 9971
2001-02-11 11993
2001-02-12 4601
2001-02-13 5632
2001-02-14 6002
2001-02-15 6719
2001-02-16 5372
2001-02-17 8089
2001-02-18 9849
2001-02-19 6513
2001-02-20 6021
2001-02-21 6501
2001-02-22 6014
2001-02-23 6202
2001-02-24 9612
2001-02-25 12914
2001-02-26 4680
2001-02-27 4808
2001-02-28 9375

In [40]:
data_count['Date'] = data_count.index
data_count['Index'] = range(0, len(data_count))
data_count = data_count.set_index('Index')
data_count


Out[40]:
People_count Date
Index
0 4282 2001-02-01
1 4615 2001-02-02
2 10497 2001-02-03
3 11820 2001-02-04
4 5578 2001-02-05
5 5684 2001-02-06
6 4931 2001-02-07
7 5448 2001-02-08
8 5309 2001-02-09
9 9971 2001-02-10
10 11993 2001-02-11
11 4601 2001-02-12
12 5632 2001-02-13
13 6002 2001-02-14
14 6719 2001-02-15
15 5372 2001-02-16
16 8089 2001-02-17
17 9849 2001-02-18
18 6513 2001-02-19
19 6021 2001-02-20
20 6501 2001-02-21
21 6014 2001-02-22
22 6202 2001-02-23
23 9612 2001-02-24
24 12914 2001-02-25
25 4680 2001-02-26
26 4808 2001-02-27
27 9375 2001-02-28

In [41]:
data_count.plot(x='Date',y='People_count',style=".")


Out[41]:
<matplotlib.axes._subplots.AxesSubplot at 0x11516a5d0>

In [42]:
data_count['from_beg'] = range(1,len(data_count)+1)
data_count


Out[42]:
People_count Date from_beg
Index
0 4282 2001-02-01 1
1 4615 2001-02-02 2
2 10497 2001-02-03 3
3 11820 2001-02-04 4
4 5578 2001-02-05 5
5 5684 2001-02-06 6
6 4931 2001-02-07 7
7 5448 2001-02-08 8
8 5309 2001-02-09 9
9 9971 2001-02-10 10
10 11993 2001-02-11 11
11 4601 2001-02-12 12
12 5632 2001-02-13 13
13 6002 2001-02-14 14
14 6719 2001-02-15 15
15 5372 2001-02-16 16
16 8089 2001-02-17 17
17 9849 2001-02-18 18
18 6513 2001-02-19 19
19 6021 2001-02-20 20
20 6501 2001-02-21 21
21 6014 2001-02-22 22
22 6202 2001-02-23 23
23 9612 2001-02-24 24
24 12914 2001-02-25 25
25 4680 2001-02-26 26
26 4808 2001-02-27 27
27 9375 2001-02-28 28

In [43]:
lm = smf.ols(formula='People_count ~ from_beg', data=data_count).fit()
lm.params


Out[43]:
Intercept    6608.349206
from_beg       34.478380
dtype: float64

In [44]:
x_new = pd.DataFrame({'from_beg': [data_count['from_beg'].min(), data_count['from_beg'].max()]})
x_new


Out[44]:
from_beg
0 1
1 28

In [45]:
preds = lm.predict(x_new)
preds


Out[45]:
array([ 6642.82758621,  7573.74384236])

In [46]:
# first, plot the observed data
data_count.plot(kind='scatter', x='from_beg', y='People_count')

# then, plot the least squares line
plt.plot(x_new, preds, c='red', linewidth=2)


Out[46]:
[<matplotlib.lines.Line2D at 0x11182fa50>]

In [47]:
lm.summary()


Out[47]:
OLS Regression Results
Dep. Variable: People_count R-squared: 0.012
Model: OLS Adj. R-squared: -0.026
Method: Least Squares F-statistic: 0.3237
Date: Tue, 15 Dec 2015 Prob (F-statistic): 0.574
Time: 11:13:13 Log-Likelihood: -258.76
No. Observations: 28 AIC: 521.5
Df Residuals: 26 BIC: 524.2
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
Intercept 6608.3492 1005.811 6.570 0.000 4540.874 8675.824
from_beg 34.4784 60.598 0.569 0.574 -90.082 159.039
Omnibus: 5.013 Durbin-Watson: 1.683
Prob(Omnibus): 0.082 Jarque-Bera (JB): 4.563
Skew: 0.965 Prob(JB): 0.102
Kurtosis: 2.571 Cond. No. 34.2

In [48]:
lm.summary2()


Out[48]:
Model: OLS Adj. R-squared: -0.026
Dependent Variable: People_count AIC: 521.5159
Date: 2015-12-15 11:13 BIC: 524.1803
No. Observations: 28 Log-Likelihood: -258.76
Df Model: 1 F-statistic: 0.3237
Df Residuals: 26 Prob (F-statistic): 0.574
R-squared: 0.012 Scale: 6.7089e+06
Coef. Std.Err. t P>|t| [0.025 0.975]
Intercept 6608.3492 1005.8114 6.5702 0.0000 4540.8743 8675.8241
from_beg 34.4784 60.5976 0.5690 0.5743 -90.0819 159.0386
Omnibus: 5.013 Durbin-Watson: 1.683
Prob(Omnibus): 0.082 Jarque-Bera (JB): 4.563
Skew: 0.965 Prob(JB): 0.102
Kurtosis: 2.571 Condition No.: 34

In [52]:
x = data_count['from_beg']
y = data_count['People_count']
x = x.reshape((x.shape[0], 1))
y = y.reshape((y.shape[0], 1))
lm = LinearRegression()
lm.fit(x, y)
print(lm.coef_)
print(lm.intercept_)


[[ 34.47837986]]
[ 6608.34920635]

In [63]:
data_count["is_monday"] = data_count["Date"].apply(lambda x: 1 if x.weekday() == 0 else 0) #The day of the week with Monday=0, Sunday=6

In [64]:
data_count['is_monday']


Out[64]:
Index
0     0
1     0
2     0
3     0
4     1
5     0
6     0
7     0
8     0
9     0
10    0
11    1
12    0
13    0
14    0
15    0
16    0
17    0
18    1
19    0
20    0
21    0
22    0
23    0
24    0
25    1
26    0
27    0
Name: is_monday, dtype: int64

In [67]:
data_count["is_tuesday"] = data_count["Date"].apply(lambda x: 1 if x.weekday() == 1 else 0)
data_count["is_wednesday"] = data_count["Date"].apply(lambda x: 1 if x.weekday() == 2 else 0)
data_count["is_thursday"] = data_count["Date"].apply(lambda x: 1 if x.weekday() == 3 else 0)
data_count["is_friday"] = data_count["Date"].apply(lambda x: 1 if x.weekday() == 4 else 0)
data_count["is_saturday"] = data_count["Date"].apply(lambda x: 1 if x.weekday() == 5 else 0)

In [68]:
data_count


Out[68]:
People_count Date from_beg is_monday is_tuesday is_wednesday is_thursday is_friday is_saturday is_sunday
Index
0 4282 2001-02-01 1 0 0 0 1 0 0 0
1 4615 2001-02-02 2 0 0 0 0 1 0 0
2 10497 2001-02-03 3 0 0 0 0 0 1 0
3 11820 2001-02-04 4 0 0 0 0 0 0 1
4 5578 2001-02-05 5 1 0 0 0 0 0 0
5 5684 2001-02-06 6 0 1 0 0 0 0 0
6 4931 2001-02-07 7 0 0 1 0 0 0 0
7 5448 2001-02-08 8 0 0 0 1 0 0 0
8 5309 2001-02-09 9 0 0 0 0 1 0 0
9 9971 2001-02-10 10 0 0 0 0 0 1 0
10 11993 2001-02-11 11 0 0 0 0 0 0 1
11 4601 2001-02-12 12 1 0 0 0 0 0 0
12 5632 2001-02-13 13 0 1 0 0 0 0 0
13 6002 2001-02-14 14 0 0 1 0 0 0 0
14 6719 2001-02-15 15 0 0 0 1 0 0 0
15 5372 2001-02-16 16 0 0 0 0 1 0 0
16 8089 2001-02-17 17 0 0 0 0 0 1 0
17 9849 2001-02-18 18 0 0 0 0 0 0 1
18 6513 2001-02-19 19 1 0 0 0 0 0 0
19 6021 2001-02-20 20 0 1 0 0 0 0 0
20 6501 2001-02-21 21 0 0 1 0 0 0 0
21 6014 2001-02-22 22 0 0 0 1 0 0 0
22 6202 2001-02-23 23 0 0 0 0 1 0 0
23 9612 2001-02-24 24 0 0 0 0 0 1 0
24 12914 2001-02-25 25 0 0 0 0 0 0 1
25 4680 2001-02-26 26 1 0 0 0 0 0 0
26 4808 2001-02-27 27 0 1 0 0 0 0 0
27 9375 2001-02-28 28 0 0 1 0 0 0 0

In [ ]: