notebook.community

Edit and run



In [1]:

    
import matplotlib
matplotlib.use("Agg")
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
%matplotlib inline



In [2]:

    
print("Pandas version:{}\nNumpy version:{}\n".format(pd.__version__, np.__version__))









    



Pandas version:0.17.1
Numpy version:1.10.1



In [3]:

    
data_file = open("data/ta_feng_dataset/D02")
data = pd.read_csv(data_file,';',skiprows=[2,3,4,5,6,7,8],names=['Date','Customer_Id','Age','Area','Product_Class','Product_Id','Amount','Assets','Price'])
data.head()









    Out[3]:






  
    
      
      Date
      Customer_Id
      Age
      Area
      Product_Class
      Product_Id
      Amount
      Assets
      Price
    
  
  
    
      0
      2001-02-01 00:00:00
      557818
      H
      E
      500210
      4710114105046
      1
      123
      135
    
    
      1
      2001-02-01 00:00:00
      1677683
      C
      B
      711310
      4902520163103
      6
      840
      894
    
    
      2
      2001-02-03 00:00:00
      1543483
      C
      F
      500307
      4901422038939
      1
      110
      119
    
    
      3
      2001-02-03 00:00:00
      2120010
      G
      F
      130207
      4710105011011
      1
      24
      29
    
    
      4
      2001-02-04 00:00:00
      1431131
      K
      G
      120103
      4710011405133
      1
      23
      29



In [4]:

    
data.shape









    Out[4]:





(199032, 9)



In [5]:

    
unrelated_columns = range(2,9)
data = data.drop(data.columns[unrelated_columns],axis=1)
data.head()
# data.plot(kind='scatter', y='Amount', x='Customer_Id')









    Out[5]:






  
    
      
      Date
      Customer_Id
    
  
  
    
      0
      2001-02-01 00:00:00
      557818
    
    
      1
      2001-02-01 00:00:00
      1677683
    
    
      2
      2001-02-03 00:00:00
      1543483
    
    
      3
      2001-02-03 00:00:00
      2120010
    
    
      4
      2001-02-04 00:00:00
      1431131



In [6]:

    
data['Date'] = pd.to_datetime(data['Date'])
data.head()









    Out[6]:






  
    
      
      Date
      Customer_Id
    
  
  
    
      0
      2001-02-01
      557818
    
    
      1
      2001-02-01
      1677683
    
    
      2
      2001-02-03
      1543483
    
    
      3
      2001-02-03
      2120010
    
    
      4
      2001-02-04
      1431131



In [7]:

    
data_count = data.groupby('Date').count()
data_count.rename(columns={'Customer_Id':'People_count','Date':'Date'},inplace=True)
data_count









    Out[7]:






  
    
      
      People_count
    
    
      Date
      
    
  
  
    
      2001-02-01
      4282
    
    
      2001-02-02
      4615
    
    
      2001-02-03
      10497
    
    
      2001-02-04
      11820
    
    
      2001-02-05
      5578
    
    
      2001-02-06
      5684
    
    
      2001-02-07
      4931
    
    
      2001-02-08
      5448
    
    
      2001-02-09
      5309
    
    
      2001-02-10
      9971
    
    
      2001-02-11
      11993
    
    
      2001-02-12
      4601
    
    
      2001-02-13
      5632
    
    
      2001-02-14
      6002
    
    
      2001-02-15
      6719
    
    
      2001-02-16
      5372
    
    
      2001-02-17
      8089
    
    
      2001-02-18
      9849
    
    
      2001-02-19
      6513
    
    
      2001-02-20
      6021
    
    
      2001-02-21
      6501
    
    
      2001-02-22
      6014
    
    
      2001-02-23
      6202
    
    
      2001-02-24
      9612
    
    
      2001-02-25
      12914
    
    
      2001-02-26
      4680
    
    
      2001-02-27
      4808
    
    
      2001-02-28
      9375



In [8]:

    
data_count['Date'] = data_count.index
data_count['Index'] = range(0, len(data_count))
data_count = data_count.set_index('Index')
data_count









    Out[8]:






  
    
      
      People_count
      Date
    
    
      Index
      
      
    
  
  
    
      0
      4282
      2001-02-01
    
    
      1
      4615
      2001-02-02
    
    
      2
      10497
      2001-02-03
    
    
      3
      11820
      2001-02-04
    
    
      4
      5578
      2001-02-05
    
    
      5
      5684
      2001-02-06
    
    
      6
      4931
      2001-02-07
    
    
      7
      5448
      2001-02-08
    
    
      8
      5309
      2001-02-09
    
    
      9
      9971
      2001-02-10
    
    
      10
      11993
      2001-02-11
    
    
      11
      4601
      2001-02-12
    
    
      12
      5632
      2001-02-13
    
    
      13
      6002
      2001-02-14
    
    
      14
      6719
      2001-02-15
    
    
      15
      5372
      2001-02-16
    
    
      16
      8089
      2001-02-17
    
    
      17
      9849
      2001-02-18
    
    
      18
      6513
      2001-02-19
    
    
      19
      6021
      2001-02-20
    
    
      20
      6501
      2001-02-21
    
    
      21
      6014
      2001-02-22
    
    
      22
      6202
      2001-02-23
    
    
      23
      9612
      2001-02-24
    
    
      24
      12914
      2001-02-25
    
    
      25
      4680
      2001-02-26
    
    
      26
      4808
      2001-02-27
    
    
      27
      9375
      2001-02-28



In [9]:

    
data_count.plot(x='Date',y='People_count',style=".")









    Out[9]:





<matplotlib.axes._subplots.AxesSubplot at 0x1134dbb10>



In [10]:

    
data_count['from_beg'] = range(1,len(data_count)+1)
data_count









    Out[10]:






  
    
      
      People_count
      Date
      from_beg
    
    
      Index
      
      
      
    
  
  
    
      0
      4282
      2001-02-01
      1
    
    
      1
      4615
      2001-02-02
      2
    
    
      2
      10497
      2001-02-03
      3
    
    
      3
      11820
      2001-02-04
      4
    
    
      4
      5578
      2001-02-05
      5
    
    
      5
      5684
      2001-02-06
      6
    
    
      6
      4931
      2001-02-07
      7
    
    
      7
      5448
      2001-02-08
      8
    
    
      8
      5309
      2001-02-09
      9
    
    
      9
      9971
      2001-02-10
      10
    
    
      10
      11993
      2001-02-11
      11
    
    
      11
      4601
      2001-02-12
      12
    
    
      12
      5632
      2001-02-13
      13
    
    
      13
      6002
      2001-02-14
      14
    
    
      14
      6719
      2001-02-15
      15
    
    
      15
      5372
      2001-02-16
      16
    
    
      16
      8089
      2001-02-17
      17
    
    
      17
      9849
      2001-02-18
      18
    
    
      18
      6513
      2001-02-19
      19
    
    
      19
      6021
      2001-02-20
      20
    
    
      20
      6501
      2001-02-21
      21
    
    
      21
      6014
      2001-02-22
      22
    
    
      22
      6202
      2001-02-23
      23
    
    
      23
      9612
      2001-02-24
      24
    
    
      24
      12914
      2001-02-25
      25
    
    
      25
      4680
      2001-02-26
      26
    
    
      26
      4808
      2001-02-27
      27
    
    
      27
      9375
      2001-02-28
      28



In [11]:

    
lm = smf.ols(formula='People_count ~ from_beg', data=data_count).fit()
lm.params









    Out[11]:





Intercept    6608.349206
from_beg       34.478380
dtype: float64



In [12]:

    
x_new = pd.DataFrame({'from_beg': [data_count['from_beg'].min(), data_count['from_beg'].max()]})
x_new



In [13]:

    
preds = lm.predict(x_new)
preds









    Out[13]:





array([ 6642.82758621,  7573.74384236])



In [14]:

    
# first, plot the observed data
data_count.plot(kind='scatter', x='from_beg', y='People_count')

# then, plot the least squares line
plt.plot(x_new, preds, c='red', linewidth=2)









    Out[14]:





[<matplotlib.lines.Line2D at 0x113636c90>]



In [15]:

    
lm.summary()









    Out[15]:





OLS Regression Results

  Dep. Variable:       People_count      R-squared:             0.012


  Model:                    OLS          Adj. R-squared:       -0.026


  Method:              Least Squares     F-statistic:          0.3237


  Date:              Tue, 15 Dec 2015    Prob (F-statistic):    0.574 


  Time:                  11:31:06        Log-Likelihood:      -258.76


  No. Observations:           28         AIC:                   521.5


  Df Residuals:               26         BIC:                   524.2


  Df Model:                    1                                     


  Covariance Type:       nonrobust                                   




               coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  Intercept   6608.3492   1005.811      6.570   0.000   4540.874  8675.824


  from_beg      34.4784     60.598      0.569   0.574    -90.082   159.039




  Omnibus:         5.013    Durbin-Watson:         1.683


  Prob(Omnibus):   0.082    Jarque-Bera (JB):      4.563


  Skew:            0.965    Prob(JB):              0.102


  Kurtosis:        2.571    Cond. No.               34.2



In [16]:

    
lm.summary2()









    Out[16]:






        Model:                OLS          Adj. R-squared:      -0.026  


  Dependent Variable:    People_count           AIC:           521.5159 


         Date:         2015-12-15 11:31         BIC:           524.1803 


   No. Observations:          28           Log-Likelihood:      -258.76 


       Df Model:               1            F-statistic:        0.3237  


     Df Residuals:            26         Prob (F-statistic):     0.574  


      R-squared:             0.012             Scale:         6.7089e+06




              Coef.    Std.Err.      t     P>|t|   [0.025     0.975]  


  Intercept  6608.3492  1005.8114  6.5702  0.0000  4540.8743  8675.8241


  from_beg    34.4784    60.5976   0.5690  0.5743  -90.0819   159.0386 




     Omnibus:     5.013   Durbin-Watson:    1.683


  Prob(Omnibus):  0.082  Jarque-Bera (JB):  4.563


       Skew:      0.965      Prob(JB):      0.102


     Kurtosis:    2.571   Condition No.:     34



In [47]:

    
x = data_count['from_beg']
y = data_count['People_count']
x = x.reshape((x.shape[0], 1))
y = y.reshape((y.shape[0], 1))
lm = LinearRegression()
lm.fit(x, y)
print(lm.coef_)
print(lm.intercept_)
lm.predict([[23]])









    



[[ 34.47837986]]
[ 6608.34920635]






    Out[47]:





array([[ 7401.35194308]])



In [18]:

    
data_count["is_monday"] = data_count["Date"].apply(lambda x: 1 if x.weekday() == 0 else 0) #The day of the week with Monday=0, Sunday=6



In [19]:

    
data_count['is_monday']









    Out[19]:





Index
0     0
1     0
2     0
3     0
4     1
5     0
6     0
7     0
8     0
9     0
10    0
11    1
12    0
13    0
14    0
15    0
16    0
17    0
18    1
19    0
20    0
21    0
22    0
23    0
24    0
25    1
26    0
27    0
Name: is_monday, dtype: int64



In [20]:

    
data_count["is_tuesday"] = data_count["Date"].apply(lambda x: 1 if x.weekday() == 1 else 0)
data_count["is_wednesday"] = data_count["Date"].apply(lambda x: 1 if x.weekday() == 2 else 0)
data_count["is_thursday"] = data_count["Date"].apply(lambda x: 1 if x.weekday() == 3 else 0)
data_count["is_friday"] = data_count["Date"].apply(lambda x: 1 if x.weekday() == 4 else 0)
data_count["is_saturday"] = data_count["Date"].apply(lambda x: 1 if x.weekday() == 5 else 0)



In [25]:

    
data_count









    Out[25]:






  
    
      
      People_count
      Date
      from_beg
      is_monday
      is_tuesday
      is_wednesday
      is_thursday
      is_friday
      is_saturday
    
    
      Index
      
      
      
      
      
      
      
      
      
    
  
  
    
      0
      4282
      2001-02-01
      1
      0
      0
      0
      1
      0
      0
    
    
      1
      4615
      2001-02-02
      2
      0
      0
      0
      0
      1
      0
    
    
      2
      10497
      2001-02-03
      3
      0
      0
      0
      0
      0
      1
    
    
      3
      11820
      2001-02-04
      4
      0
      0
      0
      0
      0
      0
    
    
      4
      5578
      2001-02-05
      5
      1
      0
      0
      0
      0
      0
    
    
      5
      5684
      2001-02-06
      6
      0
      1
      0
      0
      0
      0
    
    
      6
      4931
      2001-02-07
      7
      0
      0
      1
      0
      0
      0
    
    
      7
      5448
      2001-02-08
      8
      0
      0
      0
      1
      0
      0
    
    
      8
      5309
      2001-02-09
      9
      0
      0
      0
      0
      1
      0
    
    
      9
      9971
      2001-02-10
      10
      0
      0
      0
      0
      0
      1
    
    
      10
      11993
      2001-02-11
      11
      0
      0
      0
      0
      0
      0
    
    
      11
      4601
      2001-02-12
      12
      1
      0
      0
      0
      0
      0
    
    
      12
      5632
      2001-02-13
      13
      0
      1
      0
      0
      0
      0
    
    
      13
      6002
      2001-02-14
      14
      0
      0
      1
      0
      0
      0
    
    
      14
      6719
      2001-02-15
      15
      0
      0
      0
      1
      0
      0
    
    
      15
      5372
      2001-02-16
      16
      0
      0
      0
      0
      1
      0
    
    
      16
      8089
      2001-02-17
      17
      0
      0
      0
      0
      0
      1
    
    
      17
      9849
      2001-02-18
      18
      0
      0
      0
      0
      0
      0
    
    
      18
      6513
      2001-02-19
      19
      1
      0
      0
      0
      0
      0
    
    
      19
      6021
      2001-02-20
      20
      0
      1
      0
      0
      0
      0
    
    
      20
      6501
      2001-02-21
      21
      0
      0
      1
      0
      0
      0
    
    
      21
      6014
      2001-02-22
      22
      0
      0
      0
      1
      0
      0
    
    
      22
      6202
      2001-02-23
      23
      0
      0
      0
      0
      1
      0
    
    
      23
      9612
      2001-02-24
      24
      0
      0
      0
      0
      0
      1
    
    
      24
      12914
      2001-02-25
      25
      0
      0
      0
      0
      0
      0
    
    
      25
      4680
      2001-02-26
      26
      1
      0
      0
      0
      0
      0
    
    
      26
      4808
      2001-02-27
      27
      0
      1
      0
      0
      0
      0
    
    
      27
      9375
      2001-02-28
      28
      0
      0
      1
      0
      0
      0



In [23]:

    
feature_cols = ['from_beg', 'is_monday', 'is_tuesday', 'is_wednesday', 'is_thursday', 'is_friday','is_saturday']
X = data_count[feature_cols]
y = data_count['People_count']
x = x.reshape((x.shape[0], 1))
y = y.reshape((y.shape[0], 1))
# instantiate, fit
lm = LinearRegression()
lm.fit(X, y)









    Out[23]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [26]:

    
lm.coef_
lm.intercept_









    Out[26]:





array([ 11090.57346939])



In [32]:

    
lm.predict([[20,0,0,0,0,1,0]])









    Out[32]:





array([[ 5660.75510204]])



In [46]:

    
a = data_count['Date'].min()
from datetime import datetime
datetime.combine(a.date(), datetime.min.time())









    Out[46]:





datetime.datetime(2001, 2, 1, 0, 0)

	Date	Customer_Id	Age	Area	Product_Class	Product_Id	Amount	Assets	Price
0	2001-02-01 00:00:00	557818	H	E	500210	4710114105046	1	123	135
1	2001-02-01 00:00:00	1677683	C	B	711310	4902520163103	6	840	894
2	2001-02-03 00:00:00	1543483	C	F	500307	4901422038939	1	110	119
3	2001-02-03 00:00:00	2120010	G	F	130207	4710105011011	1	24	29
4	2001-02-04 00:00:00	1431131	K	G	120103	4710011405133	1	23	29

	Date	Customer_Id
0	2001-02-01	557818
1	2001-02-01	1677683
2	2001-02-03	1543483
3	2001-02-03	2120010
4	2001-02-04	1431131

	People_count
Date
2001-02-01	4282
2001-02-02	4615
2001-02-03	10497
2001-02-04	11820
2001-02-05	5578
2001-02-06	5684
2001-02-07	4931
2001-02-08	5448
2001-02-09	5309
2001-02-10	9971
2001-02-11	11993
2001-02-12	4601
2001-02-13	5632
2001-02-14	6002
2001-02-15	6719
2001-02-16	5372
2001-02-17	8089
2001-02-18	9849
2001-02-19	6513
2001-02-20	6021
2001-02-21	6501
2001-02-22	6014
2001-02-23	6202
2001-02-24	9612
2001-02-25	12914
2001-02-26	4680
2001-02-27	4808
2001-02-28	9375

	People_count	Date
Index
0	4282	2001-02-01
1	4615	2001-02-02
2	10497	2001-02-03
3	11820	2001-02-04
4	5578	2001-02-05
5	5684	2001-02-06
6	4931	2001-02-07
7	5448	2001-02-08
8	5309	2001-02-09
9	9971	2001-02-10
10	11993	2001-02-11
11	4601	2001-02-12
12	5632	2001-02-13
13	6002	2001-02-14
14	6719	2001-02-15
15	5372	2001-02-16
16	8089	2001-02-17
17	9849	2001-02-18
18	6513	2001-02-19
19	6021	2001-02-20
20	6501	2001-02-21
21	6014	2001-02-22
22	6202	2001-02-23
23	9612	2001-02-24
24	12914	2001-02-25
25	4680	2001-02-26
26	4808	2001-02-27
27	9375	2001-02-28

Dep. Variable:	People_count	R-squared:	0.012
Model:	OLS	Adj. R-squared:	-0.026
Method:	Least Squares	F-statistic:	0.3237
Date:	Tue, 15 Dec 2015	Prob (F-statistic):	0.574
Time:	11:31:06	Log-Likelihood:	-258.76
No. Observations:	28	AIC:	521.5
Df Residuals:	26	BIC:	524.2
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
Intercept	6608.3492	1005.811	6.570	0.000	4540.874 8675.824
from_beg	34.4784	60.598	0.569	0.574	-90.082 159.039

Omnibus:	5.013	Durbin-Watson:	1.683
Prob(Omnibus):	0.082	Jarque-Bera (JB):	4.563
Skew:	0.965	Prob(JB):	0.102
Kurtosis:	2.571	Cond. No.	34.2

	Coef.	Std.Err.	t	P>\|t\|	[0.025	0.975]
Intercept	6608.3492	1005.8114	6.5702	0.0000	4540.8743	8675.8241
from_beg	34.4784	60.5976	0.5690	0.5743	-90.0819	159.0386