In [33]:
import matplotlib
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
%matplotlib inline
In [34]:
print("Pandas version:{}\nNumpy version:{}\n".format(pd.__version__, np.__version__))
In [35]:
data_file = open("data/ta_feng_dataset/D02")
data = pd.read_csv(data_file,';',skiprows=[2,3,4,5,6,7,8],names=['Date','Customer_Id','Age','Area','Product_Class','Product_Id','Amount','Assets','Price'])
data.head()
Out[35]:
In [36]:
data.shape
Out[36]:
In [37]:
unrelated_columns = range(2,9)
data = data.drop(data.columns[unrelated_columns],axis=1)
data.head()
# data.plot(kind='scatter', y='Amount', x='Customer_Id')
Out[37]:
In [38]:
data['Date'] = pd.to_datetime(data['Date'])
data.head()
Out[38]:
In [39]:
data_count = data.groupby('Date').count()
data_count.rename(columns={'Customer_Id':'People_count','Date':'Date'},inplace=True)
data_count
Out[39]:
In [40]:
data_count['Date'] = data_count.index
data_count['Index'] = range(0, len(data_count))
data_count = data_count.set_index('Index')
data_count
Out[40]:
In [41]:
data_count.plot(x='Date',y='People_count',style=".")
Out[41]:
In [42]:
data_count['from_beg'] = range(1,len(data_count)+1)
data_count
Out[42]:
In [43]:
lm = smf.ols(formula='People_count ~ from_beg', data=data_count).fit()
lm.params
Out[43]:
In [44]:
x_new = pd.DataFrame({'from_beg': [data_count['from_beg'].min(), data_count['from_beg'].max()]})
x_new
Out[44]:
In [45]:
preds = lm.predict(x_new)
preds
Out[45]:
In [46]:
# first, plot the observed data
data_count.plot(kind='scatter', x='from_beg', y='People_count')
# then, plot the least squares line
plt.plot(x_new, preds, c='red', linewidth=2)
Out[46]:
In [47]:
lm.summary()
Out[47]:
In [48]:
lm.summary2()
Out[48]:
In [52]:
x = data_count['from_beg']
y = data_count['People_count']
x = x.reshape((x.shape[0], 1))
y = y.reshape((y.shape[0], 1))
lm = LinearRegression()
lm.fit(x, y)
print(lm.coef_)
print(lm.intercept_)
In [63]:
data_count["is_monday"] = data_count["Date"].apply(lambda x: 1 if x.weekday() == 0 else 0) #The day of the week with Monday=0, Sunday=6
In [64]:
data_count['is_monday']
Out[64]:
In [67]:
data_count["is_tuesday"] = data_count["Date"].apply(lambda x: 1 if x.weekday() == 1 else 0)
data_count["is_wednesday"] = data_count["Date"].apply(lambda x: 1 if x.weekday() == 2 else 0)
data_count["is_thursday"] = data_count["Date"].apply(lambda x: 1 if x.weekday() == 3 else 0)
data_count["is_friday"] = data_count["Date"].apply(lambda x: 1 if x.weekday() == 4 else 0)
data_count["is_saturday"] = data_count["Date"].apply(lambda x: 1 if x.weekday() == 5 else 0)
In [68]:
data_count
Out[68]:
In [ ]: