In [1]:
import matplotlib
matplotlib.use("Agg")
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
%matplotlib inline
In [2]:
print("Pandas version:{}\nNumpy version:{}\n".format(pd.__version__, np.__version__))
In [3]:
data_file = open("data/ta_feng_dataset/D02")
data = pd.read_csv(data_file,';',skiprows=[2,3,4,5,6,7,8],names=['Date','Customer_Id','Age','Area','Product_Class','Product_Id','Amount','Assets','Price'])
data.head()
Out[3]:
In [4]:
data.shape
Out[4]:
In [5]:
unrelated_columns = range(2,9)
data = data.drop(data.columns[unrelated_columns],axis=1)
data.head()
# data.plot(kind='scatter', y='Amount', x='Customer_Id')
Out[5]:
In [6]:
data['Date'] = pd.to_datetime(data['Date'])
data.head()
Out[6]:
In [7]:
data_count = data.groupby('Date').count()
data_count.rename(columns={'Customer_Id':'People_count','Date':'Date'},inplace=True)
data_count
Out[7]:
In [8]:
data_count['Date'] = data_count.index
data_count['Index'] = range(0, len(data_count))
data_count = data_count.set_index('Index')
data_count
Out[8]:
In [9]:
data_count.plot(x='Date',y='People_count',style=".")
Out[9]:
In [10]:
data_count['from_beg'] = range(1,len(data_count)+1)
data_count
Out[10]:
In [11]:
lm = smf.ols(formula='People_count ~ from_beg', data=data_count).fit()
lm.params
Out[11]:
In [12]:
x_new = pd.DataFrame({'from_beg': [data_count['from_beg'].min(), data_count['from_beg'].max()]})
x_new
Out[12]:
In [13]:
preds = lm.predict(x_new)
preds
Out[13]:
In [14]:
# first, plot the observed data
data_count.plot(kind='scatter', x='from_beg', y='People_count')
# then, plot the least squares line
plt.plot(x_new, preds, c='red', linewidth=2)
Out[14]:
In [15]:
lm.summary()
Out[15]:
In [16]:
lm.summary2()
Out[16]:
In [47]:
x = data_count['from_beg']
y = data_count['People_count']
x = x.reshape((x.shape[0], 1))
y = y.reshape((y.shape[0], 1))
lm = LinearRegression()
lm.fit(x, y)
print(lm.coef_)
print(lm.intercept_)
lm.predict([[23]])
Out[47]:
In [18]:
data_count["is_monday"] = data_count["Date"].apply(lambda x: 1 if x.weekday() == 0 else 0) #The day of the week with Monday=0, Sunday=6
In [19]:
data_count['is_monday']
Out[19]:
In [20]:
data_count["is_tuesday"] = data_count["Date"].apply(lambda x: 1 if x.weekday() == 1 else 0)
data_count["is_wednesday"] = data_count["Date"].apply(lambda x: 1 if x.weekday() == 2 else 0)
data_count["is_thursday"] = data_count["Date"].apply(lambda x: 1 if x.weekday() == 3 else 0)
data_count["is_friday"] = data_count["Date"].apply(lambda x: 1 if x.weekday() == 4 else 0)
data_count["is_saturday"] = data_count["Date"].apply(lambda x: 1 if x.weekday() == 5 else 0)
In [25]:
data_count
Out[25]:
In [23]:
feature_cols = ['from_beg', 'is_monday', 'is_tuesday', 'is_wednesday', 'is_thursday', 'is_friday','is_saturday']
X = data_count[feature_cols]
y = data_count['People_count']
x = x.reshape((x.shape[0], 1))
y = y.reshape((y.shape[0], 1))
# instantiate, fit
lm = LinearRegression()
lm.fit(X, y)
Out[23]:
In [26]:
lm.coef_
lm.intercept_
Out[26]:
In [32]:
lm.predict([[20,0,0,0,0,1,0]])
Out[32]:
In [46]:
a = data_count['Date'].min()
from datetime import datetime
datetime.combine(a.date(), datetime.min.time())
Out[46]: