In [47]:
%matplotlib inline
In [48]:
# pandas
import pandas as pd
from pandas import Series,DataFrame
# numpy
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#sns.set_style('whitegrid')
%matplotlib inline
# machine learning
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC,LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
In [49]:
# dataFrame
rossmann_df = pd.read_csv('./input/rossman/train.csv')
store_df = pd.read_csv('./input/rossman/store.csv')
test_df = pd.read_csv('./input/rossman/test.csv')
# Preview
rossmann_df.head()
Out[49]:
In [50]:
rossmann_df[rossmann_df['Store']==1115]
Out[50]:
In [51]:
test_df[test_df['Store']==1115]
Out[51]:
In [52]:
store_df.head()
Out[52]:
In [53]:
test_df.head()
Out[53]:
In [54]:
rossmann_df.info()
print('-'*20)
store_df.info()
print('-'*20)
test_df.info()
In [55]:
fig,(axis1)= plt.subplots(1,1,figsize=(15,4))
sns.countplot(x='Open',hue='DayOfWeek',data=rossmann_df,palette='husl',ax=axis1)
Out[55]:
In [56]:
test_df["DayOfWeek"] != 7
Out[56]:
In [57]:
test_df.describe()
Out[57]:
In [58]:
test_df["Open"][test_df["Open"]!=test_df["Open"]]
Out[58]:
In [59]:
test_df["Open"][test_df["Open"]!=test_df["Open"]]=(test_df['DayOfWeek'] != 7).astype(int)
In [60]:
test_df["Open"][test_df["Open"]!=test_df["Open"]]
Out[60]:
In [61]:
test_df["Open"][479]
Out[61]:
In [62]:
# Date
# Create new date column
rossmann_df['Year'] = rossmann_df['Date'].apply(lambda x : int(str(x)[:4]))
rossmann_df['Month'] = rossmann_df['Date'].apply(lambda x : int(str(x)[5:7]))
test_df['Year'] = test_df['Date'].apply(lambda x : int(str(x)[:4]))
test_df['Month'] = test_df['Date'].apply(lambda x : int(str(x)[5:7]))
# 简化Date 方便可视化
rossmann_df['Date'] = rossmann_df['Date'].apply(lambda x : str(x)[:7])
test_df['Date'] = test_df['Date'].apply(lambda x : str(x)[:7])
average_sales = rossmann_df.groupby('Date')['Sales'].mean()
pct_changle_sales = rossmann_df.groupby('Date')['Sales'].sum().pct_change()
print(average_sales)
print('='*20)
print(pct_changle_sales)
In [63]:
fig,(axis1,axis2) = plt.subplots(2,1,sharex=True,figsize=(15,8))
ax1 = average_sales.plot(legend=True,ax=axis1,marker='o',title='Average Sales')
ax1.set_xticks(range(len(average_sales)))
ax1.set_xticklabels(average_sales.index.tolist(),rotation=90)
ax2 = pct_changle_sales.plot(legend=True,ax=axis2,marker='o',rot=90,colormap='summer',title='Sales Percent Change')
In [64]:
rossmann_df.head()
Out[64]:
In [65]:
test_df.head()
Out[65]:
In [66]:
fig,(axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
sns.barplot(ax=axis1,data=rossmann_df,x='Year',y='Sales')
sns.barplot(ax=axis2,data=rossmann_df,x='Year',y='Customers')
Out[66]:
In [67]:
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
sns.barplot(x='Year', y='Sales', data=rossmann_df, ax=axis1)
sns.barplot(x='Year', y='Customers', data=rossmann_df, ax=axis2)
Out[67]:
In [ ]:
fig ,(axis1,axis2) = plt.subplots(2,1,figsize=(15,8))
sns.boxplot([rossmann_df['Customers']],whis=np.inf,ax=axis1)
average_customers = rossmann_df.groupby('Date')['Customers'].mean()
ax = average_customers.plot(legend=True,marker='o',ax=axis2)
ax.set_xticks(range(len(average_customers)))
xlabels = ax.set_xticklabels(average_customers.index.tolist(),rotation=90)
In [ ]:
fig,(axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
sns.barplot(x='DayOfWeek',y='Sales',data=rossmann_df,order=[1,2,3,4,5,6,7],ax=axis1)
sns.barplot(x='DayOfWeek',y='Customers',data=rossmann_df,order=[1,2,3,4,5,6,7],ax=axis2)
In [ ]:
# Promo
fig,(axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
sns.barplot(x='Promo',y='Sales',data=rossmann_df,ax=axis1)
sns.barplot(x='Promo',y='Customers',data=rossmann_df,ax=axis2)
In [ ]:
# StateHoliday
# StateHoliday column has values 0 & "0", So, we need to merge values with 0 to "0"
rossmann_df["StateHoliday"].loc[rossmann_df["StateHoliday"] == 0] = "0"
# test_df["StateHoliday"].loc[test_df["StateHoliday"] == 0] = "0"
# Plot
sns.countplot(x='StateHoliday', data=rossmann_df)
# Before
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
sns.barplot(x='StateHoliday', y='Sales', data=rossmann_df, ax=axis1)
mask = (rossmann_df["StateHoliday"] != "0") & (rossmann_df["Sales"] > 0)
sns.barplot(x='StateHoliday', y='Sales', data=rossmann_df[mask], ax=axis2)# .... continue with StateHoliday
In [ ]:
# After
rossmann_df["StateHoliday"] = rossmann_df["StateHoliday"].map({0: 0, "0": 0, "a": 1, "b": 1, "c": 1})
test_df["StateHoliday"] = test_df["StateHoliday"].map({0: 0, "0": 0, "a": 1, "b": 1, "c": 1})
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
sns.barplot(x='StateHoliday', y='Sales', data=rossmann_df, ax=axis1)
sns.barplot(x='StateHoliday', y='Customers', data=rossmann_df, ax=axis2)
In [ ]:
sns.countplot(x='SchoolHoliday', data=rossmann_df)
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
sns.barplot(x='SchoolHoliday', y='Sales', data=rossmann_df, ax=axis1)
sns.barplot(x='SchoolHoliday', y='Customers', data=rossmann_df, ax=axis2)
In [ ]:
# Sales
fig, (axis1,axis2) = plt.subplots(2,1,figsize=(15,8))
# Plot max, min values, & 2nd, 3rd quartile
sns.boxplot([rossmann_df["Customers"]], whis=np.inf, ax=axis1)
# Plot sales values
# Notice that values with 0 is mostly because the store was closed
rossmann_df["Sales"].plot(kind='hist',bins=70,xlim=(0,15000),ax=axis2)
In [ ]:
# Using store_df
# Merge store_df with average store sales & customers
average_sales_customers = rossmann_df.groupby('Store')[["Sales", "Customers"]].mean()
sales_customers_df = DataFrame({'Store':average_sales_customers.index,
'Sales':average_sales_customers["Sales"], 'Customers': average_sales_customers["Customers"]},
columns=['Store', 'Sales', 'Customers'])
store_df = pd.merge(sales_customers_df, store_df, on='Store')
store_df.head()
In [ ]:
# StoreType
# Plot StoreType, & StoreType Vs average sales and customers
sns.countplot(x='StoreType', data=store_df, order=['a','b','c', 'd'])
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
sns.barplot(x='StoreType', y='Sales', data=store_df, order=['a','b','c', 'd'],ax=axis1)
sns.barplot(x='StoreType', y='Customers', data=store_df, order=['a','b','c', 'd'], ax=axis2)
In [ ]:
# Assortment
# Plot Assortment, & Assortment Vs average sales and customers
sns.countplot(x='Assortment', data=store_df, order=['a','b','c'])
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
sns.barplot(x='Assortment', y='Sales', data=store_df, order=['a','b','c'], ax=axis1)
sns.barplot(x='Assortment', y='Customers', data=store_df, order=['a','b','c'], ax=axis2)
In [ ]:
# Promo2
# Plot Promo2, & Promo2 Vs average sales and customers
sns.countplot(x='Promo2', data=store_df)
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
sns.barplot(x='Promo2', y='Sales', data=store_df, ax=axis1)
sns.barplot(x='Promo2', y='Customers', data=store_df, ax=axis2)
In [ ]:
store_df.head()
In [ ]:
# CompetitionDistance
# fill NaN values
store_df["CompetitionDistance"].fillna(store_df["CompetitionDistance"].median())
# Plot CompetitionDistance Vs Sales
store_df.plot(kind='scatter',x='CompetitionDistance',y='Sales',figsize=(15,4))
store_df.plot(kind='kde',x='CompetitionDistance',y='Sales',figsize=(15,4))
In [ ]:
# CompetitionDistance
# fill NaN values
store_df["CompetitionDistance"].fillna(store_df["CompetitionDistance"].median())
# Plot CompetitionDistance Vs Sales
store_df.plot(kind='scatter',x='CompetitionDistance',y='Sales',figsize=(15,4))
store_df.plot(kind='kde',x='CompetitionDistance',y='Sales',figsize=(15,4))
In [ ]:
# What happened to the average sales of a store over time when competition started?
# Example: the average sales for store_id = 6 has dramatically decreased since the competition started
store_id = 6
store_data = rossmann_df[rossmann_df["Store"] == store_id]
average_store_sales = store_data.groupby('Date')["Sales"].mean()
# Get year, and month when Competition started
y = store_df["CompetitionOpenSinceYear"].loc[store_df["Store"] == store_id].values[0]
m = store_df["CompetitionOpenSinceMonth"].loc[store_df["Store"] == store_id].values[0]
# Plot
ax = average_store_sales.plot(legend=True,figsize=(15,4),marker='o')
ax.set_xticks(range(len(average_store_sales)))
ax.set_xticklabels(average_store_sales.index.tolist(), rotation=90)
# Since all data of store sales given in rossmann_df starts with year=2013 till 2015,
# So, we need to check if year>=2013 and y & m aren't NaN values.
if y >= 2013 and y == y and m == m:
plt.axvline(x=((y-2013) * 12) + (m - 1), linewidth=3, color='grey')
In [ ]:
# Risk Analysis
# Analyze the risk of a store; Risk(std) Vs Expected(mean)
# .... countiue using store_data
store_average = store_data["Sales"].mean()
store_std = store_data["Sales"].std()
# Plot
plt.scatter(store_average, store_std,alpha = 0.5,s =np.pi*20)
# Get min & max mean and std of store sales
# Remember that store_df["Sales"] has the average sales for a store
std_sales = rossmann_df.groupby('Store')["Sales"].std()
min_average = store_df["Sales"].min()
max_average = store_df["Sales"].max()
min_std = std_sales.min()
max_std = std_sales.max()
# Set the x and y limits of the plot
plt.ylim([min_std, max_std])
plt.xlim([min_average, max_average])
# Set the plot axis titles
plt.xlabel('Expected Sales')
plt.ylabel('Risk')
# Set label
label, x, y = "Store {}".format(store_id), store_average, store_std
plt.annotate(
label,
xy = (x, y), xytext = (50, 50),
textcoords = 'offset points', ha = 'right', va = 'bottom',
arrowprops = dict(arrowstyle = '-', connectionstyle = 'arc3,rad=-0.3'))
In [ ]:
# Correlation
# Visualize the Correlation between stores
store_piv = pd.pivot_table(rossmann_df,values='Sales', index='Date', columns=['Store'],aggfunc='sum')
store_pct_chage = store_piv.pct_change().dropna()
store_piv.head()
In [ ]:
# .... continue Correlation
# Plot correlation between range of stores
start_store = 1
end_store = 5
fig, (axis1) = plt.subplots(1,1,figsize=(15,5))
# using summation of sales values for each store
sns.heatmap(store_piv[list(range(start_store, end_store+1))].corr(),annot=True,linewidths=2)
In [ ]:
# using percent change for each store
# sns.heatmap(store_pct_chage[list(range(start_store, end_store+1))].corr(),annot=True,linewidths=2)# Notice that test_df has only year=2015, and months 8 & 9
# drop Year and Month
rossmann_df.drop(["Year", "Month"], axis=1, inplace=True)
test_df.drop(["Year", "Month"], axis=1, inplace=True)
# Create dummy varibales for DayOfWeek
day_dummies_rossmann = pd.get_dummies(rossmann_df['DayOfWeek'], prefix='Day')
day_dummies_rossmann.drop(['Day_7'], axis=1, inplace=True)
day_dummies_test = pd.get_dummies(test_df['DayOfWeek'],prefix='Day')
day_dummies_test.drop(['Day_7'], axis=1, inplace=True)
rossmann_df = rossmann_df.join(day_dummies_rossmann)
test_df = test_df.join(day_dummies_test)
rossmann_df.drop(['DayOfWeek'], axis=1,inplace=True)
test_df.drop(['DayOfWeek'], axis=1,inplace=True)
In [ ]:
# remove all rows(store,date) that were closed
rossmann_df = rossmann_df[rossmann_df["Open"] != 0]
# drop unnecessary columns, these columns won't be useful in prediction
rossmann_df.drop(["Open","Customers", "Date"], axis=1, inplace=True)
In [ ]:
# save ids of closed stores, because we will assign their sales value to 0 later(see below)
closed_store_ids = test_df["Id"][test_df["Open"] == 0].values
# remove all rows(store,date) that were closed
test_df = test_df[test_df["Open"] != 0]
# drop unnecessary columns, these columns won't be useful in prediction
test_df.drop(['Open', 'Date'], axis=1,inplace=True)
In [ ]:
# Loop through each store,
# train the model using the data of current store, and predict it's sales values.
rossmann_dic = dict(list(rossmann_df.groupby('Store')))
print('rossmann_dict:',rossmann_dic)
test_dic = dict(list(test_df.groupby('Store')))
print('test_dict:',test_dic)
submission = Series()
scores = []
for i in test_dic:
# current store
store = rossmann_dic[i]
# define training and testing sets
X_train = store.drop(["Sales","Store"],axis=1)
Y_train = store["Sales"]
X_test = test_dic[i].copy()
store_ids = X_test["Id"]
X_test.drop(["Id","Store"], axis=1,inplace=True)
# Linear Regression
lreg = LinearRegression()
lreg.fit(X_train, Y_train)
Y_pred = lreg.predict(X_test)
scores.append(lreg.score(X_train, Y_train))
# Xgboost
# params = {"objective": "reg:linear", "max_depth": 10}
# T_train_xgb = xgb.DMatrix(X_train, Y_train)
# X_test_xgb = xgb.DMatrix(X_test)
# gbm = xgb.train(params, T_train_xgb, 100)
# Y_pred = gbm.predict(X_test_xgb)
# append predicted values of current store to submission
submission = submission.append(Series(Y_pred, index=store_ids))
# append rows(store,date) that were closed, and assign their sales value to 0
submission = submission.append(Series(0, index=closed_store_ids))
# save to csv file
submission = pd.DataFrame({ "Id": submission.index, "Sales": submission.values})
submission.to_csv('rossmann.csv', index=False)