In [14]:
# Imports

# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

# machine learning
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
# dataFrame
rossmann_df = pd.read_csv('./input/rossman/train.csv')
store_df = pd.read_csv('./input/rossman/store.csv')
test_df = pd.read_csv('./input/rossman/test.csv')
# Preview

test_df["Open"][test_df["Open"] != test_df["Open"]] = (test_df["DayOfWeek"] != 7).astype(int)

# Date

# Create Year and Month columns
rossmann_df['Year']  = rossmann_df['Date'].apply(lambda x: int(str(x)[:4]))
rossmann_df['Month'] = rossmann_df['Date'].apply(lambda x: int(str(x)[5:7]))

test_df['Year']  = test_df['Date'].apply(lambda x: int(str(x)[:4]))
test_df['Month'] = test_df['Date'].apply(lambda x: int(str(x)[5:7]))

# Assign Date column to Date(Year-Month) instead of (Year-Month-Day)
# this column will be useful in analysis and visualization
rossmann_df['Date'] = rossmann_df['Date'].apply(lambda x: (str(x)[:7]))
test_df['Date']     = test_df['Date'].apply(lambda x: (str(x)[:7]))
# StateHoliday


rossmann_df["StateHoliday"].loc[rossmann_df["StateHoliday"] == 0] = "0"
# .... continue with StateHoliday

# After
rossmann_df["StateHoliday"] = rossmann_df["StateHoliday"].map({0: 0, "0": 0, "a": 1, "b": 1, "c": 1})
test_df["StateHoliday"]     = test_df["StateHoliday"].map({0: 0, "0": 0, "a": 1, "b": 1, "c": 1})

# Using store_df

# Merge store_df with average store sales & customers
average_sales_customers = rossmann_df.groupby('Store')[["Sales", "Customers"]].mean()

sales_customers_df = DataFrame({'Store':average_sales_customers.index,
                      'Sales':average_sales_customers["Sales"], 'Customers': average_sales_customers["Customers"]}, 
                      columns=['Store', 'Sales', 'Customers'])
store_df = pd.merge(sales_customers_df, store_df, on='Store')

#print(store_df["CompetitionDistance"])

store_df["CompetitionDistance"].fillna(store_df["CompetitionDistance"].median())

# Notice that test_df has only year=2015, and months 8 & 9

# drop Year and Month
rossmann_df.drop(["Year", "Month"], axis=1, inplace=True)
test_df.drop(["Year", "Month"], axis=1, inplace=True)

# Create dummy varibales for DayOfWeek
day_dummies_rossmann  = pd.get_dummies(rossmann_df['DayOfWeek'], prefix='Day')
print('day_dummies_rossmann\n',day_dummies_rossmann.head())
day_dummies_rossmann.drop(['Day_7'], axis=1, inplace=True)
print('day_dummies_rossmann\n',day_dummies_rossmann.head())
day_dummies_test  = pd.get_dummies(test_df['DayOfWeek'],prefix='Day')
day_dummies_test.drop(['Day_7'], axis=1, inplace=True)

rossmann_df = rossmann_df.join(day_dummies_rossmann)
print('rossmann_df\n',rossmann_df.head())
test_df     = test_df.join(day_dummies_test)

rossmann_df.drop(['DayOfWeek'], axis=1,inplace=True)
test_df.drop(['DayOfWeek'], axis=1,inplace=True)

# remove all rows(store,date) that were closed
rossmann_df = rossmann_df[rossmann_df["Open"] != 0]

# drop unnecessary columns, these columns won't be useful in prediction
rossmann_df.drop(["Open","Customers", "Date"], axis=1, inplace=True)

# save ids of closed stores, because we will assign their sales value to 0 later(see below)
closed_store_ids = test_df["Id"][test_df["Open"] == 0].values

# remove all rows(store,date) that were closed
test_df = test_df[test_df["Open"] != 0]

# drop unnecessary columns, these columns won't be useful in prediction
test_df.drop(['Open', 'Date'], axis=1,inplace=True)


# Loop through each store, 
# train the model using the data of current store, and predict it's sales values.

rossmann_dic = dict(list(rossmann_df.groupby('Store')))
test_dic     = dict(list(test_df.groupby('Store')))
submission   = Series()
scores       = []

for i in test_dic:
    
    # current store
    store = rossmann_dic[i]
    
    # define training and testing sets
    X_train = store.drop(["Sales","Store"],axis=1)
    Y_train = store["Sales"]
    X_test  = test_dic[i].copy()
    
    store_ids = X_test["Id"]
    X_test.drop(["Id","Store"], axis=1,inplace=True)
    
    # Linear Regression
    lreg = LinearRegression()
    lreg.fit(X_train, Y_train)
    Y_pred = lreg.predict(X_test)
    scores.append(lreg.score(X_train, Y_train))

    # Xgboost
    # params = {"objective": "reg:linear",  "max_depth": 10}
    # T_train_xgb = xgb.DMatrix(X_train, Y_train)
    # X_test_xgb  = xgb.DMatrix(X_test)
    # gbm = xgb.train(params, T_train_xgb, 100)
    # Y_pred = gbm.predict(X_test_xgb)
    
    # append predicted values of current store to submission
    submission = submission.append(Series(Y_pred, index=store_ids))

# append rows(store,date) that were closed, and assign their sales value to 0
submission = submission.append(Series(0, index=closed_store_ids))

# save to csv file
submission = pd.DataFrame({ "Id": submission.index, "Sales": submission.values})
submission.to_csv('rossmann.csv', index=False)


D:\soft\anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2717: DtypeWarning: Columns (7) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
D:\soft\anaconda3\lib\site-packages\ipykernel_launcher.py:27: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\soft\anaconda3\lib\site-packages\pandas\core\indexing.py:179: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
day_dummies_rossmann
    Day_1  Day_2  Day_3  Day_4  Day_5  Day_6  Day_7
0      0      0      0      0      1      0      0
1      0      0      0      0      1      0      0
2      0      0      0      0      1      0      0
3      0      0      0      0      1      0      0
4      0      0      0      0      1      0      0
day_dummies_rossmann
    Day_1  Day_2  Day_3  Day_4  Day_5  Day_6
0      0      0      0      0      1      0
1      0      0      0      0      1      0
2      0      0      0      0      1      0
3      0      0      0      0      1      0
4      0      0      0      0      1      0
rossmann_df
    Store  DayOfWeek     Date  Sales  Customers  Open  Promo  StateHoliday  \
0      1          5  2015-07   5263        555     1      1             0   
1      2          5  2015-07   6064        625     1      1             0   
2      3          5  2015-07   8314        821     1      1             0   
3      4          5  2015-07  13995       1498     1      1             0   
4      5          5  2015-07   4822        559     1      1             0   

   SchoolHoliday  Day_1  Day_2  Day_3  Day_4  Day_5  Day_6  
0              1      0      0      0      0      1      0  
1              1      0      0      0      0      1      0  
2              1      0      0      0      0      1      0  
3              1      0      0      0      0      1      0  
4              1      0      0      0      0      1      0  
              Sales    Customers
Store                           
1       3945.704883   467.646497
2       4122.991507   486.045648
3       5741.253715   620.286624
4       8021.769639  1100.057325
5       3867.110403   444.360934
6       4562.375796   525.990446
7       7356.902335   791.474522
8       4610.251592   547.799363
9       5426.816348   479.487261
10      4634.439490   494.332272
11      6683.955414   940.543524
12      6316.608280   746.508493
13      4124.773087   313.641161
14      4555.386412   522.200637
15      5553.619958   552.130573
16      6335.340764   703.158174
17      5259.541401   657.208068
18      5446.005308   592.128450
19      5346.251592   516.579618
20      6334.489446   692.952507
21      4522.041401   425.916136
22      3695.918206   411.779683
23      4565.073248   398.957537
24      7763.773885   690.870488
25      8892.311040  1250.288747
26      5538.498938   461.623142
27      7838.438429   888.786624
28      4332.218684   475.791932
29      6090.403397   540.587049
30      4367.352442   641.006369
...             ...          ...
1086    6365.240977   695.715499
1087    5228.500000   512.247346
1088    4204.866242   401.199575
1089    8402.582803   829.093418
1090    5343.307856   650.546709
1091    6888.168790   580.252654
1092    9814.584433   955.994723
1093    6904.761146   719.926752
1094    3753.922164   306.767810
1095    3969.285563   562.701699
1096    4512.348195   523.520170
1097    9744.599788  2420.921444
1098    4354.997877   587.133758
1099    7340.338641   928.975584
1100    4423.963907   546.181529
1101    8299.635881   822.659236
1102    5744.054090   579.721636
1103    4161.639066   319.749469
1104    4481.546174   353.845646
1105    3774.736730   421.781316
1106    4073.217622   464.085987
1107    5121.849604   549.465699
1108    4758.597665   488.755839
1109    4043.167546   371.179420
1110    3766.970276   449.000000
1111    4342.968153   373.548832
1112    8465.280255   693.498938
1113    5516.180467   596.763270
1114   17200.196391  2664.057325
1115    5225.296178   358.687898

[1115 rows x 2 columns]

In [15]:
store_df.head()


Out[15]:
Store Sales Customers StoreType Assortment CompetitionDistance CompetitionOpenSinceMonth CompetitionOpenSinceYear Promo2 Promo2SinceWeek Promo2SinceYear PromoInterval
0 1 3945.704883 467.646497 c a 1270.0 9.0 2008.0 0 NaN NaN NaN
1 2 4122.991507 486.045648 a a 570.0 11.0 2007.0 1 13.0 2010.0 Jan,Apr,Jul,Oct
2 3 5741.253715 620.286624 a a 14130.0 12.0 2006.0 1 14.0 2011.0 Jan,Apr,Jul,Oct
3 4 8021.769639 1100.057325 c c 620.0 9.0 2009.0 0 NaN NaN NaN
4 5 3867.110403 444.360934 a a 29910.0 4.0 2015.0 0 NaN NaN NaN

In [16]:
rossmann_df.head()


Out[16]:
Store Sales Promo StateHoliday SchoolHoliday Day_1 Day_2 Day_3 Day_4 Day_5 Day_6
0 1 5263 1 0 1 0 0 0 0 1 0
1 2 6064 1 0 1 0 0 0 0 1 0
2 3 8314 1 0 1 0 0 0 0 1 0
3 4 13995 1 0 1 0 0 0 0 1 0
4 5 4822 1 0 1 0 0 0 0 1 0