notebook.community

Edit and run



In [47]:

    
%matplotlib inline



In [48]:

    
# pandas
import pandas as pd
from pandas import Series,DataFrame
# numpy 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#sns.set_style('whitegrid')
%matplotlib inline

# machine learning
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC,LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.naive_bayes import GaussianNB



In [49]:

    
# dataFrame
rossmann_df = pd.read_csv('./input/rossman/train.csv')
store_df = pd.read_csv('./input/rossman/store.csv')
test_df = pd.read_csv('./input/rossman/test.csv')
# Preview
rossmann_df.head()









    



D:\soft\anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2717: DtypeWarning: Columns (7) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)






    Out[49]:







  
    
      
      Store
      DayOfWeek
      Date
      Sales
      Customers
      Open
      Promo
      StateHoliday
      SchoolHoliday
    
  
  
    
      0
      1
      5
      2015-07-31
      5263
      555
      1
      1
      0
      1
    
    
      1
      2
      5
      2015-07-31
      6064
      625
      1
      1
      0
      1
    
    
      2
      3
      5
      2015-07-31
      8314
      821
      1
      1
      0
      1
    
    
      3
      4
      5
      2015-07-31
      13995
      1498
      1
      1
      0
      1
    
    
      4
      5
      5
      2015-07-31
      4822
      559
      1
      1
      0
      1



In [50]:

    
rossmann_df[rossmann_df['Store']==1115]









    Out[50]:







  
    
      
      Store
      DayOfWeek
      Date
      Sales
      Customers
      Open
      Promo
      StateHoliday
      SchoolHoliday
    
  
  
    
      1114
      1115
      5
      2015-07-31
      8680
      538
      1
      1
      0
      1
    
    
      2229
      1115
      4
      2015-07-30
      8405
      502
      1
      1
      0
      1
    
    
      3344
      1115
      3
      2015-07-29
      7661
      473
      1
      1
      0
      1
    
    
      4459
      1115
      2
      2015-07-28
      8093
      500
      1
      1
      0
      1
    
    
      5574
      1115
      1
      2015-07-27
      10712
      608
      1
      1
      0
      1
    
    
      6689
      1115
      7
      2015-07-26
      0
      0
      0
      0
      0
      0
    
    
      7804
      1115
      6
      2015-07-25
      6897
      478
      1
      0
      0
      0
    
    
      8919
      1115
      5
      2015-07-24
      5816
      417
      1
      0
      0
      0
    
    
      10034
      1115
      4
      2015-07-23
      6150
      435
      1
      0
      0
      0
    
    
      11149
      1115
      3
      2015-07-22
      5342
      382
      1
      0
      0
      0
    
    
      12264
      1115
      2
      2015-07-21
      5074
      352
      1
      0
      0
      0
    
    
      13379
      1115
      1
      2015-07-20
      6083
      440
      1
      0
      0
      0
    
    
      14494
      1115
      7
      2015-07-19
      0
      0
      0
      0
      0
      0
    
    
      15609
      1115
      6
      2015-07-18
      7264
      504
      1
      0
      0
      0
    
    
      16724
      1115
      5
      2015-07-17
      7874
      519
      1
      1
      0
      0
    
    
      17839
      1115
      4
      2015-07-16
      6590
      455
      1
      1
      0
      0
    
    
      18954
      1115
      3
      2015-07-15
      6039
      450
      1
      1
      0
      0
    
    
      20069
      1115
      2
      2015-07-14
      7562
      488
      1
      1
      0
      0
    
    
      21184
      1115
      1
      2015-07-13
      10598
      641
      1
      1
      0
      0
    
    
      22299
      1115
      7
      2015-07-12
      0
      0
      0
      0
      0
      0
    
    
      23414
      1115
      6
      2015-07-11
      7164
      483
      1
      0
      0
      0
    
    
      24529
      1115
      5
      2015-07-10
      5844
      404
      1
      0
      0
      0
    
    
      25644
      1115
      4
      2015-07-09
      5686
      416
      1
      0
      0
      0
    
    
      26759
      1115
      3
      2015-07-08
      5900
      413
      1
      0
      0
      0
    
    
      27874
      1115
      2
      2015-07-07
      5138
      362
      1
      0
      0
      0
    
    
      28989
      1115
      1
      2015-07-06
      6501
      453
      1
      0
      0
      0
    
    
      30104
      1115
      7
      2015-07-05
      0
      0
      0
      0
      0
      0
    
    
      31219
      1115
      6
      2015-07-04
      6543
      466
      1
      0
      0
      0
    
    
      32334
      1115
      5
      2015-07-03
      7412
      478
      1
      1
      0
      0
    
    
      33449
      1115
      4
      2015-07-02
      6858
      452
      1
      1
      0
      0
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      984874
      1115
      3
      2013-01-30
      3813
      287
      1
      0
      0
      0
    
    
      985989
      1115
      2
      2013-01-29
      3397
      277
      1
      0
      0
      0
    
    
      987104
      1115
      1
      2013-01-28
      3615
      271
      1
      0
      0
      0
    
    
      988219
      1115
      7
      2013-01-27
      0
      0
      0
      0
      0
      0
    
    
      989334
      1115
      6
      2013-01-26
      4896
      394
      1
      0
      0
      0
    
    
      990449
      1115
      5
      2013-01-25
      5082
      373
      1
      1
      0
      0
    
    
      991564
      1115
      4
      2013-01-24
      5650
      366
      1
      1
      0
      0
    
    
      992679
      1115
      3
      2013-01-23
      5176
      336
      1
      1
      0
      0
    
    
      993794
      1115
      2
      2013-01-22
      5388
      329
      1
      1
      0
      0
    
    
      994909
      1115
      1
      2013-01-21
      3723
      243
      1
      1
      0
      0
    
    
      996024
      1115
      7
      2013-01-20
      0
      0
      0
      0
      0
      0
    
    
      997139
      1115
      6
      2013-01-19
      5207
      369
      1
      0
      0
      0
    
    
      998254
      1115
      5
      2013-01-18
      4287
      296
      1
      0
      0
      0
    
    
      999369
      1115
      4
      2013-01-17
      3039
      244
      1
      0
      0
      0
    
    
      1000484
      1115
      3
      2013-01-16
      3064
      237
      1
      0
      0
      0
    
    
      1001599
      1115
      2
      2013-01-15
      3528
      277
      1
      0
      0
      0
    
    
      1002714
      1115
      1
      2013-01-14
      3158
      252
      1
      0
      0
      0
    
    
      1003829
      1115
      7
      2013-01-13
      0
      0
      0
      0
      0
      0
    
    
      1004944
      1115
      6
      2013-01-12
      4497
      350
      1
      0
      0
      0
    
    
      1006059
      1115
      5
      2013-01-11
      5142
      351
      1
      1
      0
      1
    
    
      1007174
      1115
      4
      2013-01-10
      5007
      339
      1
      1
      0
      1
    
    
      1008289
      1115
      3
      2013-01-09
      4649
      324
      1
      1
      0
      1
    
    
      1009404
      1115
      2
      2013-01-08
      5243
      341
      1
      1
      0
      1
    
    
      1010519
      1115
      1
      2013-01-07
      6905
      471
      1
      1
      0
      1
    
    
      1011634
      1115
      7
      2013-01-06
      0
      0
      0
      0
      0
      1
    
    
      1012749
      1115
      6
      2013-01-05
      4771
      339
      1
      0
      0
      1
    
    
      1013864
      1115
      5
      2013-01-04
      4540
      326
      1
      0
      0
      1
    
    
      1014979
      1115
      4
      2013-01-03
      4297
      300
      1
      0
      0
      1
    
    
      1016094
      1115
      3
      2013-01-02
      3697
      305
      1
      0
      0
      1
    
    
      1017208
      1115
      2
      2013-01-01
      0
      0
      0
      0
      a
      1
    
  

942 rows × 9 columns



In [51]:

    
test_df[test_df['Store']==1115]









    Out[51]:







  
    
      
      Id
      Store
      DayOfWeek
      Date
      Open
      Promo
      StateHoliday
      SchoolHoliday
    
  
  
    
      855
      856
      1115
      4
      2015-09-17
      1.0
      1
      0
      0
    
    
      1711
      1712
      1115
      3
      2015-09-16
      1.0
      1
      0
      0
    
    
      2567
      2568
      1115
      2
      2015-09-15
      1.0
      1
      0
      0
    
    
      3423
      3424
      1115
      1
      2015-09-14
      1.0
      1
      0
      0
    
    
      4279
      4280
      1115
      7
      2015-09-13
      0.0
      0
      0
      0
    
    
      5135
      5136
      1115
      6
      2015-09-12
      1.0
      0
      0
      0
    
    
      5991
      5992
      1115
      5
      2015-09-11
      1.0
      0
      0
      0
    
    
      6847
      6848
      1115
      4
      2015-09-10
      1.0
      0
      0
      0
    
    
      7703
      7704
      1115
      3
      2015-09-09
      1.0
      0
      0
      0
    
    
      8559
      8560
      1115
      2
      2015-09-08
      1.0
      0
      0
      0
    
    
      9415
      9416
      1115
      1
      2015-09-07
      1.0
      0
      0
      0
    
    
      10271
      10272
      1115
      7
      2015-09-06
      0.0
      0
      0
      0
    
    
      11127
      11128
      1115
      6
      2015-09-05
      1.0
      0
      0
      0
    
    
      11983
      11984
      1115
      5
      2015-09-04
      1.0
      1
      0
      1
    
    
      12839
      12840
      1115
      4
      2015-09-03
      1.0
      1
      0
      1
    
    
      13695
      13696
      1115
      3
      2015-09-02
      1.0
      1
      0
      1
    
    
      14551
      14552
      1115
      2
      2015-09-01
      1.0
      1
      0
      1
    
    
      15407
      15408
      1115
      1
      2015-08-31
      1.0
      1
      0
      1
    
    
      16263
      16264
      1115
      7
      2015-08-30
      0.0
      0
      0
      1
    
    
      17119
      17120
      1115
      6
      2015-08-29
      1.0
      0
      0
      1
    
    
      17975
      17976
      1115
      5
      2015-08-28
      1.0
      0
      0
      1
    
    
      18831
      18832
      1115
      4
      2015-08-27
      1.0
      0
      0
      1
    
    
      19687
      19688
      1115
      3
      2015-08-26
      1.0
      0
      0
      1
    
    
      20543
      20544
      1115
      2
      2015-08-25
      1.0
      0
      0
      1
    
    
      21399
      21400
      1115
      1
      2015-08-24
      1.0
      0
      0
      1
    
    
      22255
      22256
      1115
      7
      2015-08-23
      0.0
      0
      0
      1
    
    
      23111
      23112
      1115
      6
      2015-08-22
      1.0
      0
      0
      1
    
    
      23967
      23968
      1115
      5
      2015-08-21
      1.0
      1
      0
      1
    
    
      24823
      24824
      1115
      4
      2015-08-20
      1.0
      1
      0
      1
    
    
      25679
      25680
      1115
      3
      2015-08-19
      1.0
      1
      0
      1
    
    
      26535
      26536
      1115
      2
      2015-08-18
      1.0
      1
      0
      1
    
    
      27391
      27392
      1115
      1
      2015-08-17
      1.0
      1
      0
      1
    
    
      28247
      28248
      1115
      7
      2015-08-16
      0.0
      0
      0
      1
    
    
      29103
      29104
      1115
      6
      2015-08-15
      1.0
      0
      0
      1
    
    
      29959
      29960
      1115
      5
      2015-08-14
      1.0
      0
      0
      1
    
    
      30815
      30816
      1115
      4
      2015-08-13
      1.0
      0
      0
      1
    
    
      31671
      31672
      1115
      3
      2015-08-12
      1.0
      0
      0
      1
    
    
      32527
      32528
      1115
      2
      2015-08-11
      1.0
      0
      0
      1
    
    
      33383
      33384
      1115
      1
      2015-08-10
      1.0
      0
      0
      1
    
    
      34239
      34240
      1115
      7
      2015-08-09
      0.0
      0
      0
      1
    
    
      35095
      35096
      1115
      6
      2015-08-08
      1.0
      0
      0
      1
    
    
      35951
      35952
      1115
      5
      2015-08-07
      1.0
      1
      0
      1
    
    
      36807
      36808
      1115
      4
      2015-08-06
      1.0
      1
      0
      1
    
    
      37663
      37664
      1115
      3
      2015-08-05
      1.0
      1
      0
      1
    
    
      38519
      38520
      1115
      2
      2015-08-04
      1.0
      1
      0
      1
    
    
      39375
      39376
      1115
      1
      2015-08-03
      1.0
      1
      0
      1
    
    
      40231
      40232
      1115
      7
      2015-08-02
      0.0
      0
      0
      1
    
    
      41087
      41088
      1115
      6
      2015-08-01
      1.0
      0
      0
      1



In [52]:

    
store_df.head()









    Out[52]:







  
    
      
      Store
      StoreType
      Assortment
      CompetitionDistance
      CompetitionOpenSinceMonth
      CompetitionOpenSinceYear
      Promo2
      Promo2SinceWeek
      Promo2SinceYear
      PromoInterval
    
  
  
    
      0
      1
      c
      a
      1270.0
      9.0
      2008.0
      0
      NaN
      NaN
      NaN
    
    
      1
      2
      a
      a
      570.0
      11.0
      2007.0
      1
      13.0
      2010.0
      Jan,Apr,Jul,Oct
    
    
      2
      3
      a
      a
      14130.0
      12.0
      2006.0
      1
      14.0
      2011.0
      Jan,Apr,Jul,Oct
    
    
      3
      4
      c
      c
      620.0
      9.0
      2009.0
      0
      NaN
      NaN
      NaN
    
    
      4
      5
      a
      a
      29910.0
      4.0
      2015.0
      0
      NaN
      NaN
      NaN



In [53]:

    
test_df.head()









    Out[53]:







  
    
      
      Id
      Store
      DayOfWeek
      Date
      Open
      Promo
      StateHoliday
      SchoolHoliday
    
  
  
    
      0
      1
      1
      4
      2015-09-17
      1.0
      1
      0
      0
    
    
      1
      2
      3
      4
      2015-09-17
      1.0
      1
      0
      0
    
    
      2
      3
      7
      4
      2015-09-17
      1.0
      1
      0
      0
    
    
      3
      4
      8
      4
      2015-09-17
      1.0
      1
      0
      0
    
    
      4
      5
      9
      4
      2015-09-17
      1.0
      1
      0
      0



In [54]:

    
rossmann_df.info()
print('-'*20)
store_df.info()
print('-'*20)
test_df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 9 columns):
Store            1017209 non-null int64
DayOfWeek        1017209 non-null int64
Date             1017209 non-null object
Sales            1017209 non-null int64
Customers        1017209 non-null int64
Open             1017209 non-null int64
Promo            1017209 non-null int64
StateHoliday     1017209 non-null object
SchoolHoliday    1017209 non-null int64
dtypes: int64(7), object(2)
memory usage: 69.8+ MB
--------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1115 entries, 0 to 1114
Data columns (total 10 columns):
Store                        1115 non-null int64
StoreType                    1115 non-null object
Assortment                   1115 non-null object
CompetitionDistance          1112 non-null float64
CompetitionOpenSinceMonth    761 non-null float64
CompetitionOpenSinceYear     761 non-null float64
Promo2                       1115 non-null int64
Promo2SinceWeek              571 non-null float64
Promo2SinceYear              571 non-null float64
PromoInterval                571 non-null object
dtypes: float64(5), int64(2), object(3)
memory usage: 87.2+ KB
--------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41088 entries, 0 to 41087
Data columns (total 8 columns):
Id               41088 non-null int64
Store            41088 non-null int64
DayOfWeek        41088 non-null int64
Date             41088 non-null object
Open             41077 non-null float64
Promo            41088 non-null int64
StateHoliday     41088 non-null object
SchoolHoliday    41088 non-null int64
dtypes: float64(1), int64(5), object(2)
memory usage: 2.5+ MB



In [55]:

    
fig,(axis1)= plt.subplots(1,1,figsize=(15,4))
sns.countplot(x='Open',hue='DayOfWeek',data=rossmann_df,palette='husl',ax=axis1)









    Out[55]:





<matplotlib.axes._subplots.AxesSubplot at 0x237363c8>



In [56]:

    
test_df["DayOfWeek"] != 7









    Out[56]:





0        True
1        True
2        True
3        True
4        True
5        True
6        True
7        True
8        True
9        True
10       True
11       True
12       True
13       True
14       True
15       True
16       True
17       True
18       True
19       True
20       True
21       True
22       True
23       True
24       True
25       True
26       True
27       True
28       True
29       True
         ... 
41058    True
41059    True
41060    True
41061    True
41062    True
41063    True
41064    True
41065    True
41066    True
41067    True
41068    True
41069    True
41070    True
41071    True
41072    True
41073    True
41074    True
41075    True
41076    True
41077    True
41078    True
41079    True
41080    True
41081    True
41082    True
41083    True
41084    True
41085    True
41086    True
41087    True
Name: DayOfWeek, Length: 41088, dtype: bool



In [57]:

    
test_df.describe()









    Out[57]:







  
    
      
      Id
      Store
      DayOfWeek
      Open
      Promo
      SchoolHoliday
    
  
  
    
      count
      41088.000000
      41088.000000
      41088.000000
      41077.000000
      41088.000000
      41088.000000
    
    
      mean
      20544.500000
      555.899533
      3.979167
      0.854322
      0.395833
      0.443487
    
    
      std
      11861.228267
      320.274496
      2.015481
      0.352787
      0.489035
      0.496802
    
    
      min
      1.000000
      1.000000
      1.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      10272.750000
      279.750000
      2.000000
      1.000000
      0.000000
      0.000000
    
    
      50%
      20544.500000
      553.500000
      4.000000
      1.000000
      0.000000
      0.000000
    
    
      75%
      30816.250000
      832.250000
      6.000000
      1.000000
      1.000000
      1.000000
    
    
      max
      41088.000000
      1115.000000
      7.000000
      1.000000
      1.000000
      1.000000



In [58]:

    
test_df["Open"][test_df["Open"]!=test_df["Open"]]









    Out[58]:





479     NaN
1335    NaN
2191    NaN
3047    NaN
4759    NaN
5615    NaN
6471    NaN
7327    NaN
8183    NaN
9039    NaN
10751   NaN
Name: Open, dtype: float64



In [59]:

    
test_df["Open"][test_df["Open"]!=test_df["Open"]]=(test_df['DayOfWeek'] != 7).astype(int)









    



D:\soft\anaconda3\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.



In [60]:

    
test_df["Open"][test_df["Open"]!=test_df["Open"]]









    Out[60]:





Series([], Name: Open, dtype: float64)



In [61]:

    
test_df["Open"][479]









    Out[61]:





1.0



In [62]:

    
# Date 
# Create new date column
rossmann_df['Year'] = rossmann_df['Date'].apply(lambda x : int(str(x)[:4]))
rossmann_df['Month'] = rossmann_df['Date'].apply(lambda x : int(str(x)[5:7]))
test_df['Year'] = test_df['Date'].apply(lambda x : int(str(x)[:4]))
test_df['Month'] = test_df['Date'].apply(lambda x : int(str(x)[5:7]))

# 简化Date 方便可视化
rossmann_df['Date'] = rossmann_df['Date'].apply(lambda x : str(x)[:7])
test_df['Date'] = test_df['Date'].apply(lambda x : str(x)[:7])

average_sales = rossmann_df.groupby('Date')['Sales'].mean()
pct_changle_sales = rossmann_df.groupby('Date')['Sales'].sum().pct_change()
print(average_sales)
print('='*20)
print(pct_changle_sales)









    



Date
2013-01    5211.555578
2013-02    5494.371397
2013-03    5820.349168
2013-04    5483.749836
2013-05    5364.127383
2013-06    5402.162960
2013-07    6042.062260
2013-08    5729.574049
2013-09    5322.988430
2013-10    5429.258788
2013-11    5864.601614
2013-12    6703.618140
2014-01    5431.875799
2014-02    5731.091512
2014-03    5584.257312
2014-04    5815.993333
2014-05    5632.670534
2014-06    5681.526188
2014-07    5999.403381
2014-08    5649.421252
2014-09    5865.104029
2014-10    5665.564878
2014-11    6179.249661
2014-12    6973.282456
2015-01    5752.747866
2015-02    5710.296541
2015-03    5949.130132
2015-04    5916.857578
2015-05    5472.122002
2015-06    6199.203976
2015-07    6142.705511
Name: Sales, dtype: float64
====================
Date
2013-01         NaN
2013-02   -0.047731
2013-03    0.172829
2013-04   -0.088224
2013-05    0.010792
2013-06   -0.025396
2013-07    0.155734
2013-08   -0.051719
2013-09   -0.100932
2013-10    0.053963
2013-11    0.045340
2013-12    0.181167
2014-01   -0.189710
2014-02   -0.047020
2014-03    0.078777
2014-04    0.007901
2014-05    0.000762
2014-06   -0.023864
2014-07   -0.085002
2014-08   -0.058336
2014-09    0.004688
2014-10   -0.001822
2014-11    0.055485
2014-12    0.166117
2015-01   -0.016212
2015-02   -0.103439
2015-03    0.153449
2015-04   -0.037508
2015-05   -0.044336
2015-06    0.096326
2015-07    0.023916
Name: Sales, dtype: float64



In [63]:

    
fig,(axis1,axis2) = plt.subplots(2,1,sharex=True,figsize=(15,8))
ax1 = average_sales.plot(legend=True,ax=axis1,marker='o',title='Average Sales')
ax1.set_xticks(range(len(average_sales)))
ax1.set_xticklabels(average_sales.index.tolist(),rotation=90)
ax2 = pct_changle_sales.plot(legend=True,ax=axis2,marker='o',rot=90,colormap='summer',title='Sales Percent Change')



In [64]:

    
rossmann_df.head()









    Out[64]:







  
    
      
      Store
      DayOfWeek
      Date
      Sales
      Customers
      Open
      Promo
      StateHoliday
      SchoolHoliday
      Year
      Month
    
  
  
    
      0
      1
      5
      2015-07
      5263
      555
      1
      1
      0
      1
      2015
      7
    
    
      1
      2
      5
      2015-07
      6064
      625
      1
      1
      0
      1
      2015
      7
    
    
      2
      3
      5
      2015-07
      8314
      821
      1
      1
      0
      1
      2015
      7
    
    
      3
      4
      5
      2015-07
      13995
      1498
      1
      1
      0
      1
      2015
      7
    
    
      4
      5
      5
      2015-07
      4822
      559
      1
      1
      0
      1
      2015
      7



In [65]:

    
test_df.head()









    Out[65]:







  
    
      
      Id
      Store
      DayOfWeek
      Date
      Open
      Promo
      StateHoliday
      SchoolHoliday
      Year
      Month
    
  
  
    
      0
      1
      1
      4
      2015-09
      1.0
      1
      0
      0
      2015
      9
    
    
      1
      2
      3
      4
      2015-09
      1.0
      1
      0
      0
      2015
      9
    
    
      2
      3
      7
      4
      2015-09
      1.0
      1
      0
      0
      2015
      9
    
    
      3
      4
      8
      4
      2015-09
      1.0
      1
      0
      0
      2015
      9
    
    
      4
      5
      9
      4
      2015-09
      1.0
      1
      0
      0
      2015
      9



In [66]:

    
fig,(axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
sns.barplot(ax=axis1,data=rossmann_df,x='Year',y='Sales')
sns.barplot(ax=axis2,data=rossmann_df,x='Year',y='Customers')









    Out[66]:





<matplotlib.axes._subplots.AxesSubplot at 0x2559f0f0>



In [67]:

    
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

sns.barplot(x='Year', y='Sales', data=rossmann_df, ax=axis1)
sns.barplot(x='Year', y='Customers', data=rossmann_df, ax=axis2)









    Out[67]:





<matplotlib.axes._subplots.AxesSubplot at 0x253c70b8>



In [ ]:

    
fig ,(axis1,axis2) = plt.subplots(2,1,figsize=(15,8))
sns.boxplot([rossmann_df['Customers']],whis=np.inf,ax=axis1)
average_customers = rossmann_df.groupby('Date')['Customers'].mean()
ax = average_customers.plot(legend=True,marker='o',ax=axis2)
ax.set_xticks(range(len(average_customers)))
xlabels = ax.set_xticklabels(average_customers.index.tolist(),rotation=90)



In [ ]:

    
fig,(axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
sns.barplot(x='DayOfWeek',y='Sales',data=rossmann_df,order=[1,2,3,4,5,6,7],ax=axis1)
sns.barplot(x='DayOfWeek',y='Customers',data=rossmann_df,order=[1,2,3,4,5,6,7],ax=axis2)



In [ ]:

    
# Promo
fig,(axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
sns.barplot(x='Promo',y='Sales',data=rossmann_df,ax=axis1)
sns.barplot(x='Promo',y='Customers',data=rossmann_df,ax=axis2)



In [ ]:

    
# StateHoliday

# StateHoliday column has values 0 & "0", So, we need to merge values with 0 to "0"
rossmann_df["StateHoliday"].loc[rossmann_df["StateHoliday"] == 0] = "0"
# test_df["StateHoliday"].loc[test_df["StateHoliday"] == 0] = "0"

# Plot
sns.countplot(x='StateHoliday', data=rossmann_df)

# Before
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

sns.barplot(x='StateHoliday', y='Sales', data=rossmann_df, ax=axis1)

mask = (rossmann_df["StateHoliday"] != "0") & (rossmann_df["Sales"] > 0)
sns.barplot(x='StateHoliday', y='Sales', data=rossmann_df[mask], ax=axis2)# .... continue with StateHoliday



In [ ]:

    
# After
rossmann_df["StateHoliday"] = rossmann_df["StateHoliday"].map({0: 0, "0": 0, "a": 1, "b": 1, "c": 1})
test_df["StateHoliday"]     = test_df["StateHoliday"].map({0: 0, "0": 0, "a": 1, "b": 1, "c": 1})

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

sns.barplot(x='StateHoliday', y='Sales', data=rossmann_df, ax=axis1)
sns.barplot(x='StateHoliday', y='Customers', data=rossmann_df, ax=axis2)



In [ ]:

    
sns.countplot(x='SchoolHoliday', data=rossmann_df)

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

sns.barplot(x='SchoolHoliday', y='Sales', data=rossmann_df, ax=axis1)
sns.barplot(x='SchoolHoliday', y='Customers', data=rossmann_df, ax=axis2)



In [ ]:

    
# Sales

fig, (axis1,axis2) = plt.subplots(2,1,figsize=(15,8))

# Plot max, min values, & 2nd, 3rd quartile
sns.boxplot([rossmann_df["Customers"]], whis=np.inf, ax=axis1)

# Plot sales values 
# Notice that values with 0 is mostly because the store was closed
rossmann_df["Sales"].plot(kind='hist',bins=70,xlim=(0,15000),ax=axis2)



In [ ]:

    
# Using store_df

# Merge store_df with average store sales & customers
average_sales_customers = rossmann_df.groupby('Store')[["Sales", "Customers"]].mean()
sales_customers_df = DataFrame({'Store':average_sales_customers.index,
                      'Sales':average_sales_customers["Sales"], 'Customers': average_sales_customers["Customers"]}, 
                      columns=['Store', 'Sales', 'Customers'])
store_df = pd.merge(sales_customers_df, store_df, on='Store')

store_df.head()



In [ ]:

    
# StoreType 

# Plot StoreType, & StoreType Vs average sales and customers

sns.countplot(x='StoreType', data=store_df, order=['a','b','c', 'd'])

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

sns.barplot(x='StoreType', y='Sales', data=store_df, order=['a','b','c', 'd'],ax=axis1)
sns.barplot(x='StoreType', y='Customers', data=store_df, order=['a','b','c', 'd'], ax=axis2)



In [ ]:

    
# Assortment 

# Plot Assortment, & Assortment Vs average sales and customers

sns.countplot(x='Assortment', data=store_df, order=['a','b','c'])

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

sns.barplot(x='Assortment', y='Sales', data=store_df, order=['a','b','c'], ax=axis1)
sns.barplot(x='Assortment', y='Customers', data=store_df, order=['a','b','c'], ax=axis2)



In [ ]:

    
# Promo2

# Plot Promo2, & Promo2 Vs average sales and customers

sns.countplot(x='Promo2', data=store_df)

fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))

sns.barplot(x='Promo2', y='Sales', data=store_df, ax=axis1)
sns.barplot(x='Promo2', y='Customers', data=store_df, ax=axis2)



In [ ]:

    
store_df.head()



In [ ]:

    
# CompetitionDistance

# fill NaN values
store_df["CompetitionDistance"].fillna(store_df["CompetitionDistance"].median())

# Plot CompetitionDistance Vs Sales
store_df.plot(kind='scatter',x='CompetitionDistance',y='Sales',figsize=(15,4))
store_df.plot(kind='kde',x='CompetitionDistance',y='Sales',figsize=(15,4))



In [ ]:

    
# CompetitionDistance

# fill NaN values
store_df["CompetitionDistance"].fillna(store_df["CompetitionDistance"].median())

# Plot CompetitionDistance Vs Sales
store_df.plot(kind='scatter',x='CompetitionDistance',y='Sales',figsize=(15,4))
store_df.plot(kind='kde',x='CompetitionDistance',y='Sales',figsize=(15,4))



In [ ]:

    
# What happened to the average sales of a store over time when competition started?
# Example: the average sales for store_id = 6 has dramatically decreased since the competition started

store_id = 6
store_data = rossmann_df[rossmann_df["Store"] == store_id]

average_store_sales = store_data.groupby('Date')["Sales"].mean()

# Get year, and month when Competition started
y = store_df["CompetitionOpenSinceYear"].loc[store_df["Store"]  == store_id].values[0]
m = store_df["CompetitionOpenSinceMonth"].loc[store_df["Store"] == store_id].values[0]

# Plot 
ax = average_store_sales.plot(legend=True,figsize=(15,4),marker='o')
ax.set_xticks(range(len(average_store_sales)))
ax.set_xticklabels(average_store_sales.index.tolist(), rotation=90)

# Since all data of store sales given in rossmann_df starts with year=2013 till 2015,
# So, we need to check if year>=2013 and y & m aren't NaN values.
if y >= 2013 and y == y and m == m:
    plt.axvline(x=((y-2013) * 12) + (m - 1), linewidth=3, color='grey')



In [ ]:

    
# Risk Analysis
# Analyze the risk of a store; Risk(std) Vs Expected(mean)

# .... countiue using store_data
store_average = store_data["Sales"].mean()
store_std     = store_data["Sales"].std()

# Plot
plt.scatter(store_average, store_std,alpha = 0.5,s =np.pi*20)

# Get min & max mean and std of store sales
# Remember that store_df["Sales"] has the average sales for a store
std_sales = rossmann_df.groupby('Store')["Sales"].std()

min_average = store_df["Sales"].min()
max_average = store_df["Sales"].max()
min_std     = std_sales.min()
max_std     = std_sales.max()

# Set the x and y limits of the plot
plt.ylim([min_std, max_std])
plt.xlim([min_average, max_average])

# Set the plot axis titles
plt.xlabel('Expected Sales')
plt.ylabel('Risk')

# Set label
label, x, y = "Store {}".format(store_id), store_average, store_std
plt.annotate(
        label, 
        xy = (x, y), xytext = (50, 50),
        textcoords = 'offset points', ha = 'right', va = 'bottom',
        arrowprops = dict(arrowstyle = '-', connectionstyle = 'arc3,rad=-0.3'))



In [ ]:

    
# Correlation
# Visualize the Correlation between stores

store_piv       = pd.pivot_table(rossmann_df,values='Sales', index='Date', columns=['Store'],aggfunc='sum')
store_pct_chage = store_piv.pct_change().dropna()
store_piv.head()



In [ ]:

    
# .... continue Correlation

# Plot correlation between range of stores
start_store = 1
end_store   = 5

fig, (axis1) = plt.subplots(1,1,figsize=(15,5))

# using summation of sales values for each store 
sns.heatmap(store_piv[list(range(start_store, end_store+1))].corr(),annot=True,linewidths=2)



In [ ]:

    
# using percent change for each store
# sns.heatmap(store_pct_chage[list(range(start_store, end_store+1))].corr(),annot=True,linewidths=2)# Notice that test_df has only year=2015, and months 8 & 9

# drop Year and Month
rossmann_df.drop(["Year", "Month"], axis=1, inplace=True)
test_df.drop(["Year", "Month"], axis=1, inplace=True)

# Create dummy varibales for DayOfWeek
day_dummies_rossmann  = pd.get_dummies(rossmann_df['DayOfWeek'], prefix='Day')
day_dummies_rossmann.drop(['Day_7'], axis=1, inplace=True)

day_dummies_test  = pd.get_dummies(test_df['DayOfWeek'],prefix='Day')
day_dummies_test.drop(['Day_7'], axis=1, inplace=True)

rossmann_df = rossmann_df.join(day_dummies_rossmann)
test_df     = test_df.join(day_dummies_test)

rossmann_df.drop(['DayOfWeek'], axis=1,inplace=True)
test_df.drop(['DayOfWeek'], axis=1,inplace=True)



In [ ]:

    
# remove all rows(store,date) that were closed
rossmann_df = rossmann_df[rossmann_df["Open"] != 0]

# drop unnecessary columns, these columns won't be useful in prediction
rossmann_df.drop(["Open","Customers", "Date"], axis=1, inplace=True)



In [ ]:

    
# save ids of closed stores, because we will assign their sales value to 0 later(see below)
closed_store_ids = test_df["Id"][test_df["Open"] == 0].values

# remove all rows(store,date) that were closed
test_df = test_df[test_df["Open"] != 0]

# drop unnecessary columns, these columns won't be useful in prediction
test_df.drop(['Open', 'Date'], axis=1,inplace=True)



In [ ]:

    
# Loop through each store, 
# train the model using the data of current store, and predict it's sales values.

rossmann_dic = dict(list(rossmann_df.groupby('Store')))
print('rossmann_dict:',rossmann_dic)
test_dic     = dict(list(test_df.groupby('Store')))
print('test_dict:',test_dic)
submission   = Series()
scores       = []

for i in test_dic:
    
    # current store
    store = rossmann_dic[i]
    
    # define training and testing sets
    X_train = store.drop(["Sales","Store"],axis=1)
    Y_train = store["Sales"]
    X_test  = test_dic[i].copy()
    
    store_ids = X_test["Id"]
    X_test.drop(["Id","Store"], axis=1,inplace=True)
    
    # Linear Regression
    lreg = LinearRegression()
    lreg.fit(X_train, Y_train)
    Y_pred = lreg.predict(X_test)
    scores.append(lreg.score(X_train, Y_train))

    # Xgboost
    # params = {"objective": "reg:linear",  "max_depth": 10}
    # T_train_xgb = xgb.DMatrix(X_train, Y_train)
    # X_test_xgb  = xgb.DMatrix(X_test)
    # gbm = xgb.train(params, T_train_xgb, 100)
    # Y_pred = gbm.predict(X_test_xgb)
    
    # append predicted values of current store to submission
    submission = submission.append(Series(Y_pred, index=store_ids))

# append rows(store,date) that were closed, and assign their sales value to 0
submission = submission.append(Series(0, index=closed_store_ids))

# save to csv file
submission = pd.DataFrame({ "Id": submission.index, "Sales": submission.values})
submission.to_csv('rossmann.csv', index=False)

	Store	DayOfWeek	Date	Sales	Customers	Open	Promo	SchoolHoliday
0	1	5	2015-07-31	5263	555	1	1	1
1	2	5	2015-07-31	6064	625	1	1	1
2	3	5	2015-07-31	8314	821	1	1	1
3	4	5	2015-07-31	13995	1498	1	1	1
4	5	5	2015-07-31	4822	559	1	1	1

	Store	DayOfWeek	Date	Sales	Customers	Open	Promo	StateHoliday	SchoolHoliday
1114	1115	5	2015-07-31	8680	538	1	1	0	1
2229	1115	4	2015-07-30	8405	502	1	1	0	1
3344	1115	3	2015-07-29	7661	473	1	1	0	1
4459	1115	2	2015-07-28	8093	500	1	1	0	1
5574	1115	1	2015-07-27	10712	608	1	1	0	1
6689	1115	7	2015-07-26	0	0	0	0	0	0
7804	1115	6	2015-07-25	6897	478	1	0	0	0
8919	1115	5	2015-07-24	5816	417	1	0	0	0
10034	1115	4	2015-07-23	6150	435	1	0	0	0
11149	1115	3	2015-07-22	5342	382	1	0	0	0
12264	1115	2	2015-07-21	5074	352	1	0	0	0
13379	1115	1	2015-07-20	6083	440	1	0	0	0
14494	1115	7	2015-07-19	0	0	0	0	0	0
15609	1115	6	2015-07-18	7264	504	1	0	0	0
16724	1115	5	2015-07-17	7874	519	1	1	0	0
17839	1115	4	2015-07-16	6590	455	1	1	0	0
18954	1115	3	2015-07-15	6039	450	1	1	0	0
20069	1115	2	2015-07-14	7562	488	1	1	0	0
21184	1115	1	2015-07-13	10598	641	1	1	0	0
22299	1115	7	2015-07-12	0	0	0	0	0	0
23414	1115	6	2015-07-11	7164	483	1	0	0	0
24529	1115	5	2015-07-10	5844	404	1	0	0	0
25644	1115	4	2015-07-09	5686	416	1	0	0	0
26759	1115	3	2015-07-08	5900	413	1	0	0	0
27874	1115	2	2015-07-07	5138	362	1	0	0	0
28989	1115	1	2015-07-06	6501	453	1	0	0	0
30104	1115	7	2015-07-05	0	0	0	0	0	0
31219	1115	6	2015-07-04	6543	466	1	0	0	0
32334	1115	5	2015-07-03	7412	478	1	1	0	0
33449	1115	4	2015-07-02	6858	452	1	1	0	0
...	...	...	...	...	...	...	...	...	...
984874	1115	3	2013-01-30	3813	287	1	0	0	0
985989	1115	2	2013-01-29	3397	277	1	0	0	0
987104	1115	1	2013-01-28	3615	271	1	0	0	0
988219	1115	7	2013-01-27	0	0	0	0	0	0
989334	1115	6	2013-01-26	4896	394	1	0	0	0
990449	1115	5	2013-01-25	5082	373	1	1	0	0
991564	1115	4	2013-01-24	5650	366	1	1	0	0
992679	1115	3	2013-01-23	5176	336	1	1	0	0
993794	1115	2	2013-01-22	5388	329	1	1	0	0
994909	1115	1	2013-01-21	3723	243	1	1	0	0
996024	1115	7	2013-01-20	0	0	0	0	0	0
997139	1115	6	2013-01-19	5207	369	1	0	0	0
998254	1115	5	2013-01-18	4287	296	1	0	0	0
999369	1115	4	2013-01-17	3039	244	1	0	0	0
1000484	1115	3	2013-01-16	3064	237	1	0	0	0
1001599	1115	2	2013-01-15	3528	277	1	0	0	0
1002714	1115	1	2013-01-14	3158	252	1	0	0	0
1003829	1115	7	2013-01-13	0	0	0	0	0	0
1004944	1115	6	2013-01-12	4497	350	1	0	0	0
1006059	1115	5	2013-01-11	5142	351	1	1	0	1
1007174	1115	4	2013-01-10	5007	339	1	1	0	1
1008289	1115	3	2013-01-09	4649	324	1	1	0	1
1009404	1115	2	2013-01-08	5243	341	1	1	0	1
1010519	1115	1	2013-01-07	6905	471	1	1	0	1
1011634	1115	7	2013-01-06	0	0	0	0	0	1
1012749	1115	6	2013-01-05	4771	339	1	0	0	1
1013864	1115	5	2013-01-04	4540	326	1	0	0	1
1014979	1115	4	2013-01-03	4297	300	1	0	0	1
1016094	1115	3	2013-01-02	3697	305	1	0	0	1
1017208	1115	2	2013-01-01	0	0	0	0	a	1

	Id	Store	DayOfWeek	Date	Open	Promo	SchoolHoliday
855	856	1115	4	2015-09-17	1.0	1	0
1711	1712	1115	3	2015-09-16	1.0	1	0
2567	2568	1115	2	2015-09-15	1.0	1	0
3423	3424	1115	1	2015-09-14	1.0	1	0
4279	4280	1115	7	2015-09-13	0.0	0	0
5135	5136	1115	6	2015-09-12	1.0	0	0
5991	5992	1115	5	2015-09-11	1.0	0	0
6847	6848	1115	4	2015-09-10	1.0	0	0
7703	7704	1115	3	2015-09-09	1.0	0	0
8559	8560	1115	2	2015-09-08	1.0	0	0
9415	9416	1115	1	2015-09-07	1.0	0	0
10271	10272	1115	7	2015-09-06	0.0	0	0
11127	11128	1115	6	2015-09-05	1.0	0	0
11983	11984	1115	5	2015-09-04	1.0	1	1
12839	12840	1115	4	2015-09-03	1.0	1	1
13695	13696	1115	3	2015-09-02	1.0	1	1
14551	14552	1115	2	2015-09-01	1.0	1	1
15407	15408	1115	1	2015-08-31	1.0	1	1
16263	16264	1115	7	2015-08-30	0.0	0	1
17119	17120	1115	6	2015-08-29	1.0	0	1
17975	17976	1115	5	2015-08-28	1.0	0	1
18831	18832	1115	4	2015-08-27	1.0	0	1
19687	19688	1115	3	2015-08-26	1.0	0	1
20543	20544	1115	2	2015-08-25	1.0	0	1
21399	21400	1115	1	2015-08-24	1.0	0	1
22255	22256	1115	7	2015-08-23	0.0	0	1
23111	23112	1115	6	2015-08-22	1.0	0	1
23967	23968	1115	5	2015-08-21	1.0	1	1
24823	24824	1115	4	2015-08-20	1.0	1	1
25679	25680	1115	3	2015-08-19	1.0	1	1
26535	26536	1115	2	2015-08-18	1.0	1	1
27391	27392	1115	1	2015-08-17	1.0	1	1
28247	28248	1115	7	2015-08-16	0.0	0	1
29103	29104	1115	6	2015-08-15	1.0	0	1
29959	29960	1115	5	2015-08-14	1.0	0	1
30815	30816	1115	4	2015-08-13	1.0	0	1
31671	31672	1115	3	2015-08-12	1.0	0	1
32527	32528	1115	2	2015-08-11	1.0	0	1
33383	33384	1115	1	2015-08-10	1.0	0	1
34239	34240	1115	7	2015-08-09	0.0	0	1
35095	35096	1115	6	2015-08-08	1.0	0	1
35951	35952	1115	5	2015-08-07	1.0	1	1
36807	36808	1115	4	2015-08-06	1.0	1	1
37663	37664	1115	3	2015-08-05	1.0	1	1
38519	38520	1115	2	2015-08-04	1.0	1	1
39375	39376	1115	1	2015-08-03	1.0	1	1
40231	40232	1115	7	2015-08-02	0.0	0	1
41087	41088	1115	6	2015-08-01	1.0	0	1

	Store	StoreType	Assortment	CompetitionDistance	CompetitionOpenSinceMonth	CompetitionOpenSinceYear	Promo2	Promo2SinceWeek	Promo2SinceYear	PromoInterval
0	1	c	a	1270.0	9.0	2008.0	0	NaN	NaN	NaN
1	2	a	a	570.0	11.0	2007.0	1	13.0	2010.0	Jan,Apr,Jul,Oct
2	3	a	a	14130.0	12.0	2006.0	1	14.0	2011.0	Jan,Apr,Jul,Oct
3	4	c	c	620.0	9.0	2009.0	0	NaN	NaN	NaN
4	5	a	a	29910.0	4.0	2015.0	0	NaN	NaN	NaN

	Id	Store	DayOfWeek	Open	Promo	SchoolHoliday
count	41088.000000	41088.000000	41088.000000	41077.000000	41088.000000	41088.000000
mean	20544.500000	555.899533	3.979167	0.854322	0.395833	0.443487
std	11861.228267	320.274496	2.015481	0.352787	0.489035	0.496802
min	1.000000	1.000000	1.000000	0.000000	0.000000	0.000000
25%	10272.750000	279.750000	2.000000	1.000000	0.000000	0.000000
50%	20544.500000	553.500000	4.000000	1.000000	0.000000	0.000000
75%	30816.250000	832.250000	6.000000	1.000000	1.000000	1.000000
max	41088.000000	1115.000000	7.000000	1.000000	1.000000	1.000000

	Store	DayOfWeek	Date	Sales	Customers	Open	Promo	SchoolHoliday	Year	Month
0	1	5	2015-07	5263	555	1	1	1	2015	7
1	2	5	2015-07	6064	625	1	1	1	2015	7
2	3	5	2015-07	8314	821	1	1	1	2015	7
3	4	5	2015-07	13995	1498	1	1	1	2015	7
4	5	5	2015-07	4822	559	1	1	1	2015	7

	Id	Store	DayOfWeek	Date	Open	Promo	Year	Month
0	1	1	4	2015-09	1.0	1	2015	9
1	2	3	4	2015-09	1.0	1	2015	9
2	3	7	4	2015-09	1.0	1	2015	9
3	4	8	4	2015-09	1.0	1	2015	9
4	5	9	4	2015-09	1.0	1	2015	9