In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from scipy.stats import probplot
from matplotlib import pyplot as plt
Most of the fields are self-explanatory. The following are descriptions for those that aren't.
In [2]:
!unzip -o '*.csv.zip'
!rm *.zip
In [3]:
ls
In [4]:
!head train.csv
In [5]:
df = pd.read_csv('train.csv')
for c in df.columns:
v = ["2015-07-31", 'etc.'] if c == 'Date' else np.sort(df[c].unique())
hasna = df[c].isnull().any()
print "{}: {}\n has NA values: {}\n".format(c, v, hasna)
In [74]:
data_types = {
"Store": int,
"DayOfWeek": int, # {1, 2, 3, 4, 5, 6, 7}
"Date": object, # e.g. "2015-07-31"
"Sales": int,
"Customers": int,
"Open": bool,
"Promo": bool,
"StateHoliday": str, # {'0', 'a', 'b', 'c'}
"SchoolHoliday": bool,
}
df_train = pd.read_csv('train.csv', dtype=data_types)
In [6]:
!head test.csv
In [93]:
df = pd.read_csv('test.csv')
for c in df.columns:
v = np.sort(df[c].unique())
hasna = df[c].isnull().any()
print "{}: {}\n dtype: {}\n has NA values: {}\n".format(c, v, df[c].dtype, hasna)
In [61]:
!head store.csv
In [98]:
df = pd.read_csv('store.csv')
for c in df.columns:
v = [20, 30, 'etc.', np.nan] if c == 'CompetitionDistance' else np.sort(df[c].unique())
print "{}: {}\n dtype: {}\n has NA values: {}\n".format(c, v, df[c].dtype, df[c].isnull().any())
In [120]:
# check consistency of 'Promo2' flag
for c in ('PromoInterval', 'Promo2SinceWeek', 'Promo2SinceYear'):
print (df.Promo2[df[c].isnull()].unique(), df.Promo2[~df[c].isnull()].unique()) == ([0], [1])
In [121]:
# add analogous 'Competition' flag
print all(df['CompetitionDistance'].isnull() == df['CompetitionOpenSinceYear'].isnull())
print all(df['CompetitionDistance'].isnull() == df['CompetitionOpenSinceMonth'].isnull())
print all(df['CompetitionOpenSinceMonth'].isnull() == df['CompetitionOpenSinceYear'].isnull())
In [122]:
mask = df['CompetitionDistance'].isnull() != df['CompetitionOpenSinceYear'].isnull()
df[mask]
Out[122]:
In [77]:
data_types = {
"Store": int,
"StoreType": str,
"Assortment": str,
"CompetitionDistance": float,
"CompetitionOpenSinceMonth": float,
"CompetitionOpenSinceYear": float,
"Promo2": bool,
"Promo2SinceWeek": float,
"Promo2SinceYear": float,
"PromoInterval": object,
}
df_store = pd.read_csv('store.csv', dtype=data_types)
In [45]:
mask = df_train.Sales != 0
df_train[mask].Sales.hist(bins=100)
Out[45]:
This looks like a log-normal distribution...
In [51]:
df_train[mask].Sales.map(np.log10).hist(bins=100)
Out[51]:
Check QQ plot to see how well this is modeled by a Gaussian distribution:
In [60]:
probplot(df_train[mask].Sales.map(np.log10), dist="norm", plot=plt)
Out[60]:
In [ ]: