In [ ]:
# TPOT uses genetic algorithm for feature selection and model selection, and it does all these automatically
# want to try it
# Data can be down loaded here (need sign in):
## https://datahack.analyticsvidhya.com/contest/practice-problem-big-mart-sales-iii/
In [97]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tpot import TPOTRegressor
%matplotlib inline
In [74]:
train = pd.read_csv("train_GA.csv")
test = pd.read_csv("test_GA.csv")
In [44]:
train.head()
Out[44]:
In [45]:
pd.isnull(train).sum() > 0
Out[45]:
In [29]:
pd.isnull(test).sum() > 0
Out[29]:
In [75]:
train.Item_Weight = train.Item_Weight.fillna(np.nanmedian(train.Item_Weight))
test.Item_Weight = test.Item_Weight.fillna(np.nanmedian(test.Item_Weight))
In [62]:
print train.Outlet_Size.unique()
print test.Outlet_Size.unique()
In [76]:
train.Outlet_Size = train.Outlet_Size.fillna(train.Outlet_Size.mode().iloc[0])
test.Outlet_Size = test.Outlet_Size.fillna(train.Outlet_Size.mode().iloc[0])
In [77]:
train.dtypes
Out[77]:
In [78]:
print train.Item_Fat_Content.unique()
print test.Item_Fat_Content.unique()
print train.Item_Type.unique()
print test.Item_Type.unique()
print train.Outlet_Identifier.unique()
print test.Outlet_Identifier.unique()
print train.Outlet_Size.unique()
print test.Outlet_Size.unique()
print train.Outlet_Location_Type.unique()
print test.Outlet_Location_Type.unique()
print train.Outlet_Type.unique()
print test.Outlet_Type.unique()
In [79]:
train.Item_Fat_Content = train.Item_Fat_Content.replace(['low fat', 'LF'], ['Low Fat', 'Low Fat'])
test.Item_Fat_Content = test.Item_Fat_Content.replace(['low fat', 'LF'], ['Low Fat', 'Low Fat'])
train.Item_Fat_Content = train.Item_Fat_Content.replace(['reg'], ['Regular'])
test.Item_Fat_Content = test.Item_Fat_Content.replace(['reg'], ['Regular'])
In [80]:
print train.Item_Fat_Content.unique()
print test.Item_Fat_Content.unique()
In [81]:
print train.Outlet_Establishment_Year.max()
print train.Outlet_Establishment_Year.min()
In [82]:
train.Outlet_Establishment_Year = 2017 - train.Outlet_Establishment_Year
test.Outlet_Establishment_Year = 2017 - test.Outlet_Establishment_Year
In [83]:
train.shape
Out[83]:
In [84]:
# label encoding, do this by combining train and test together
test['Item_Outlet_Sales'] = 0
combi = train.append(test)
number = LabelEncoder()
for i in combi.columns:
if (combi[i].dtype == 'object'):
combi[i] = number.fit_transform(combi[i].astype('str'))
combi[i] = combi[i].astype('object')
train = combi[:train.shape[0]]
test = combi[train.shape[0]:]
In [86]:
test.head()
Out[86]:
In [88]:
test = test.drop('Item_Outlet_Sales',axis=1)
# remove id and those with more levels
tpot_train = train.drop('Item_Identifier',axis=1)
tpot_test = test.drop('Item_Identifier',axis=1)
target = tpot_train['Item_Outlet_Sales']
tpot_train = tpot_train.drop('Item_Outlet_Sales',axis=1)
In [98]:
# build the model with tpot
# finally building model using tpot library
X_train, X_test, y_train, y_test = train_test_split(tpot_train, target,
train_size=0.75, test_size=0.25)
tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test)) # mean squared error (MSE)
tpot.export('tpot_boston_pipeline.py')