In [ ]:
# Pseudo Labeling is a method of semi-clustering

# 1. train the labeled data first
# 2. use the trained model to predict unlabeled data, and generate pseudo labels
# 3. combined both data above as training data and predict
## according to the theory, when you added more data with pseudo labels, it can increase accuracy

# Data can be down loaded here (need sign in and register): 
## https://datahack.analyticsvidhya.com/contest/practice-problem-big-mart-sales-iii/

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

from sklearn import preprocessing 
from sklearn.metrics import mean_squared_error 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from tpot import TPOTRegressor
%matplotlib inline

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train.head()


Out[3]:
Item_Identifier Item_Weight Item_Fat_Content Item_Visibility Item_Type Item_MRP Outlet_Identifier Outlet_Establishment_Year Outlet_Size Outlet_Location_Type Outlet_Type Item_Outlet_Sales
0 FDA15 9.30 Low Fat 0.016047 Dairy 249.8092 OUT049 1999 Medium Tier 1 Supermarket Type1 3735.1380
1 DRC01 5.92 Regular 0.019278 Soft Drinks 48.2692 OUT018 2009 Medium Tier 3 Supermarket Type2 443.4228
2 FDN15 17.50 Low Fat 0.016760 Meat 141.6180 OUT049 1999 Medium Tier 1 Supermarket Type1 2097.2700
3 FDX07 19.20 Regular 0.000000 Fruits and Vegetables 182.0950 OUT010 1998 NaN Tier 3 Grocery Store 732.3800
4 NCD19 8.93 Low Fat 0.000000 Household 53.8614 OUT013 1987 High Tier 3 Supermarket Type1 994.7052

In [4]:
pd.isnull(train).sum()


Out[4]:
Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [5]:
pd.isnull(test).sum()


Out[5]:
Item_Identifier                 0
Item_Weight                   976
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  1606
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

In [6]:
train.Item_Weight = train.Item_Weight.fillna(np.nanmedian(train.Item_Weight))
test.Item_Weight = test.Item_Weight.fillna(np.nanmedian(test.Item_Weight))

In [7]:
print train.Outlet_Size.unique()
print test.Outlet_Size.unique()


['Medium' nan 'High' 'Small']
['Medium' nan 'Small' 'High']

In [8]:
train.Outlet_Size = train.Outlet_Size.fillna(train.Outlet_Size.mode().iloc[0])
test.Outlet_Size = test.Outlet_Size.fillna(train.Outlet_Size.mode().iloc[0])

In [9]:
train.dtypes


Out[9]:
Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object

In [10]:
print train.Item_Fat_Content.unique()
print test.Item_Fat_Content.unique()
print train.Item_Type.unique()
print test.Item_Type.unique()
print train.Outlet_Identifier.unique()
print test.Outlet_Identifier.unique()
print train.Outlet_Size.unique()
print test.Outlet_Size.unique()
print train.Outlet_Location_Type.unique()
print test.Outlet_Location_Type.unique()
print train.Outlet_Type.unique()
print test.Outlet_Type.unique()


['Low Fat' 'Regular' 'low fat' 'LF' 'reg']
['Low Fat' 'reg' 'Regular' 'LF' 'low fat']
['Dairy' 'Soft Drinks' 'Meat' 'Fruits and Vegetables' 'Household'
 'Baking Goods' 'Snack Foods' 'Frozen Foods' 'Breakfast'
 'Health and Hygiene' 'Hard Drinks' 'Canned' 'Breads' 'Starchy Foods'
 'Others' 'Seafood']
['Snack Foods' 'Dairy' 'Others' 'Fruits and Vegetables' 'Baking Goods'
 'Health and Hygiene' 'Breads' 'Hard Drinks' 'Seafood' 'Soft Drinks'
 'Household' 'Frozen Foods' 'Meat' 'Canned' 'Starchy Foods' 'Breakfast']
['OUT049' 'OUT018' 'OUT010' 'OUT013' 'OUT027' 'OUT045' 'OUT017' 'OUT046'
 'OUT035' 'OUT019']
['OUT049' 'OUT017' 'OUT010' 'OUT027' 'OUT046' 'OUT018' 'OUT045' 'OUT019'
 'OUT013' 'OUT035']
['Medium' 'High' 'Small']
['Medium' 'Small' 'High']
['Tier 1' 'Tier 3' 'Tier 2']
['Tier 1' 'Tier 2' 'Tier 3']
['Supermarket Type1' 'Supermarket Type2' 'Grocery Store'
 'Supermarket Type3']
['Supermarket Type1' 'Grocery Store' 'Supermarket Type3'
 'Supermarket Type2']

In [11]:
train.Item_Fat_Content = train.Item_Fat_Content.replace(['low fat', 'LF'], ['Low Fat', 'Low Fat'])
test.Item_Fat_Content = test.Item_Fat_Content.replace(['low fat', 'LF'], ['Low Fat', 'Low Fat'])
train.Item_Fat_Content = train.Item_Fat_Content.replace(['reg'], ['Regular'])
test.Item_Fat_Content = test.Item_Fat_Content.replace(['reg'], ['Regular'])

In [12]:
print train.Item_Fat_Content.unique()
print test.Item_Fat_Content.unique()


['Low Fat' 'Regular']
['Low Fat' 'Regular']

In [13]:
print train.Outlet_Establishment_Year.max()
print train.Outlet_Establishment_Year.min()


2009
1985

In [14]:
train.Outlet_Establishment_Year = 2017 - train.Outlet_Establishment_Year
test.Outlet_Establishment_Year = 2017 - test.Outlet_Establishment_Year

In [15]:
train.shape


Out[15]:
(8523, 12)

In [16]:
# label encoding, do this by combining train and test together

test['Item_Outlet_Sales'] = 0
combi = train.append(test)
number = LabelEncoder()

for i in combi.columns:
    if (combi[i].dtype == 'object'):
        combi[i] = number.fit_transform(combi[i].astype('str'))
        combi[i] = combi[i].astype('object')
        
train = combi[:train.shape[0]]
test = combi[train.shape[0]:]

In [17]:
test.head()


Out[17]:
Item_Identifier Item_Weight Item_Fat_Content Item_Visibility Item_Type Item_MRP Outlet_Identifier Outlet_Establishment_Year Outlet_Size Outlet_Location_Type Outlet_Type Item_Outlet_Sales
0 1114 20.750 0 0.007565 13 107.8622 9 18 1 0 1 0.0
1 1078 8.300 1 0.038428 4 87.3198 2 10 1 1 1 0.0
2 1420 14.600 0 0.099575 11 241.7538 0 19 1 2 0 0.0
3 817 7.315 0 0.015388 13 155.0340 2 10 1 1 1 0.0
4 1197 12.500 1 0.118599 4 234.2300 5 32 1 2 3 0.0

In [18]:
test = test.drop('Item_Outlet_Sales',axis=1)

# remove id and those with more levels
tpot_train = train.drop('Item_Identifier',axis=1)
tpot_test = test.drop('Item_Identifier',axis=1)
target = tpot_train['Item_Outlet_Sales']
tpot_train = tpot_train.drop('Item_Outlet_Sales',axis=1)

In [20]:
# build the model with tpot
# finally building model using tpot library

X_train, X_test, y_train, y_test = train_test_split(tpot_train, target,
 train_size=0.7, test_size=0.3)

tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)
tpot.fit(X_train, y_train)  # use cross validation and genetic alg to find the best model with optimized params
print(tpot.score(X_test, y_test))   # mean squared error (MSE)


Optimization Progress:  32%|███▏      | 97/300 [02:29<07:38,  2.26s/pipeline]
Generation 1 - Current best internal CV score: 1162697.67194
Optimization Progress:  47%|████▋     | 141/300 [05:33<11:03,  4.17s/pipeline]
Generation 2 - Current best internal CV score: 1162697.67194
Optimization Progress:  63%|██████▎   | 189/300 [09:45<05:46,  3.12s/pipeline]
Generation 3 - Current best internal CV score: 1162697.67194
Optimization Progress:  79%|███████▊  | 236/300 [11:52<02:03,  1.93s/pipeline]
Generation 4 - Current best internal CV score: 1162697.67194
                                                                              
Generation 5 - Current best internal CV score: 1161965.17685

Best pipeline: ExtraTreesRegressor(MinMaxScaler(input_matrix), ExtraTreesRegressor__bootstrap=True, ExtraTreesRegressor__max_features=DEFAULT, ExtraTreesRegressor__min_samples_leaf=16, ExtraTreesRegressor__min_samples_split=3, ExtraTreesRegressor__n_estimators=100)
1167703.86599

In [39]:
tpot_test.head()
tpot_test.index.values


Out[39]:
array([   0,    1,    2, ..., 5678, 5679, 5680])

In [34]:
pseudo_label = tpot.predict(tpot_test)
pseudo_label


Out[34]:
array([ 1670.7869921 ,  1374.11673036,   601.84154656, ...,  1891.80030285,
        3955.74660333,  1273.38562306])

In [44]:
pseudo_label_df = pd.DataFrame(data=pseudo_label, 
                        index=tpot_test.index.values,
                              columns=['Item_Outlet_Sales'])

In [45]:
pseudo_label_df.head()


Out[45]:
Item_Outlet_Sales
0 1670.786992
1 1374.116730
2 601.841547
3 2396.554574
4 5707.000035

In [49]:
# add pseudo label in  this new dataset(test data in this case)
pseudo_test = tpot_test
pseudo_test['Item_Outlet_Sales'] = pd.Series(pseudo_label_df['Item_Outlet_Sales'], index=tpot_test.index)
pseudo_test.head()


Out[49]:
Item_Weight Item_Fat_Content Item_Visibility Item_Type Item_MRP Outlet_Identifier Outlet_Establishment_Year Outlet_Size Outlet_Location_Type Outlet_Type Item_Outlet_Sales
0 20.750 0 0.007565 13 107.8622 9 18 1 0 1 1670.786992
1 8.300 1 0.038428 4 87.3198 2 10 1 1 1 1374.116730
2 14.600 0 0.099575 11 241.7538 0 19 1 2 0 601.841547
3 7.315 0 0.015388 13 155.0340 2 10 1 1 1 2396.554574
4 12.500 1 0.118599 4 234.2300 5 32 1 2 3 5707.000035

In [63]:
# append new dataset with pseudo label
tpot_train = train.drop('Item_Identifier',axis=1)
pseudo_train = tpot_train.append(pseudo_test)

In [64]:
pseudo_train.head()


Out[64]:
Item_Weight Item_Fat_Content Item_Visibility Item_Type Item_MRP Outlet_Identifier Outlet_Establishment_Year Outlet_Size Outlet_Location_Type Outlet_Type Item_Outlet_Sales
0 9.30 0 0.016047 4 249.8092 9 18 1 0 1 3735.1380
1 5.92 1 0.019278 14 48.2692 3 8 1 2 2 443.4228
2 17.50 0 0.016760 10 141.6180 9 18 1 0 1 2097.2700
3 19.20 1 0.000000 6 182.0950 0 19 1 2 0 732.3800
4 8.93 0 0.000000 9 53.8614 1 30 0 2 1 994.7052

In [60]:
tpot_test.head()


Out[60]:
Item_Weight Item_Fat_Content Item_Visibility Item_Type Item_MRP Outlet_Identifier Outlet_Establishment_Year Outlet_Size Outlet_Location_Type Outlet_Type Item_Outlet_Sales
0 20.750 0 0.007565 13 107.8622 9 18 1 0 1 1670.786992
1 8.300 1 0.038428 4 87.3198 2 10 1 1 1 1374.116730
2 14.600 0 0.099575 11 241.7538 0 19 1 2 0 601.841547
3 7.315 0 0.015388 13 155.0340 2 10 1 1 1 2396.554574
4 12.500 1 0.118599 4 234.2300 5 32 1 2 3 5707.000035

In [65]:
pseudo_target = pseudo_train['Item_Outlet_Sales']
pseudo_train = pseudo_train.drop('Item_Outlet_Sales',axis=1)

X_train, X_test, y_train, y_test = train_test_split(pseudo_train, pseudo_target,
 train_size=0.7, test_size=0.3)

tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)
tpot.fit(X_train, y_train)  # use cross validation and genetic alg to find the best model with optimized params
print(tpot.score(X_test, y_test))   # mean squared error (MSE)


Optimization Progress:  29%|██▉       | 87/300 [02:50<08:07,  2.29s/pipeline]
Generation 1 - Current best internal CV score: 702816.414279
Optimization Progress:  43%|████▎     | 130/300 [04:36<08:35,  3.03s/pipeline]
Generation 2 - Current best internal CV score: 700376.767819
Optimization Progress:  59%|█████▉    | 178/300 [06:26<05:00,  2.46s/pipeline]
Generation 3 - Current best internal CV score: 700376.767819
Optimization Progress:  75%|███████▌  | 225/300 [08:54<02:05,  1.68s/pipeline]
Generation 4 - Current best internal CV score: 699585.72371
                                                                              
Generation 5 - Current best internal CV score: 697140.36341

Best pipeline: GradientBoostingRegressor(input_matrix, GradientBoostingRegressor__alpha=0.85, GradientBoostingRegressor__learning_rate=DEFAULT, GradientBoostingRegressor__loss=lad, GradientBoostingRegressor__max_depth=5, GradientBoostingRegressor__max_features=0.6, GradientBoostingRegressor__min_samples_leaf=9, GradientBoostingRegressor__min_samples_split=16, GradientBoostingRegressor__n_estimators=DEFAULT, GradientBoostingRegressor__subsample=0.7)
674055.06549

In [ ]:
# Here, we don't have ground truth for testing data
# But you can see MSE during the validation period is much less than the above one, which didn't have pseudo labeling