notebook.community

Edit and run



In [ ]:

    
# Pseudo Labeling is a method of semi-clustering

# 1. train the labeled data first
# 2. use the trained model to predict unlabeled data, and generate pseudo labels
# 3. combined both data above as training data and predict
## according to the theory, when you added more data with pseudo labels, it can increase accuracy

# Data can be down loaded here (need sign in and register): 
## https://datahack.analyticsvidhya.com/contest/practice-problem-big-mart-sales-iii/



In [1]:

    
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

from sklearn import preprocessing 
from sklearn.metrics import mean_squared_error 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from tpot import TPOTRegressor
%matplotlib inline



In [2]:

    
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")



In [3]:

    
train.head()









    Out[3]:







  
    
      
      Item_Identifier
      Item_Weight
      Item_Fat_Content
      Item_Visibility
      Item_Type
      Item_MRP
      Outlet_Identifier
      Outlet_Establishment_Year
      Outlet_Size
      Outlet_Location_Type
      Outlet_Type
      Item_Outlet_Sales
    
  
  
    
      0
      FDA15
      9.30
      Low Fat
      0.016047
      Dairy
      249.8092
      OUT049
      1999
      Medium
      Tier 1
      Supermarket Type1
      3735.1380
    
    
      1
      DRC01
      5.92
      Regular
      0.019278
      Soft Drinks
      48.2692
      OUT018
      2009
      Medium
      Tier 3
      Supermarket Type2
      443.4228
    
    
      2
      FDN15
      17.50
      Low Fat
      0.016760
      Meat
      141.6180
      OUT049
      1999
      Medium
      Tier 1
      Supermarket Type1
      2097.2700
    
    
      3
      FDX07
      19.20
      Regular
      0.000000
      Fruits and Vegetables
      182.0950
      OUT010
      1998
      NaN
      Tier 3
      Grocery Store
      732.3800
    
    
      4
      NCD19
      8.93
      Low Fat
      0.000000
      Household
      53.8614
      OUT013
      1987
      High
      Tier 3
      Supermarket Type1
      994.7052



In [4]:

    
pd.isnull(train).sum()









    Out[4]:





Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64



In [5]:

    
pd.isnull(test).sum()









    Out[5]:





Item_Identifier                 0
Item_Weight                   976
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  1606
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64



In [6]:

    
train.Item_Weight = train.Item_Weight.fillna(np.nanmedian(train.Item_Weight))
test.Item_Weight = test.Item_Weight.fillna(np.nanmedian(test.Item_Weight))



In [7]:

    
print train.Outlet_Size.unique()
print test.Outlet_Size.unique()









    



['Medium' nan 'High' 'Small']
['Medium' nan 'Small' 'High']



In [8]:

    
train.Outlet_Size = train.Outlet_Size.fillna(train.Outlet_Size.mode().iloc[0])
test.Outlet_Size = test.Outlet_Size.fillna(train.Outlet_Size.mode().iloc[0])



In [9]:

    
train.dtypes









    Out[9]:





Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object



In [10]:

    
print train.Item_Fat_Content.unique()
print test.Item_Fat_Content.unique()
print train.Item_Type.unique()
print test.Item_Type.unique()
print train.Outlet_Identifier.unique()
print test.Outlet_Identifier.unique()
print train.Outlet_Size.unique()
print test.Outlet_Size.unique()
print train.Outlet_Location_Type.unique()
print test.Outlet_Location_Type.unique()
print train.Outlet_Type.unique()
print test.Outlet_Type.unique()









    



['Low Fat' 'Regular' 'low fat' 'LF' 'reg']
['Low Fat' 'reg' 'Regular' 'LF' 'low fat']
['Dairy' 'Soft Drinks' 'Meat' 'Fruits and Vegetables' 'Household'
 'Baking Goods' 'Snack Foods' 'Frozen Foods' 'Breakfast'
 'Health and Hygiene' 'Hard Drinks' 'Canned' 'Breads' 'Starchy Foods'
 'Others' 'Seafood']
['Snack Foods' 'Dairy' 'Others' 'Fruits and Vegetables' 'Baking Goods'
 'Health and Hygiene' 'Breads' 'Hard Drinks' 'Seafood' 'Soft Drinks'
 'Household' 'Frozen Foods' 'Meat' 'Canned' 'Starchy Foods' 'Breakfast']
['OUT049' 'OUT018' 'OUT010' 'OUT013' 'OUT027' 'OUT045' 'OUT017' 'OUT046'
 'OUT035' 'OUT019']
['OUT049' 'OUT017' 'OUT010' 'OUT027' 'OUT046' 'OUT018' 'OUT045' 'OUT019'
 'OUT013' 'OUT035']
['Medium' 'High' 'Small']
['Medium' 'Small' 'High']
['Tier 1' 'Tier 3' 'Tier 2']
['Tier 1' 'Tier 2' 'Tier 3']
['Supermarket Type1' 'Supermarket Type2' 'Grocery Store'
 'Supermarket Type3']
['Supermarket Type1' 'Grocery Store' 'Supermarket Type3'
 'Supermarket Type2']



In [11]:

    
train.Item_Fat_Content = train.Item_Fat_Content.replace(['low fat', 'LF'], ['Low Fat', 'Low Fat'])
test.Item_Fat_Content = test.Item_Fat_Content.replace(['low fat', 'LF'], ['Low Fat', 'Low Fat'])
train.Item_Fat_Content = train.Item_Fat_Content.replace(['reg'], ['Regular'])
test.Item_Fat_Content = test.Item_Fat_Content.replace(['reg'], ['Regular'])



In [12]:

    
print train.Item_Fat_Content.unique()
print test.Item_Fat_Content.unique()









    



['Low Fat' 'Regular']
['Low Fat' 'Regular']



In [13]:

    
print train.Outlet_Establishment_Year.max()
print train.Outlet_Establishment_Year.min()



In [14]:

    
train.Outlet_Establishment_Year = 2017 - train.Outlet_Establishment_Year
test.Outlet_Establishment_Year = 2017 - test.Outlet_Establishment_Year



In [15]:

    
train.shape









    Out[15]:





(8523, 12)



In [16]:

    
# label encoding, do this by combining train and test together

test['Item_Outlet_Sales'] = 0
combi = train.append(test)
number = LabelEncoder()

for i in combi.columns:
    if (combi[i].dtype == 'object'):
        combi[i] = number.fit_transform(combi[i].astype('str'))
        combi[i] = combi[i].astype('object')
        
train = combi[:train.shape[0]]
test = combi[train.shape[0]:]



In [17]:

    
test.head()









    Out[17]:







  
    
      
      Item_Identifier
      Item_Weight
      Item_Fat_Content
      Item_Visibility
      Item_Type
      Item_MRP
      Outlet_Identifier
      Outlet_Establishment_Year
      Outlet_Size
      Outlet_Location_Type
      Outlet_Type
      Item_Outlet_Sales
    
  
  
    
      0
      1114
      20.750
      0
      0.007565
      13
      107.8622
      9
      18
      1
      0
      1
      0.0
    
    
      1
      1078
      8.300
      1
      0.038428
      4
      87.3198
      2
      10
      1
      1
      1
      0.0
    
    
      2
      1420
      14.600
      0
      0.099575
      11
      241.7538
      0
      19
      1
      2
      0
      0.0
    
    
      3
      817
      7.315
      0
      0.015388
      13
      155.0340
      2
      10
      1
      1
      1
      0.0
    
    
      4
      1197
      12.500
      1
      0.118599
      4
      234.2300
      5
      32
      1
      2
      3
      0.0



In [18]:

    
test = test.drop('Item_Outlet_Sales',axis=1)

# remove id and those with more levels
tpot_train = train.drop('Item_Identifier',axis=1)
tpot_test = test.drop('Item_Identifier',axis=1)
target = tpot_train['Item_Outlet_Sales']
tpot_train = tpot_train.drop('Item_Outlet_Sales',axis=1)



In [20]:

    
# build the model with tpot
# finally building model using tpot library

X_train, X_test, y_train, y_test = train_test_split(tpot_train, target,
 train_size=0.7, test_size=0.3)

tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)
tpot.fit(X_train, y_train)  # use cross validation and genetic alg to find the best model with optimized params
print(tpot.score(X_test, y_test))   # mean squared error (MSE)









    



Optimization Progress:  32%|███▏      | 97/300 [02:29<07:38,  2.26s/pipeline]





    



Generation 1 - Current best internal CV score: 1162697.67194






    



Optimization Progress:  47%|████▋     | 141/300 [05:33<11:03,  4.17s/pipeline]





    



Generation 2 - Current best internal CV score: 1162697.67194






    



Optimization Progress:  63%|██████▎   | 189/300 [09:45<05:46,  3.12s/pipeline]





    



Generation 3 - Current best internal CV score: 1162697.67194






    



Optimization Progress:  79%|███████▊  | 236/300 [11:52<02:03,  1.93s/pipeline]





    



Generation 4 - Current best internal CV score: 1162697.67194






    



                                                                              






    



Generation 5 - Current best internal CV score: 1161965.17685

Best pipeline: ExtraTreesRegressor(MinMaxScaler(input_matrix), ExtraTreesRegressor__bootstrap=True, ExtraTreesRegressor__max_features=DEFAULT, ExtraTreesRegressor__min_samples_leaf=16, ExtraTreesRegressor__min_samples_split=3, ExtraTreesRegressor__n_estimators=100)
1167703.86599



In [39]:

    
tpot_test.head()
tpot_test.index.values









    Out[39]:





array([   0,    1,    2, ..., 5678, 5679, 5680])



In [34]:

    
pseudo_label = tpot.predict(tpot_test)
pseudo_label









    Out[34]:





array([ 1670.7869921 ,  1374.11673036,   601.84154656, ...,  1891.80030285,
        3955.74660333,  1273.38562306])



In [44]:

    
pseudo_label_df = pd.DataFrame(data=pseudo_label, 
                        index=tpot_test.index.values,
                              columns=['Item_Outlet_Sales'])



In [45]:

    
pseudo_label_df.head()









    Out[45]:







  
    
      
      Item_Outlet_Sales
    
  
  
    
      0
      1670.786992
    
    
      1
      1374.116730
    
    
      2
      601.841547
    
    
      3
      2396.554574
    
    
      4
      5707.000035



In [49]:

    
# add pseudo label in  this new dataset(test data in this case)
pseudo_test = tpot_test
pseudo_test['Item_Outlet_Sales'] = pd.Series(pseudo_label_df['Item_Outlet_Sales'], index=tpot_test.index)
pseudo_test.head()









    Out[49]:







  
    
      
      Item_Weight
      Item_Fat_Content
      Item_Visibility
      Item_Type
      Item_MRP
      Outlet_Identifier
      Outlet_Establishment_Year
      Outlet_Size
      Outlet_Location_Type
      Outlet_Type
      Item_Outlet_Sales
    
  
  
    
      0
      20.750
      0
      0.007565
      13
      107.8622
      9
      18
      1
      0
      1
      1670.786992
    
    
      1
      8.300
      1
      0.038428
      4
      87.3198
      2
      10
      1
      1
      1
      1374.116730
    
    
      2
      14.600
      0
      0.099575
      11
      241.7538
      0
      19
      1
      2
      0
      601.841547
    
    
      3
      7.315
      0
      0.015388
      13
      155.0340
      2
      10
      1
      1
      1
      2396.554574
    
    
      4
      12.500
      1
      0.118599
      4
      234.2300
      5
      32
      1
      2
      3
      5707.000035



In [63]:

    
# append new dataset with pseudo label
tpot_train = train.drop('Item_Identifier',axis=1)
pseudo_train = tpot_train.append(pseudo_test)



In [64]:

    
pseudo_train.head()









    Out[64]:







  
    
      
      Item_Weight
      Item_Fat_Content
      Item_Visibility
      Item_Type
      Item_MRP
      Outlet_Identifier
      Outlet_Establishment_Year
      Outlet_Size
      Outlet_Location_Type
      Outlet_Type
      Item_Outlet_Sales
    
  
  
    
      0
      9.30
      0
      0.016047
      4
      249.8092
      9
      18
      1
      0
      1
      3735.1380
    
    
      1
      5.92
      1
      0.019278
      14
      48.2692
      3
      8
      1
      2
      2
      443.4228
    
    
      2
      17.50
      0
      0.016760
      10
      141.6180
      9
      18
      1
      0
      1
      2097.2700
    
    
      3
      19.20
      1
      0.000000
      6
      182.0950
      0
      19
      1
      2
      0
      732.3800
    
    
      4
      8.93
      0
      0.000000
      9
      53.8614
      1
      30
      0
      2
      1
      994.7052



In [60]:

    
tpot_test.head()









    Out[60]:







  
    
      
      Item_Weight
      Item_Fat_Content
      Item_Visibility
      Item_Type
      Item_MRP
      Outlet_Identifier
      Outlet_Establishment_Year
      Outlet_Size
      Outlet_Location_Type
      Outlet_Type
      Item_Outlet_Sales
    
  
  
    
      0
      20.750
      0
      0.007565
      13
      107.8622
      9
      18
      1
      0
      1
      1670.786992
    
    
      1
      8.300
      1
      0.038428
      4
      87.3198
      2
      10
      1
      1
      1
      1374.116730
    
    
      2
      14.600
      0
      0.099575
      11
      241.7538
      0
      19
      1
      2
      0
      601.841547
    
    
      3
      7.315
      0
      0.015388
      13
      155.0340
      2
      10
      1
      1
      1
      2396.554574
    
    
      4
      12.500
      1
      0.118599
      4
      234.2300
      5
      32
      1
      2
      3
      5707.000035



In [65]:

    
pseudo_target = pseudo_train['Item_Outlet_Sales']
pseudo_train = pseudo_train.drop('Item_Outlet_Sales',axis=1)

X_train, X_test, y_train, y_test = train_test_split(pseudo_train, pseudo_target,
 train_size=0.7, test_size=0.3)

tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)
tpot.fit(X_train, y_train)  # use cross validation and genetic alg to find the best model with optimized params
print(tpot.score(X_test, y_test))   # mean squared error (MSE)









    



Optimization Progress:  29%|██▉       | 87/300 [02:50<08:07,  2.29s/pipeline]





    



Generation 1 - Current best internal CV score: 702816.414279






    



Optimization Progress:  43%|████▎     | 130/300 [04:36<08:35,  3.03s/pipeline]





    



Generation 2 - Current best internal CV score: 700376.767819






    



Optimization Progress:  59%|█████▉    | 178/300 [06:26<05:00,  2.46s/pipeline]





    



Generation 3 - Current best internal CV score: 700376.767819






    



Optimization Progress:  75%|███████▌  | 225/300 [08:54<02:05,  1.68s/pipeline]





    



Generation 4 - Current best internal CV score: 699585.72371






    



                                                                              






    



Generation 5 - Current best internal CV score: 697140.36341

Best pipeline: GradientBoostingRegressor(input_matrix, GradientBoostingRegressor__alpha=0.85, GradientBoostingRegressor__learning_rate=DEFAULT, GradientBoostingRegressor__loss=lad, GradientBoostingRegressor__max_depth=5, GradientBoostingRegressor__max_features=0.6, GradientBoostingRegressor__min_samples_leaf=9, GradientBoostingRegressor__min_samples_split=16, GradientBoostingRegressor__n_estimators=DEFAULT, GradientBoostingRegressor__subsample=0.7)
674055.06549



In [ ]:

    
# Here, we don't have ground truth for testing data
# But you can see MSE during the validation period is much less than the above one, which didn't have pseudo labeling

	Item_Identifier	Item_Weight	Item_Fat_Content	Item_Visibility	Item_Type	Item_MRP	Outlet_Identifier	Outlet_Establishment_Year	Outlet_Size	Outlet_Location_Type	Outlet_Type	Item_Outlet_Sales
0	FDA15	9.30	Low Fat	0.016047	Dairy	249.8092	OUT049	1999	Medium	Tier 1	Supermarket Type1	3735.1380
1	DRC01	5.92	Regular	0.019278	Soft Drinks	48.2692	OUT018	2009	Medium	Tier 3	Supermarket Type2	443.4228
2	FDN15	17.50	Low Fat	0.016760	Meat	141.6180	OUT049	1999	Medium	Tier 1	Supermarket Type1	2097.2700
3	FDX07	19.20	Regular	0.000000	Fruits and Vegetables	182.0950	OUT010	1998	NaN	Tier 3	Grocery Store	732.3800
4	NCD19	8.93	Low Fat	0.000000	Household	53.8614	OUT013	1987	High	Tier 3	Supermarket Type1	994.7052

	Item_Identifier	Item_Weight	Item_Fat_Content	Item_Visibility	Item_Type	Item_MRP	Outlet_Identifier	Outlet_Establishment_Year	Outlet_Size	Outlet_Location_Type	Outlet_Type
0	1114	20.750	0	0.007565	13	107.8622	9	18	1	0	1
1	1078	8.300	1	0.038428	4	87.3198	2	10	1	1	1
2	1420	14.600	0	0.099575	11	241.7538	0	19	1	2	0
3	817	7.315	0	0.015388	13	155.0340	2	10	1	1	1
4	1197	12.500	1	0.118599	4	234.2300	5	32	1	2	3