In [ ]:
# Pseudo Labeling is a method of semi-clustering
# 1. train the labeled data first
# 2. use the trained model to predict unlabeled data, and generate pseudo labels
# 3. combined both data above as training data and predict
## according to the theory, when you added more data with pseudo labels, it can increase accuracy
# Data can be down loaded here (need sign in and register):
## https://datahack.analyticsvidhya.com/contest/practice-problem-big-mart-sales-iii/
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tpot import TPOTRegressor
%matplotlib inline
In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
In [3]:
train.head()
Out[3]:
In [4]:
pd.isnull(train).sum()
Out[4]:
In [5]:
pd.isnull(test).sum()
Out[5]:
In [6]:
train.Item_Weight = train.Item_Weight.fillna(np.nanmedian(train.Item_Weight))
test.Item_Weight = test.Item_Weight.fillna(np.nanmedian(test.Item_Weight))
In [7]:
print train.Outlet_Size.unique()
print test.Outlet_Size.unique()
In [8]:
train.Outlet_Size = train.Outlet_Size.fillna(train.Outlet_Size.mode().iloc[0])
test.Outlet_Size = test.Outlet_Size.fillna(train.Outlet_Size.mode().iloc[0])
In [9]:
train.dtypes
Out[9]:
In [10]:
print train.Item_Fat_Content.unique()
print test.Item_Fat_Content.unique()
print train.Item_Type.unique()
print test.Item_Type.unique()
print train.Outlet_Identifier.unique()
print test.Outlet_Identifier.unique()
print train.Outlet_Size.unique()
print test.Outlet_Size.unique()
print train.Outlet_Location_Type.unique()
print test.Outlet_Location_Type.unique()
print train.Outlet_Type.unique()
print test.Outlet_Type.unique()
In [11]:
train.Item_Fat_Content = train.Item_Fat_Content.replace(['low fat', 'LF'], ['Low Fat', 'Low Fat'])
test.Item_Fat_Content = test.Item_Fat_Content.replace(['low fat', 'LF'], ['Low Fat', 'Low Fat'])
train.Item_Fat_Content = train.Item_Fat_Content.replace(['reg'], ['Regular'])
test.Item_Fat_Content = test.Item_Fat_Content.replace(['reg'], ['Regular'])
In [12]:
print train.Item_Fat_Content.unique()
print test.Item_Fat_Content.unique()
In [13]:
print train.Outlet_Establishment_Year.max()
print train.Outlet_Establishment_Year.min()
In [14]:
train.Outlet_Establishment_Year = 2017 - train.Outlet_Establishment_Year
test.Outlet_Establishment_Year = 2017 - test.Outlet_Establishment_Year
In [15]:
train.shape
Out[15]:
In [16]:
# label encoding, do this by combining train and test together
test['Item_Outlet_Sales'] = 0
combi = train.append(test)
number = LabelEncoder()
for i in combi.columns:
if (combi[i].dtype == 'object'):
combi[i] = number.fit_transform(combi[i].astype('str'))
combi[i] = combi[i].astype('object')
train = combi[:train.shape[0]]
test = combi[train.shape[0]:]
In [17]:
test.head()
Out[17]:
In [18]:
test = test.drop('Item_Outlet_Sales',axis=1)
# remove id and those with more levels
tpot_train = train.drop('Item_Identifier',axis=1)
tpot_test = test.drop('Item_Identifier',axis=1)
target = tpot_train['Item_Outlet_Sales']
tpot_train = tpot_train.drop('Item_Outlet_Sales',axis=1)
In [20]:
# build the model with tpot
# finally building model using tpot library
X_train, X_test, y_train, y_test = train_test_split(tpot_train, target,
train_size=0.7, test_size=0.3)
tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)
tpot.fit(X_train, y_train) # use cross validation and genetic alg to find the best model with optimized params
print(tpot.score(X_test, y_test)) # mean squared error (MSE)
In [39]:
tpot_test.head()
tpot_test.index.values
Out[39]:
In [34]:
pseudo_label = tpot.predict(tpot_test)
pseudo_label
Out[34]:
In [44]:
pseudo_label_df = pd.DataFrame(data=pseudo_label,
index=tpot_test.index.values,
columns=['Item_Outlet_Sales'])
In [45]:
pseudo_label_df.head()
Out[45]:
In [49]:
# add pseudo label in this new dataset(test data in this case)
pseudo_test = tpot_test
pseudo_test['Item_Outlet_Sales'] = pd.Series(pseudo_label_df['Item_Outlet_Sales'], index=tpot_test.index)
pseudo_test.head()
Out[49]:
In [63]:
# append new dataset with pseudo label
tpot_train = train.drop('Item_Identifier',axis=1)
pseudo_train = tpot_train.append(pseudo_test)
In [64]:
pseudo_train.head()
Out[64]:
In [60]:
tpot_test.head()
Out[60]:
In [65]:
pseudo_target = pseudo_train['Item_Outlet_Sales']
pseudo_train = pseudo_train.drop('Item_Outlet_Sales',axis=1)
X_train, X_test, y_train, y_test = train_test_split(pseudo_train, pseudo_target,
train_size=0.7, test_size=0.3)
tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)
tpot.fit(X_train, y_train) # use cross validation and genetic alg to find the best model with optimized params
print(tpot.score(X_test, y_test)) # mean squared error (MSE)
In [ ]:
# Here, we don't have ground truth for testing data
# But you can see MSE during the validation period is much less than the above one, which didn't have pseudo labeling