P1_Data_Preprocessing



In [4]:

    
# Import the libraries



In [1]:

    
import numpy as np



In [2]:

    
import matplotlib.pyplot as plt









    



/opt/conda/lib/python3.5/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
/opt/conda/lib/python3.5/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')



In [3]:

    
import pandas as pd



In [5]:

    
# import the dataset



In [6]:

    
dataset = pd.read_csv('datasets/Data.csv')



In [7]:

    
# Overview of datset
dataset.head()









    Out[7]:






  
    
      
      Country
      Age
      Salary
      Purchased
    
  
  
    
      0
      France
      44.0
      72000.0
      No
    
    
      1
      Spain
      27.0
      48000.0
      Yes
    
    
      2
      Germany
      30.0
      54000.0
      No
    
    
      3
      Spain
      38.0
      61000.0
      No
    
    
      4
      Germany
      40.0
      NaN
      Yes

So there are 3 independent variables and 1 dependent variable.



In [9]:

    
# Now create matrices of independent and dependent varaibles.
X = dataset.iloc[:, :-1].values
# The first ':' left of ',' means we are taking all the rows
# :-1 means we take all the columns except the last column



In [10]:

    
X









    Out[10]:





array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)



In [11]:

    
Y = dataset.iloc[:, 3].values



In [12]:

    
Y









    Out[12]:





array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'], dtype=object)

Handle Missing data



In [13]:

    
# very simple missing data handling methode is mean of columns



In [14]:

    
from sklearn.preprocessing import Imputer
# From preprocessing library import imputer class
imputer = Imputer(missing_values = 'NaN', strategy = "mean", axis=0)
# axis = 0 means take mean across columns. for rows axis=1



In [15]:

    
imputer.fit(X[:, 1:3])
# We are not imputing the whole dataset as there are missin
# values only in 2nd and 3rd column.
# So acc. to index starting from zero, the two columns are 
# 1 and 2.









    Out[15]:





Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)



In [16]:

    
imputer = imputer.fit(X[:, 1:3])



In [19]:

    
X[:, 1:3] = imputer.transform(X[:, 1:3])
# Now replace the missing data with the mean



In [20]:

    
# Now check X
X









    Out[20]:





array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

Handle Categorical Data

As seen earlier Country and Purchased are categorical variables

Encode the categorical variables into numbers



In [23]:

    
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])



In [24]:

    
X









    Out[24]:





array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

But there is problem here, the mathmatical model may see 2>1 instead of taking them as labels. So we are going to use one hot encoder.



In [25]:

    
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(categorical_features = [0])
# this means which column you are taking as having 
# categories. Here 1st column i.e. Column 0



In [26]:

    
# Now our object is ready, lets replace



In [27]:

    
X = onehotencoder.fit_transform(X).toarray()



In [28]:

    
X









    Out[28]:





array([[  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.40000000e+01,   7.20000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          2.70000000e+01,   4.80000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          3.00000000e+01,   5.40000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.80000000e+01,   6.10000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          4.00000000e+01,   6.37777778e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          3.50000000e+01,   5.80000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.87777778e+01,   5.20000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.80000000e+01,   7.90000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          5.00000000e+01,   8.30000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          3.70000000e+01,   6.70000000e+04]])

Now we will encode the dependent variable. We dont need to hot encode this



In [29]:

    
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)



In [30]:

    
Y









    Out[30]:





array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

Split the dataset into train and test



In [32]:

    
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
# You can use random_state=0 if you want the exactly same result as mine



In [33]:

    
X_train









    Out[33]:





array([[  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          4.00000000e+01,   6.37777778e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          3.70000000e+01,   6.70000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          2.70000000e+01,   4.80000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.87777778e+01,   5.20000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.80000000e+01,   7.90000000e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          3.80000000e+01,   6.10000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.40000000e+01,   7.20000000e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          3.50000000e+01,   5.80000000e+04]])



In [34]:

    
X_test









    Out[34]:





array([[  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          3.00000000e+01,   5.40000000e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          5.00000000e+01,   8.30000000e+04]])



In [35]:

    
Y_train









    Out[35]:





array([1, 1, 1, 0, 1, 0, 0, 1])



In [36]:

    
Y_test









    Out[36]:





array([0, 0])

Feature Scaling

Columns Age and Salary are not in same scale which may create problems in ML models. Types- Standardisation and Normalisation



In [38]:

    
from sklearn.preprocessing import StandardScaler



In [39]:

    
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
# for training set fit and transform, for test 
# set only transform
X_test = sc_X.transform(X_test)



In [40]:

    
X_train









    Out[40]:





array([[-1.        ,  2.64575131, -0.77459667,  0.26306757,  0.12381479],
       [ 1.        , -0.37796447, -0.77459667, -0.25350148,  0.46175632],
       [-1.        , -0.37796447,  1.29099445, -1.97539832, -1.53093341],
       [-1.        , -0.37796447,  1.29099445,  0.05261351, -1.11141978],
       [ 1.        , -0.37796447, -0.77459667,  1.64058505,  1.7202972 ],
       [-1.        , -0.37796447,  1.29099445, -0.0813118 , -0.16751412],
       [ 1.        , -0.37796447, -0.77459667,  0.95182631,  0.98614835],
       [ 1.        , -0.37796447, -0.77459667, -0.59788085, -0.48214934]])



In [41]:

    
X_test









    Out[41]:





array([[-1.        ,  2.64575131, -0.77459667, -1.45882927, -0.90166297],
       [-1.        ,  2.64575131, -0.77459667,  1.98496442,  2.13981082]])

For dependent variable Y no need to apply feature scaling, as this is binary classification. For multiple classification feature scaling required.



In [ ]:

	Country	Age	Salary	Purchased
0	France	44.0	72000.0	No
1	Spain	27.0	48000.0	Yes
2	Germany	30.0	54000.0	No
3	Spain	38.0	61000.0	No
4	Germany	40.0	NaN	Yes