In [301]:
import numpy as np
import pandas as pd

In [302]:
np.set_printoptions(precision=3, suppress=True)

Importing the dataset


In [303]:
dataset = pd.read_csv('Data.csv')

In [304]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values

Taking care of missing data


In [305]:
from sklearn.preprocessing import Imputer

In [306]:
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(X[:, 1:3])

In [307]:
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [308]:
X


Out[308]:
array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

Encoding the Independent Variable


In [309]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [310]:
X[:, 0] = LabelEncoder().fit_transform(X[:, 0])

In [311]:
X = OneHotEncoder(categorical_features=[0]).fit_transform(X).toarray()

In [312]:
X


Out[312]:
array([[     1.   ,      0.   ,      0.   ,     44.   ,  72000.   ],
       [     0.   ,      0.   ,      1.   ,     27.   ,  48000.   ],
       [     0.   ,      1.   ,      0.   ,     30.   ,  54000.   ],
       [     0.   ,      0.   ,      1.   ,     38.   ,  61000.   ],
       [     0.   ,      1.   ,      0.   ,     40.   ,  63777.778],
       [     1.   ,      0.   ,      0.   ,     35.   ,  58000.   ],
       [     0.   ,      0.   ,      1.   ,     38.778,  52000.   ],
       [     1.   ,      0.   ,      0.   ,     48.   ,  79000.   ],
       [     0.   ,      1.   ,      0.   ,     50.   ,  83000.   ],
       [     1.   ,      0.   ,      0.   ,     37.   ,  67000.   ]])

Encoding the Dependent Variable


In [313]:
y = LabelEncoder().fit_transform(y)

In [314]:
y


Out[314]:
array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

Splitting the dataset into the Training set and Test set


In [315]:
from sklearn.model_selection import train_test_split

In [316]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [317]:
X_train, X_test, y_train, y_test


Out[317]:
(array([[     0.   ,      1.   ,      0.   ,     40.   ,  63777.778],
        [     1.   ,      0.   ,      0.   ,     37.   ,  67000.   ],
        [     0.   ,      0.   ,      1.   ,     27.   ,  48000.   ],
        [     0.   ,      0.   ,      1.   ,     38.778,  52000.   ],
        [     1.   ,      0.   ,      0.   ,     48.   ,  79000.   ],
        [     0.   ,      0.   ,      1.   ,     38.   ,  61000.   ],
        [     1.   ,      0.   ,      0.   ,     44.   ,  72000.   ],
        [     1.   ,      0.   ,      0.   ,     35.   ,  58000.   ]]),
 array([[     0.,      1.,      0.,     30.,  54000.],
        [     0.,      1.,      0.,     50.,  83000.]]),
 array([1, 1, 1, 0, 1, 0, 0, 1]),
 array([0, 0]))

Feature scaling


In [318]:
from sklearn.preprocessing import StandardScaler

In [319]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [320]:
X_train, X_test


Out[320]:
(array([[-1.   ,  2.646, -0.775,  0.263,  0.124],
        [ 1.   , -0.378, -0.775, -0.254,  0.462],
        [-1.   , -0.378,  1.291, -1.975, -1.531],
        [-1.   , -0.378,  1.291,  0.053, -1.111],
        [ 1.   , -0.378, -0.775,  1.641,  1.72 ],
        [-1.   , -0.378,  1.291, -0.081, -0.168],
        [ 1.   , -0.378, -0.775,  0.952,  0.986],
        [ 1.   , -0.378, -0.775, -0.598, -0.482]]),
 array([[-1.   ,  2.646, -0.775, -1.459, -0.902],
        [-1.   ,  2.646, -0.775,  1.985,  2.14 ]]))