In [4]:
# Import the libraries
In [1]:
import numpy as np
In [2]:
import matplotlib.pyplot as plt
In [3]:
import pandas as pd
In [5]:
# import the dataset
In [6]:
dataset = pd.read_csv('datasets/Data.csv')
In [7]:
# Overview of datset
dataset.head()
Out[7]:
So there are 3 independent variables and 1 dependent variable.
In [9]:
# Now create matrices of independent and dependent varaibles.
X = dataset.iloc[:, :-1].values
# The first ':' left of ',' means we are taking all the rows
# :-1 means we take all the columns except the last column
In [10]:
X
Out[10]:
In [11]:
Y = dataset.iloc[:, 3].values
In [12]:
Y
Out[12]:
In [13]:
# very simple missing data handling methode is mean of columns
In [14]:
from sklearn.preprocessing import Imputer
# From preprocessing library import imputer class
imputer = Imputer(missing_values = 'NaN', strategy = "mean", axis=0)
# axis = 0 means take mean across columns. for rows axis=1
In [15]:
imputer.fit(X[:, 1:3])
# We are not imputing the whole dataset as there are missin
# values only in 2nd and 3rd column.
# So acc. to index starting from zero, the two columns are
# 1 and 2.
Out[15]:
In [16]:
imputer = imputer.fit(X[:, 1:3])
In [19]:
X[:, 1:3] = imputer.transform(X[:, 1:3])
# Now replace the missing data with the mean
In [20]:
# Now check X
X
Out[20]:
As seen earlier Country and Purchased are categorical variables
Encode the categorical variables into numbers
In [23]:
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
In [24]:
X
Out[24]:
But there is problem here, the mathmatical model may see 2>1 instead of taking them as labels. So we are going to use one hot encoder.
In [25]:
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(categorical_features = [0])
# this means which column you are taking as having
# categories. Here 1st column i.e. Column 0
In [26]:
# Now our object is ready, lets replace
In [27]:
X = onehotencoder.fit_transform(X).toarray()
In [28]:
X
Out[28]:
Now we will encode the dependent variable. We dont need to hot encode this
In [29]:
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)
In [30]:
Y
Out[30]:
In [32]:
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
# You can use random_state=0 if you want the exactly same result as mine
In [33]:
X_train
Out[33]:
In [34]:
X_test
Out[34]:
In [35]:
Y_train
Out[35]:
In [36]:
Y_test
Out[36]:
Columns Age and Salary are not in same scale which may create problems in ML models. Types- Standardisation and Normalisation
In [38]:
from sklearn.preprocessing import StandardScaler
In [39]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
# for training set fit and transform, for test
# set only transform
X_test = sc_X.transform(X_test)
In [40]:
X_train
Out[40]:
In [41]:
X_test
Out[41]:
For dependent variable Y no need to apply feature scaling, as this is binary classification. For multiple classification feature scaling required.
In [ ]: