Data preprocessing

Data preprocessing using R.


In [1]:
# Importing the dataset
dataset=read.csv('Data.csv')
dataset


CountryAgeSalaryPurchased
France 44 72000 No
Spain 27 48000 Yes
Germany30 54000 No
Spain 38 61000 No
Germany40 NA Yes
France 35 58000 Yes
Spain NA 52000 No
France 48 79000 Yes
Germany50 83000 No
France 37 67000 Yes

In [2]:
# Taking care of missing values
dataset$Age=ifelse(is.na(dataset$Age),
                  ave(dataset$Age,FUN=function(x) mean(x,na.rm=TRUE)),
                      dataset$Age)
dataset$Salary= ifelse(is.na(dataset$Salary),
                  ave(dataset$Salary,FUN=function(x) mean(x,na.rm=TRUE)),
                      dataset$Salary)
dataset


CountryAgeSalaryPurchased
France 44.0000072000.00No
Spain 27.0000048000.00Yes
Germany 30.0000054000.00No
Spain 38.0000061000.00No
Germany 40.0000063777.78Yes
France 35.0000058000.00Yes
Spain 38.7777852000.00No
France 48.0000079000.00Yes
Germany 50.0000083000.00No
France 37.0000067000.00Yes

In [3]:
# Encoding Categorical variables
dataset$Country=factor(dataset$Country,
                      levels=c('France','Spain','Germany'),
                      labels=c(1,2,3))
dataset


CountryAgeSalaryPurchased
1 44.0000072000.00No
2 27.0000048000.00Yes
3 30.0000054000.00No
2 38.0000061000.00No
3 40.0000063777.78Yes
1 35.0000058000.00Yes
2 38.7777852000.00No
1 48.0000079000.00Yes
3 50.0000083000.00No
1 37.0000067000.00Yes

In [5]:
dataset$Purchased=factor(dataset$Purchased,levels=c('No','Yes'),labels=c(0,1))
dataset


CountryAgeSalaryPurchased
1 44.0000072000.000
2 27.0000048000.001
3 30.0000054000.000
2 38.0000061000.000
3 40.0000063777.781
1 35.0000058000.001
2 38.7777852000.000
1 48.0000079000.001
3 50.0000083000.000
1 37.0000067000.001

In [12]:
# Splitting train set and test set
library(caTools)
set.seed(2001)
split=sample.split(dataset$Purchased,SplitRatio=0.8)
#help(sample.split)
split
training_set=subset(dataset,split==TRUE)
test_set=subset(dataset,split==FALSE)


  1. FALSE
  2. TRUE
  3. TRUE
  4. TRUE
  5. TRUE
  6. TRUE
  7. TRUE
  8. FALSE
  9. TRUE
  10. TRUE
CountryAgeSalaryPurchased
11 44 720000
81 48 790001

In [15]:
# Feature scaling (optional)
## Note : A factor is not a numeric variable
training_set[,2:3]=scale(training_set[,2:3])
test_set[,2:3]=scale(test_set[,2:3])
test_set


CountryAgeSalaryPurchased
11 -0.7071068-0.70710680
81 0.7071068 0.70710681

In [ ]: