In [4]:
library(h2o)
h2o.init()
data<-h2o.uploadFile("../data/wholesale_customers_data.csv")
In [11]:
h2o.head(data)
In [12]:
h2o.describe(data)
In [7]:
### Convert Numeric to Categorical ###
to_factors <- c(1,2)
for(i in to_factors) data[,i] <- h2o.asfactor(data[,i])
In [8]:
h2o.describe(data)
In [13]:
h2o.summary(data)
In [15]:
h2o.str(data)
In [16]:
h2o.group_by(data, by="Channel",nrow("Channel"))
In [17]:
h2o.hist(data[,"Fresh"])
#h2o.hist(data[,14])
In [18]:
target <- "Channel"
In [19]:
print(target)
In [20]:
a<-colnames(data)
features <- a[2:7]
print(features)
In [21]:
set.seed(102) # Set Seed so that same sample can be reproduced in future also
# Now Selecting 75% of data as sample from total 'n' rows of the data
# Split dataset giving the training dataset 75% of the data
data.split <- h2o.splitFrame(data=data, ratios=0.75)
# Create a training set from the 1st dataset in the split
data.train <- data.split[[1]]
# Create a testing set from the 2nd dataset in the split
data.test <- data.split[[2]]
In [22]:
nrow(data.train)
In [23]:
glm_model1 <- h2o.glm(x = features,
y = target,
training_frame = data.train,
model_id = "glm_model1",
family = "binomial")
In [24]:
print(summary(glm_model1))
In [25]:
perf_obj <- h2o.performance(glm_model1, newdata = data.test)
In [26]:
print(perf_obj)
In [32]:
h2o.accuracy(perf_obj,0.946986888684036)
In [30]:
pred_channel <- h2o.predict(glm_model1,data.test)
pred_channel
In [ ]: