In [1]:
library(caret)


Loading required package: lattice
Loading required package: ggplot2

In [2]:
library(data.table)

In [31]:
path = '/home/zongyi/bimbo_data/'

train <- fread('/home/zongyi/bimbo_data/test_fs.csv', select=c('prior_sum','lag_sum'))


Read 6999251 rows and 2 (of 30) columns from 0.971 GB file in 00:00:08

In [24]:
train[is.na(train)] <- 0

In [26]:
train <- train[1:1000]

In [37]:
c2 <- chisq.test(train$lag_sum, train$lag1)
print(c2)


Warning message:
In chisq.test(train$lag_sum, train$lag1): Chi-squared approximation may be incorrect
	Pearson's Chi-squared test

data:  train$lag_sum and train$lag1
X-squared = 8679100000, df = 158040000, p-value < 2.2e-16


In [ ]:


In [32]:
fcor <- cor(train)
fcor


lag1lag_sum
lag11.00000000.5750333
lag_sum0.57503331.0000000

In [24]:
sum(abs(fcor[upper.tri(fcor)]))


1

In [29]:
highCorr <- sum(abs(fcor[upper.tri(fcor)]) > .995)
highCorr


1

In [33]:
summary(fcor[upper.tri(fcor)])


   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.575   0.575   0.575   0.575   0.575   0.575 

In [ ]: