In [1]:
getwd()
In [2]:
library(data.table)
library(FSelector)
library(rworldmap)
library(rworldxtra)
In [21]:
properties <- fread("data/properties_2016.csv")
training.set <- read.csv("data/train_2016_v2.csv")
In [24]:
ls(properties)
In [25]:
ls(training.set)
In [4]:
properties$longitude <- properties$longitude/1000000
properties$latitude <- properties$latitude/1000000
In [5]:
newmap <- getMap(resolution = "high")
plot(newmap,
xlim = c(min(properties$longitude, na.rm = TRUE), max(properties$longitude, na.rm = TRUE)),
ylim = c(min(properties$latitude, na.rm = TRUE), max(properties$latitude, na.rm = TRUE)),
asp = 1)
points(properties$longitude, properties$latitude, col = "red", cex = .1)
In [8]:
training.set[is.na(training.set)] <- 0
properties[is.na(properties)] <- 0
properties$censustractandblock <- factor(properties$censustractandblock)
training.set.merged <- merge(x = training.set, y = properties, by = "parcelid", all.x = TRUE)
In [9]:
gain.ratio.feature.weights <- gain.ratio(logerror~., training.set.merged)
print(gain.ratio.feature.weights)
In [10]:
gain.ratio.top.features <- cutoff.k(gain.ratio.feature.weights, 10)
print(gain.ratio.top.features)
In [11]:
information.gain.feature.weights <- information.gain(logerror~., training.set.merged)
print(information.gain.feature.weights)
In [12]:
information.gain.top.features <- cutoff.k(information.gain.feature.weights, 10)
print(information.gain.top.features)
In [13]:
date.info <- unclass(as.POSIXlt(training.set.merged$transactiondate))
ls(date.info)
In [16]:
date.feature.names <- c("mday", "mon", "year", "yday")
training.set.merged[date.feature.names] <- date.info[date.feature.names]
In [18]:
training.set <- subset(training.set.merged, select = unique(c(gain.ratio.top.features, information.gain.top.features, date.feature.names, "logerror")))
test.set <- subset(properties, select = c(gain.ratio.top.features, information.gain.top.features))
results <- subset(properties, select= "parcelid")
In [ ]: