In [1]:
    
getwd()
    
    
In [2]:
    
library(data.table)
library(FSelector)
library(rworldmap)
library(rworldxtra)
    
    
In [21]:
    
properties <- fread("data/properties_2016.csv")
training.set <- read.csv("data/train_2016_v2.csv")
    
    
    
In [24]:
    
ls(properties)
    
    
In [25]:
    
ls(training.set)
    
    
In [4]:
    
properties$longitude <- properties$longitude/1000000
properties$latitude <- properties$latitude/1000000
    
In [5]:
    
newmap <- getMap(resolution = "high")
plot(newmap, 
     xlim = c(min(properties$longitude, na.rm = TRUE), max(properties$longitude, na.rm = TRUE)), 
     ylim = c(min(properties$latitude, na.rm = TRUE), max(properties$latitude, na.rm = TRUE)), 
     asp = 1)
points(properties$longitude, properties$latitude, col = "red", cex = .1)
    
    
In [8]:
    
training.set[is.na(training.set)] <- 0
properties[is.na(properties)] <- 0
properties$censustractandblock <- factor(properties$censustractandblock)
training.set.merged <- merge(x = training.set, y = properties, by = "parcelid", all.x = TRUE)
    
In [9]:
    
gain.ratio.feature.weights <- gain.ratio(logerror~., training.set.merged)
print(gain.ratio.feature.weights)
    
    
In [10]:
    
gain.ratio.top.features <- cutoff.k(gain.ratio.feature.weights, 10)
print(gain.ratio.top.features)
    
    
In [11]:
    
information.gain.feature.weights <- information.gain(logerror~., training.set.merged)
print(information.gain.feature.weights)
    
    
In [12]:
    
information.gain.top.features <- cutoff.k(information.gain.feature.weights, 10)
print(information.gain.top.features)
    
    
In [13]:
    
date.info <- unclass(as.POSIXlt(training.set.merged$transactiondate))
ls(date.info)
    
    
In [16]:
    
date.feature.names <- c("mday", "mon", "year", "yday")
training.set.merged[date.feature.names] <- date.info[date.feature.names]
    
In [18]:
    
training.set <- subset(training.set.merged, select = unique(c(gain.ratio.top.features, information.gain.top.features, date.feature.names, "logerror")))
test.set <- subset(properties, select = c(gain.ratio.top.features, information.gain.top.features))
results <- subset(properties, select= "parcelid")
    
In [ ]: