Setup


In [1]:
getwd()


'/var/www/kaggle/zillow-forecast'

In [2]:
library(data.table)
library(FSelector)
library(rworldmap)
library(rworldxtra)


Loading required package: sp
### Welcome to rworldmap ###
For a short introduction type : 	 vignette('rworldmap')

Load Data


In [21]:
properties <- fread("data/properties_2016.csv")
training.set <- read.csv("data/train_2016_v2.csv")


Warning message in fread("data/properties_2016.csv"):
“Bumped column 50 to type character on data row 10354, field contains 'true'. Coercing previously read values in this column from logical, integer or numeric back to character which may not be lossless; e.g., if '00' and '000' occurred before they will now be just '0', and there may be inconsistencies with treatment of ',,' and ',NA,' too (if they occurred in this column before the bump). If this matters please rerun and set 'colClasses' to 'character' for this column. Please note that column type detection uses a sample of 1,000 rows (100 rows at 10 points) so hopefully this message should be very rare. If reporting to datatable-help, please rerun and include the output from verbose=TRUE.”
Read 2985217 rows and 58 (of 58) columns from 0.604 GB file in 00:00:08

In [24]:
ls(properties)


  1. 'airconditioningtypeid'
  2. 'architecturalstyletypeid'
  3. 'assessmentyear'
  4. 'basementsqft'
  5. 'bathroomcnt'
  6. 'bedroomcnt'
  7. 'buildingclasstypeid'
  8. 'buildingqualitytypeid'
  9. 'calculatedbathnbr'
  10. 'calculatedfinishedsquarefeet'
  11. 'censustractandblock'
  12. 'decktypeid'
  13. 'finishedfloor1squarefeet'
  14. 'finishedsquarefeet12'
  15. 'finishedsquarefeet13'
  16. 'finishedsquarefeet15'
  17. 'finishedsquarefeet50'
  18. 'finishedsquarefeet6'
  19. 'fips'
  20. 'fireplacecnt'
  21. 'fireplaceflag'
  22. 'fullbathcnt'
  23. 'garagecarcnt'
  24. 'garagetotalsqft'
  25. 'hashottuborspa'
  26. 'heatingorsystemtypeid'
  27. 'landtaxvaluedollarcnt'
  28. 'latitude'
  29. 'longitude'
  30. 'lotsizesquarefeet'
  31. 'numberofstories'
  32. 'parcelid'
  33. 'poolcnt'
  34. 'poolsizesum'
  35. 'pooltypeid10'
  36. 'pooltypeid2'
  37. 'pooltypeid7'
  38. 'propertycountylandusecode'
  39. 'propertylandusetypeid'
  40. 'propertyzoningdesc'
  41. 'rawcensustractandblock'
  42. 'regionidcity'
  43. 'regionidcounty'
  44. 'regionidneighborhood'
  45. 'regionidzip'
  46. 'roomcnt'
  47. 'storytypeid'
  48. 'structuretaxvaluedollarcnt'
  49. 'taxamount'
  50. 'taxdelinquencyflag'
  51. 'taxdelinquencyyear'
  52. 'taxvaluedollarcnt'
  53. 'threequarterbathnbr'
  54. 'typeconstructiontypeid'
  55. 'unitcnt'
  56. 'yardbuildingsqft17'
  57. 'yardbuildingsqft26'
  58. 'yearbuilt'

In [25]:
ls(training.set)


  1. 'logerror'
  2. 'parcelid'
  3. 'transactiondate'

Map


In [4]:
properties$longitude <- properties$longitude/1000000
properties$latitude <- properties$latitude/1000000

In [5]:
newmap <- getMap(resolution = "high")
plot(newmap, 
     xlim = c(min(properties$longitude, na.rm = TRUE), max(properties$longitude, na.rm = TRUE)), 
     ylim = c(min(properties$latitude, na.rm = TRUE), max(properties$latitude, na.rm = TRUE)), 
     asp = 1)

points(properties$longitude, properties$latitude, col = "red", cex = .1)


Prepare Data


In [8]:
training.set[is.na(training.set)] <- 0
properties[is.na(properties)] <- 0
properties$censustractandblock <- factor(properties$censustractandblock)
training.set.merged <- merge(x = training.set, y = properties, by = "parcelid", all.x = TRUE)

Gain Ratio


In [9]:
gain.ratio.feature.weights <- gain.ratio(logerror~., training.set.merged)
print(gain.ratio.feature.weights)


                             attr_importance
parcelid                        0.0081679029
transactiondate                 0.0019954063
airconditioningtypeid           0.0017371142
architecturalstyletypeid        0.0000000000
basementsqft                    0.0000000000
bathroomcnt                     0.0077297568
bedroomcnt                      0.0036417195
buildingclasstypeid             0.0000000000
buildingqualitytypeid           0.0068060060
calculatedbathnbr               0.0077361662
decktypeid                      0.0000000000
finishedfloor1squarefeet        0.0027157643
calculatedfinishedsquarefeet    0.0054131631
finishedsquarefeet12            0.0084911087
finishedsquarefeet13            0.0000000000
finishedsquarefeet15            0.0323885343
finishedsquarefeet50            0.0028002437
finishedsquarefeet6             0.0192842403
fips                            0.0091938293
fireplacecnt                    0.0035854440
fullbathcnt                     0.0062057334
garagecarcnt                    0.0116310934
garagetotalsqft                 0.0091929425
hashottuborspa                  0.0008760003
heatingorsystemtypeid           0.0035769473
latitude                        0.0048137716
longitude                       0.0060711891
lotsizesquarefeet               0.0030989798
poolcnt                         0.0008856146
poolsizesum                     0.0000000000
pooltypeid10                    0.0000000000
pooltypeid2                     0.0000000000
pooltypeid7                     0.0007496693
propertycountylandusecode       0.0083054618
propertylandusetypeid           0.0105565574
propertyzoningdesc              0.0166353789
rawcensustractandblock          0.0078470012
regionidcity                    0.0061329748
regionidcounty                  0.0091938293
regionidneighborhood            0.0052727717
regionidzip                     0.0067718511
roomcnt                         0.0045078419
storytypeid                     0.0000000000
threequarterbathnbr             0.0093419541
typeconstructiontypeid          0.0000000000
unitcnt                         0.0124810266
yardbuildingsqft17              0.0000000000
yardbuildingsqft26              0.0000000000
yearbuilt                       0.0108121228
numberofstories                 0.0030407561
fireplaceflag                   0.0003896122
structuretaxvaluedollarcnt      0.0049383435
taxvaluedollarcnt               0.0057151234
assessmentyear                  0.0000000000
landtaxvaluedollarcnt           0.0049008548
taxamount                       0.0049083295
taxdelinquencyflag              0.0106706175
taxdelinquencyyear              0.0106706175
censustractandblock             0.0894876804

In [10]:
gain.ratio.top.features <- cutoff.k(gain.ratio.feature.weights, 10)
print(gain.ratio.top.features)


 [1] "censustractandblock"   "finishedsquarefeet15"  "finishedsquarefeet6"  
 [4] "propertyzoningdesc"    "unitcnt"               "garagecarcnt"         
 [7] "yearbuilt"             "taxdelinquencyflag"    "taxdelinquencyyear"   
[10] "propertylandusetypeid"

Information Gain


In [11]:
information.gain.feature.weights <- information.gain(logerror~., training.set.merged)
print(information.gain.feature.weights)


                             attr_importance
parcelid                        2.030146e-02
transactiondate                 1.084085e-02
airconditioningtypeid           1.232653e-03
architecturalstyletypeid        0.000000e+00
basementsqft                    0.000000e+00
bathroomcnt                     1.184954e-02
bedroomcnt                      4.977731e-03
buildingclasstypeid             0.000000e+00
buildingqualitytypeid           8.566429e-03
calculatedbathnbr               1.186269e-02
decktypeid                      0.000000e+00
finishedfloor1squarefeet        7.565559e-04
calculatedfinishedsquarefeet    7.663731e-03
finishedsquarefeet12            1.375808e-02
finishedsquarefeet13            0.000000e+00
finishedsquarefeet15            5.385760e-03
finishedsquarefeet50            7.809896e-04
finishedsquarefeet6             5.724792e-04
fips                            5.959091e-03
fireplacecnt                    1.215332e-03
fullbathcnt                     8.052671e-03
garagecarcnt                    7.937533e-03
garagetotalsqft                 5.243388e-03
hashottuborspa                  1.062289e-04
heatingorsystemtypeid           3.909818e-03
latitude                        1.062611e-02
longitude                       1.160072e-02
lotsizesquarefeet               4.133842e-03
poolcnt                         4.410612e-04
poolsizesum                     0.000000e+00
pooltypeid10                    0.000000e+00
pooltypeid2                     0.000000e+00
pooltypeid7                     3.589623e-04
propertycountylandusecode       1.844089e-02
propertylandusetypeid           8.311071e-03
propertyzoningdesc              6.954989e-02
rawcensustractandblock          2.032236e-02
regionidcity                    1.351144e-02
regionidcounty                  5.959091e-03
regionidneighborhood            3.992435e-03
regionidzip                     1.916001e-02
roomcnt                         2.527056e-03
storytypeid                     0.000000e+00
threequarterbathnbr             3.662984e-03
typeconstructiontypeid          0.000000e+00
unitcnt                         1.006963e-02
yardbuildingsqft17              0.000000e+00
yardbuildingsqft26              0.000000e+00
yearbuilt                       1.604566e-02
numberofstories                 2.102300e-03
fireplaceflag                   6.713239e-06
structuretaxvaluedollarcnt      8.261606e-03
taxvaluedollarcnt               1.015723e-02
assessmentyear                  0.000000e+00
landtaxvaluedollarcnt           7.082297e-03
taxamount                       8.746810e-03
taxdelinquencyflag              1.035770e-03
taxdelinquencyyear              1.035770e-03
censustractandblock             9.191350e-01

In [12]:
information.gain.top.features <- cutoff.k(information.gain.feature.weights, 10)
print(information.gain.top.features)


 [1] "censustractandblock"       "propertyzoningdesc"       
 [3] "rawcensustractandblock"    "parcelid"                 
 [5] "regionidzip"               "propertycountylandusecode"
 [7] "yearbuilt"                 "finishedsquarefeet12"     
 [9] "regionidcity"              "calculatedbathnbr"        

Time features


In [13]:
date.info <- unclass(as.POSIXlt(training.set.merged$transactiondate))
ls(date.info)


  1. 'gmtoff'
  2. 'hour'
  3. 'isdst'
  4. 'mday'
  5. 'min'
  6. 'mon'
  7. 'sec'
  8. 'wday'
  9. 'yday'
  10. 'year'
  11. 'zone'

In [16]:
date.feature.names <- c("mday", "mon", "year", "yday")
training.set.merged[date.feature.names] <- date.info[date.feature.names]

All Features


In [18]:
training.set <- subset(training.set.merged, select = unique(c(gain.ratio.top.features, information.gain.top.features, date.feature.names, "logerror")))
test.set <- subset(properties, select = c(gain.ratio.top.features, information.gain.top.features))
results <- subset(properties, select= "parcelid")

In [ ]: