Libraries Import

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.1.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.1.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine

Import CSV

# I did this before I did feature selection so need to change abit
resale = read.csv("data/feature_engineering/resale_feature_engineering.csv")
total_rows = dim(resale)[1]
train_indices = sample(total_rows, 0.8 * total_rows)

resale_train = resale[train_indices,]
resale_test = resale[-train_indices,]

dim(resale_train)
## [1] 3528   29
dim(resale_test)
## [1] 882  29
resale_train_ori = resale_train[, 1:10]
resale_test_ori = resale_test[, 1:10]

Bagging

set.seed(1)

#to be finetuned?
bag.resale = randomForest(resale_price ~ ., data = resale_train_ori, mtry = 9, importance = TRUE)
yhat.bag = predict(bag.resale, newdata = resale_test_ori)

sqrt(mean((yhat.bag - resale_test_ori[,"resale_price"])^2))
## [1] 85575.9
rss = sum((yhat.bag - resale_test_ori[,"resale_price"]) ^ 2)  ## residual sum of squares
tss = sum((resale_test_ori[,"resale_price"] - mean(resale_test_ori[,"resale_price"])) ^ 2)  ## total sum of squares
rsq = 1 - rss/tss
rsq
## [1] 0.7352996
a.rsq = 1 - (1 - rsq) * (dim(resale_test)[1] - 1) / (dim(resale_test)[1] - dim(resale_train)[2])
a.rsq
## [1] 0.7266107
summary(yhat.bag)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  264592  443537  581492  576046  685046 1140857
importance(bag.resale)
##                     %IncMSE IncNodePurity
## month            -0.3292501  2.095556e+11
## town            131.7478528  7.547361e+12
## flat_type        17.6255727  2.361648e+12
## block            66.6004454  4.550945e+12
## street_name      69.2436707  4.071712e+12
## storey_range     84.9109146  1.263418e+13
## floor_area_sqm  151.2414078  5.592235e+13
## flat_model       60.8847273  2.562239e+12
## remaining_lease 117.9976527  1.011919e+13
varImpPlot(bag.resale)

set.seed(1)

#to be finetuned?
bag.resale = randomForest(resale_price ~ ., data = resale_train, mtry = 9, importance = TRUE)
yhat.bag = predict(bag.resale, newdata = resale_test)

sqrt(mean((yhat.bag - resale_test[,"resale_price"])^2))
## [1] 55861.28
rss = sum((yhat.bag - resale_test[,"resale_price"]) ^ 2)  ## residual sum of squares
tss = sum((resale_test[,"resale_price"] - mean(resale_test[,"resale_price"])) ^ 2)  ## total sum of squares
rsq = 1 - rss/tss
rsq
## [1] 0.8872093
a.rsq = 1 - (1 - rsq) * (dim(resale_test)[1] - 1) / (dim(resale_test)[1] - dim(resale_train)[2])
a.rsq
## [1] 0.8835069
summary(yhat.bag)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  282706  446428  562838  561678  646905 1155400
importance(bag.resale)
##                                          %IncMSE IncNodePurity
## month                                 -0.5816031  7.393828e+10
## town                                  31.4533182  2.333737e+12
## flat_type                             28.8605063  2.080316e+13
## block                                 26.8139165  1.226658e+12
## street_name                           27.8965708  1.316393e+12
## storey_range                          38.1138528  7.710507e+12
## floor_area_sqm                        45.8650967  2.914882e+13
## flat_model                            28.4260506  4.227536e+12
## remaining_lease                       69.0288556  1.037132e+13
## nearest_mrt_dist                      37.6604255  3.286390e+12
## nearest_mrt                           30.5895673  1.105061e+12
## nearest_mall_dist                     28.3909691  1.111826e+12
## nearest_mall                          28.8143284  1.362723e+12
## nearest_school_dist                   18.3318730  7.148938e+11
## nearest_school                        26.3600320  9.437424e+11
## nearest_bus_stop_dist                  7.7210729  6.007954e+11
## nearest_bus_stop                      18.5819876  7.527109e+11
## nearest_primary_school_dist           16.0758028  6.858093e+11
## nearest_primary_school                26.9207984  8.634954e+11
## within_1_km_to_nearest_primary_school  3.7553756  2.919901e+10
## total_nearby_mrt                      29.4229351  2.627241e+12
## total_nearby_mall                     18.1236066  3.511764e+11
## total_nearby_school                   15.4352427  7.058830e+11
## total_nearby_primary_school           17.5838842  6.852300e+11
## total_nearby_bus_stop                 12.5651143  3.806125e+11
## total_resales_in_town                 45.5223674  5.160484e+12
## total_resales_in_block                10.5077359  4.929537e+11
## total_resales_in_street               24.5070864  1.091400e+12
varImpPlot(bag.resale)