Libraries Import
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.1.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.1.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
Import CSV
# I did this before I did feature selection so need to change abit
resale = read.csv("data/feature_engineering/resale_feature_engineering.csv")
total_rows = dim(resale)[1]
train_indices = sample(total_rows, 0.8 * total_rows)
resale_train = resale[train_indices,]
resale_test = resale[-train_indices,]
dim(resale_train)
## [1] 3528 29
dim(resale_test)
## [1] 882 29
resale_train_ori = resale_train[, 1:10]
resale_test_ori = resale_test[, 1:10]
Bagging
set.seed(1)
#to be finetuned?
bag.resale = randomForest(resale_price ~ ., data = resale_train_ori, mtry = 9, importance = TRUE)
yhat.bag = predict(bag.resale, newdata = resale_test_ori)
sqrt(mean((yhat.bag - resale_test_ori[,"resale_price"])^2))
## [1] 85575.9
rss = sum((yhat.bag - resale_test_ori[,"resale_price"]) ^ 2) ## residual sum of squares
tss = sum((resale_test_ori[,"resale_price"] - mean(resale_test_ori[,"resale_price"])) ^ 2) ## total sum of squares
rsq = 1 - rss/tss
rsq
## [1] 0.7352996
a.rsq = 1 - (1 - rsq) * (dim(resale_test)[1] - 1) / (dim(resale_test)[1] - dim(resale_train)[2])
a.rsq
## [1] 0.7266107
summary(yhat.bag)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 264592 443537 581492 576046 685046 1140857
importance(bag.resale)
## %IncMSE IncNodePurity
## month -0.3292501 2.095556e+11
## town 131.7478528 7.547361e+12
## flat_type 17.6255727 2.361648e+12
## block 66.6004454 4.550945e+12
## street_name 69.2436707 4.071712e+12
## storey_range 84.9109146 1.263418e+13
## floor_area_sqm 151.2414078 5.592235e+13
## flat_model 60.8847273 2.562239e+12
## remaining_lease 117.9976527 1.011919e+13
varImpPlot(bag.resale)
set.seed(1)
#to be finetuned?
bag.resale = randomForest(resale_price ~ ., data = resale_train, mtry = 9, importance = TRUE)
yhat.bag = predict(bag.resale, newdata = resale_test)
sqrt(mean((yhat.bag - resale_test[,"resale_price"])^2))
## [1] 55861.28
rss = sum((yhat.bag - resale_test[,"resale_price"]) ^ 2) ## residual sum of squares
tss = sum((resale_test[,"resale_price"] - mean(resale_test[,"resale_price"])) ^ 2) ## total sum of squares
rsq = 1 - rss/tss
rsq
## [1] 0.8872093
a.rsq = 1 - (1 - rsq) * (dim(resale_test)[1] - 1) / (dim(resale_test)[1] - dim(resale_train)[2])
a.rsq
## [1] 0.8835069
summary(yhat.bag)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 282706 446428 562838 561678 646905 1155400
importance(bag.resale)
## %IncMSE IncNodePurity
## month -0.5816031 7.393828e+10
## town 31.4533182 2.333737e+12
## flat_type 28.8605063 2.080316e+13
## block 26.8139165 1.226658e+12
## street_name 27.8965708 1.316393e+12
## storey_range 38.1138528 7.710507e+12
## floor_area_sqm 45.8650967 2.914882e+13
## flat_model 28.4260506 4.227536e+12
## remaining_lease 69.0288556 1.037132e+13
## nearest_mrt_dist 37.6604255 3.286390e+12
## nearest_mrt 30.5895673 1.105061e+12
## nearest_mall_dist 28.3909691 1.111826e+12
## nearest_mall 28.8143284 1.362723e+12
## nearest_school_dist 18.3318730 7.148938e+11
## nearest_school 26.3600320 9.437424e+11
## nearest_bus_stop_dist 7.7210729 6.007954e+11
## nearest_bus_stop 18.5819876 7.527109e+11
## nearest_primary_school_dist 16.0758028 6.858093e+11
## nearest_primary_school 26.9207984 8.634954e+11
## within_1_km_to_nearest_primary_school 3.7553756 2.919901e+10
## total_nearby_mrt 29.4229351 2.627241e+12
## total_nearby_mall 18.1236066 3.511764e+11
## total_nearby_school 15.4352427 7.058830e+11
## total_nearby_primary_school 17.5838842 6.852300e+11
## total_nearby_bus_stop 12.5651143 3.806125e+11
## total_resales_in_town 45.5223674 5.160484e+12
## total_resales_in_block 10.5077359 4.929537e+11
## total_resales_in_street 24.5070864 1.091400e+12
varImpPlot(bag.resale)