Libraries Import

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.1.3
library(ggplot2)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.1.3
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths

Dataset Import

resale_2023 = read.csv("data/data_cleaning/resale_2023.csv")

EDA

sum(is.na(resale_2023))
## [1] 0
summary(resale_2023)
##     month               town            flat_type            block          
##  Length:4410        Length:4410        Length:4410        Length:4410       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  street_name        storey_range       floor_area_sqm    flat_model       
##  Length:4410        Length:4410        Min.   : 37.00   Length:4410       
##  Class :character   Class :character   1st Qu.: 73.00   Class :character  
##  Mode  :character   Mode  :character   Median : 93.00   Mode  :character  
##                                        Mean   : 94.82                     
##                                        3rd Qu.:111.75                     
##                                        Max.   :192.00                     
##   resale_price     remaining_lease
##  Min.   : 230000   Min.   :43.00  
##  1st Qu.: 430000   1st Qu.:61.17  
##  Median : 535000   Median :73.42  
##  Mean   : 555608   Mean   :73.79  
##  3rd Qu.: 648000   3rd Qu.:90.58  
##  Max.   :1340000   Max.   :95.50
str(resale_2023)
## 'data.frame':    4410 obs. of  10 variables:
##  $ month          : chr  "2023-01" "2023-01" "2023-01" "2023-01" ...
##  $ town           : chr  "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" ...
##  $ flat_type      : chr  "2 ROOM" "2 ROOM" "2 ROOM" "2 ROOM" ...
##  $ block          : chr  "406" "323" "314" "314" ...
##  $ street_name    : chr  "ANG MO KIO AVE 10" "ANG MO KIO AVE 3" "ANG MO KIO AVE 3" "ANG MO KIO AVE 3" ...
##  $ storey_range   : chr  "01 TO 03" "04 TO 06" "04 TO 06" "07 TO 09" ...
##  $ floor_area_sqm : num  44 49 44 44 45 67 70 67 73 73 ...
##  $ flat_model     : chr  "Improved" "Improved" "Improved" "Improved" ...
##  $ resale_price   : num  267000 300000 280000 282000 289800 ...
##  $ remaining_lease: num  55.4 53.5 54.1 54.1 62.1 ...
barplot(table(resale_2023$month))

barplot(table(resale_2023$town))

barplot(table(resale_2023$flat_type))

length(unique(resale_2023$block))
## [1] 1650
barplot(table(resale_2023$street_name))

barplot(table(resale_2023$storey_range))

barplot(table(resale_2023$floor_area_sqm))

barplot(table(resale_2023$flat_model))

barplot(table(resale_2023$remaining_lease))

barplot(table(resale_2023$resale_price))

resale_2023 %>% 
  ggplot() +
  geom_boxplot(aes(x = month, y = resale_price))

resale_2023 %>% 
  ggplot() +
  geom_boxplot(aes(x = town, y = resale_price)) +
  labs(title = "Resale Price Distribution by Town") +
  xlab("Town") +
  ylab("Resale Price") +
  theme(axis.text.x = element_text(angle = 90))

resale_2023 %>% 
  ggplot() +
  geom_boxplot(aes(x = flat_type, y = resale_price)) +
  labs(title = "Resale Price Distribution by Flat Type") +
  xlab("Flat Type") +
  ylab("Resale Price") +
  theme_minimal()

resale_2023 %>% 
  ggplot() +
  geom_boxplot(aes(x = flat_type, y = resale_price)) +
  facet_wrap(~flat_model, scales = "free_x")

resale_2023 %>% 
  ggplot() +
  geom_boxplot(aes(x = flat_model, y = resale_price)) +
  facet_wrap(~flat_type, scales = "free") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45))

resale_2023 %>% 
  ggplot() +
  geom_boxplot(aes(x = storey_range, y = resale_price)) +
  labs(title = "Resale Price Distribution by HDB Storey Range") +
  xlab("HDB Storey Range") +
  ylab("Resale Price") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 30))

resale_2023 %>% 
  ggplot() +
  geom_point(aes(x = floor_area_sqm, y = resale_price)) +
  labs(title = "Resale Price Distribution by Flat Floor Area (in m^2)") +
  xlab("Flat Floor Area (in m^2)") +
  ylab("Resale Price") +
  theme_minimal()

resale_2023 %>% 
  ggplot() +
  geom_point(aes(x = floor_area_sqm, y = resale_price, color = storey_range)) +
  labs(title = "Resale Price Distribution by Flat Floor Area (in m^2)") +
  xlab("Flat Floor Area (in m^2)") +
  ylab("Resale Price") +
  theme_minimal()

resale_2023 %>% 
  ggplot() +
  geom_boxplot(aes(x = flat_model, y = resale_price))

resale_2023 %>% 
  ggplot() +
  geom_point(aes(x = remaining_lease, y = resale_price))

resale_2023 %>%
  ggplot() +
  geom_histogram(aes(x = resale_price)) +
  labs(title = "Resale Price Distribution") +
  ylab("Count") +
  xlab("Resale Price") +
  theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

resale_2023 %>%
  ggplot(aes(sample = resale_price)) +
  stat_qq() +
  stat_qq_line() +
  labs(title = "Resale Price QQ Plot") +
  ylab("Sample Quantiles") +
  xlab("Theoretical Quantiles") +
  theme_minimal()

resale_2023 %>%
  ggplot() +
  geom_histogram(aes(x = remaining_lease))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

resale_2023 %>%
  ggplot() +
  geom_histogram(aes(x = floor_area_sqm))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

cormat <- round(cor(resale_2023[, c("floor_area_sqm", "remaining_lease", "resale_price")]),2)
head(cormat)
##                 floor_area_sqm remaining_lease resale_price
## floor_area_sqm            1.00            0.07         0.69
## remaining_lease           0.07            1.00         0.37
## resale_price              0.69            0.37         1.00
melted_cormat <- melt(cormat)
head(melted_cormat)
##              Var1            Var2 value
## 1  floor_area_sqm  floor_area_sqm  1.00
## 2 remaining_lease  floor_area_sqm  0.07
## 3    resale_price  floor_area_sqm  0.69
## 4  floor_area_sqm remaining_lease  0.07
## 5 remaining_lease remaining_lease  1.00
## 6    resale_price remaining_lease  0.37
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) + 
  geom_tile() +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name="Pearson\nCorrelation") +
  geom_text(aes(Var2, Var1, label = value), color = "black", size = 4)