Libraries Import
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.1.3
library(ggplot2)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.1.3
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
EDA
sum(is.na(resale_2023))
## [1] 0
summary(resale_2023)
## month town flat_type block
## Length:4410 Length:4410 Length:4410 Length:4410
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## street_name storey_range floor_area_sqm flat_model
## Length:4410 Length:4410 Min. : 37.00 Length:4410
## Class :character Class :character 1st Qu.: 73.00 Class :character
## Mode :character Mode :character Median : 93.00 Mode :character
## Mean : 94.82
## 3rd Qu.:111.75
## Max. :192.00
## resale_price remaining_lease
## Min. : 230000 Min. :43.00
## 1st Qu.: 430000 1st Qu.:61.17
## Median : 535000 Median :73.42
## Mean : 555608 Mean :73.79
## 3rd Qu.: 648000 3rd Qu.:90.58
## Max. :1340000 Max. :95.50
str(resale_2023)
## 'data.frame': 4410 obs. of 10 variables:
## $ month : chr "2023-01" "2023-01" "2023-01" "2023-01" ...
## $ town : chr "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" ...
## $ flat_type : chr "2 ROOM" "2 ROOM" "2 ROOM" "2 ROOM" ...
## $ block : chr "406" "323" "314" "314" ...
## $ street_name : chr "ANG MO KIO AVE 10" "ANG MO KIO AVE 3" "ANG MO KIO AVE 3" "ANG MO KIO AVE 3" ...
## $ storey_range : chr "01 TO 03" "04 TO 06" "04 TO 06" "07 TO 09" ...
## $ floor_area_sqm : num 44 49 44 44 45 67 70 67 73 73 ...
## $ flat_model : chr "Improved" "Improved" "Improved" "Improved" ...
## $ resale_price : num 267000 300000 280000 282000 289800 ...
## $ remaining_lease: num 55.4 53.5 54.1 54.1 62.1 ...
barplot(table(resale_2023$month))
barplot(table(resale_2023$town))
barplot(table(resale_2023$flat_type))
length(unique(resale_2023$block))
## [1] 1650
barplot(table(resale_2023$street_name))
barplot(table(resale_2023$storey_range))
barplot(table(resale_2023$floor_area_sqm))
barplot(table(resale_2023$flat_model))
barplot(table(resale_2023$remaining_lease))
barplot(table(resale_2023$resale_price))
resale_2023 %>%
ggplot() +
geom_boxplot(aes(x = month, y = resale_price))
resale_2023 %>%
ggplot() +
geom_boxplot(aes(x = town, y = resale_price)) +
labs(title = "Resale Price Distribution by Town") +
xlab("Town") +
ylab("Resale Price") +
theme(axis.text.x = element_text(angle = 90))
resale_2023 %>%
ggplot() +
geom_boxplot(aes(x = flat_type, y = resale_price)) +
labs(title = "Resale Price Distribution by Flat Type") +
xlab("Flat Type") +
ylab("Resale Price") +
theme_minimal()
resale_2023 %>%
ggplot() +
geom_boxplot(aes(x = flat_type, y = resale_price)) +
facet_wrap(~flat_model, scales = "free_x")
resale_2023 %>%
ggplot() +
geom_boxplot(aes(x = flat_model, y = resale_price)) +
facet_wrap(~flat_type, scales = "free") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
resale_2023 %>%
ggplot() +
geom_boxplot(aes(x = storey_range, y = resale_price)) +
labs(title = "Resale Price Distribution by HDB Storey Range") +
xlab("HDB Storey Range") +
ylab("Resale Price") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 30))
resale_2023 %>%
ggplot() +
geom_point(aes(x = floor_area_sqm, y = resale_price)) +
labs(title = "Resale Price Distribution by Flat Floor Area (in m^2)") +
xlab("Flat Floor Area (in m^2)") +
ylab("Resale Price") +
theme_minimal()
resale_2023 %>%
ggplot() +
geom_point(aes(x = floor_area_sqm, y = resale_price, color = storey_range)) +
labs(title = "Resale Price Distribution by Flat Floor Area (in m^2)") +
xlab("Flat Floor Area (in m^2)") +
ylab("Resale Price") +
theme_minimal()
resale_2023 %>%
ggplot() +
geom_boxplot(aes(x = flat_model, y = resale_price))
resale_2023 %>%
ggplot() +
geom_point(aes(x = remaining_lease, y = resale_price))
resale_2023 %>%
ggplot() +
geom_histogram(aes(x = resale_price)) +
labs(title = "Resale Price Distribution") +
ylab("Count") +
xlab("Resale Price") +
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
resale_2023 %>%
ggplot(aes(sample = resale_price)) +
stat_qq() +
stat_qq_line() +
labs(title = "Resale Price QQ Plot") +
ylab("Sample Quantiles") +
xlab("Theoretical Quantiles") +
theme_minimal()
resale_2023 %>%
ggplot() +
geom_histogram(aes(x = remaining_lease))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
resale_2023 %>%
ggplot() +
geom_histogram(aes(x = floor_area_sqm))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
cormat <- round(cor(resale_2023[, c("floor_area_sqm", "remaining_lease", "resale_price")]),2)
head(cormat)
## floor_area_sqm remaining_lease resale_price
## floor_area_sqm 1.00 0.07 0.69
## remaining_lease 0.07 1.00 0.37
## resale_price 0.69 0.37 1.00
melted_cormat <- melt(cormat)
head(melted_cormat)
## Var1 Var2 value
## 1 floor_area_sqm floor_area_sqm 1.00
## 2 remaining_lease floor_area_sqm 0.07
## 3 resale_price floor_area_sqm 0.69
## 4 floor_area_sqm remaining_lease 0.07
## 5 remaining_lease remaining_lease 1.00
## 6 resale_price remaining_lease 0.37
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) +
geom_tile() +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
geom_text(aes(Var2, Var1, label = value), color = "black", size = 4)