from google.colab import drive
drive.mount("/content/gdrive")
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
import json
import numpy as np
import pandas as pd
import requests
import time
!pip install beautifulsoup4
!pip install onemapsg
from bs4 import BeautifulSoup
from onemapsg import OneMapClient
from tqdm import tqdm
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Requirement already satisfied: onemapsg in /usr/local/lib/python3.9/dist-packages (1.0.3) Requirement already satisfied: requests>=2.20.0 in /usr/local/lib/python3.9/dist-packages (from onemapsg) (2.25.1) Requirement already satisfied: requests-toolbelt in /usr/local/lib/python3.9/dist-packages (from onemapsg) (0.10.1) Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests>=2.20.0->onemapsg) (2.10) Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.9/dist-packages (from requests>=2.20.0->onemapsg) (1.26.15) Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.9/dist-packages (from requests>=2.20.0->onemapsg) (2022.12.7) Requirement already satisfied: chardet<5,>=3.0.2 in /usr/local/lib/python3.9/dist-packages (from requests>=2.20.0->onemapsg) (4.0.0) Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.9/dist-packages (4.9.3) Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.9/dist-packages (from beautifulsoup4) (2.4)
resale = pd.read_csv("gdrive/MyDrive/dataset/ST4248/resale_2023.csv")
After browsing articles, property market sites and social media, 5 additional features are found to be potential features to improve our model.
1. Nearest and Nearby MRTs (1 km)
2. Nearest and Nearby Bus stops (300 m)
3. Nearest and Nearby Schools (1 km)
4. Nearest and Nearby Primary Schools (1 km)
- Within 1km radius gives higher priority during balloting
( https://www.moe.gov.sg/primary/p1-registration/distance/ )
5. Nearest and Nearby Malls (1 km)
Assuming a straight line walking route, we can use geocoordinates from Singapore's onemap API to determine the actual land distance.
# Start onemap client
creds_email = "cliftonfelix@u.nus.edu"
creds_password = "ST4248GroupC4"
token = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOjEwMDU0LCJ1c2VyX2lkIjoxMDA1NCwiZW1haWwiOiJjbGlmdG9uZmVsaXhAdS5udXMuZWR1IiwiZm9yZXZlciI6ZmFsc2UsImlzcyI6Imh0dHA6XC9cL29tMi5kZmUub25lbWFwLnNnXC9hcGlcL3YyXC91c2VyXC9zZXNzaW9uIiwiaWF0IjoxNjc5MDUwNjEyLCJleHAiOjE2Nzk0ODI2MTIsIm5iZiI6MTY3OTA1MDYxMiwianRpIjoiZjljNjFjN2Q4MTcwNDg5ZTUwNzU5ODRjM2MxYzdhOTYifQ.JNdAjvfVQdIn3RuH9rLlqrQCtJdSq4JYMqIdWBKZ4qI"
Client = OneMapClient(creds_email, creds_password)
def countdown(count,start):
# Sleep counter abides onemap's rules (max 250 query/s)
count += 1
if count >= 250:
end = time.time()
time.sleep(1)
start = time.time()
count = 0
return count,start
# # Distance Multiplier Calculation
# latlong_dist = np.sqrt(0.1**2 + 0.1**2) # 0.1 lat and 0.1 long difference
# actual_dist_m = 15.723 * 1000 # According to https://latlongdata.com/distance-calculator/
# latlong_multiplier = actual_dist_m/latlong_dist
# # Assuming average walking speed as 1.4m/s
# # (according to https://www.healthline.com/health/exercise-fitness/average-walking-speed#average-speed-by-age)
# kmpermin = (1.4/1000) * 60
# print(f"Expected time for 1km walking distance: {1/kmpermin}")
# https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
from math import radians, cos, sin, asin, sqrt
def haversine(lon1, lat1, lon2, lat2):
"""
Calculate the great circle distance in kilometers between two points
on the earth (specified in decimal degrees)
"""
# convert decimal degrees to radians
lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
# haversine formula
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
c = 2 * asin(sqrt(a))
r = 6371 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
return c * r
Dataset can be downloaded directly from the Singapore data gov website at https://data.gov.sg/dataset/train-station-chinese-names or it can be accessed via API below.
# Import Dataset via API
url = 'https://data.gov.sg/api/action/datastore_search?resource_id=65c1093b-0c34-41cf-8f0e-f11318766298&limit=1000'
response = requests.get(url)
print(f"Response Code : {response.status_code}")
mrt_df = pd.json_normalize(response.json()['result']['records'])
mrt_df = mrt_df.loc[:,["mrt_station_english","mrt_line_english"]]
mrt_df.columns = ["station","line"]
mrt_df = mrt_df.loc[mrt_df.line.str.contains("Line"),:] # remove LRTs
print(len(mrt_df))
mrt_df.head()
Response Code : 200 141
station | line | |
---|---|---|
0 | Jurong East | North South Line |
1 | Bukit Batok | North South Line |
2 | Bukit Gombak | North South Line |
3 | Choa Chu Kang | North South Line |
4 | Yew Tee | North South Line |
# Adding Longitude and Lattitude Data
mrt_df.loc[:,"lat"] = ""
mrt_df.loc[:,"long"] = ""
start = time.time()
count = 0
for i,row in tqdm(mrt_df.iterrows(), total=mrt_df.shape[0]):
station_name = row["station"] + " MRT"
try:
search = Client.search(station_name,page_num=1)["results"][0]
mrt_df.loc[i,"lat"] = search["LATITUDE"]
mrt_df.loc[i,"long"] = search["LONGITUDE"]
count,start = countdown(count,start)
except:
continue
mrt_df.loc[mrt_df.lat.isna(),:] # Checking for any entries with no geo location
100%|██████████| 141/141 [01:37<00:00, 1.44it/s]
station | line | lat | long |
---|
# mrt_df = pd.read_csv("dataset/mrt_geo.csv") # If no access to onemap
mrt_df.head()
station | line | lat | long | |
---|---|---|---|---|
0 | Jurong East | North South Line | 1.33315281585758 | 103.742286332403 |
1 | Bukit Batok | North South Line | 1.34903331201636 | 103.749566478309 |
2 | Bukit Gombak | North South Line | 1.35861159094192 | 103.751790910733 |
3 | Choa Chu Kang | North South Line | 1.38536316540225 | 103.744370779756 |
4 | Yew Tee | North South Line | 1.39753506936297 | 103.747405150236 |
mrt_df = mrt_df.dropna(axis=0)
mrt_df.lat = mrt_df.lat.astype(np.float)
mrt_df.long = mrt_df.long.astype(np.float)
mrt_df.to_csv("mrt_geo.csv",index=False)
<ipython-input-22-a04d9a7086c7>:2: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations mrt_df.lat = mrt_df.lat.astype(np.float) <ipython-input-22-a04d9a7086c7>:3: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations mrt_df.long = mrt_df.long.astype(np.float)
Dataset can be downloaded directly from the Singapore data gov website at https://data.gov.sg/dataset/school-directory-and-information?view_id=ba7c477d-a077-4303-96a1-ac1d4f25b190&resource_id=ede26d32-01af-4228-b1ed-f05c45a1d8ee or it can be accessed via API below.
# Import Dataset via API
url = 'https://data.gov.sg/api/action/datastore_search?resource_id=ede26d32-01af-4228-b1ed-f05c45a1d8ee&limit=1000'
response = requests.get(url)
print(f"Response Code : {response.status_code}")
school_df = pd.json_normalize(response.json()['result']['records'])
school_df = school_df.loc[:,["school_name","postal_code","mainlevel_code"]]
school_df.head()
Response Code : 200
school_name | postal_code | mainlevel_code | |
---|---|---|---|
0 | ADMIRALTY PRIMARY SCHOOL | 738907 | PRIMARY |
1 | ADMIRALTY SECONDARY SCHOOL | 737916 | SECONDARY |
2 | AHMAD IBRAHIM PRIMARY SCHOOL | 768643 | PRIMARY |
3 | AHMAD IBRAHIM SECONDARY SCHOOL | 768928 | SECONDARY |
4 | AI TONG SCHOOL | 579646 | PRIMARY |
# Adding Longitude and Lattitude Data
school_df.loc[:,"lat"] = ""
school_df.loc[:,"long"] = ""
start = time.time()
count = 0
for i,row in tqdm(school_df.iterrows(), total=school_df.shape[0]):
postal_code = row["postal_code"]
if len(str(postal_code))<6:
postal_code = (6-len(str(postal_code)))*"0" + str(postal_code)
try:
search = Client.search(postal_code,page_num=1)["results"][0]
school_df.loc[i,"lat"] = search["LATITUDE"]
school_df.loc[i,"long"] = search["LONGITUDE"]
count,start = countdown(count,start)
except:
continue
school_df.loc[school_df.lat.isna(),:]
100%|██████████| 346/346 [06:54<00:00, 1.20s/it]
school_name | postal_code | mainlevel_code | lat | long |
---|
school_df.head()
school_name | postal_code | mainlevel_code | lat | long | |
---|---|---|---|---|---|
0 | ADMIRALTY PRIMARY SCHOOL | 738907 | PRIMARY | 1.4426347903311 | 103.800040119743 |
1 | ADMIRALTY SECONDARY SCHOOL | 737916 | SECONDARY | 1.44589068910993 | 103.802398196596 |
2 | AHMAD IBRAHIM PRIMARY SCHOOL | 768643 | PRIMARY | 1.43315271543517 | 103.832942401086 |
3 | AHMAD IBRAHIM SECONDARY SCHOOL | 768928 | SECONDARY | 1.43605975368804 | 103.829718690077 |
4 | AI TONG SCHOOL | 579646 | PRIMARY | 1.36058343005814 | 103.83302033725 |
school_df = school_df[school_df["school_name"] != "ZHENGHUA SECONDARY SCHOOL"]
school_df = school_df.dropna(axis=0)
school_df.lat = school_df.lat.astype(np.float)
school_df.long = school_df.long.astype(np.float)
school_df.to_csv("schools_geo.csv",index=False)
<ipython-input-32-8a9b0e7a55d3>:2: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations school_df.lat = school_df.lat.astype(np.float) <ipython-input-32-8a9b0e7a55d3>:3: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations school_df.long = school_df.long.astype(np.float)
The list of malls are found by webscraping the wikipedia page due to no such central dataset in any government source.
url = "https://en.wikipedia.org/wiki/List_of_shopping_malls_in_Singapore"
response = requests.get(url=url)
print(f"Response Code : {response.status_code}")
soup = BeautifulSoup(response.content, 'html.parser')
all_region = soup.find(id="mw-content-text")
mall_list = []
for region in all_region:
items = all_region.find_all("ul")
for item in items[:8]:
for mall in item.select("li"):
mall_list.append(str(mall.string))
mall_list = sorted(set(mall_list))
mall_df = pd.DataFrame(mall_list,columns=["mall"])
mall_df.head()
Response Code : 200
mall | |
---|---|
0 | 321 Clementi |
1 | AMK Hub |
2 | Admiralty Place |
3 | Alexandra Central |
4 | Alexandra Retail Centre |
mall_df = pd.DataFrame(mall_list,columns=["mall"])
mall_df.loc[:,"lat"] = ""
mall_df.loc[:,"long"] = ""
start = time.time()
count = 0
for i,row in tqdm(mall_df.iterrows(), total=mall_df.shape[0]):
mall = row["mall"]
try:
search = Client.search(mall,page_num=1)["results"][0]
mall_df.loc[i,"lat"] = search["LATITUDE"]
mall_df.loc[i,"long"] = search["LONGITUDE"]
count,start = countdown(count,start)
except:
continue
mall_df.loc[mall_df.lat.isna(),:]
100%|██████████| 149/149 [01:44<00:00, 1.43it/s]
mall | lat | long |
---|
Take note that some of the malls are not found using onemap's api. Therefore, lat and long are inputted manually using google map.
mall_df.head()
mall | lat | long | |
---|---|---|---|
0 | 321 Clementi | 1.31200212030821 | 103.764986676365 |
1 | AMK Hub | 1.36938922690344 | 103.84847796594 |
2 | Admiralty Place | 1.43938615380542 | 103.801706470491 |
3 | Alexandra Central | 1.28728320930592 | 103.805283367958 |
4 | Alexandra Retail Centre | 1.27384263888449 | 103.801375038176 |
mall_df[mall_df["lat"].str[0] != "1"]
mall | lat | long | |
---|---|---|---|
26 | Clarke Quay Central | ||
47 | Holland Village Shopping Mall | ||
62 | KINEX (formerly OneKM) | ||
76 | Mustafa Shopping Centre | ||
79 | None | ||
82 | OD Mall | ||
89 | Paya Lebar Quarter (PLQ) | ||
95 | PoMo | ||
104 | Shaw House and Centre |
mall_df[mall_df["lat"].str[0] != "1"]
mall | lat | long |
---|
mall_df.loc[26, "lat"] = "1.2890219680898987"
mall_df.loc[26, "long"] = "103.84660784783235"
mall_df.loc[47, "lat"] = "1.3102019819886"
mall_df.loc[47, "long"] = "103.7953010354647"
mall_df.loc[62, "lat"] = "1.3147214068241624"
mall_df.loc[62, "long"] = "103.89475475854094"
mall_df.loc[76, "lat"] = "1.3099635534889562"
mall_df.loc[76, "long"] = "103.85542043537346"
mall_df.loc[82, "lat"] = "1.3380055424196884"
mall_df.loc[82, "long"] = "103.79344084446043"
mall_df.loc[89, "lat"] = "1.3158143275605374"
mall_df.loc[89, "long"] = "103.8936469972349"
mall_df.loc[95, "mall"] = "GR.iD"
mall_df.loc[95, "lat"] = "1.3001929443895233"
mall_df.loc[95, "long"] = "103.84922663176022"
mall_df.loc[104, "lat"] = "1.306292053298618"
mall_df.loc[104, "long"] = "103.83184564390703"
mall_df.drop(79, axis = 0, inplace = True)
mall_df = mall_df.dropna(axis=0)
mall_df.lat = mall_df.lat.astype(np.float)
mall_df.long = mall_df.long.astype(np.float)
mall_df.to_csv("malls_geo.csv",index=False)
<ipython-input-55-3cf3f03c3e7c>:2: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations mall_df.lat = mall_df.lat.astype(np.float) <ipython-input-55-3cf3f03c3e7c>:3: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations mall_df.long = mall_df.long.astype(np.float)
The algorithm will attempt several naming convention of each entry to find the the most exact geocoordinates using onemap's api.
df = resale.copy()
df.loc[:,"lat"] = ""
df.loc[:,"long"] = ""
df2 = df.copy()
count = 0
start = time.time()
for i,row in tqdm(df2.iterrows(), total=df2.shape[0]):
if row["lat"] == "":
block = row["block"]
street = row["street_name"]
location = block + " " + street # "123 Charming Street Ave 2"
index_ = df2.loc[(df2.street_name==street) &
(df2.block==block),:].index
try:
search = Client.search(location,page_num=1)["results"][0]
df.iloc[index_,-2] = search["LATITUDE"]
df.iloc[index_,-1] = search["LONGITUDE"]
count,start = countdown(count,start)
except:
try:
location = street # "Charming Street Ave 2"
search = Client.search(location,page_num=1)["results"][0]
df.iloc[index_,-2] = search["LATITUDE"]
df.iloc[index_,-1] = search["LONGITUDE"]
count,start = countdown(count,start)
except:
try:
town = row["town"]
location = block + " " + town # "123 Hougang"
search = Client.search(location,page_num=1)["results"][0]
df.iloc[index_,-2] = search["LATITUDE"]
df.iloc[index_,-1] = search["LONGITUDE"]
count,start = countdown(count,start)
except:
print(location)
continue
df.loc[df.lat.isna(),:]
100%|██████████| 4410/4410 [52:01<00:00, 1.41it/s]
month | town | flat_type | block | street_name | storey_range | floor_area_sqm | flat_model | resale_price | remaining_lease | lat | long |
---|
df.to_csv("resale_loc.csv", index = False)
# Second round of iteration is required as sometimes the onemap api do not return a geolocation despite it being found before.
for i,row in df.loc[df.lat.isna(),:].iterrows():
street_name = row["street_name"]
block = row["block"]
# Check if there is other rows that gotten a result
search_df = df.loc[(df.street_name==street_name) &
(df.block==block),:]
if len(search_df)>1: # Yes, there is others
for j,row2 in search_df.iterrows():
if ~np.isnan(row2["lat"]):
df.loc[i,["lat","long"]] = df.loc[j,["lat","long"]]
break
elif ~block[-1].isdigit(): # check if last char of block is a str
# check for other results
search_df = df.loc[(df.street_name==street_name) &
(df.block.str.contains(block[:-1])),:]
if len(search_df)>1: # Yes, there is others
for j,row2 in search_df.iterrows():
if ~np.isnan(row2["lat"]):
df.loc[i,["lat","long"]] = df.loc[j,["lat","long"]]
break
else: # no results
try:
location = street_name + " " + block[:-1] # find directly from onemap
search = Client.search(location,page_num=1)["results"][0]
df.loc[i,"lat"] = search["LATITUDE"]
df.loc[i,"long"] = search["LONGITUDE"]
except:
print("Not found")
if np.isnan(df.loc[i,"lat"]):
print(f"{street_name} at {block}")
df.loc[df.lat.isna(),:]
# df = pd.read_csv("dataset/df_geo.csv")
df.to_csv("df_geo.csv",index=False)
df.head()
month | town | flat_type | block | street_name | storey_range | floor_area_sqm | flat_model | resale_price | remaining_lease | lat | long | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2023-01 | ANG MO KIO | 2 ROOM | 406 | ANG MO KIO AVE 10 | 01 TO 03 | 44.0 | Improved | 267000.0 | 55.416667 | 1.36200453938712 | 103.853879910407 |
1 | 2023-01 | ANG MO KIO | 2 ROOM | 323 | ANG MO KIO AVE 3 | 04 TO 06 | 49.0 | Improved | 300000.0 | 53.500000 | 1.36790849360635 | 103.84771408812 |
2 | 2023-01 | ANG MO KIO | 2 ROOM | 314 | ANG MO KIO AVE 3 | 04 TO 06 | 44.0 | Improved | 280000.0 | 54.083333 | 1.36622707120636 | 103.850085858983 |
3 | 2023-01 | ANG MO KIO | 2 ROOM | 314 | ANG MO KIO AVE 3 | 07 TO 09 | 44.0 | Improved | 282000.0 | 54.083333 | 1.36622707120636 | 103.850085858983 |
4 | 2023-01 | ANG MO KIO | 2 ROOM | 170 | ANG MO KIO AVE 4 | 01 TO 03 | 45.0 | Improved | 289800.0 | 62.083333 | 1.37400071781295 | 103.83643153142 |
df.head()
month | town | flat_type | block | street_name | storey_range | floor_area_sqm | flat_model | resale_price | remaining_lease | lat | long | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2023-01 | ANG MO KIO | 2 ROOM | 406 | ANG MO KIO AVE 10 | 01 TO 03 | 44.0 | Improved | 267000.0 | 55.416667 | 1.36200453938712 | 103.853879910407 |
1 | 2023-01 | ANG MO KIO | 2 ROOM | 323 | ANG MO KIO AVE 3 | 04 TO 06 | 49.0 | Improved | 300000.0 | 53.500000 | 1.36790849360635 | 103.84771408812 |
2 | 2023-01 | ANG MO KIO | 2 ROOM | 314 | ANG MO KIO AVE 3 | 04 TO 06 | 44.0 | Improved | 280000.0 | 54.083333 | 1.36622707120636 | 103.850085858983 |
3 | 2023-01 | ANG MO KIO | 2 ROOM | 314 | ANG MO KIO AVE 3 | 07 TO 09 | 44.0 | Improved | 282000.0 | 54.083333 | 1.36622707120636 | 103.850085858983 |
4 | 2023-01 | ANG MO KIO | 2 ROOM | 170 | ANG MO KIO AVE 4 | 01 TO 03 | 45.0 | Improved | 289800.0 | 62.083333 | 1.37400071781295 | 103.83643153142 |
mall_df = pd.read_csv("malls_geo.csv")
mrt_df = pd.read_csv("mrt_geo.csv")
school_df = pd.read_csv("schools_geo.csv")
def find_nearest(resale_df, secondary_df, keyword, col, *primary):
if primary:
resale_df.loc[:,"nearest_" + "primary_" + keyword + "_dist"] = 0
resale_df.loc[:,"nearest_" + "primary_" + keyword] = ""
resale_df.loc[:,"total_nearby_" + "primary_" + keyword] = 0
for index1, row1 in resale_df.iterrows():
nearest_primary_school = ""
nearest_dist = 1e6
total_nearby = 0
for index2, row2 in secondary_df[secondary_df["mainlevel_code"] == "PRIMARY"].iterrows():
curr_dist = haversine(float(row1["long"]), float(row1["lat"]), float(row2["long"]), float(row2["lat"]))
if curr_dist < nearest_dist:
nearest_dist = curr_dist
nearest_primary_school = row2["school_name"]
if curr_dist <= 1:
total_nearby += 1
resale_df.loc[index1, "nearest_" + "primary_" + keyword + "_dist"] = nearest_dist
resale_df.loc[index1, "nearest_" + "primary_" + keyword] = nearest_primary_school
resale_df.loc[index1, "total_nearby_" + "primary_" + keyword] = total_nearby
else:
resale_df.loc[:,"nearest_" + keyword + "_dist"] = 0
resale_df.loc[:,"nearest_" + keyword] = ""
resale_df.loc[:,"total_nearby_" + keyword] = 0
for index1, row1 in resale_df.iterrows():
nearest_index = 0
nearest_dist = 1e6
total_nearby = 0
for index2, row2 in secondary_df.iterrows():
curr_dist = haversine(float(row1["long"]), float(row1["lat"]), float(row2["long"]), float(row2["lat"]))
if curr_dist < nearest_dist:
nearest_dist = curr_dist
nearest_index = index2
if curr_dist <= 1:
total_nearby += 1
resale_df.loc[index1, "nearest_" + keyword + "_dist"] = nearest_dist
resale_df.loc[index1, "nearest_" + keyword] = secondary_df.loc[nearest_index, col]
resale_df.loc[index1, "total_nearby_" + keyword] = total_nearby
find_nearest(df, mrt_df, "mrt", "station")
find_nearest(df, mall_df, "mall", "mall")
find_nearest(df, school_df, "school", "school_name")
find_nearest(df, school_df, "school", "school_name", True)
#https://github.com/cheeaun/sgbusdata/blob/main/data/v1/stops.json
with open('stops.json') as json_file:
bus_stops_dict = json.load(json_file)
def find_nearest_dict(resale_df, secondary_dict, keyword):
resale_df.loc[:,"nearest_" + keyword + "_dist"] = 0
resale_df.loc[:,"nearest_" + keyword] = ""
resale_df.loc[:,"total_nearby_" + keyword] = 0
for index1, row1 in resale_df.iterrows():
nearest_index = 0
nearest_dist = 1e6
total_nearby = 0
for key, value in secondary_dict.items():
curr_dist = haversine(float(row1["long"]), float(row1["lat"]), float(value[0]), float(value[1]))
if curr_dist < nearest_dist:
nearest_dist = curr_dist
nearest_index = key
if curr_dist <= 0.3:
total_nearby += 1
resale_df.loc[index1, "nearest_" + keyword + "_dist"] = nearest_dist
resale_df.loc[index1, "nearest_" + keyword] = secondary_dict[nearest_index][2]
resale_df.loc[index1, "total_nearby_" + keyword] = total_nearby
find_nearest_dict(df, bus_stops_dict, "bus_stop")
df["within_1_km_to_nearest_primary_school"] = df["nearest_primary_school_dist"].apply(lambda x: 1 if x < 1 else 0)
df.head()
month | town | flat_type | block | street_name | storey_range | floor_area_sqm | flat_model | resale_price | remaining_lease | ... | nearest_bus_stop_dist | nearest_bus_stop | nearest_primary_school_dist | nearest_primary_school | within_1_km_to_nearest_primary_school | total_nearby_mrt | total_nearby_mall | total_nearby_school | total_nearby_primary_school | total_nearby_bus_stop | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2023-01 | ANG MO KIO | 2 ROOM | 406 | ANG MO KIO AVE 10 | 01 TO 03 | 44.0 | Improved | 267000.0 | 55.416667 | ... | 0.092462 | Opp Christ The King Ch | 0.229176 | TOWNSVILLE PRIMARY SCHOOL | 1 | 0 | 0 | 3 | 2 | 5 |
1 | 2023-01 | ANG MO KIO | 2 ROOM | 323 | ANG MO KIO AVE 3 | 04 TO 06 | 49.0 | Improved | 300000.0 | 53.500000 | ... | 0.109283 | Blk 322 | 0.444161 | TECK GHEE PRIMARY SCHOOL | 1 | 1 | 3 | 7 | 3 | 4 |
2 | 2023-01 | ANG MO KIO | 2 ROOM | 314 | ANG MO KIO AVE 3 | 04 TO 06 | 44.0 | Improved | 280000.0 | 54.083333 | ... | 0.119838 | Blk 346 | 0.121094 | TECK GHEE PRIMARY SCHOOL | 1 | 1 | 3 | 6 | 3 | 2 |
3 | 2023-01 | ANG MO KIO | 2 ROOM | 314 | ANG MO KIO AVE 3 | 07 TO 09 | 44.0 | Improved | 282000.0 | 54.083333 | ... | 0.119838 | Blk 346 | 0.121094 | TECK GHEE PRIMARY SCHOOL | 1 | 1 | 3 | 6 | 3 | 2 |
4 | 2023-01 | ANG MO KIO | 2 ROOM | 170 | ANG MO KIO AVE 4 | 01 TO 03 | 45.0 | Improved | 289800.0 | 62.083333 | ... | 0.135864 | Mayflower Stn Exit 2 | 0.248519 | MAYFLOWER PRIMARY SCHOOL | 1 | 0 | 0 | 6 | 2 | 2 |
5 rows × 28 columns
total_resales_in_town = df.groupby(["town"]).size()
total_resales_in_block = df.groupby(["block"]).size()
total_resales_in_street = df.groupby(["street_name"]).size()
df["total_resales_in_town"] = df["town"].apply(lambda x: total_resales_in_town.loc[x])
df["total_resales_in_block"] = df["block"].apply(lambda x: total_resales_in_block.loc[x])
df["total_resales_in_street"] = df["street_name"].apply(lambda x: total_resales_in_street.loc[x])
df.head()
month | town | flat_type | block | street_name | storey_range | floor_area_sqm | flat_model | resale_price | remaining_lease | ... | nearest_primary_school | within_1_km_to_nearest_primary_school | total_nearby_mrt | total_nearby_mall | total_nearby_school | total_nearby_primary_school | total_nearby_bus_stop | total_resales_in_town | total_resales_in_block | total_resales_in_street | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2023-01 | ANG MO KIO | 2 ROOM | 406 | ANG MO KIO AVE 10 | 01 TO 03 | 44.0 | Improved | 267000.0 | 55.416667 | ... | TOWNSVILLE PRIMARY SCHOOL | 1 | 0 | 0 | 3 | 2 | 5 | 189 | 4 | 40 |
1 | 2023-01 | ANG MO KIO | 2 ROOM | 323 | ANG MO KIO AVE 3 | 04 TO 06 | 49.0 | Improved | 300000.0 | 53.500000 | ... | TECK GHEE PRIMARY SCHOOL | 1 | 1 | 3 | 7 | 3 | 4 | 189 | 4 | 37 |
2 | 2023-01 | ANG MO KIO | 2 ROOM | 314 | ANG MO KIO AVE 3 | 04 TO 06 | 44.0 | Improved | 280000.0 | 54.083333 | ... | TECK GHEE PRIMARY SCHOOL | 1 | 1 | 3 | 6 | 3 | 2 | 189 | 3 | 37 |
3 | 2023-01 | ANG MO KIO | 2 ROOM | 314 | ANG MO KIO AVE 3 | 07 TO 09 | 44.0 | Improved | 282000.0 | 54.083333 | ... | TECK GHEE PRIMARY SCHOOL | 1 | 1 | 3 | 6 | 3 | 2 | 189 | 3 | 37 |
4 | 2023-01 | ANG MO KIO | 2 ROOM | 170 | ANG MO KIO AVE 4 | 01 TO 03 | 45.0 | Improved | 289800.0 | 62.083333 | ... | MAYFLOWER PRIMARY SCHOOL | 1 | 0 | 0 | 6 | 2 | 2 | 189 | 2 | 20 |
5 rows × 31 columns
df.drop(["lat", "long"], axis = 1, inplace = True)
df.to_csv("resale_feature_engineering.csv", index = False)
num_vars = ['floor_area_sqm', 'resale_price', 'remaining_lease',
'nearest_mrt_dist', 'nearest_mall_dist',
'nearest_school_dist', 'nearest_bus_stop_dist',
'nearest_primary_school_dist',
'within_1_km_to_nearest_primary_school',
'total_nearby_mrt', 'total_nearby_mall', 'total_nearby_school',
'total_nearby_primary_school', 'total_nearby_bus_stop',
'total_resales_in_town', 'total_resales_in_block',
'total_resales_in_street']
cat_vars = ['month', 'town', 'flat_type', 'block', 'street_name', 'storey_range', 'flat_model',
'nearest_mrt', 'nearest_mall', 'nearest_school', 'nearest_bus_stop', 'nearest_primary_school']
df_one_hot = pd.get_dummies(df)
df_one_hot.to_csv('resale_feature_engineering_one_hot.csv', index = False)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
train_df_one_hot, test_df_one_hot = train_test_split(df_one_hot, test_size = 0.2, random_state = 42)
scaler = StandardScaler()
scaler.fit(train_df_one_hot)
print(scaler.mean_)
train_df_one_hot_scaled = pd.DataFrame(scaler.transform(train_df_one_hot), columns = train_df_one_hot.columns)
test_df_one_hot_scaled = pd.DataFrame(scaler.transform(test_df_one_hot), columns = test_df_one_hot.columns)
[9.50907880e+01 5.57036464e+05 7.37632511e+01 ... 8.21995465e-03 6.23582766e-03 3.11791383e-03]
mean = scaler.mean_
sd = scaler.scale_
scaler_data = pd.DataFrame({"column_name": train_df_one_hot.columns,
"mean": mean,
"sd": sd})
scaler_data.to_csv('scaler_data.csv', index = False)
train_df_one_hot_scaled.to_csv("train_df_one_hot_scaled.csv", index = False)
test_df_one_hot_scaled.to_csv("test_df_one_hot_scaled.csv", index = False)
Using the original dataset and engineered features, we discovered the major factors that affect HDB resales price such as the location, the flat type and the remaining lease of the flat. Features such as proximity to schools, malls and MRTS are heavily marketed in social media and property markets. As these features do not greatly affect the price of HDBs, they remain as a attractive selling point. The next time you buy a HDB resale flat in Singapore, make sure you have these features because it will mean you that you getting the most bang for the buck!
Predictive program can be found at https://share.streamlit.io/liankeat/resaleflatsinsg/main.