from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


import json
import numpy as np
import pandas as pd
import requests
import time

!pip install beautifulsoup4
!pip install onemapsg

from bs4 import BeautifulSoup
from onemapsg import OneMapClient
from tqdm import tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: onemapsg in /usr/local/lib/python3.9/dist-packages (1.0.3)
Requirement already satisfied: requests>=2.20.0 in /usr/local/lib/python3.9/dist-packages (from onemapsg) (2.25.1)
Requirement already satisfied: requests-toolbelt in /usr/local/lib/python3.9/dist-packages (from onemapsg) (0.10.1)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests>=2.20.0->onemapsg) (2.10)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.9/dist-packages (from requests>=2.20.0->onemapsg) (1.26.15)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.9/dist-packages (from requests>=2.20.0->onemapsg) (2022.12.7)
Requirement already satisfied: chardet<5,>=3.0.2 in /usr/local/lib/python3.9/dist-packages (from requests>=2.20.0->onemapsg) (4.0.0)
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.9/dist-packages (4.9.3)
Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.9/dist-packages (from beautifulsoup4) (2.4)


resale = pd.read_csv("gdrive/MyDrive/dataset/ST4248/resale_2023.csv")


# Start onemap client
creds_email = "cliftonfelix@u.nus.edu"
creds_password = "ST4248GroupC4"
token = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOjEwMDU0LCJ1c2VyX2lkIjoxMDA1NCwiZW1haWwiOiJjbGlmdG9uZmVsaXhAdS5udXMuZWR1IiwiZm9yZXZlciI6ZmFsc2UsImlzcyI6Imh0dHA6XC9cL29tMi5kZmUub25lbWFwLnNnXC9hcGlcL3YyXC91c2VyXC9zZXNzaW9uIiwiaWF0IjoxNjc5MDUwNjEyLCJleHAiOjE2Nzk0ODI2MTIsIm5iZiI6MTY3OTA1MDYxMiwianRpIjoiZjljNjFjN2Q4MTcwNDg5ZTUwNzU5ODRjM2MxYzdhOTYifQ.JNdAjvfVQdIn3RuH9rLlqrQCtJdSq4JYMqIdWBKZ4qI"
Client = OneMapClient(creds_email, creds_password)

def countdown(count,start):
    # Sleep counter abides onemap's rules (max 250 query/s)
    count += 1
    if count >= 250:
        end = time.time()
        time.sleep(1)
        start = time.time()
        count = 0
    return count,start

# # Distance Multiplier Calculation
# latlong_dist = np.sqrt(0.1**2 + 0.1**2) # 0.1 lat and 0.1 long difference
# actual_dist_m = 15.723 * 1000 # According to https://latlongdata.com/distance-calculator/
# latlong_multiplier = actual_dist_m/latlong_dist

# # Assuming average walking speed as 1.4m/s 
# # (according to https://www.healthline.com/health/exercise-fitness/average-walking-speed#average-speed-by-age)
# kmpermin = (1.4/1000) * 60
# print(f"Expected time for 1km walking distance: {1/kmpermin}")


# https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
from math import radians, cos, sin, asin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
    return c * r


# Import Dataset via API
url = 'https://data.gov.sg/api/action/datastore_search?resource_id=65c1093b-0c34-41cf-8f0e-f11318766298&limit=1000'
response = requests.get(url)
print(f"Response Code : {response.status_code}")
mrt_df = pd.json_normalize(response.json()['result']['records'])
mrt_df = mrt_df.loc[:,["mrt_station_english","mrt_line_english"]]
mrt_df.columns = ["station","line"]
mrt_df = mrt_df.loc[mrt_df.line.str.contains("Line"),:] # remove LRTs
print(len(mrt_df))
mrt_df.head()

Response Code : 200
141


# Adding Longitude and Lattitude Data
mrt_df.loc[:,"lat"] = ""
mrt_df.loc[:,"long"] = ""
start = time.time()
count = 0
for i,row in tqdm(mrt_df.iterrows(), total=mrt_df.shape[0]):
    station_name = row["station"] + " MRT"
    try:
        search = Client.search(station_name,page_num=1)["results"][0]
        mrt_df.loc[i,"lat"] = search["LATITUDE"]
        mrt_df.loc[i,"long"] = search["LONGITUDE"]
        count,start = countdown(count,start)
    except:
        continue
mrt_df.loc[mrt_df.lat.isna(),:] # Checking for any entries with no geo location

100%|██████████| 141/141 [01:37<00:00,  1.44it/s]


# mrt_df = pd.read_csv("dataset/mrt_geo.csv") # If no access to onemap
mrt_df.head()


mrt_df = mrt_df.dropna(axis=0)
mrt_df.lat = mrt_df.lat.astype(np.float)
mrt_df.long = mrt_df.long.astype(np.float)
mrt_df.to_csv("mrt_geo.csv",index=False)

<ipython-input-22-a04d9a7086c7>:2: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  mrt_df.lat = mrt_df.lat.astype(np.float)
<ipython-input-22-a04d9a7086c7>:3: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  mrt_df.long = mrt_df.long.astype(np.float)


# Import Dataset via API
url = 'https://data.gov.sg/api/action/datastore_search?resource_id=ede26d32-01af-4228-b1ed-f05c45a1d8ee&limit=1000'
response = requests.get(url)
print(f"Response Code : {response.status_code}")
school_df = pd.json_normalize(response.json()['result']['records'])
school_df = school_df.loc[:,["school_name","postal_code","mainlevel_code"]]
school_df.head()

Response Code : 200


# Adding Longitude and Lattitude Data
school_df.loc[:,"lat"] = ""
school_df.loc[:,"long"] = ""
start = time.time()
count = 0
for i,row in tqdm(school_df.iterrows(), total=school_df.shape[0]):
    postal_code = row["postal_code"]
    if len(str(postal_code))<6:
        postal_code = (6-len(str(postal_code)))*"0" + str(postal_code)

    try:
        search = Client.search(postal_code,page_num=1)["results"][0]
        school_df.loc[i,"lat"] = search["LATITUDE"]
        school_df.loc[i,"long"] = search["LONGITUDE"]
        count,start = countdown(count,start)
    except:
        continue
school_df.loc[school_df.lat.isna(),:]

100%|██████████| 346/346 [06:54<00:00,  1.20s/it]


school_df.head()


school_df = school_df[school_df["school_name"] != "ZHENGHUA SECONDARY SCHOOL"]


school_df = school_df.dropna(axis=0)
school_df.lat = school_df.lat.astype(np.float)
school_df.long = school_df.long.astype(np.float)
school_df.to_csv("schools_geo.csv",index=False)

<ipython-input-32-8a9b0e7a55d3>:2: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  school_df.lat = school_df.lat.astype(np.float)
<ipython-input-32-8a9b0e7a55d3>:3: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  school_df.long = school_df.long.astype(np.float)


url = "https://en.wikipedia.org/wiki/List_of_shopping_malls_in_Singapore"
response = requests.get(url=url)
print(f"Response Code : {response.status_code}")
soup = BeautifulSoup(response.content, 'html.parser')
all_region = soup.find(id="mw-content-text")
mall_list = []
for region in all_region:
    items = all_region.find_all("ul")
    for item in items[:8]:
        for mall in item.select("li"):
            mall_list.append(str(mall.string))
mall_list = sorted(set(mall_list))
mall_df = pd.DataFrame(mall_list,columns=["mall"])
mall_df.head()

Response Code : 200


mall_df = pd.DataFrame(mall_list,columns=["mall"])
mall_df.loc[:,"lat"] = ""
mall_df.loc[:,"long"] = ""
start = time.time()
count = 0
for i,row in tqdm(mall_df.iterrows(), total=mall_df.shape[0]):
    mall = row["mall"]
    try:
        search = Client.search(mall,page_num=1)["results"][0]
        mall_df.loc[i,"lat"] = search["LATITUDE"]
        mall_df.loc[i,"long"] = search["LONGITUDE"]
        count,start = countdown(count,start)
    except:
        continue
mall_df.loc[mall_df.lat.isna(),:]

100%|██████████| 149/149 [01:44<00:00,  1.43it/s]


mall_df.head()


mall_df[mall_df["lat"].str[0] != "1"]


mall_df[mall_df["lat"].str[0] != "1"]


mall_df.loc[26, "lat"] = "1.2890219680898987"
mall_df.loc[26, "long"] = "103.84660784783235"

mall_df.loc[47, "lat"] = "1.3102019819886"
mall_df.loc[47, "long"] = "103.7953010354647"

mall_df.loc[62, "lat"] = "1.3147214068241624"
mall_df.loc[62, "long"] = "103.89475475854094"

mall_df.loc[76, "lat"] = "1.3099635534889562"
mall_df.loc[76, "long"] = "103.85542043537346"

mall_df.loc[82, "lat"] = "1.3380055424196884"
mall_df.loc[82, "long"] = "103.79344084446043"

mall_df.loc[89, "lat"] = "1.3158143275605374"
mall_df.loc[89, "long"] = "103.8936469972349"

mall_df.loc[95, "mall"] = "GR.iD"
mall_df.loc[95, "lat"] = "1.3001929443895233"
mall_df.loc[95, "long"] = "103.84922663176022"

mall_df.loc[104, "lat"] = "1.306292053298618"
mall_df.loc[104, "long"] = "103.83184564390703"

mall_df.drop(79, axis = 0, inplace = True)


mall_df = mall_df.dropna(axis=0)
mall_df.lat = mall_df.lat.astype(np.float)
mall_df.long = mall_df.long.astype(np.float)
mall_df.to_csv("malls_geo.csv",index=False)

<ipython-input-55-3cf3f03c3e7c>:2: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  mall_df.lat = mall_df.lat.astype(np.float)
<ipython-input-55-3cf3f03c3e7c>:3: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  mall_df.long = mall_df.long.astype(np.float)


df = resale.copy()


df.loc[:,"lat"] = ""
df.loc[:,"long"] = ""
df2 = df.copy()
count = 0
start = time.time()
for i,row in tqdm(df2.iterrows(), total=df2.shape[0]):
    if row["lat"] == "":
        block = row["block"]
        street = row["street_name"]
        location = block + " " + street # "123 Charming Street Ave 2"
        index_ = df2.loc[(df2.street_name==street) &
                        (df2.block==block),:].index

        try:
            search = Client.search(location,page_num=1)["results"][0]
            df.iloc[index_,-2] = search["LATITUDE"]
            df.iloc[index_,-1] = search["LONGITUDE"]
            count,start = countdown(count,start)
        except:
            try:
                location = street  # "Charming Street Ave 2"
                search = Client.search(location,page_num=1)["results"][0]
                df.iloc[index_,-2] = search["LATITUDE"]
                df.iloc[index_,-1] = search["LONGITUDE"]
                count,start = countdown(count,start)
            except:
                try:
                    town = row["town"]
                    location = block + " " + town  # "123 Hougang"
                    search = Client.search(location,page_num=1)["results"][0]
                    df.iloc[index_,-2] = search["LATITUDE"]
                    df.iloc[index_,-1] = search["LONGITUDE"]
                    count,start = countdown(count,start)
                except:
                    print(location)
                    continue
df.loc[df.lat.isna(),:]

100%|██████████| 4410/4410 [52:01<00:00,  1.41it/s]


df.to_csv("resale_loc.csv", index = False)


# Second round of iteration is required as sometimes the onemap api do not return a geolocation despite it being found before.
for i,row in df.loc[df.lat.isna(),:].iterrows():
    street_name = row["street_name"]
    block = row["block"]

    # Check if there is other rows that gotten a result
    search_df = df.loc[(df.street_name==street_name) &
                         (df.block==block),:]
    if len(search_df)>1: # Yes, there is others
        for j,row2 in search_df.iterrows():
            if ~np.isnan(row2["lat"]):
                df.loc[i,["lat","long"]] = df.loc[j,["lat","long"]]
                break
    elif ~block[-1].isdigit(): # check if last char of block is a str
        # check for other results
        search_df = df.loc[(df.street_name==street_name) &
                            (df.block.str.contains(block[:-1])),:]
        if len(search_df)>1: # Yes, there is others
            for j,row2 in search_df.iterrows():
                if ~np.isnan(row2["lat"]):
                    df.loc[i,["lat","long"]] = df.loc[j,["lat","long"]]
                    break
        else: # no results
            try:
                location = street_name + " " + block[:-1] # find directly from onemap
                search = Client.search(location,page_num=1)["results"][0]
                df.loc[i,"lat"] = search["LATITUDE"]
                df.loc[i,"long"] = search["LONGITUDE"]
            except:
                print("Not found")

    if np.isnan(df.loc[i,"lat"]):
        print(f"{street_name} at {block}")
df.loc[df.lat.isna(),:]


# df = pd.read_csv("dataset/df_geo.csv")
df.to_csv("df_geo.csv",index=False)
df.head()


df.head()


mall_df = pd.read_csv("malls_geo.csv")
mrt_df = pd.read_csv("mrt_geo.csv")
school_df = pd.read_csv("schools_geo.csv")


def find_nearest(resale_df, secondary_df, keyword, col, *primary):
    if primary:
        resale_df.loc[:,"nearest_" + "primary_" + keyword + "_dist"] = 0
        resale_df.loc[:,"nearest_" + "primary_" + keyword] = ""
        resale_df.loc[:,"total_nearby_" + "primary_" + keyword] = 0
        for index1, row1 in resale_df.iterrows():
            nearest_primary_school = ""
            nearest_dist = 1e6
            total_nearby = 0
            for index2, row2 in secondary_df[secondary_df["mainlevel_code"] == "PRIMARY"].iterrows():
                curr_dist = haversine(float(row1["long"]), float(row1["lat"]), float(row2["long"]), float(row2["lat"]))
                if curr_dist < nearest_dist:
                    nearest_dist = curr_dist
                    nearest_primary_school = row2["school_name"]
                if curr_dist <= 1:
                    total_nearby += 1
            resale_df.loc[index1, "nearest_" + "primary_" + keyword + "_dist"] = nearest_dist
            resale_df.loc[index1, "nearest_" + "primary_" + keyword] = nearest_primary_school
            resale_df.loc[index1, "total_nearby_" + "primary_" + keyword] = total_nearby
    else:
        resale_df.loc[:,"nearest_" + keyword + "_dist"] = 0
        resale_df.loc[:,"nearest_" + keyword] = ""
        resale_df.loc[:,"total_nearby_" + keyword] = 0
        for index1, row1 in resale_df.iterrows():
            nearest_index = 0
            nearest_dist = 1e6
            total_nearby = 0
            for index2, row2 in secondary_df.iterrows():
                curr_dist = haversine(float(row1["long"]), float(row1["lat"]), float(row2["long"]), float(row2["lat"]))
                if curr_dist < nearest_dist:
                    nearest_dist = curr_dist
                    nearest_index = index2
                if curr_dist <= 1:
                    total_nearby += 1
            resale_df.loc[index1, "nearest_" + keyword + "_dist"] = nearest_dist
            resale_df.loc[index1, "nearest_" + keyword] = secondary_df.loc[nearest_index, col]
            resale_df.loc[index1, "total_nearby_" + keyword] = total_nearby


find_nearest(df, mrt_df, "mrt", "station")
find_nearest(df, mall_df, "mall", "mall")
find_nearest(df, school_df, "school", "school_name")
find_nearest(df, school_df, "school", "school_name", True)


#https://github.com/cheeaun/sgbusdata/blob/main/data/v1/stops.json
with open('stops.json') as json_file:
    bus_stops_dict = json.load(json_file)


def find_nearest_dict(resale_df, secondary_dict, keyword):
    resale_df.loc[:,"nearest_" + keyword + "_dist"] = 0
    resale_df.loc[:,"nearest_" + keyword] = ""
    resale_df.loc[:,"total_nearby_" + keyword] = 0
    for index1, row1 in resale_df.iterrows():
        nearest_index = 0
        nearest_dist = 1e6
        total_nearby = 0
        for key, value in secondary_dict.items():
            curr_dist = haversine(float(row1["long"]), float(row1["lat"]), float(value[0]), float(value[1]))
            if curr_dist < nearest_dist:
                nearest_dist = curr_dist
                nearest_index = key
            if curr_dist <= 0.3:
                total_nearby += 1
        resale_df.loc[index1, "nearest_" + keyword + "_dist"] = nearest_dist
        resale_df.loc[index1, "nearest_" + keyword] = secondary_dict[nearest_index][2]
        resale_df.loc[index1, "total_nearby_" + keyword] = total_nearby


find_nearest_dict(df, bus_stops_dict, "bus_stop")


df["within_1_km_to_nearest_primary_school"] = df["nearest_primary_school_dist"].apply(lambda x: 1 if x < 1 else 0)


df.head()


total_resales_in_town = df.groupby(["town"]).size()
total_resales_in_block = df.groupby(["block"]).size()
total_resales_in_street = df.groupby(["street_name"]).size()


df["total_resales_in_town"] = df["town"].apply(lambda x: total_resales_in_town.loc[x])
df["total_resales_in_block"] = df["block"].apply(lambda x: total_resales_in_block.loc[x])
df["total_resales_in_street"] = df["street_name"].apply(lambda x: total_resales_in_street.loc[x])


df.head()


df.drop(["lat", "long"], axis = 1, inplace = True)


df.to_csv("resale_feature_engineering.csv", index = False)


num_vars = ['floor_area_sqm', 'resale_price', 'remaining_lease',
       'nearest_mrt_dist', 'nearest_mall_dist', 
       'nearest_school_dist', 'nearest_bus_stop_dist',
       'nearest_primary_school_dist',
       'within_1_km_to_nearest_primary_school',
       'total_nearby_mrt', 'total_nearby_mall', 'total_nearby_school',
       'total_nearby_primary_school', 'total_nearby_bus_stop',
       'total_resales_in_town', 'total_resales_in_block',
       'total_resales_in_street']

cat_vars = ['month', 'town', 'flat_type', 'block', 'street_name', 'storey_range', 'flat_model',
            'nearest_mrt', 'nearest_mall', 'nearest_school', 'nearest_bus_stop', 'nearest_primary_school']


df_one_hot = pd.get_dummies(df)
df_one_hot.to_csv('resale_feature_engineering_one_hot.csv', index = False)


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

train_df_one_hot, test_df_one_hot = train_test_split(df_one_hot, test_size = 0.2, random_state = 42)

scaler = StandardScaler()
scaler.fit(train_df_one_hot)
print(scaler.mean_)

train_df_one_hot_scaled = pd.DataFrame(scaler.transform(train_df_one_hot), columns = train_df_one_hot.columns)
test_df_one_hot_scaled = pd.DataFrame(scaler.transform(test_df_one_hot), columns = test_df_one_hot.columns)

[9.50907880e+01 5.57036464e+05 7.37632511e+01 ... 8.21995465e-03
 6.23582766e-03 3.11791383e-03]


mean = scaler.mean_
sd = scaler.scale_

scaler_data = pd.DataFrame({"column_name": train_df_one_hot.columns,
              "mean": mean,
              "sd": sd})
scaler_data.to_csv('scaler_data.csv', index = False)


train_df_one_hot_scaled.to_csv("train_df_one_hot_scaled.csv", index = False)
test_df_one_hot_scaled.to_csv("test_df_one_hot_scaled.csv", index = False)

	station	line
0	Jurong East	North South Line
1	Bukit Batok	North South Line
2	Bukit Gombak	North South Line
3	Choa Chu Kang	North South Line
4	Yew Tee	North South Line

	station	line	lat	long
0	Jurong East	North South Line	1.33315281585758	103.742286332403
1	Bukit Batok	North South Line	1.34903331201636	103.749566478309
2	Bukit Gombak	North South Line	1.35861159094192	103.751790910733
3	Choa Chu Kang	North South Line	1.38536316540225	103.744370779756
4	Yew Tee	North South Line	1.39753506936297	103.747405150236

	school_name	postal_code	mainlevel_code
0	ADMIRALTY PRIMARY SCHOOL	738907	PRIMARY
1	ADMIRALTY SECONDARY SCHOOL	737916	SECONDARY
2	AHMAD IBRAHIM PRIMARY SCHOOL	768643	PRIMARY
3	AHMAD IBRAHIM SECONDARY SCHOOL	768928	SECONDARY
4	AI TONG SCHOOL	579646	PRIMARY

	school_name	postal_code	mainlevel_code	lat	long
0	ADMIRALTY PRIMARY SCHOOL	738907	PRIMARY	1.4426347903311	103.800040119743
1	ADMIRALTY SECONDARY SCHOOL	737916	SECONDARY	1.44589068910993	103.802398196596
2	AHMAD IBRAHIM PRIMARY SCHOOL	768643	PRIMARY	1.43315271543517	103.832942401086
3	AHMAD IBRAHIM SECONDARY SCHOOL	768928	SECONDARY	1.43605975368804	103.829718690077
4	AI TONG SCHOOL	579646	PRIMARY	1.36058343005814	103.83302033725

	mall
0	321 Clementi
1	AMK Hub
2	Admiralty Place
3	Alexandra Central
4	Alexandra Retail Centre

Feature Engineering¶

Preparing MRT dataset¶

Preparing Schools Dataset¶

Preparing Malls Dataset¶

Adding Geocoordinates to the Original Dataset¶

Adding Features into Main Dataset¶

Dummifying Variables¶

Conclusion and Future Improvements¶

Discovery¶

Technique Used:¶

Possible Future Improvements¶

	mall	lat	long
0	321 Clementi	1.31200212030821	103.764986676365
1	AMK Hub	1.36938922690344	103.84847796594
2	Admiralty Place	1.43938615380542	103.801706470491
3	Alexandra Central	1.28728320930592	103.805283367958
4	Alexandra Retail Centre	1.27384263888449	103.801375038176

	mall	lat	long
26	Clarke Quay Central
47	Holland Village Shopping Mall
62	KINEX (formerly OneKM)
76	Mustafa Shopping Centre
79	None
82	OD Mall
89	Paya Lebar Quarter (PLQ)
95	PoMo
104	Shaw House and Centre

	month	town	flat_type	block	street_name	storey_range	floor_area_sqm	flat_model	resale_price	remaining_lease	lat	long
0	2023-01	ANG MO KIO	2 ROOM	406	ANG MO KIO AVE 10	01 TO 03	44.0	Improved	267000.0	55.416667	1.36200453938712	103.853879910407
1	2023-01	ANG MO KIO	2 ROOM	323	ANG MO KIO AVE 3	04 TO 06	49.0	Improved	300000.0	53.500000	1.36790849360635	103.84771408812
2	2023-01	ANG MO KIO	2 ROOM	314	ANG MO KIO AVE 3	04 TO 06	44.0	Improved	280000.0	54.083333	1.36622707120636	103.850085858983
3	2023-01	ANG MO KIO	2 ROOM	314	ANG MO KIO AVE 3	07 TO 09	44.0	Improved	282000.0	54.083333	1.36622707120636	103.850085858983
4	2023-01	ANG MO KIO	2 ROOM	170	ANG MO KIO AVE 4	01 TO 03	45.0	Improved	289800.0	62.083333	1.37400071781295	103.83643153142