In [ ]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
In [ ]:
unscaled_data = pd.read_csv('./data/feature_selection/resale_feature_selection_not_scaled.csv')
In [ ]:
unscaled_data.head()
Out[ ]:
floor_area_sqm total_resales_in_town nearest_mrt_dist remaining_lease town_BUKIT MERAH flat_type_3 ROOM street_name_CANTONMENT RD storey_range_01 TO 03 storey_range_04 TO 06 storey_range_19 TO 21 ... town_JURONG EAST town_BEDOK storey_range_31 TO 33 nearest_primary_school_RADIN MAS PRIMARY SCHOOL nearest_mrt_Tiong Bahru nearest_mrt_Sengkang total_resales_in_street nearest_mrt_Outram Park storey_range_28 TO 30 resale_price
0 44.0 189 1.003996 55.416667 0 0 0 1 0 0 ... 0 0 0 0 0 0 40 0 0 267000.0
1 49.0 189 0.304447 53.500000 0 0 0 0 1 0 ... 0 0 0 0 0 0 37 0 0 300000.0
2 44.0 189 0.416219 54.083333 0 0 0 0 1 0 ... 0 0 0 0 0 0 37 0 0 280000.0
3 44.0 189 0.416219 54.083333 0 0 0 0 0 0 ... 0 0 0 0 0 0 37 0 0 282000.0
4 45.0 189 1.280521 62.083333 0 0 0 1 0 0 ... 0 0 0 0 0 0 20 0 0 289800.0

5 rows × 75 columns

In [ ]:
unscaled_data.describe()
Out[ ]:
floor_area_sqm total_resales_in_town nearest_mrt_dist remaining_lease town_BUKIT MERAH flat_type_3 ROOM street_name_CANTONMENT RD storey_range_01 TO 03 storey_range_04 TO 06 storey_range_19 TO 21 ... town_JURONG EAST town_BEDOK storey_range_31 TO 33 nearest_primary_school_RADIN MAS PRIMARY SCHOOL nearest_mrt_Tiong Bahru nearest_mrt_Sengkang total_resales_in_street nearest_mrt_Outram Park storey_range_28 TO 30 resale_price
count 4410.000000 4410.000000 4410.000000 4410.000000 4410.000000 4410.000000 4410.000000 4410.000000 4410.000000 4410.000000 ... 4410.000000 4410.000000 4410.000000 4410.000000 4410.000000 4410.000000 4410.000000 4410.000000 4410.000000 4.410000e+03
mean 94.821156 234.816780 0.859754 73.789021 0.036508 0.265986 0.002041 0.170975 0.229478 0.016780 ... 0.018821 0.053968 0.002041 0.003855 0.014512 0.085941 18.998639 0.003855 0.004989 5.556083e+05
std 24.082048 101.834821 0.453918 15.054199 0.187572 0.441907 0.045134 0.376530 0.420545 0.128461 ... 0.135907 0.225981 0.045134 0.061975 0.119604 0.280309 17.129396 0.061975 0.070462 1.686516e+05
min 37.000000 6.000000 0.036138 43.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 2.300000e+05
25% 73.000000 152.000000 0.498500 61.166667 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 8.000000 0.000000 0.000000 4.300000e+05
50% 93.000000 223.000000 0.787363 73.416667 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 14.000000 0.000000 0.000000 5.350000e+05
75% 111.750000 362.000000 1.160318 90.583333 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 24.000000 0.000000 0.000000 6.480000e+05
max 192.000000 387.000000 3.479159 95.500000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 93.000000 1.000000 1.000000 1.340000e+06

8 rows × 75 columns

In [ ]:
unscaled_data.columns
Out[ ]:
Index(['floor_area_sqm', 'total_resales_in_town', 'nearest_mrt_dist',
       'remaining_lease', 'town_BUKIT MERAH', 'flat_type_3 ROOM',
       'street_name_CANTONMENT RD', 'storey_range_01 TO 03',
       'storey_range_04 TO 06', 'storey_range_19 TO 21',
       'flat_model_Apartment', 'flat_model_Maisonette', 'town_PASIR RIS',
       'street_name_DAWSON RD', 'flat_model_Premium Apartment',
       'nearest_mall_dist', 'total_nearby_mrt', 'street_name_ANG MO KIO ST 51',
       'flat_model_New Generation', 'nearest_bus_stop_Blk 18',
       'town_SEMBAWANG', 'flat_type_2 ROOM', 'flat_type_5 ROOM',
       'flat_type_EXECUTIVE', 'flat_model_Standard', 'flat_model_Type S1',
       'flat_model_Model A', 'storey_range_16 TO 18', 'flat_model_Simplified',
       'flat_model_DBSS', 'town_MARINE PARADE', 'flat_model_Improved',
       'storey_range_07 TO 09',
       'nearest_primary_school_ZHANGDE PRIMARY SCHOOL',
       'street_name_JLN BAHAGIA', 'town_BUKIT BATOK', 'town_BUKIT PANJANG',
       'town_HOUGANG', 'town_CHOA CHU KANG',
       'nearest_school_WOODLANDS SECONDARY SCHOOL', 'nearest_mrt_Admiralty',
       'street_name_MCNAIR RD', 'nearest_mrt_Kallang',
       'nearest_mall_Sun Plaza', 'total_resales_in_block',
       'storey_range_10 TO 12', 'nearest_school_GAN ENG SENG PRIMARY SCHOOL',
       'nearest_school_BLANGAH RISE PRIMARY SCHOOL',
       'street_name_TELOK BLANGAH ST 31', 'storey_range_22 TO 24',
       'nearest_bus_stop_Opp Blk 565', 'town_CENTRAL AREA',
       'street_name_ANG MO KIO AVE 3', 'street_name_ANG MO KIO ST 44',
       'nearest_school_KHENG CHENG SCHOOL', 'storey_range_37 TO 39',
       'storey_range_25 TO 27', 'flat_type_4 ROOM', 'town_QUEENSTOWN',
       'town_ANG MO KIO', 'town_SENGKANG', 'town_TAMPINES', 'town_WOODLANDS',
       'nearest_mrt_Toa Payoh', 'street_name_CLEMENTI AVE 3',
       'town_JURONG EAST', 'town_BEDOK', 'storey_range_31 TO 33',
       'nearest_primary_school_RADIN MAS PRIMARY SCHOOL',
       'nearest_mrt_Tiong Bahru', 'nearest_mrt_Sengkang',
       'total_resales_in_street', 'nearest_mrt_Outram Park',
       'storey_range_28 TO 30', 'resale_price'],
      dtype='object')
In [ ]:
unscaled_data['price_per_sqm'] = unscaled_data['resale_price']/unscaled_data['floor_area_sqm']
In [ ]:
unscaled_data.head()
Out[ ]:
floor_area_sqm total_resales_in_town nearest_mrt_dist remaining_lease town_BUKIT MERAH flat_type_3 ROOM street_name_CANTONMENT RD storey_range_01 TO 03 storey_range_04 TO 06 storey_range_19 TO 21 ... town_BEDOK storey_range_31 TO 33 nearest_primary_school_RADIN MAS PRIMARY SCHOOL nearest_mrt_Tiong Bahru nearest_mrt_Sengkang total_resales_in_street nearest_mrt_Outram Park storey_range_28 TO 30 resale_price price_per_sqm
0 44.0 189 1.003996 55.416667 0 0 0 1 0 0 ... 0 0 0 0 0 40 0 0 267000.0 6068.181818
1 49.0 189 0.304447 53.500000 0 0 0 0 1 0 ... 0 0 0 0 0 37 0 0 300000.0 6122.448980
2 44.0 189 0.416219 54.083333 0 0 0 0 1 0 ... 0 0 0 0 0 37 0 0 280000.0 6363.636364
3 44.0 189 0.416219 54.083333 0 0 0 0 0 0 ... 0 0 0 0 0 37 0 0 282000.0 6409.090909
4 45.0 189 1.280521 62.083333 0 0 0 1 0 0 ... 0 0 0 0 0 20 0 0 289800.0 6440.000000

5 rows × 76 columns

In [ ]:
unscaled_data.describe()
Out[ ]:
floor_area_sqm total_resales_in_town nearest_mrt_dist remaining_lease town_BUKIT MERAH flat_type_3 ROOM street_name_CANTONMENT RD storey_range_01 TO 03 storey_range_04 TO 06 storey_range_19 TO 21 ... town_BEDOK storey_range_31 TO 33 nearest_primary_school_RADIN MAS PRIMARY SCHOOL nearest_mrt_Tiong Bahru nearest_mrt_Sengkang total_resales_in_street nearest_mrt_Outram Park storey_range_28 TO 30 resale_price price_per_sqm
count 4410.000000 4410.000000 4410.000000 4410.000000 4410.000000 4410.000000 4410.000000 4410.000000 4410.000000 4410.000000 ... 4410.000000 4410.000000 4410.000000 4410.000000 4410.000000 4410.000000 4410.000000 4410.000000 4.410000e+03 4410.000000
mean 94.821156 234.816780 0.859754 73.789021 0.036508 0.265986 0.002041 0.170975 0.229478 0.016780 ... 0.053968 0.002041 0.003855 0.014512 0.085941 18.998639 0.003855 0.004989 5.556083e+05 5923.272014
std 24.082048 101.834821 0.453918 15.054199 0.187572 0.441907 0.045134 0.376530 0.420545 0.128461 ... 0.225981 0.045134 0.061975 0.119604 0.280309 17.129396 0.061975 0.070462 1.686516e+05 1319.972500
min 37.000000 6.000000 0.036138 43.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 2.300000e+05 3403.508772
25% 73.000000 152.000000 0.498500 61.166667 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 8.000000 0.000000 0.000000 4.300000e+05 5066.342441
50% 93.000000 223.000000 0.787363 73.416667 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 14.000000 0.000000 0.000000 5.350000e+05 5585.236118
75% 111.750000 362.000000 1.160318 90.583333 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 24.000000 0.000000 0.000000 6.480000e+05 6400.000000
max 192.000000 387.000000 3.479159 95.500000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 93.000000 1.000000 1.000000 1.340000e+06 13723.404255

8 rows × 76 columns

In [ ]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(unscaled_data, test_size = 0.2, random_state = 42)
In [ ]:
# perform scaling on quantitative features

# total_resales_in_town
# nearest_mrt_dist
# remaining_lease'
    
# nearest_mall_dist
# total_nearby_mrt

# total_resales_in_block
# total_resales_in_street
In [ ]:
train_y = train_df['price_per_sqm']
train_x = train_df.drop(['price_per_sqm', 'floor_area_sqm', 'resale_price'], axis=1)

test_y = test_df['price_per_sqm']
test_x = test_df.drop(['price_per_sqm', 'floor_area_sqm', 'resale_price'], axis=1)
In [ ]:
reg = LinearRegression().fit(train_x, train_y)
In [ ]:
import statsmodels.api as sm

train_x2 = sm.add_constant(train_x)
est = sm.OLS(train_y, train_x2)
est2 = est.fit()
print(est2.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:          price_per_sqm   R-squared:                       0.850
Model:                            OLS   Adj. R-squared:                  0.847
Method:                 Least Squares   F-statistic:                     271.3
Date:                Sat, 25 Mar 2023   Prob (F-statistic):               0.00
Time:                        21:43:14   Log-Likelihood:                -27013.
No. Observations:                3528   AIC:                         5.417e+04
Df Residuals:                    3455   BIC:                         5.462e+04
Df Model:                          72                                         
Covariance Type:            nonrobust                                         
===================================================================================================================
                                                      coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------------------
const                                            4246.0972    526.129      8.070      0.000    3214.542    5277.652
total_resales_in_town                              -5.1221      0.174    -29.395      0.000      -5.464      -4.780
nearest_mrt_dist                                 -392.0953     28.667    -13.678      0.000    -448.301    -335.890
remaining_lease                                    64.8747      1.013     64.039      0.000      62.888      66.861
town_BUKIT MERAH                                  543.1916    119.334      4.552      0.000     309.219     777.165
flat_type_3 ROOM                                 -697.5989    523.358     -1.333      0.183   -1723.722     328.524
street_name_CANTONMENT RD                        2219.9375    441.854      5.024      0.000    1353.617    3086.258
storey_range_01 TO 03                            -620.6503     36.089    -17.198      0.000    -691.408    -549.893
storey_range_04 TO 06                            -435.5614     34.541    -12.610      0.000    -503.284    -367.839
storey_range_19 TO 21                             332.5007     75.659      4.395      0.000     184.160     480.841
flat_model_Apartment                              189.7051    132.501      1.432      0.152     -70.084     449.494
flat_model_Maisonette                             631.4407    134.905      4.681      0.000     366.938     895.943
town_PASIR RIS                                   -777.9371     63.807    -12.192      0.000    -903.040    -652.835
street_name_DAWSON RD                            2057.4598    210.342      9.781      0.000    1645.052    2469.868
flat_model_Premium Apartment                     -219.4219     74.512     -2.945      0.003    -365.515     -73.329
nearest_mall_dist                                -159.0245     27.036     -5.882      0.000    -212.032    -106.017
total_nearby_mrt                                   87.7080     12.609      6.956      0.000      62.986     112.430
street_name_ANG MO KIO ST 51                     1104.8803    184.839      5.978      0.000     742.475    1467.286
flat_model_New Generation                        -266.8412     75.245     -3.546      0.000    -414.370    -119.313
nearest_bus_stop_Blk 18                          1300.3506    230.744      5.635      0.000     847.943    1752.758
town_SEMBAWANG                                  -1078.6626     75.696    -14.250      0.000   -1227.077    -930.249
flat_type_2 ROOM                                 -222.1454    525.266     -0.423      0.672   -1252.009     807.719
flat_type_5 ROOM                                -1094.6596    523.698     -2.090      0.037   -2121.449     -67.870
flat_type_EXECUTIVE                             -1369.9033    533.029     -2.570      0.010   -2414.987    -324.819
flat_model_Standard                              -381.7736     90.390     -4.224      0.000    -558.997    -204.551
flat_model_Type S1                               -445.4836    418.125     -1.065      0.287   -1265.281     374.314
flat_model_Model A                               -438.4732     70.345     -6.233      0.000    -576.395    -300.551
storey_range_16 TO 18                             151.6447     53.136      2.854      0.004      47.463     255.827
flat_model_Simplified                              50.9849     80.475      0.634      0.526    -106.798     208.768
flat_model_DBSS                                   957.8831    105.763      9.057      0.000     750.519    1165.247
town_MARINE PARADE                               1438.8825    119.886     12.002      0.000    1203.828    1673.937
flat_model_Improved                              -425.4139     72.607     -5.859      0.000    -567.770    -283.058
storey_range_07 TO 09                            -278.6488     34.433     -8.092      0.000    -346.161    -211.137
nearest_primary_school_ZHANGDE PRIMARY SCHOOL     780.4289    163.710      4.767      0.000     459.450    1101.408
street_name_JLN BAHAGIA                          1980.6356    217.758      9.096      0.000    1553.687    2407.584
town_BUKIT BATOK                                 -583.2634     48.579    -12.007      0.000    -678.509    -488.018
town_BUKIT PANJANG                              -1366.4006     59.234    -23.068      0.000   -1482.537   -1250.264
town_HOUGANG                                     -210.0800     42.299     -4.967      0.000    -293.014    -127.147
town_CHOA CHU KANG                              -1195.6802     45.061    -26.535      0.000   -1284.030   -1107.331
nearest_school_WOODLANDS SECONDARY SCHOOL         642.6794    127.541      5.039      0.000     392.616     892.743
nearest_mrt_Admiralty                            -266.3586     63.304     -4.208      0.000    -390.475    -142.243
street_name_MCNAIR RD                            1031.7811    166.540      6.195      0.000     705.254    1358.308
nearest_mrt_Kallang                               940.8730    123.546      7.616      0.000     698.643    1183.103
nearest_mall_Sun Plaza                           -595.5670     96.541     -6.169      0.000    -784.851    -406.283
total_resales_in_block                             15.4206      3.068      5.026      0.000       9.405      21.437
storey_range_10 TO 12                            -166.1132     35.276     -4.709      0.000    -235.277     -96.949
nearest_school_GAN ENG SENG PRIMARY SCHOOL        698.3186    220.737      3.164      0.002     265.530    1131.107
nearest_school_BLANGAH RISE PRIMARY SCHOOL        380.9247    170.008      2.241      0.025      47.599     714.250
street_name_TELOK BLANGAH ST 31                   581.7126    171.348      3.395      0.001     245.758     917.667
storey_range_22 TO 24                             607.7347     85.951      7.071      0.000     439.215     776.255
nearest_bus_stop_Opp Blk 565                      541.1651     91.543      5.912      0.000     361.681     720.649
town_CENTRAL AREA                                 -39.3566    141.183     -0.279      0.780    -316.168     237.455
street_name_ANG MO KIO AVE 3                     -131.4407    111.968     -1.174      0.241    -350.971      88.090
street_name_ANG MO KIO ST 44                      541.1651     91.543      5.912      0.000     361.681     720.649
nearest_school_KHENG CHENG SCHOOL                 413.0946    108.657      3.802      0.000     200.056     626.133
storey_range_37 TO 39                            1191.1417    190.883      6.240      0.000     816.886    1565.398
storey_range_25 TO 27                             863.2411     94.851      9.101      0.000     677.271    1049.211
flat_type_4 ROOM                                -1013.1508    523.052     -1.937      0.053   -2038.672      12.371
town_QUEENSTOWN                                   692.4003     69.491      9.964      0.000     556.152     828.649
town_ANG MO KIO                                   410.1668     57.757      7.102      0.000     296.925     523.409
town_SENGKANG                                    -574.1014     45.146    -12.716      0.000    -662.617    -485.585
town_TAMPINES                                     371.2856     40.604      9.144      0.000     291.675     450.896
town_WOODLANDS                                   -201.8772     56.219     -3.591      0.000    -312.103     -91.651
nearest_mrt_Toa Payoh                             357.4936    109.452      3.266      0.001     142.896     572.091
street_name_CLEMENTI AVE 3                       1016.3727    161.198      6.305      0.000     700.320    1332.426
town_JURONG EAST                                 -986.1403     67.647    -14.578      0.000   -1118.773    -853.508
town_BEDOK                                        380.1097     45.925      8.277      0.000     290.066     470.153
storey_range_31 TO 33                            1161.1612    187.387      6.197      0.000     793.761    1528.562
nearest_primary_school_RADIN MAS PRIMARY SCHOOL  -382.6097    166.340     -2.300      0.021    -708.744     -56.475
nearest_mrt_Tiong Bahru                           395.6359    154.386      2.563      0.010      92.938     698.334
nearest_mrt_Sengkang                               36.5487     41.428      0.882      0.378     -44.678     117.775
total_resales_in_street                             0.0014      0.639      0.002      0.998      -1.251       1.253
nearest_mrt_Outram Park                          1271.9461    211.425      6.016      0.000     857.415    1686.477
storey_range_28 TO 30                            1034.0408    136.308      7.586      0.000     766.789    1301.293
==============================================================================
Omnibus:                      205.404   Durbin-Watson:                   2.005
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              554.498
Skew:                           0.309   Prob(JB):                    3.91e-121
Kurtosis:                       4.841   Cond. No.                     1.08e+16
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 2.14e-24. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [ ]:
est2.summary2()
Out[ ]:
Model: OLS Adj. R-squared: 0.847
Dependent Variable: price_per_sqm AIC: 54171.5485
Date: 2023-03-25 21:43 BIC: 54621.8480
No. Observations: 3528 Log-Likelihood: -27013.
Df Model: 72 F-statistic: 271.3
Df Residuals: 3455 Prob (F-statistic): 0.00
R-squared: 0.850 Scale: 2.6737e+05
Coef. Std.Err. t P>|t| [0.025 0.975]
const 4246.0972 526.1288 8.0705 0.0000 3214.5422 5277.6521
total_resales_in_town -5.1221 0.1742 -29.3955 0.0000 -5.4637 -4.7805
nearest_mrt_dist -392.0953 28.6666 -13.6778 0.0000 -448.3006 -335.8900
remaining_lease 64.8747 1.0131 64.0389 0.0000 62.8884 66.8609
town_BUKIT MERAH 543.1916 119.3344 4.5518 0.0000 309.2185 777.1647
flat_type_3 ROOM -697.5989 523.3584 -1.3329 0.1826 -1723.7221 328.5242
street_name_CANTONMENT RD 2219.9375 441.8537 5.0241 0.0000 1353.6167 3086.2584
storey_range_01 TO 03 -620.6503 36.0889 -17.1978 0.0000 -691.4081 -549.8925
storey_range_04 TO 06 -435.5614 34.5407 -12.6101 0.0000 -503.2837 -367.8391
storey_range_19 TO 21 332.5007 75.6587 4.3947 0.0000 184.1603 480.8411
flat_model_Apartment 189.7051 132.5014 1.4317 0.1523 -70.0839 449.4941
flat_model_Maisonette 631.4407 134.9054 4.6806 0.0000 366.9382 895.9431
town_PASIR RIS -777.9371 63.8066 -12.1921 0.0000 -903.0395 -652.8347
street_name_DAWSON RD 2057.4598 210.3423 9.7815 0.0000 1645.0520 2469.8676
flat_model_Premium Apartment -219.4219 74.5124 -2.9448 0.0033 -365.5147 -73.3290
nearest_mall_dist -159.0245 27.0358 -5.8820 0.0000 -212.0324 -106.0166
total_nearby_mrt 87.7080 12.6090 6.9560 0.0000 62.9861 112.4299
street_name_ANG MO KIO ST 51 1104.8803 184.8395 5.9775 0.0000 742.4746 1467.2860
flat_model_New Generation -266.8412 75.2446 -3.5463 0.0004 -414.3697 -119.3128
nearest_bus_stop_Blk 18 1300.3506 230.7436 5.6355 0.0000 847.9429 1752.7583
town_SEMBAWANG -1078.6626 75.6963 -14.2499 0.0000 -1227.0766 -930.2486
flat_type_2 ROOM -222.1454 525.2664 -0.4229 0.6724 -1252.0094 807.7186
flat_type_5 ROOM -1094.6596 523.6985 -2.0902 0.0367 -2121.4494 -67.8697
flat_type_EXECUTIVE -1369.9033 533.0292 -2.5700 0.0102 -2414.9874 -324.8193
flat_model_Standard -381.7736 90.3899 -4.2236 0.0000 -558.9967 -204.5505
flat_model_Type S1 -445.4836 418.1250 -1.0654 0.2868 -1265.2808 374.3136
flat_model_Model A -438.4732 70.3450 -6.2332 0.0000 -576.3952 -300.5513
storey_range_16 TO 18 151.6447 53.1364 2.8539 0.0043 47.4629 255.8266
flat_model_Simplified 50.9849 80.4748 0.6336 0.5264 -106.7982 208.7680
flat_model_DBSS 957.8831 105.7630 9.0569 0.0000 750.5186 1165.2475
town_MARINE PARADE 1438.8825 119.8861 12.0021 0.0000 1203.8277 1673.9374
flat_model_Improved -425.4139 72.6066 -5.8592 0.0000 -567.7700 -283.0578
storey_range_07 TO 09 -278.6488 34.4334 -8.0924 0.0000 -346.1606 -211.1370
nearest_primary_school_ZHANGDE PRIMARY SCHOOL 780.4289 163.7104 4.7671 0.0000 459.4499 1101.4079
street_name_JLN BAHAGIA 1980.6356 217.7584 9.0956 0.0000 1553.6873 2407.5838
town_BUKIT BATOK -583.2634 48.5786 -12.0066 0.0000 -678.5091 -488.0178
town_BUKIT PANJANG -1366.4006 59.2337 -23.0680 0.0000 -1482.5371 -1250.2641
town_HOUGANG -210.0800 42.2990 -4.9666 0.0000 -293.0135 -127.1465
town_CHOA CHU KANG -1195.6802 45.0613 -26.5345 0.0000 -1284.0297 -1107.3307
nearest_school_WOODLANDS SECONDARY SCHOOL 642.6794 127.5412 5.0390 0.0000 392.6156 892.7431
nearest_mrt_Admiralty -266.3586 63.3035 -4.2076 0.0000 -390.4746 -142.2425
street_name_MCNAIR RD 1031.7811 166.5402 6.1954 0.0000 705.2538 1358.3083
nearest_mrt_Kallang 940.8730 123.5458 7.6156 0.0000 698.6428 1183.1032
nearest_mall_Sun Plaza -595.5670 96.5412 -6.1690 0.0000 -784.8506 -406.2834
total_resales_in_block 15.4206 3.0684 5.0256 0.0000 9.4045 21.4368
storey_range_10 TO 12 -166.1132 35.2759 -4.7090 0.0000 -235.2770 -96.9495
nearest_school_GAN ENG SENG PRIMARY SCHOOL 698.3186 220.7372 3.1636 0.0016 265.5300 1131.1071
nearest_school_BLANGAH RISE PRIMARY SCHOOL 380.9247 170.0076 2.2406 0.0251 47.5991 714.2504
street_name_TELOK BLANGAH ST 31 581.7126 171.3485 3.3949 0.0007 245.7581 917.6671
storey_range_22 TO 24 607.7347 85.9511 7.0707 0.0000 439.2147 776.2547
nearest_bus_stop_Opp Blk 565 541.1651 91.5431 5.9116 0.0000 361.6811 720.6490
town_CENTRAL AREA -39.3566 141.1834 -0.2788 0.7804 -316.1681 237.4548
street_name_ANG MO KIO AVE 3 -131.4407 111.9683 -1.1739 0.2405 -350.9714 88.0900
street_name_ANG MO KIO ST 44 541.1651 91.5431 5.9116 0.0000 361.6811 720.6490
nearest_school_KHENG CHENG SCHOOL 413.0946 108.6572 3.8018 0.0001 200.0557 626.1335
storey_range_37 TO 39 1191.1417 190.8835 6.2402 0.0000 816.8859 1565.3976
storey_range_25 TO 27 863.2411 94.8512 9.1010 0.0000 677.2709 1049.2112
flat_type_4 ROOM -1013.1508 523.0515 -1.9370 0.0528 -2038.6723 12.3706
town_QUEENSTOWN 692.4003 69.4915 9.9638 0.0000 556.1519 828.6488
town_ANG MO KIO 410.1668 57.7573 7.1016 0.0000 296.9249 523.4087
town_SENGKANG -574.1014 45.1462 -12.7165 0.0000 -662.6174 -485.5855
town_TAMPINES 371.2856 40.6040 9.1441 0.0000 291.6754 450.8957
town_WOODLANDS -201.8772 56.2191 -3.5909 0.0003 -312.1033 -91.6511
nearest_mrt_Toa Payoh 357.4936 109.4523 3.2662 0.0011 142.8958 572.0914
street_name_CLEMENTI AVE 3 1016.3727 161.1979 6.3051 0.0000 700.3198 1332.4256
town_JURONG EAST -986.1403 67.6472 -14.5777 0.0000 -1118.7727 -853.5078
town_BEDOK 380.1097 45.9253 8.2767 0.0000 290.0662 470.1533
storey_range_31 TO 33 1161.1612 187.3870 6.1966 0.0000 793.7608 1528.5616
nearest_primary_school_RADIN MAS PRIMARY SCHOOL -382.6097 166.3400 -2.3002 0.0215 -708.7443 -56.4752
nearest_mrt_Tiong Bahru 395.6359 154.3864 2.5626 0.0104 92.9381 698.3337
nearest_mrt_Sengkang 36.5487 41.4282 0.8822 0.3777 -44.6776 117.7749
total_resales_in_street 0.0014 0.6386 0.0023 0.9982 -1.2505 1.2534
nearest_mrt_Outram Park 1271.9461 211.4250 6.0161 0.0000 857.4155 1686.4768
storey_range_28 TO 30 1034.0408 136.3078 7.5861 0.0000 766.7887 1301.2929
Omnibus: 205.404 Durbin-Watson: 2.005
Prob(Omnibus): 0.000 Jarque-Bera (JB): 554.498
Skew: 0.309 Prob(JB): 0.000
Kurtosis: 4.841 Condition No.: 10792461271321490
In [ ]:
num_test = test_x.shape[0]
num_pred_price = test_x.shape[1] 
num_pred_ppsqm = test_price_x.shape[1]

def adj_r2(r2_score, n, k):
    return 1 - (1 - r2_score) * (n - 1) / (n - k - 1)
In [ ]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error

pred_y = reg.predict(test_x) 

print(f"RMSE: {mean_squared_error(pred_y * test_df['floor_area_sqm'], test_df['resale_price'], squared=False)}")

print(f"MSE: {mean_squared_error(pred_y * test_df['floor_area_sqm'], test_df['resale_price'], squared=True)}")

print(f"MAPE: {mean_absolute_percentage_error(pred_y * test_df['floor_area_sqm'], test_df['resale_price'])}" )

r2 = r2_score(pred_y * test_df['floor_area_sqm'], test_df['resale_price'])

print(f"R2: {r2}")
      
print(f"Adj R2: {adj_r2(r2, num_test, num_pred_ppsqm)}")
RMSE: 49426.826997841184
MSE: 2443011227.0745225
MAPE: 0.06620348238280643
R2: 0.9028775431364398
Adj R2: 0.8939716425070674
In [ ]:
train_price_y = train_df['resale_price']
train_price_x = train_df.drop(['price_per_sqm', 'resale_price'], axis=1)

test_price_y = test_df['resale_price']
test_price_x = test_df.drop(['price_per_sqm', 'resale_price'], axis=1)
In [ ]:
train_df.describe()
Out[ ]:
floor_area_sqm total_resales_in_town nearest_mrt_dist remaining_lease town_BUKIT MERAH flat_type_3 ROOM street_name_CANTONMENT RD storey_range_01 TO 03 storey_range_04 TO 06 storey_range_19 TO 21 ... town_BEDOK storey_range_31 TO 33 nearest_primary_school_RADIN MAS PRIMARY SCHOOL nearest_mrt_Tiong Bahru nearest_mrt_Sengkang total_resales_in_street nearest_mrt_Outram Park storey_range_28 TO 30 resale_price price_per_sqm
count 3528.000000 3528.000000 3528.000000 3528.000000 3528.000000 3528.000000 3528.000000 3528.000000 3528.000000 3528.000000 ... 3528.000000 3528.000000 3528.000000 3528.000000 3528.000000 3528.000000 3528.000000 3528.000000 3.528000e+03 3528.000000
mean 95.090788 234.252551 0.856008 73.763251 0.035998 0.258503 0.002551 0.174036 0.227608 0.016156 ... 0.052721 0.002268 0.004252 0.014172 0.085601 18.755385 0.004819 0.004535 5.570365e+05 5918.672653
std 23.925711 102.023088 0.452228 14.959429 0.186311 0.437874 0.050450 0.379194 0.419347 0.126095 ... 0.223508 0.047572 0.065076 0.118218 0.279813 16.906956 0.069258 0.067200 1.694671e+05 1320.094818
min 37.000000 6.000000 0.036138 43.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 2.300000e+05 3403.508772
25% 74.000000 152.000000 0.496064 61.166667 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 8.000000 0.000000 0.000000 4.330000e+05 5068.261755
50% 93.000000 223.000000 0.782662 73.416667 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 14.000000 0.000000 0.000000 5.350000e+05 5567.270033
75% 111.000000 362.000000 1.153094 90.333333 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 24.000000 0.000000 0.000000 6.452220e+05 6397.256483
max 192.000000 387.000000 3.479159 95.250000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 93.000000 1.000000 1.000000 1.340000e+06 13723.404255

8 rows × 76 columns

In [ ]:
reg2 = LinearRegression().fit(train_price_x, train_price_y)
reg2.score(train_price_x, train_price_y)
Out[ ]:
0.9049534725102613
In [ ]:
pred_price_y = reg2.predict(test_price_x) 

print(f"RMSE: {mean_squared_error(pred_price_y, test_price_y, squared=False)}")

print(f"MSE: {mean_squared_error(pred_price_y, test_price_y, squared=True)}")

print(f"MAPE: {mean_absolute_percentage_error(pred_price_y, test_price_y)}" )

r2 = r2_score(np.array(pred_price_y), test_price_y)

print(f"R2: {r2}")
      
print(f"Adj R2: {adj_r2(r2, num_test, num_pred_price)}")
RMSE: 54316.37118057538
MSE: 2950268178.2260394
MAPE: 0.0780557332344016
R2: 0.8854189654232654
Adj R2: 0.8750669660122485
In [ ]:
print("TRAIN")
print(reg.score(train_x, train_y))
print(reg2.score(train_price_x, train_price_y))

print('\nTEST')
print(reg.score(test_x, test_y))
print(reg2.score(test_price_x, test_price_y))
TRAIN
0.8497037887882943
0.9049534725102613

TEST
0.8461168378576372
0.8919312720249223
In [ ]:
from sklearn.ensemble import GradientBoostingRegressor

reg = GradientBoostingRegressor(random_state=42, learning_rate=0.01, max_depth=10, n_estimators=1500, subsample=0.2)
reg.fit(train_x, train_y)
Out[ ]:
GradientBoostingRegressor(learning_rate=0.01, max_depth=10, n_estimators=1500,
                          random_state=42, subsample=0.2)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GradientBoostingRegressor(learning_rate=0.01, max_depth=10, n_estimators=1500,
                          random_state=42, subsample=0.2)
In [ ]:
print(reg.score(train_x, train_y))
print(reg.score(test_x, test_y))
0.9817539632699481
0.9210077325852002
In [ ]:
pred_y = reg.predict(test_x) 

print(f"RMSE: {mean_squared_error(pred_y * test_df['floor_area_sqm'], test_df['resale_price'], squared=False)}")

print(f"MSE: {mean_squared_error(pred_y * test_df['floor_area_sqm'], test_df['resale_price'], squared=True)}")

print(f"MAPE: {mean_absolute_percentage_error(pred_y * test_df['floor_area_sqm'], test_df['resale_price'])}" )

r2 = r2_score(pred_y * test_df['floor_area_sqm'], test_df['resale_price'])

print(f"R2: {r2}")
      
print(f"Adj R2: {adj_r2(r2, num_test, num_pred_ppsqm)}")
RMSE: 35003.75237082601
MSE: 1225262680.0381076
MAPE: 0.04342378394524176
R2: 0.9519062895582234
Adj R2: 0.9474962095424967
In [ ]:
import matplotlib.pyplot as plt

plt.style.use('ggplot')

feature_importance = reg.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
sorted_idx = sorted_idx[-10:]
pos = np.arange(sorted_idx.shape[0]) + .5
# plt.subplot(1, 2, 2)
plt.figure(figsize=(10, 7))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, train_x.keys()[sorted_idx])
# plt.xlabel('Relative Importance')
plt.title('Gradient Boosted Regression Feature Importance')
plt.show()
In [ ]:
reg2 = GradientBoostingRegressor(random_state=1)
reg2.fit(train_price_x, train_price_y)
Out[ ]:
GradientBoostingRegressor(random_state=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GradientBoostingRegressor(random_state=1)
In [ ]:
pred_price_y = reg2.predict(test_price_x) 

print(f"RMSE: {mean_squared_error(pred_price_y, test_price_y, squared=False)}")

print(f"MSE: {mean_squared_error(pred_price_y, test_price_y, squared=True)}")

print(f"MAPE: {mean_absolute_percentage_error(pred_price_y, test_price_y)}" )

r2 = r2_score(np.array(pred_price_y), test_price_y)

print(f"R2: {r2}")
      
print(f"Adj R2: {adj_r2(r2, num_test, num_pred_price)}")
RMSE: 50644.27956427074
MSE: 2564843052.584011
MAPE: 0.06185336184942124
R2: 0.8842771840825315
Adj R2: 0.8738220286840473
In [ ]:
print("TRAIN")
print(reg.score(train_x, train_y))
print(reg2.score(train_price_x, train_price_y))

print('\nTEST')
print(reg.score(test_x, test_y))
print(reg2.score(test_price_x, test_price_y))
TRAIN
0.8838548946496559
0.9255936939558768

TEST
0.8662866062721712
0.9060494472352904
In [ ]: