import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
unscaled_data = pd.read_csv('./data/feature_selection/resale_feature_selection_not_scaled.csv')
unscaled_data.head()
floor_area_sqm | total_resales_in_town | nearest_mrt_dist | remaining_lease | town_BUKIT MERAH | flat_type_3 ROOM | street_name_CANTONMENT RD | storey_range_01 TO 03 | storey_range_04 TO 06 | storey_range_19 TO 21 | ... | town_JURONG EAST | town_BEDOK | storey_range_31 TO 33 | nearest_primary_school_RADIN MAS PRIMARY SCHOOL | nearest_mrt_Tiong Bahru | nearest_mrt_Sengkang | total_resales_in_street | nearest_mrt_Outram Park | storey_range_28 TO 30 | resale_price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 44.0 | 189 | 1.003996 | 55.416667 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 40 | 0 | 0 | 267000.0 |
1 | 49.0 | 189 | 0.304447 | 53.500000 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 37 | 0 | 0 | 300000.0 |
2 | 44.0 | 189 | 0.416219 | 54.083333 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 37 | 0 | 0 | 280000.0 |
3 | 44.0 | 189 | 0.416219 | 54.083333 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 37 | 0 | 0 | 282000.0 |
4 | 45.0 | 189 | 1.280521 | 62.083333 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 20 | 0 | 0 | 289800.0 |
5 rows × 75 columns
unscaled_data.describe()
floor_area_sqm | total_resales_in_town | nearest_mrt_dist | remaining_lease | town_BUKIT MERAH | flat_type_3 ROOM | street_name_CANTONMENT RD | storey_range_01 TO 03 | storey_range_04 TO 06 | storey_range_19 TO 21 | ... | town_JURONG EAST | town_BEDOK | storey_range_31 TO 33 | nearest_primary_school_RADIN MAS PRIMARY SCHOOL | nearest_mrt_Tiong Bahru | nearest_mrt_Sengkang | total_resales_in_street | nearest_mrt_Outram Park | storey_range_28 TO 30 | resale_price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 4410.000000 | 4410.000000 | 4410.000000 | 4410.000000 | 4410.000000 | 4410.000000 | 4410.000000 | 4410.000000 | 4410.000000 | 4410.000000 | ... | 4410.000000 | 4410.000000 | 4410.000000 | 4410.000000 | 4410.000000 | 4410.000000 | 4410.000000 | 4410.000000 | 4410.000000 | 4.410000e+03 |
mean | 94.821156 | 234.816780 | 0.859754 | 73.789021 | 0.036508 | 0.265986 | 0.002041 | 0.170975 | 0.229478 | 0.016780 | ... | 0.018821 | 0.053968 | 0.002041 | 0.003855 | 0.014512 | 0.085941 | 18.998639 | 0.003855 | 0.004989 | 5.556083e+05 |
std | 24.082048 | 101.834821 | 0.453918 | 15.054199 | 0.187572 | 0.441907 | 0.045134 | 0.376530 | 0.420545 | 0.128461 | ... | 0.135907 | 0.225981 | 0.045134 | 0.061975 | 0.119604 | 0.280309 | 17.129396 | 0.061975 | 0.070462 | 1.686516e+05 |
min | 37.000000 | 6.000000 | 0.036138 | 43.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 2.300000e+05 |
25% | 73.000000 | 152.000000 | 0.498500 | 61.166667 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 8.000000 | 0.000000 | 0.000000 | 4.300000e+05 |
50% | 93.000000 | 223.000000 | 0.787363 | 73.416667 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 14.000000 | 0.000000 | 0.000000 | 5.350000e+05 |
75% | 111.750000 | 362.000000 | 1.160318 | 90.583333 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 24.000000 | 0.000000 | 0.000000 | 6.480000e+05 |
max | 192.000000 | 387.000000 | 3.479159 | 95.500000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 93.000000 | 1.000000 | 1.000000 | 1.340000e+06 |
8 rows × 75 columns
unscaled_data.columns
Index(['floor_area_sqm', 'total_resales_in_town', 'nearest_mrt_dist', 'remaining_lease', 'town_BUKIT MERAH', 'flat_type_3 ROOM', 'street_name_CANTONMENT RD', 'storey_range_01 TO 03', 'storey_range_04 TO 06', 'storey_range_19 TO 21', 'flat_model_Apartment', 'flat_model_Maisonette', 'town_PASIR RIS', 'street_name_DAWSON RD', 'flat_model_Premium Apartment', 'nearest_mall_dist', 'total_nearby_mrt', 'street_name_ANG MO KIO ST 51', 'flat_model_New Generation', 'nearest_bus_stop_Blk 18', 'town_SEMBAWANG', 'flat_type_2 ROOM', 'flat_type_5 ROOM', 'flat_type_EXECUTIVE', 'flat_model_Standard', 'flat_model_Type S1', 'flat_model_Model A', 'storey_range_16 TO 18', 'flat_model_Simplified', 'flat_model_DBSS', 'town_MARINE PARADE', 'flat_model_Improved', 'storey_range_07 TO 09', 'nearest_primary_school_ZHANGDE PRIMARY SCHOOL', 'street_name_JLN BAHAGIA', 'town_BUKIT BATOK', 'town_BUKIT PANJANG', 'town_HOUGANG', 'town_CHOA CHU KANG', 'nearest_school_WOODLANDS SECONDARY SCHOOL', 'nearest_mrt_Admiralty', 'street_name_MCNAIR RD', 'nearest_mrt_Kallang', 'nearest_mall_Sun Plaza', 'total_resales_in_block', 'storey_range_10 TO 12', 'nearest_school_GAN ENG SENG PRIMARY SCHOOL', 'nearest_school_BLANGAH RISE PRIMARY SCHOOL', 'street_name_TELOK BLANGAH ST 31', 'storey_range_22 TO 24', 'nearest_bus_stop_Opp Blk 565', 'town_CENTRAL AREA', 'street_name_ANG MO KIO AVE 3', 'street_name_ANG MO KIO ST 44', 'nearest_school_KHENG CHENG SCHOOL', 'storey_range_37 TO 39', 'storey_range_25 TO 27', 'flat_type_4 ROOM', 'town_QUEENSTOWN', 'town_ANG MO KIO', 'town_SENGKANG', 'town_TAMPINES', 'town_WOODLANDS', 'nearest_mrt_Toa Payoh', 'street_name_CLEMENTI AVE 3', 'town_JURONG EAST', 'town_BEDOK', 'storey_range_31 TO 33', 'nearest_primary_school_RADIN MAS PRIMARY SCHOOL', 'nearest_mrt_Tiong Bahru', 'nearest_mrt_Sengkang', 'total_resales_in_street', 'nearest_mrt_Outram Park', 'storey_range_28 TO 30', 'resale_price'], dtype='object')
unscaled_data['price_per_sqm'] = unscaled_data['resale_price']/unscaled_data['floor_area_sqm']
unscaled_data.head()
floor_area_sqm | total_resales_in_town | nearest_mrt_dist | remaining_lease | town_BUKIT MERAH | flat_type_3 ROOM | street_name_CANTONMENT RD | storey_range_01 TO 03 | storey_range_04 TO 06 | storey_range_19 TO 21 | ... | town_BEDOK | storey_range_31 TO 33 | nearest_primary_school_RADIN MAS PRIMARY SCHOOL | nearest_mrt_Tiong Bahru | nearest_mrt_Sengkang | total_resales_in_street | nearest_mrt_Outram Park | storey_range_28 TO 30 | resale_price | price_per_sqm | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 44.0 | 189 | 1.003996 | 55.416667 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 40 | 0 | 0 | 267000.0 | 6068.181818 |
1 | 49.0 | 189 | 0.304447 | 53.500000 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 37 | 0 | 0 | 300000.0 | 6122.448980 |
2 | 44.0 | 189 | 0.416219 | 54.083333 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 37 | 0 | 0 | 280000.0 | 6363.636364 |
3 | 44.0 | 189 | 0.416219 | 54.083333 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 37 | 0 | 0 | 282000.0 | 6409.090909 |
4 | 45.0 | 189 | 1.280521 | 62.083333 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 20 | 0 | 0 | 289800.0 | 6440.000000 |
5 rows × 76 columns
unscaled_data.describe()
floor_area_sqm | total_resales_in_town | nearest_mrt_dist | remaining_lease | town_BUKIT MERAH | flat_type_3 ROOM | street_name_CANTONMENT RD | storey_range_01 TO 03 | storey_range_04 TO 06 | storey_range_19 TO 21 | ... | town_BEDOK | storey_range_31 TO 33 | nearest_primary_school_RADIN MAS PRIMARY SCHOOL | nearest_mrt_Tiong Bahru | nearest_mrt_Sengkang | total_resales_in_street | nearest_mrt_Outram Park | storey_range_28 TO 30 | resale_price | price_per_sqm | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 4410.000000 | 4410.000000 | 4410.000000 | 4410.000000 | 4410.000000 | 4410.000000 | 4410.000000 | 4410.000000 | 4410.000000 | 4410.000000 | ... | 4410.000000 | 4410.000000 | 4410.000000 | 4410.000000 | 4410.000000 | 4410.000000 | 4410.000000 | 4410.000000 | 4.410000e+03 | 4410.000000 |
mean | 94.821156 | 234.816780 | 0.859754 | 73.789021 | 0.036508 | 0.265986 | 0.002041 | 0.170975 | 0.229478 | 0.016780 | ... | 0.053968 | 0.002041 | 0.003855 | 0.014512 | 0.085941 | 18.998639 | 0.003855 | 0.004989 | 5.556083e+05 | 5923.272014 |
std | 24.082048 | 101.834821 | 0.453918 | 15.054199 | 0.187572 | 0.441907 | 0.045134 | 0.376530 | 0.420545 | 0.128461 | ... | 0.225981 | 0.045134 | 0.061975 | 0.119604 | 0.280309 | 17.129396 | 0.061975 | 0.070462 | 1.686516e+05 | 1319.972500 |
min | 37.000000 | 6.000000 | 0.036138 | 43.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 2.300000e+05 | 3403.508772 |
25% | 73.000000 | 152.000000 | 0.498500 | 61.166667 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 8.000000 | 0.000000 | 0.000000 | 4.300000e+05 | 5066.342441 |
50% | 93.000000 | 223.000000 | 0.787363 | 73.416667 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 14.000000 | 0.000000 | 0.000000 | 5.350000e+05 | 5585.236118 |
75% | 111.750000 | 362.000000 | 1.160318 | 90.583333 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 24.000000 | 0.000000 | 0.000000 | 6.480000e+05 | 6400.000000 |
max | 192.000000 | 387.000000 | 3.479159 | 95.500000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 93.000000 | 1.000000 | 1.000000 | 1.340000e+06 | 13723.404255 |
8 rows × 76 columns
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(unscaled_data, test_size = 0.2, random_state = 42)
# perform scaling on quantitative features
# total_resales_in_town
# nearest_mrt_dist
# remaining_lease'
# nearest_mall_dist
# total_nearby_mrt
# total_resales_in_block
# total_resales_in_street
train_y = train_df['price_per_sqm']
train_x = train_df.drop(['price_per_sqm', 'floor_area_sqm', 'resale_price'], axis=1)
test_y = test_df['price_per_sqm']
test_x = test_df.drop(['price_per_sqm', 'floor_area_sqm', 'resale_price'], axis=1)
reg = LinearRegression().fit(train_x, train_y)
import statsmodels.api as sm
train_x2 = sm.add_constant(train_x)
est = sm.OLS(train_y, train_x2)
est2 = est.fit()
print(est2.summary())
OLS Regression Results ============================================================================== Dep. Variable: price_per_sqm R-squared: 0.850 Model: OLS Adj. R-squared: 0.847 Method: Least Squares F-statistic: 271.3 Date: Sat, 25 Mar 2023 Prob (F-statistic): 0.00 Time: 21:43:14 Log-Likelihood: -27013. No. Observations: 3528 AIC: 5.417e+04 Df Residuals: 3455 BIC: 5.462e+04 Df Model: 72 Covariance Type: nonrobust =================================================================================================================== coef std err t P>|t| [0.025 0.975] ------------------------------------------------------------------------------------------------------------------- const 4246.0972 526.129 8.070 0.000 3214.542 5277.652 total_resales_in_town -5.1221 0.174 -29.395 0.000 -5.464 -4.780 nearest_mrt_dist -392.0953 28.667 -13.678 0.000 -448.301 -335.890 remaining_lease 64.8747 1.013 64.039 0.000 62.888 66.861 town_BUKIT MERAH 543.1916 119.334 4.552 0.000 309.219 777.165 flat_type_3 ROOM -697.5989 523.358 -1.333 0.183 -1723.722 328.524 street_name_CANTONMENT RD 2219.9375 441.854 5.024 0.000 1353.617 3086.258 storey_range_01 TO 03 -620.6503 36.089 -17.198 0.000 -691.408 -549.893 storey_range_04 TO 06 -435.5614 34.541 -12.610 0.000 -503.284 -367.839 storey_range_19 TO 21 332.5007 75.659 4.395 0.000 184.160 480.841 flat_model_Apartment 189.7051 132.501 1.432 0.152 -70.084 449.494 flat_model_Maisonette 631.4407 134.905 4.681 0.000 366.938 895.943 town_PASIR RIS -777.9371 63.807 -12.192 0.000 -903.040 -652.835 street_name_DAWSON RD 2057.4598 210.342 9.781 0.000 1645.052 2469.868 flat_model_Premium Apartment -219.4219 74.512 -2.945 0.003 -365.515 -73.329 nearest_mall_dist -159.0245 27.036 -5.882 0.000 -212.032 -106.017 total_nearby_mrt 87.7080 12.609 6.956 0.000 62.986 112.430 street_name_ANG MO KIO ST 51 1104.8803 184.839 5.978 0.000 742.475 1467.286 flat_model_New Generation -266.8412 75.245 -3.546 0.000 -414.370 -119.313 nearest_bus_stop_Blk 18 1300.3506 230.744 5.635 0.000 847.943 1752.758 town_SEMBAWANG -1078.6626 75.696 -14.250 0.000 -1227.077 -930.249 flat_type_2 ROOM -222.1454 525.266 -0.423 0.672 -1252.009 807.719 flat_type_5 ROOM -1094.6596 523.698 -2.090 0.037 -2121.449 -67.870 flat_type_EXECUTIVE -1369.9033 533.029 -2.570 0.010 -2414.987 -324.819 flat_model_Standard -381.7736 90.390 -4.224 0.000 -558.997 -204.551 flat_model_Type S1 -445.4836 418.125 -1.065 0.287 -1265.281 374.314 flat_model_Model A -438.4732 70.345 -6.233 0.000 -576.395 -300.551 storey_range_16 TO 18 151.6447 53.136 2.854 0.004 47.463 255.827 flat_model_Simplified 50.9849 80.475 0.634 0.526 -106.798 208.768 flat_model_DBSS 957.8831 105.763 9.057 0.000 750.519 1165.247 town_MARINE PARADE 1438.8825 119.886 12.002 0.000 1203.828 1673.937 flat_model_Improved -425.4139 72.607 -5.859 0.000 -567.770 -283.058 storey_range_07 TO 09 -278.6488 34.433 -8.092 0.000 -346.161 -211.137 nearest_primary_school_ZHANGDE PRIMARY SCHOOL 780.4289 163.710 4.767 0.000 459.450 1101.408 street_name_JLN BAHAGIA 1980.6356 217.758 9.096 0.000 1553.687 2407.584 town_BUKIT BATOK -583.2634 48.579 -12.007 0.000 -678.509 -488.018 town_BUKIT PANJANG -1366.4006 59.234 -23.068 0.000 -1482.537 -1250.264 town_HOUGANG -210.0800 42.299 -4.967 0.000 -293.014 -127.147 town_CHOA CHU KANG -1195.6802 45.061 -26.535 0.000 -1284.030 -1107.331 nearest_school_WOODLANDS SECONDARY SCHOOL 642.6794 127.541 5.039 0.000 392.616 892.743 nearest_mrt_Admiralty -266.3586 63.304 -4.208 0.000 -390.475 -142.243 street_name_MCNAIR RD 1031.7811 166.540 6.195 0.000 705.254 1358.308 nearest_mrt_Kallang 940.8730 123.546 7.616 0.000 698.643 1183.103 nearest_mall_Sun Plaza -595.5670 96.541 -6.169 0.000 -784.851 -406.283 total_resales_in_block 15.4206 3.068 5.026 0.000 9.405 21.437 storey_range_10 TO 12 -166.1132 35.276 -4.709 0.000 -235.277 -96.949 nearest_school_GAN ENG SENG PRIMARY SCHOOL 698.3186 220.737 3.164 0.002 265.530 1131.107 nearest_school_BLANGAH RISE PRIMARY SCHOOL 380.9247 170.008 2.241 0.025 47.599 714.250 street_name_TELOK BLANGAH ST 31 581.7126 171.348 3.395 0.001 245.758 917.667 storey_range_22 TO 24 607.7347 85.951 7.071 0.000 439.215 776.255 nearest_bus_stop_Opp Blk 565 541.1651 91.543 5.912 0.000 361.681 720.649 town_CENTRAL AREA -39.3566 141.183 -0.279 0.780 -316.168 237.455 street_name_ANG MO KIO AVE 3 -131.4407 111.968 -1.174 0.241 -350.971 88.090 street_name_ANG MO KIO ST 44 541.1651 91.543 5.912 0.000 361.681 720.649 nearest_school_KHENG CHENG SCHOOL 413.0946 108.657 3.802 0.000 200.056 626.133 storey_range_37 TO 39 1191.1417 190.883 6.240 0.000 816.886 1565.398 storey_range_25 TO 27 863.2411 94.851 9.101 0.000 677.271 1049.211 flat_type_4 ROOM -1013.1508 523.052 -1.937 0.053 -2038.672 12.371 town_QUEENSTOWN 692.4003 69.491 9.964 0.000 556.152 828.649 town_ANG MO KIO 410.1668 57.757 7.102 0.000 296.925 523.409 town_SENGKANG -574.1014 45.146 -12.716 0.000 -662.617 -485.585 town_TAMPINES 371.2856 40.604 9.144 0.000 291.675 450.896 town_WOODLANDS -201.8772 56.219 -3.591 0.000 -312.103 -91.651 nearest_mrt_Toa Payoh 357.4936 109.452 3.266 0.001 142.896 572.091 street_name_CLEMENTI AVE 3 1016.3727 161.198 6.305 0.000 700.320 1332.426 town_JURONG EAST -986.1403 67.647 -14.578 0.000 -1118.773 -853.508 town_BEDOK 380.1097 45.925 8.277 0.000 290.066 470.153 storey_range_31 TO 33 1161.1612 187.387 6.197 0.000 793.761 1528.562 nearest_primary_school_RADIN MAS PRIMARY SCHOOL -382.6097 166.340 -2.300 0.021 -708.744 -56.475 nearest_mrt_Tiong Bahru 395.6359 154.386 2.563 0.010 92.938 698.334 nearest_mrt_Sengkang 36.5487 41.428 0.882 0.378 -44.678 117.775 total_resales_in_street 0.0014 0.639 0.002 0.998 -1.251 1.253 nearest_mrt_Outram Park 1271.9461 211.425 6.016 0.000 857.415 1686.477 storey_range_28 TO 30 1034.0408 136.308 7.586 0.000 766.789 1301.293 ============================================================================== Omnibus: 205.404 Durbin-Watson: 2.005 Prob(Omnibus): 0.000 Jarque-Bera (JB): 554.498 Skew: 0.309 Prob(JB): 3.91e-121 Kurtosis: 4.841 Cond. No. 1.08e+16 ============================================================================== Notes: [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. [2] The smallest eigenvalue is 2.14e-24. This might indicate that there are strong multicollinearity problems or that the design matrix is singular.
est2.summary2()
Model: | OLS | Adj. R-squared: | 0.847 |
Dependent Variable: | price_per_sqm | AIC: | 54171.5485 |
Date: | 2023-03-25 21:43 | BIC: | 54621.8480 |
No. Observations: | 3528 | Log-Likelihood: | -27013. |
Df Model: | 72 | F-statistic: | 271.3 |
Df Residuals: | 3455 | Prob (F-statistic): | 0.00 |
R-squared: | 0.850 | Scale: | 2.6737e+05 |
Coef. | Std.Err. | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 4246.0972 | 526.1288 | 8.0705 | 0.0000 | 3214.5422 | 5277.6521 |
total_resales_in_town | -5.1221 | 0.1742 | -29.3955 | 0.0000 | -5.4637 | -4.7805 |
nearest_mrt_dist | -392.0953 | 28.6666 | -13.6778 | 0.0000 | -448.3006 | -335.8900 |
remaining_lease | 64.8747 | 1.0131 | 64.0389 | 0.0000 | 62.8884 | 66.8609 |
town_BUKIT MERAH | 543.1916 | 119.3344 | 4.5518 | 0.0000 | 309.2185 | 777.1647 |
flat_type_3 ROOM | -697.5989 | 523.3584 | -1.3329 | 0.1826 | -1723.7221 | 328.5242 |
street_name_CANTONMENT RD | 2219.9375 | 441.8537 | 5.0241 | 0.0000 | 1353.6167 | 3086.2584 |
storey_range_01 TO 03 | -620.6503 | 36.0889 | -17.1978 | 0.0000 | -691.4081 | -549.8925 |
storey_range_04 TO 06 | -435.5614 | 34.5407 | -12.6101 | 0.0000 | -503.2837 | -367.8391 |
storey_range_19 TO 21 | 332.5007 | 75.6587 | 4.3947 | 0.0000 | 184.1603 | 480.8411 |
flat_model_Apartment | 189.7051 | 132.5014 | 1.4317 | 0.1523 | -70.0839 | 449.4941 |
flat_model_Maisonette | 631.4407 | 134.9054 | 4.6806 | 0.0000 | 366.9382 | 895.9431 |
town_PASIR RIS | -777.9371 | 63.8066 | -12.1921 | 0.0000 | -903.0395 | -652.8347 |
street_name_DAWSON RD | 2057.4598 | 210.3423 | 9.7815 | 0.0000 | 1645.0520 | 2469.8676 |
flat_model_Premium Apartment | -219.4219 | 74.5124 | -2.9448 | 0.0033 | -365.5147 | -73.3290 |
nearest_mall_dist | -159.0245 | 27.0358 | -5.8820 | 0.0000 | -212.0324 | -106.0166 |
total_nearby_mrt | 87.7080 | 12.6090 | 6.9560 | 0.0000 | 62.9861 | 112.4299 |
street_name_ANG MO KIO ST 51 | 1104.8803 | 184.8395 | 5.9775 | 0.0000 | 742.4746 | 1467.2860 |
flat_model_New Generation | -266.8412 | 75.2446 | -3.5463 | 0.0004 | -414.3697 | -119.3128 |
nearest_bus_stop_Blk 18 | 1300.3506 | 230.7436 | 5.6355 | 0.0000 | 847.9429 | 1752.7583 |
town_SEMBAWANG | -1078.6626 | 75.6963 | -14.2499 | 0.0000 | -1227.0766 | -930.2486 |
flat_type_2 ROOM | -222.1454 | 525.2664 | -0.4229 | 0.6724 | -1252.0094 | 807.7186 |
flat_type_5 ROOM | -1094.6596 | 523.6985 | -2.0902 | 0.0367 | -2121.4494 | -67.8697 |
flat_type_EXECUTIVE | -1369.9033 | 533.0292 | -2.5700 | 0.0102 | -2414.9874 | -324.8193 |
flat_model_Standard | -381.7736 | 90.3899 | -4.2236 | 0.0000 | -558.9967 | -204.5505 |
flat_model_Type S1 | -445.4836 | 418.1250 | -1.0654 | 0.2868 | -1265.2808 | 374.3136 |
flat_model_Model A | -438.4732 | 70.3450 | -6.2332 | 0.0000 | -576.3952 | -300.5513 |
storey_range_16 TO 18 | 151.6447 | 53.1364 | 2.8539 | 0.0043 | 47.4629 | 255.8266 |
flat_model_Simplified | 50.9849 | 80.4748 | 0.6336 | 0.5264 | -106.7982 | 208.7680 |
flat_model_DBSS | 957.8831 | 105.7630 | 9.0569 | 0.0000 | 750.5186 | 1165.2475 |
town_MARINE PARADE | 1438.8825 | 119.8861 | 12.0021 | 0.0000 | 1203.8277 | 1673.9374 |
flat_model_Improved | -425.4139 | 72.6066 | -5.8592 | 0.0000 | -567.7700 | -283.0578 |
storey_range_07 TO 09 | -278.6488 | 34.4334 | -8.0924 | 0.0000 | -346.1606 | -211.1370 |
nearest_primary_school_ZHANGDE PRIMARY SCHOOL | 780.4289 | 163.7104 | 4.7671 | 0.0000 | 459.4499 | 1101.4079 |
street_name_JLN BAHAGIA | 1980.6356 | 217.7584 | 9.0956 | 0.0000 | 1553.6873 | 2407.5838 |
town_BUKIT BATOK | -583.2634 | 48.5786 | -12.0066 | 0.0000 | -678.5091 | -488.0178 |
town_BUKIT PANJANG | -1366.4006 | 59.2337 | -23.0680 | 0.0000 | -1482.5371 | -1250.2641 |
town_HOUGANG | -210.0800 | 42.2990 | -4.9666 | 0.0000 | -293.0135 | -127.1465 |
town_CHOA CHU KANG | -1195.6802 | 45.0613 | -26.5345 | 0.0000 | -1284.0297 | -1107.3307 |
nearest_school_WOODLANDS SECONDARY SCHOOL | 642.6794 | 127.5412 | 5.0390 | 0.0000 | 392.6156 | 892.7431 |
nearest_mrt_Admiralty | -266.3586 | 63.3035 | -4.2076 | 0.0000 | -390.4746 | -142.2425 |
street_name_MCNAIR RD | 1031.7811 | 166.5402 | 6.1954 | 0.0000 | 705.2538 | 1358.3083 |
nearest_mrt_Kallang | 940.8730 | 123.5458 | 7.6156 | 0.0000 | 698.6428 | 1183.1032 |
nearest_mall_Sun Plaza | -595.5670 | 96.5412 | -6.1690 | 0.0000 | -784.8506 | -406.2834 |
total_resales_in_block | 15.4206 | 3.0684 | 5.0256 | 0.0000 | 9.4045 | 21.4368 |
storey_range_10 TO 12 | -166.1132 | 35.2759 | -4.7090 | 0.0000 | -235.2770 | -96.9495 |
nearest_school_GAN ENG SENG PRIMARY SCHOOL | 698.3186 | 220.7372 | 3.1636 | 0.0016 | 265.5300 | 1131.1071 |
nearest_school_BLANGAH RISE PRIMARY SCHOOL | 380.9247 | 170.0076 | 2.2406 | 0.0251 | 47.5991 | 714.2504 |
street_name_TELOK BLANGAH ST 31 | 581.7126 | 171.3485 | 3.3949 | 0.0007 | 245.7581 | 917.6671 |
storey_range_22 TO 24 | 607.7347 | 85.9511 | 7.0707 | 0.0000 | 439.2147 | 776.2547 |
nearest_bus_stop_Opp Blk 565 | 541.1651 | 91.5431 | 5.9116 | 0.0000 | 361.6811 | 720.6490 |
town_CENTRAL AREA | -39.3566 | 141.1834 | -0.2788 | 0.7804 | -316.1681 | 237.4548 |
street_name_ANG MO KIO AVE 3 | -131.4407 | 111.9683 | -1.1739 | 0.2405 | -350.9714 | 88.0900 |
street_name_ANG MO KIO ST 44 | 541.1651 | 91.5431 | 5.9116 | 0.0000 | 361.6811 | 720.6490 |
nearest_school_KHENG CHENG SCHOOL | 413.0946 | 108.6572 | 3.8018 | 0.0001 | 200.0557 | 626.1335 |
storey_range_37 TO 39 | 1191.1417 | 190.8835 | 6.2402 | 0.0000 | 816.8859 | 1565.3976 |
storey_range_25 TO 27 | 863.2411 | 94.8512 | 9.1010 | 0.0000 | 677.2709 | 1049.2112 |
flat_type_4 ROOM | -1013.1508 | 523.0515 | -1.9370 | 0.0528 | -2038.6723 | 12.3706 |
town_QUEENSTOWN | 692.4003 | 69.4915 | 9.9638 | 0.0000 | 556.1519 | 828.6488 |
town_ANG MO KIO | 410.1668 | 57.7573 | 7.1016 | 0.0000 | 296.9249 | 523.4087 |
town_SENGKANG | -574.1014 | 45.1462 | -12.7165 | 0.0000 | -662.6174 | -485.5855 |
town_TAMPINES | 371.2856 | 40.6040 | 9.1441 | 0.0000 | 291.6754 | 450.8957 |
town_WOODLANDS | -201.8772 | 56.2191 | -3.5909 | 0.0003 | -312.1033 | -91.6511 |
nearest_mrt_Toa Payoh | 357.4936 | 109.4523 | 3.2662 | 0.0011 | 142.8958 | 572.0914 |
street_name_CLEMENTI AVE 3 | 1016.3727 | 161.1979 | 6.3051 | 0.0000 | 700.3198 | 1332.4256 |
town_JURONG EAST | -986.1403 | 67.6472 | -14.5777 | 0.0000 | -1118.7727 | -853.5078 |
town_BEDOK | 380.1097 | 45.9253 | 8.2767 | 0.0000 | 290.0662 | 470.1533 |
storey_range_31 TO 33 | 1161.1612 | 187.3870 | 6.1966 | 0.0000 | 793.7608 | 1528.5616 |
nearest_primary_school_RADIN MAS PRIMARY SCHOOL | -382.6097 | 166.3400 | -2.3002 | 0.0215 | -708.7443 | -56.4752 |
nearest_mrt_Tiong Bahru | 395.6359 | 154.3864 | 2.5626 | 0.0104 | 92.9381 | 698.3337 |
nearest_mrt_Sengkang | 36.5487 | 41.4282 | 0.8822 | 0.3777 | -44.6776 | 117.7749 |
total_resales_in_street | 0.0014 | 0.6386 | 0.0023 | 0.9982 | -1.2505 | 1.2534 |
nearest_mrt_Outram Park | 1271.9461 | 211.4250 | 6.0161 | 0.0000 | 857.4155 | 1686.4768 |
storey_range_28 TO 30 | 1034.0408 | 136.3078 | 7.5861 | 0.0000 | 766.7887 | 1301.2929 |
Omnibus: | 205.404 | Durbin-Watson: | 2.005 |
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 554.498 |
Skew: | 0.309 | Prob(JB): | 0.000 |
Kurtosis: | 4.841 | Condition No.: | 10792461271321490 |
num_test = test_x.shape[0]
num_pred_price = test_x.shape[1]
num_pred_ppsqm = test_price_x.shape[1]
def adj_r2(r2_score, n, k):
return 1 - (1 - r2_score) * (n - 1) / (n - k - 1)
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error
pred_y = reg.predict(test_x)
print(f"RMSE: {mean_squared_error(pred_y * test_df['floor_area_sqm'], test_df['resale_price'], squared=False)}")
print(f"MSE: {mean_squared_error(pred_y * test_df['floor_area_sqm'], test_df['resale_price'], squared=True)}")
print(f"MAPE: {mean_absolute_percentage_error(pred_y * test_df['floor_area_sqm'], test_df['resale_price'])}" )
r2 = r2_score(pred_y * test_df['floor_area_sqm'], test_df['resale_price'])
print(f"R2: {r2}")
print(f"Adj R2: {adj_r2(r2, num_test, num_pred_ppsqm)}")
RMSE: 49426.826997841184 MSE: 2443011227.0745225 MAPE: 0.06620348238280643 R2: 0.9028775431364398 Adj R2: 0.8939716425070674
train_price_y = train_df['resale_price']
train_price_x = train_df.drop(['price_per_sqm', 'resale_price'], axis=1)
test_price_y = test_df['resale_price']
test_price_x = test_df.drop(['price_per_sqm', 'resale_price'], axis=1)
train_df.describe()
floor_area_sqm | total_resales_in_town | nearest_mrt_dist | remaining_lease | town_BUKIT MERAH | flat_type_3 ROOM | street_name_CANTONMENT RD | storey_range_01 TO 03 | storey_range_04 TO 06 | storey_range_19 TO 21 | ... | town_BEDOK | storey_range_31 TO 33 | nearest_primary_school_RADIN MAS PRIMARY SCHOOL | nearest_mrt_Tiong Bahru | nearest_mrt_Sengkang | total_resales_in_street | nearest_mrt_Outram Park | storey_range_28 TO 30 | resale_price | price_per_sqm | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 3528.000000 | 3528.000000 | 3528.000000 | 3528.000000 | 3528.000000 | 3528.000000 | 3528.000000 | 3528.000000 | 3528.000000 | 3528.000000 | ... | 3528.000000 | 3528.000000 | 3528.000000 | 3528.000000 | 3528.000000 | 3528.000000 | 3528.000000 | 3528.000000 | 3.528000e+03 | 3528.000000 |
mean | 95.090788 | 234.252551 | 0.856008 | 73.763251 | 0.035998 | 0.258503 | 0.002551 | 0.174036 | 0.227608 | 0.016156 | ... | 0.052721 | 0.002268 | 0.004252 | 0.014172 | 0.085601 | 18.755385 | 0.004819 | 0.004535 | 5.570365e+05 | 5918.672653 |
std | 23.925711 | 102.023088 | 0.452228 | 14.959429 | 0.186311 | 0.437874 | 0.050450 | 0.379194 | 0.419347 | 0.126095 | ... | 0.223508 | 0.047572 | 0.065076 | 0.118218 | 0.279813 | 16.906956 | 0.069258 | 0.067200 | 1.694671e+05 | 1320.094818 |
min | 37.000000 | 6.000000 | 0.036138 | 43.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 2.300000e+05 | 3403.508772 |
25% | 74.000000 | 152.000000 | 0.496064 | 61.166667 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 8.000000 | 0.000000 | 0.000000 | 4.330000e+05 | 5068.261755 |
50% | 93.000000 | 223.000000 | 0.782662 | 73.416667 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 14.000000 | 0.000000 | 0.000000 | 5.350000e+05 | 5567.270033 |
75% | 111.000000 | 362.000000 | 1.153094 | 90.333333 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 24.000000 | 0.000000 | 0.000000 | 6.452220e+05 | 6397.256483 |
max | 192.000000 | 387.000000 | 3.479159 | 95.250000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 93.000000 | 1.000000 | 1.000000 | 1.340000e+06 | 13723.404255 |
8 rows × 76 columns
reg2 = LinearRegression().fit(train_price_x, train_price_y)
reg2.score(train_price_x, train_price_y)
0.9049534725102613
pred_price_y = reg2.predict(test_price_x)
print(f"RMSE: {mean_squared_error(pred_price_y, test_price_y, squared=False)}")
print(f"MSE: {mean_squared_error(pred_price_y, test_price_y, squared=True)}")
print(f"MAPE: {mean_absolute_percentage_error(pred_price_y, test_price_y)}" )
r2 = r2_score(np.array(pred_price_y), test_price_y)
print(f"R2: {r2}")
print(f"Adj R2: {adj_r2(r2, num_test, num_pred_price)}")
RMSE: 54316.37118057538 MSE: 2950268178.2260394 MAPE: 0.0780557332344016 R2: 0.8854189654232654 Adj R2: 0.8750669660122485
print("TRAIN")
print(reg.score(train_x, train_y))
print(reg2.score(train_price_x, train_price_y))
print('\nTEST')
print(reg.score(test_x, test_y))
print(reg2.score(test_price_x, test_price_y))
TRAIN 0.8497037887882943 0.9049534725102613 TEST 0.8461168378576372 0.8919312720249223
from sklearn.ensemble import GradientBoostingRegressor
reg = GradientBoostingRegressor(random_state=42, learning_rate=0.01, max_depth=10, n_estimators=1500, subsample=0.2)
reg.fit(train_x, train_y)
GradientBoostingRegressor(learning_rate=0.01, max_depth=10, n_estimators=1500, random_state=42, subsample=0.2)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GradientBoostingRegressor(learning_rate=0.01, max_depth=10, n_estimators=1500, random_state=42, subsample=0.2)
print(reg.score(train_x, train_y))
print(reg.score(test_x, test_y))
0.9817539632699481 0.9210077325852002
pred_y = reg.predict(test_x)
print(f"RMSE: {mean_squared_error(pred_y * test_df['floor_area_sqm'], test_df['resale_price'], squared=False)}")
print(f"MSE: {mean_squared_error(pred_y * test_df['floor_area_sqm'], test_df['resale_price'], squared=True)}")
print(f"MAPE: {mean_absolute_percentage_error(pred_y * test_df['floor_area_sqm'], test_df['resale_price'])}" )
r2 = r2_score(pred_y * test_df['floor_area_sqm'], test_df['resale_price'])
print(f"R2: {r2}")
print(f"Adj R2: {adj_r2(r2, num_test, num_pred_ppsqm)}")
RMSE: 35003.75237082601 MSE: 1225262680.0381076 MAPE: 0.04342378394524176 R2: 0.9519062895582234 Adj R2: 0.9474962095424967
import matplotlib.pyplot as plt
plt.style.use('ggplot')
feature_importance = reg.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
sorted_idx = sorted_idx[-10:]
pos = np.arange(sorted_idx.shape[0]) + .5
# plt.subplot(1, 2, 2)
plt.figure(figsize=(10, 7))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, train_x.keys()[sorted_idx])
# plt.xlabel('Relative Importance')
plt.title('Gradient Boosted Regression Feature Importance')
plt.show()
reg2 = GradientBoostingRegressor(random_state=1)
reg2.fit(train_price_x, train_price_y)
GradientBoostingRegressor(random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GradientBoostingRegressor(random_state=1)
pred_price_y = reg2.predict(test_price_x)
print(f"RMSE: {mean_squared_error(pred_price_y, test_price_y, squared=False)}")
print(f"MSE: {mean_squared_error(pred_price_y, test_price_y, squared=True)}")
print(f"MAPE: {mean_absolute_percentage_error(pred_price_y, test_price_y)}" )
r2 = r2_score(np.array(pred_price_y), test_price_y)
print(f"R2: {r2}")
print(f"Adj R2: {adj_r2(r2, num_test, num_pred_price)}")
RMSE: 50644.27956427074 MSE: 2564843052.584011 MAPE: 0.06185336184942124 R2: 0.8842771840825315 Adj R2: 0.8738220286840473
print("TRAIN")
print(reg.score(train_x, train_y))
print(reg2.score(train_price_x, train_price_y))
print('\nTEST')
print(reg.score(test_x, test_y))
print(reg2.score(test_price_x, test_price_y))
TRAIN 0.8838548946496559 0.9255936939558768 TEST 0.8662866062721712 0.9060494472352904