from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


!pip install shap

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shap
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from xgboost import plot_importance

matplotlib.style.use('ggplot')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: shap in /usr/local/lib/python3.9/dist-packages (0.41.0)
Requirement already satisfied: pandas in /usr/local/lib/python3.9/dist-packages (from shap) (1.4.4)
Requirement already satisfied: packaging>20.9 in /usr/local/lib/python3.9/dist-packages (from shap) (23.0)
Requirement already satisfied: scikit-learn in /usr/local/lib/python3.9/dist-packages (from shap) (1.2.2)
Requirement already satisfied: numba in /usr/local/lib/python3.9/dist-packages (from shap) (0.56.4)
Requirement already satisfied: tqdm>4.25.0 in /usr/local/lib/python3.9/dist-packages (from shap) (4.65.0)
Requirement already satisfied: slicer==0.0.7 in /usr/local/lib/python3.9/dist-packages (from shap) (0.0.7)
Requirement already satisfied: cloudpickle in /usr/local/lib/python3.9/dist-packages (from shap) (2.2.1)
Requirement already satisfied: scipy in /usr/local/lib/python3.9/dist-packages (from shap) (1.10.1)
Requirement already satisfied: numpy in /usr/local/lib/python3.9/dist-packages (from shap) (1.22.4)
Requirement already satisfied: llvmlite<0.40,>=0.39.0dev0 in /usr/local/lib/python3.9/dist-packages (from numba->shap) (0.39.1)
Requirement already satisfied: setuptools in /usr/local/lib/python3.9/dist-packages (from numba->shap) (63.4.3)
Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.9/dist-packages (from pandas->shap) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.9/dist-packages (from pandas->shap) (2022.7.1)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.9/dist-packages (from scikit-learn->shap) (3.1.0)
Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.9/dist-packages (from scikit-learn->shap) (1.1.1)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.9/dist-packages (from python-dateutil>=2.8.1->pandas->shap) (1.15.0)


train_df = pd.read_csv("drive/MyDrive/dataset/ST4248/resale_train_feature_selection.csv")
test_df = pd.read_csv("drive/MyDrive/dataset/ST4248/resale_test_feature_selection.csv")


X_train = train_df.drop("resale_price", axis = 1)
X_test = test_df.drop("resale_price", axis = 1)

y_train = train_df["resale_price"]
y_test = test_df["resale_price"]


test_params = {
 'max_depth': [3, 6, 10],
 'eta': [0.03, 0.1, 0.2, 0.3],
 'min_child_weight': [1, 5, 10, 20],
 'colsample_bytree': [0.5, 0.8, 1]
}

xgb_model = XGBRegressor()

cv_model = GridSearchCV(estimator = xgb_model, param_grid = test_params)
cv_model.fit(X_train, y_train)
cv_model.best_params_

{'colsample_bytree': 0.8, 'eta': 0.1, 'max_depth': 10, 'min_child_weight': 5}


#fit model no training data
model = XGBRegressor(eta = 0.1, max_depth = 10, min_child_weight = 5, colsample_bytree = 0.8)
model.fit(X_train, y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.8, early_stopping_rounds=None,
             enable_categorical=False, eta=0.1, eval_metric=None,
             feature_types=None, gamma=None, gpu_id=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=10,
             max_leaves=None, min_child_weight=5, missing=nan,
             monotone_constraints=None, n_estimators=100, n_jobs=None,
             num_parallel_tree=None, predictor=None, ...)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.8, early_stopping_rounds=None,
             enable_categorical=False, eta=0.1, eval_metric=None,
             feature_types=None, gamma=None, gpu_id=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=10,
             max_leaves=None, min_child_weight=5, missing=nan,
             monotone_constraints=None, n_estimators=100, n_jobs=None,
             num_parallel_tree=None, predictor=None, ...)


y_train_pred = model.predict(X_train)


y_test_pred = model.predict(X_test)


train_n = X_train.shape[0]
train_p = X_train.shape[1]
train_mape = mean_absolute_percentage_error(y_train, y_train_pred)
train_rmse = mean_squared_error(y_train, y_train_pred, squared = False)
train_r2 = r2_score(y_train, y_train_pred)
train_adj_r2 = 1 - (1 - train_r2) * (train_n - 1) / (train_n - train_p - 1)


test_n = X_test.shape[0]
test_p = X_test.shape[1]
test_mape = mean_absolute_percentage_error(y_test, y_test_pred)
test_rmse = mean_squared_error(y_test, y_test_pred, squared = False)
test_r2 = r2_score(y_test, y_test_pred)
test_adj_r2 = 1 - (1 - test_r2) * (test_n - 1) / (test_n - test_p - 1)


print("Train MAPE: {:.2f}%".format(train_mape * 100))
print("Train RMSE:", train_rmse)
print("Train R2:", train_r2)
print("Train Adj R2:", train_adj_r2)
print()
print("Test MAPE: {:.2f}%".format(test_mape * 100))
print("Test RMSE:", test_rmse)
print("Test R2:", test_r2)
print("Test Adj R2:", test_adj_r2)

Train MAPE: 2.02%
Train RMSE: 14954.497949463695
Train R2: 0.9922107422211408
Train Adj R2: 0.9920438134416344

Test MAPE: 4.53%
Test RMSE: 38256.28238775734
Test R2: 0.9463902139899916
Test Adj R2: 0.9414743228317008


train_residuals = y_train - y_train_pred
test_residuals = y_test - y_test_pred


#idk residual plot is vs predicted value or actual value lol
plt.scatter(y_train_pred, train_residuals)
plt.xlabel("Predicted Price")
plt.ylabel("Residual")
plt.title("Residual Plot")

Text(0.5, 1.0, 'Residual Plot')


plt.scatter(y_train, train_residuals)
plt.xlabel("Actual Price")
plt.ylabel("Residual")
plt.title("Residual Plot")

Text(0.5, 1.0, 'Residual Plot')


plt.scatter(y_train, y_train_pred)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual vs Predicted Price")

Text(0.5, 1.0, 'Actual vs Predicted Price')


plot_importance(model, max_num_features = 10)

<Axes: title={'center': 'Feature importance'}, xlabel='F score', ylabel='Features'>


plot_importance(model, max_num_features = 15)

<Axes: title={'center': 'Feature importance'}, xlabel='F score', ylabel='Features'>


# https://github.com/slundberg/shap
explainer = shap.Explainer(model)
shap_values = explainer(X_test)

# This plot is not useful (?)
shap.plots.waterfall(shap_values[0])

ntree_limit is deprecated, use `iteration_range` or model slicing instead.


# This plot is v good i think, it shows:
# 1. Higher floor area sqm -> higher price
# 2. Higher remaining lease -> higher price
# 3. Lower total resales in town -> higher price
# 4. Nearer MRT -> higher price
# 5. HDBs located at storey 1 to 3, 4 to 6, 7 to 9 tend to have lower price
# Positive SHAP value means positive impact on prediction
# Gradient color indicates the original value for that variable
shap.summary_plot(shap_values, X_test, show = False)
plt.title("SHAP Values of Predictors")
plt.gcf().set_size_inches(12,6)

No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored


train_df2 = train_df.copy()
test_df2 = test_df.copy()

train_df2["floor_area_sqm"] = train_df2["floor_area_sqm"].apply(lambda x : x * 23.922319549360488 + 95.09078798185942)
test_df2["floor_area_sqm"] = test_df2["floor_area_sqm"].apply(lambda x : x * 23.922319549360488 + 95.09078798185942)

train_df2["resale_price_per_sqm"] = train_df2["resale_price"] / train_df2["floor_area_sqm"]
test_df2["resale_price_per_sqm"] = test_df2["resale_price"] / test_df2["floor_area_sqm"]


X_train = train_df2.drop(["resale_price_per_sqm", "resale_price", "floor_area_sqm"], axis = 1)
X_test = test_df2.drop(["resale_price_per_sqm", "resale_price", "floor_area_sqm"], axis = 1)

y_train = train_df2["resale_price_per_sqm"]
y_test = test_df2["resale_price_per_sqm"]


test_params = {
 'max_depth': [3, 6, 10],
 'eta': [0.03, 0.1, 0.2, 0.3],
 'min_child_weight': [1, 5, 10, 20],
 'colsample_bytree': [0.5, 0.8, 1]
}

xgb_model = XGBRegressor()

cv_model = GridSearchCV(estimator = xgb_model, param_grid = test_params)
cv_model.fit(X_train, y_train)
cv_model.best_params_

{'colsample_bytree': 0.8, 'eta': 0.1, 'max_depth': 10, 'min_child_weight': 1}


#fit model no training data
model = XGBRegressor(eta = 0.1, max_depth = 10, min_child_weight = 1, colsample_bytree = 0.8)
model.fit(X_train, y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.8, early_stopping_rounds=None,
             enable_categorical=False, eta=0.1, eval_metric=None,
             feature_types=None, gamma=None, gpu_id=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=10,
             max_leaves=None, min_child_weight=1, missing=nan,
             monotone_constraints=None, n_estimators=100, n_jobs=None,
             num_parallel_tree=None, predictor=None, ...)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.8, early_stopping_rounds=None,
             enable_categorical=False, eta=0.1, eval_metric=None,
             feature_types=None, gamma=None, gpu_id=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=10,
             max_leaves=None, min_child_weight=1, missing=nan,
             monotone_constraints=None, n_estimators=100, n_jobs=None,
             num_parallel_tree=None, predictor=None, ...)


y_train_pred = model.predict(X_train)
y_train = y_train * train_df2["floor_area_sqm"]
y_train_pred = y_train_pred * train_df2["floor_area_sqm"]


y_test_pred = model.predict(X_test)
y_test = y_test * test_df2["floor_area_sqm"]
y_test_pred = y_test_pred * test_df2["floor_area_sqm"]


train_n = X_train.shape[0]
train_p = X_train.shape[1]
train_mape = mean_absolute_percentage_error(y_train, y_train_pred)
train_rmse = mean_squared_error(y_train, y_train_pred, squared = False)
train_r2 = r2_score(y_train, y_train_pred)
train_adj_r2 = 1 - (1 - train_r2) * (train_n - 1) / (train_n - train_p - 1)


test_n = X_test.shape[0]
test_p = X_test.shape[1]
test_mape = mean_absolute_percentage_error(y_test, y_test_pred)
test_rmse = mean_squared_error(y_test, y_test_pred, squared = False)
test_r2 = r2_score(y_test, y_test_pred)
test_adj_r2 = 1 - (1 - test_r2) * (test_n - 1) / (test_n - test_p - 1)


print("Train MAPE: {:.2f}%".format(train_mape * 100))
print("Train RMSE:", train_rmse)
print("Train R2:", train_r2)
print("Train Adj R2:", train_adj_r2)
print()
print("Test MAPE: {:.2f}%".format(test_mape * 100))
print("Test RMSE:", test_rmse)
print("Test R2:", test_r2)
print("Test Adj R2:", test_adj_r2)

Train MAPE: 1.71%
Train RMSE: 13035.20521611742
Train R2: 0.9940818203115429
Train Adj R2: 0.9939567400807214

Test MAPE: 4.40%
Test RMSE: 35366.68520642956
Test R2: 0.9541829357262053
Test Adj R2: 0.9500435227410728


train_residuals = y_train - y_train_pred
test_residuals = y_test - y_test_pred


#idk residual plot is vs predicted value or actual value lol
plt.scatter(y_train_pred, train_residuals)
plt.xlabel("Predicted Price")
plt.ylabel("Residual")
plt.title("Residual Plot")

Text(0.5, 1.0, 'Residual Plot')


plt.scatter(y_train, train_residuals)
plt.xlabel("Actual Price")
plt.ylabel("Residual")
plt.title("Residual Plot")

Text(0.5, 1.0, 'Residual Plot')


plt.scatter(y_train, y_train_pred)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual vs Predicted Price")

Text(0.5, 1.0, 'Actual vs Predicted Price')


plot_importance(model, max_num_features = 10)

<Axes: title={'center': 'Feature importance'}, xlabel='F score', ylabel='Features'>


plot_importance(model, max_num_features = 15)

<Axes: title={'center': 'Feature importance'}, xlabel='F score', ylabel='Features'>


# https://github.com/slundberg/shap
explainer = shap.Explainer(model)
shap_values = explainer(X_test)

# This plot is not useful (?)
shap.plots.waterfall(shap_values[0])

ntree_limit is deprecated, use `iteration_range` or model slicing instead.


# This plot is v good i think, it shows:
# 1. Higher remaining lease -> higher price
# 2. Lower total resales in town -> higher price
# 3. Nearer MRT -> higher price
# 4. HDBs located at storey 1 to 3, 4 to 6, 7 to 9 tend to have lower price
# Positive SHAP value means positive impact on prediction
# Gradient color indicates the original value for that variable
shap.summary_plot(shap_values, X_test, show = False)
plt.title("SHAP Values of Predictors")
plt.gcf().set_size_inches(12,6)

No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored

Price¶

Price / Sqm¶