from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
!pip install shap
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shap
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from xgboost import plot_importance
matplotlib.style.use('ggplot')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Requirement already satisfied: shap in /usr/local/lib/python3.9/dist-packages (0.41.0) Requirement already satisfied: pandas in /usr/local/lib/python3.9/dist-packages (from shap) (1.4.4) Requirement already satisfied: packaging>20.9 in /usr/local/lib/python3.9/dist-packages (from shap) (23.0) Requirement already satisfied: scikit-learn in /usr/local/lib/python3.9/dist-packages (from shap) (1.2.2) Requirement already satisfied: numba in /usr/local/lib/python3.9/dist-packages (from shap) (0.56.4) Requirement already satisfied: tqdm>4.25.0 in /usr/local/lib/python3.9/dist-packages (from shap) (4.65.0) Requirement already satisfied: slicer==0.0.7 in /usr/local/lib/python3.9/dist-packages (from shap) (0.0.7) Requirement already satisfied: cloudpickle in /usr/local/lib/python3.9/dist-packages (from shap) (2.2.1) Requirement already satisfied: scipy in /usr/local/lib/python3.9/dist-packages (from shap) (1.10.1) Requirement already satisfied: numpy in /usr/local/lib/python3.9/dist-packages (from shap) (1.22.4) Requirement already satisfied: llvmlite<0.40,>=0.39.0dev0 in /usr/local/lib/python3.9/dist-packages (from numba->shap) (0.39.1) Requirement already satisfied: setuptools in /usr/local/lib/python3.9/dist-packages (from numba->shap) (63.4.3) Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.9/dist-packages (from pandas->shap) (2.8.2) Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.9/dist-packages (from pandas->shap) (2022.7.1) Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.9/dist-packages (from scikit-learn->shap) (3.1.0) Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.9/dist-packages (from scikit-learn->shap) (1.1.1) Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.9/dist-packages (from python-dateutil>=2.8.1->pandas->shap) (1.15.0)
train_df = pd.read_csv("drive/MyDrive/dataset/ST4248/resale_train_feature_selection.csv")
test_df = pd.read_csv("drive/MyDrive/dataset/ST4248/resale_test_feature_selection.csv")
X_train = train_df.drop("resale_price", axis = 1)
X_test = test_df.drop("resale_price", axis = 1)
y_train = train_df["resale_price"]
y_test = test_df["resale_price"]
test_params = {
'max_depth': [3, 6, 10],
'eta': [0.03, 0.1, 0.2, 0.3],
'min_child_weight': [1, 5, 10, 20],
'colsample_bytree': [0.5, 0.8, 1]
}
xgb_model = XGBRegressor()
cv_model = GridSearchCV(estimator = xgb_model, param_grid = test_params)
cv_model.fit(X_train, y_train)
cv_model.best_params_
{'colsample_bytree': 0.8, 'eta': 0.1, 'max_depth': 10, 'min_child_weight': 5}
#fit model no training data
model = XGBRegressor(eta = 0.1, max_depth = 10, min_child_weight = 5, colsample_bytree = 0.8)
model.fit(X_train, y_train)
XGBRegressor(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=0.8, early_stopping_rounds=None, enable_categorical=False, eta=0.1, eval_metric=None, feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=10, max_leaves=None, min_child_weight=5, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, predictor=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
XGBRegressor(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=0.8, early_stopping_rounds=None, enable_categorical=False, eta=0.1, eval_metric=None, feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=10, max_leaves=None, min_child_weight=5, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, predictor=None, ...)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
train_n = X_train.shape[0]
train_p = X_train.shape[1]
train_mape = mean_absolute_percentage_error(y_train, y_train_pred)
train_rmse = mean_squared_error(y_train, y_train_pred, squared = False)
train_r2 = r2_score(y_train, y_train_pred)
train_adj_r2 = 1 - (1 - train_r2) * (train_n - 1) / (train_n - train_p - 1)
test_n = X_test.shape[0]
test_p = X_test.shape[1]
test_mape = mean_absolute_percentage_error(y_test, y_test_pred)
test_rmse = mean_squared_error(y_test, y_test_pred, squared = False)
test_r2 = r2_score(y_test, y_test_pred)
test_adj_r2 = 1 - (1 - test_r2) * (test_n - 1) / (test_n - test_p - 1)
print("Train MAPE: {:.2f}%".format(train_mape * 100))
print("Train RMSE:", train_rmse)
print("Train R2:", train_r2)
print("Train Adj R2:", train_adj_r2)
print()
print("Test MAPE: {:.2f}%".format(test_mape * 100))
print("Test RMSE:", test_rmse)
print("Test R2:", test_r2)
print("Test Adj R2:", test_adj_r2)
Train MAPE: 2.02% Train RMSE: 14954.497949463695 Train R2: 0.9922107422211408 Train Adj R2: 0.9920438134416344 Test MAPE: 4.53% Test RMSE: 38256.28238775734 Test R2: 0.9463902139899916 Test Adj R2: 0.9414743228317008
train_residuals = y_train - y_train_pred
test_residuals = y_test - y_test_pred
#idk residual plot is vs predicted value or actual value lol
plt.scatter(y_train_pred, train_residuals)
plt.xlabel("Predicted Price")
plt.ylabel("Residual")
plt.title("Residual Plot")
Text(0.5, 1.0, 'Residual Plot')
plt.scatter(y_train, train_residuals)
plt.xlabel("Actual Price")
plt.ylabel("Residual")
plt.title("Residual Plot")
Text(0.5, 1.0, 'Residual Plot')
plt.scatter(y_train, y_train_pred)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual vs Predicted Price")
Text(0.5, 1.0, 'Actual vs Predicted Price')
plot_importance(model, max_num_features = 10)
<Axes: title={'center': 'Feature importance'}, xlabel='F score', ylabel='Features'>
plot_importance(model, max_num_features = 15)
<Axes: title={'center': 'Feature importance'}, xlabel='F score', ylabel='Features'>
# https://github.com/slundberg/shap
explainer = shap.Explainer(model)
shap_values = explainer(X_test)
# This plot is not useful (?)
shap.plots.waterfall(shap_values[0])
ntree_limit is deprecated, use `iteration_range` or model slicing instead.
# This plot is v good i think, it shows:
# 1. Higher floor area sqm -> higher price
# 2. Higher remaining lease -> higher price
# 3. Lower total resales in town -> higher price
# 4. Nearer MRT -> higher price
# 5. HDBs located at storey 1 to 3, 4 to 6, 7 to 9 tend to have lower price
# Positive SHAP value means positive impact on prediction
# Gradient color indicates the original value for that variable
shap.summary_plot(shap_values, X_test, show = False)
plt.title("SHAP Values of Predictors")
plt.gcf().set_size_inches(12,6)
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
train_df2 = train_df.copy()
test_df2 = test_df.copy()
train_df2["floor_area_sqm"] = train_df2["floor_area_sqm"].apply(lambda x : x * 23.922319549360488 + 95.09078798185942)
test_df2["floor_area_sqm"] = test_df2["floor_area_sqm"].apply(lambda x : x * 23.922319549360488 + 95.09078798185942)
train_df2["resale_price_per_sqm"] = train_df2["resale_price"] / train_df2["floor_area_sqm"]
test_df2["resale_price_per_sqm"] = test_df2["resale_price"] / test_df2["floor_area_sqm"]
X_train = train_df2.drop(["resale_price_per_sqm", "resale_price", "floor_area_sqm"], axis = 1)
X_test = test_df2.drop(["resale_price_per_sqm", "resale_price", "floor_area_sqm"], axis = 1)
y_train = train_df2["resale_price_per_sqm"]
y_test = test_df2["resale_price_per_sqm"]
test_params = {
'max_depth': [3, 6, 10],
'eta': [0.03, 0.1, 0.2, 0.3],
'min_child_weight': [1, 5, 10, 20],
'colsample_bytree': [0.5, 0.8, 1]
}
xgb_model = XGBRegressor()
cv_model = GridSearchCV(estimator = xgb_model, param_grid = test_params)
cv_model.fit(X_train, y_train)
cv_model.best_params_
{'colsample_bytree': 0.8, 'eta': 0.1, 'max_depth': 10, 'min_child_weight': 1}
#fit model no training data
model = XGBRegressor(eta = 0.1, max_depth = 10, min_child_weight = 1, colsample_bytree = 0.8)
model.fit(X_train, y_train)
XGBRegressor(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=0.8, early_stopping_rounds=None, enable_categorical=False, eta=0.1, eval_metric=None, feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=10, max_leaves=None, min_child_weight=1, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, predictor=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
XGBRegressor(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=0.8, early_stopping_rounds=None, enable_categorical=False, eta=0.1, eval_metric=None, feature_types=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=10, max_leaves=None, min_child_weight=1, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, predictor=None, ...)
y_train_pred = model.predict(X_train)
y_train = y_train * train_df2["floor_area_sqm"]
y_train_pred = y_train_pred * train_df2["floor_area_sqm"]
y_test_pred = model.predict(X_test)
y_test = y_test * test_df2["floor_area_sqm"]
y_test_pred = y_test_pred * test_df2["floor_area_sqm"]
train_n = X_train.shape[0]
train_p = X_train.shape[1]
train_mape = mean_absolute_percentage_error(y_train, y_train_pred)
train_rmse = mean_squared_error(y_train, y_train_pred, squared = False)
train_r2 = r2_score(y_train, y_train_pred)
train_adj_r2 = 1 - (1 - train_r2) * (train_n - 1) / (train_n - train_p - 1)
test_n = X_test.shape[0]
test_p = X_test.shape[1]
test_mape = mean_absolute_percentage_error(y_test, y_test_pred)
test_rmse = mean_squared_error(y_test, y_test_pred, squared = False)
test_r2 = r2_score(y_test, y_test_pred)
test_adj_r2 = 1 - (1 - test_r2) * (test_n - 1) / (test_n - test_p - 1)
print("Train MAPE: {:.2f}%".format(train_mape * 100))
print("Train RMSE:", train_rmse)
print("Train R2:", train_r2)
print("Train Adj R2:", train_adj_r2)
print()
print("Test MAPE: {:.2f}%".format(test_mape * 100))
print("Test RMSE:", test_rmse)
print("Test R2:", test_r2)
print("Test Adj R2:", test_adj_r2)
Train MAPE: 1.71% Train RMSE: 13035.20521611742 Train R2: 0.9940818203115429 Train Adj R2: 0.9939567400807214 Test MAPE: 4.40% Test RMSE: 35366.68520642956 Test R2: 0.9541829357262053 Test Adj R2: 0.9500435227410728
train_residuals = y_train - y_train_pred
test_residuals = y_test - y_test_pred
#idk residual plot is vs predicted value or actual value lol
plt.scatter(y_train_pred, train_residuals)
plt.xlabel("Predicted Price")
plt.ylabel("Residual")
plt.title("Residual Plot")
Text(0.5, 1.0, 'Residual Plot')
plt.scatter(y_train, train_residuals)
plt.xlabel("Actual Price")
plt.ylabel("Residual")
plt.title("Residual Plot")
Text(0.5, 1.0, 'Residual Plot')
plt.scatter(y_train, y_train_pred)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual vs Predicted Price")
Text(0.5, 1.0, 'Actual vs Predicted Price')
plot_importance(model, max_num_features = 10)
<Axes: title={'center': 'Feature importance'}, xlabel='F score', ylabel='Features'>
plot_importance(model, max_num_features = 15)
<Axes: title={'center': 'Feature importance'}, xlabel='F score', ylabel='Features'>
# https://github.com/slundberg/shap
explainer = shap.Explainer(model)
shap_values = explainer(X_test)
# This plot is not useful (?)
shap.plots.waterfall(shap_values[0])
ntree_limit is deprecated, use `iteration_range` or model slicing instead.
# This plot is v good i think, it shows:
# 1. Higher remaining lease -> higher price
# 2. Lower total resales in town -> higher price
# 3. Nearer MRT -> higher price
# 4. HDBs located at storey 1 to 3, 4 to 6, 7 to 9 tend to have lower price
# Positive SHAP value means positive impact on prediction
# Gradient color indicates the original value for that variable
shap.summary_plot(shap_values, X_test, show = False)
plt.title("SHAP Values of Predictors")
plt.gcf().set_size_inches(12,6)
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored