!pip install shap scipy
Requirement already satisfied: shap in c:\users\user\appdata\local\programs\python\python39\lib\site-packages (0.41.0) Requirement already satisfied: scipy in c:\users\user\appdata\local\programs\python\python39\lib\site-packages (1.7.3) Requirement already satisfied: pandas in c:\users\user\appdata\local\programs\python\python39\lib\site-packages (from shap) (1.3.5) Requirement already satisfied: scikit-learn in c:\users\user\appdata\local\programs\python\python39\lib\site-packages (from shap) (0.24.2) Requirement already satisfied: numpy in c:\users\user\appdata\local\programs\python\python39\lib\site-packages (from shap) (1.22.0) Requirement already satisfied: tqdm>4.25.0 in c:\users\user\appdata\local\programs\python\python39\lib\site-packages (from shap) (4.62.0) Requirement already satisfied: numba in c:\users\user\appdata\local\programs\python\python39\lib\site-packages (from shap) (0.53.1) Requirement already satisfied: cloudpickle in c:\users\user\appdata\local\programs\python\python39\lib\site-packages (from shap) (2.0.0) Requirement already satisfied: slicer==0.0.7 in c:\users\user\appdata\local\programs\python\python39\lib\site-packages (from shap) (0.0.7) Requirement already satisfied: packaging>20.9 in c:\users\user\appdata\local\programs\python\python39\lib\site-packages (from shap) (21.3) Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in c:\users\user\appdata\local\programs\python\python39\lib\site-packages (from packaging>20.9->shap) (3.0.6) Requirement already satisfied: colorama in c:\users\user\appdata\local\programs\python\python39\lib\site-packages (from tqdm>4.25.0->shap) (0.4.4) Requirement already satisfied: llvmlite<0.37,>=0.36.0rc1 in c:\users\user\appdata\local\programs\python\python39\lib\site-packages (from numba->shap) (0.36.0) Requirement already satisfied: setuptools in c:\users\user\appdata\roaming\python\python39\site-packages (from numba->shap) (62.3.2) Requirement already satisfied: pytz>=2017.3 in c:\users\user\appdata\local\programs\python\python39\lib\site-packages (from pandas->shap) (2021.3) Requirement already satisfied: python-dateutil>=2.7.3 in c:\users\user\appdata\local\programs\python\python39\lib\site-packages (from pandas->shap) (2.8.2) Requirement already satisfied: joblib>=0.11 in c:\users\user\appdata\local\programs\python\python39\lib\site-packages (from scikit-learn->shap) (1.0.1) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\user\appdata\local\programs\python\python39\lib\site-packages (from scikit-learn->shap) (2.2.0) Requirement already satisfied: six>=1.5 in c:\users\user\appdata\local\programs\python\python39\lib\site-packages (from python-dateutil>=2.7.3->pandas->shap) (1.16.0)
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
from scipy import stats
matplotlib.style.use('ggplot')
unscaled_data = pd.read_csv('./data/feature_selection/resale_feature_selection_not_scaled.csv')
unscaled_data['price_per_sqm'] = unscaled_data['resale_price']/unscaled_data['floor_area_sqm']
train_df, test_df = train_test_split(unscaled_data, test_size=0.3, random_state=4248)
train_y = train_df['price_per_sqm']
train_x = train_df.drop(['price_per_sqm', 'floor_area_sqm', 'resale_price'], axis=1)
test_y = test_df['price_per_sqm']
test_x = test_df.drop(['price_per_sqm', 'floor_area_sqm', 'resale_price'], axis=1)
train_price_y = train_df['resale_price']
train_price_x = train_df.drop(['price_per_sqm', 'resale_price'], axis=1)
test_price_y = test_df['resale_price']
test_price_x = test_df.drop(['price_per_sqm', 'resale_price'], axis=1)
def train_and_evaluate(model, parameters):
train_n = train_x.shape[0]
train_p = train_x.shape[1]
test_n = test_x.shape[0]
test_p = test_x.shape[1]
grid_model = GridSearchCV(estimator=model(), param_grid=parameters, cv=5, n_jobs=3, verbose=1)
grid_model.fit(train_x, train_y)
print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n", grid_model.best_estimator_)
print("\n The best score across ALL searched params:\n", grid_model.best_score_)
print("\n The best parameters across ALL searched params:\n", grid_model.best_params_)
print()
pred_y = grid_model.predict(test_x)
print(f"RMSE: {mean_squared_error(pred_y * test_df['floor_area_sqm'], test_df['resale_price'], squared=False)}")
print(f"MSE: {mean_squared_error(pred_y * test_df['floor_area_sqm'], test_df['resale_price'], squared=True)}")
print(f"MAPE: {mean_absolute_percentage_error(pred_y * test_df['floor_area_sqm'], test_df['resale_price'])}" )
print(f"AdjR2: {1 - (1 - r2_score(test_df['resale_price'], pred_y * test_df['floor_area_sqm'])) * (test_n - 1) / (test_n - test_p - 1)}")
print()
plt.scatter(pred_y * test_df['floor_area_sqm'], test_df['resale_price'] - pred_y * test_df['floor_area_sqm'])
plt.xlabel("Predicted Price")
plt.ylabel("Residual")
plt.title("Test Residual Plot")
plt.show()
train_score_1, test_score_1 = grid_model.score(train_x, train_y), grid_model.score(test_x, test_y)
grid_model_2 = GridSearchCV(estimator=model(), param_grid=parameters, cv=5, n_jobs=3, verbose=1)
grid_model_2.fit(train_price_x, train_price_y)
print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n", grid_model_2.best_estimator_)
print("\n The best score across ALL searched params:\n", grid_model_2.best_score_)
print("\n The best parameters across ALL searched params:\n", grid_model_2.best_params_)
print()
pred_price_y = grid_model_2.predict(test_price_x)
print(f"RMSE: {mean_squared_error(pred_price_y, test_price_y, squared=False)}")
print(f"MSE: {mean_squared_error(pred_price_y, test_price_y, squared=True)}")
print(f"MAPE: {mean_absolute_percentage_error(pred_price_y, test_price_y)}" )
print(f"AdjR2: {1 - (1 - r2_score(test_price_y, pred_price_y)) * (test_n - 1) / (test_n - test_p - 1)}")
print()
plt.scatter(pred_price_y, test_price_y - pred_price_y)
plt.xlabel("Predicted Price")
plt.ylabel("Residual")
plt.title("Test Residual Plot")
plt.show()
train_score_2, test_score_2 = grid_model_2.score(train_price_x, train_price_y), grid_model_2.score(test_price_x, test_price_y)
print('TRAIN')
print('Using price_per_sqm:', train_score_1)
print('Using resale_price:', train_score_2)
print()
print('TEST')
print('Using price_per_sqm:', test_score_1)
print('Using resale_price:', test_score_2)
print()
return grid_model, grid_model_2
parameters = {
'n_estimators' : [10, 100, 500],
'max_depth' : [10, 20, None],
'min_samples_split' : [2, 5, 10]
}
m1, m2 = train_and_evaluate(RandomForestRegressor, parameters)
Fitting 5 folds for each of 27 candidates, totalling 135 fits Results from Grid Search The best estimator across ALL searched params: RandomForestRegressor(max_depth=20, n_estimators=500) The best score across ALL searched params: 0.8670846334668434 The best parameters across ALL searched params: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 500} RMSE: 42237.149882087935 MSE: 1783976830.1619606 MAPE: 0.05090438068679358 AdjR2: 0.934509960910321
Fitting 5 folds for each of 27 candidates, totalling 135 fits Results from Grid Search The best estimator across ALL searched params: RandomForestRegressor(n_estimators=500) The best score across ALL searched params: 0.9031342965219459 The best parameters across ALL searched params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 500} RMSE: 46917.2661015701 MSE: 2201229858.445539 MAPE: 0.055084578538069466 AdjR2: 0.9191925438505391
TRAIN Using price_per_sqm: 0.9840487982479632 Using resale_price: 0.9886008784514781 TEST Using price_per_sqm: 0.8913448156239485 Using resale_price: 0.9236546802339813
plt.figure(figsize=(4, 9))
importances = np.mean(np.array([dt.feature_importances_ for dt in m1.best_estimator_.estimators_]), axis=0)
sorted_idx = np.argsort(importances)[-5:]
plt.barh(test_x.columns[sorted_idx], 100*importances[sorted_idx])
plt.xlabel('% of Importance')
plt.title('Random Forest Regressor Feature Importance')
plt.show()
parameters = {
'n_estimators': [10, 30, 50, 100],
'max_samples': [0.25, 0.5, 1.0]
}
m1, m2 = train_and_evaluate(BaggingRegressor, parameters)
Fitting 5 folds for each of 12 candidates, totalling 60 fits Results from Grid Search The best estimator across ALL searched params: BaggingRegressor(n_estimators=100) The best score across ALL searched params: 0.8665704218147925 The best parameters across ALL searched params: {'max_samples': 1.0, 'n_estimators': 100} RMSE: 42089.958783943934 MSE: 1771564630.434099 MAPE: 0.05076847163239013 AdjR2: 0.9349656144993268
Fitting 5 folds for each of 12 candidates, totalling 60 fits Results from Grid Search The best estimator across ALL searched params: BaggingRegressor(n_estimators=30) The best score across ALL searched params: 0.9019036041550578 The best parameters across ALL searched params: {'max_samples': 1.0, 'n_estimators': 30} RMSE: 48277.44898992802 MSE: 2330712080.9751024 MAPE: 0.05725793588440524 AdjR2: 0.9144392333413943
TRAIN Using price_per_sqm: 0.9841546200394318 Using resale_price: 0.9860053492971201 TEST Using price_per_sqm: 0.8926049029169024 Using resale_price: 0.9191638445108937
plt.figure(figsize=(4, 9))
importances = np.mean(np.array([dt.feature_importances_ for dt in m1.best_estimator_.estimators_]), axis=0)
sorted_idx = np.argsort(importances)[-10:]
plt.barh(test_x.columns[sorted_idx], 100*importances[sorted_idx])
plt.xlabel('% of Importance')
plt.title('(Top 10) Variable importance using price_per_sqm')
plt.show()
plt.figure(figsize=(4, 9))
importances = np.mean(np.array([dt.feature_importances_ for dt in m2.best_estimator_.estimators_]), axis=0)
sorted_idx = np.argsort(importances)[-10:]
plt.barh(test_price_x.columns[sorted_idx], 100*importances[sorted_idx])
plt.xlabel('% of Importance')
plt.title('(Top 10) Variable importance using resale_price')
plt.show()
$(1 / (2 * n_{samples})) * ||y - Xw||^2_2 + \alpha * ||w||_1$
parameters = {
'alpha': [0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50],
'max_iter': [10000, 50000]
}
m1, m2 = train_and_evaluate(Lasso, parameters)
Fitting 5 folds for each of 18 candidates, totalling 90 fits Results from Grid Search The best estimator across ALL searched params: Lasso(alpha=0.05, max_iter=10000) The best score across ALL searched params: 0.8321556633908885 The best parameters across ALL searched params: {'alpha': 0.05, 'max_iter': 10000} RMSE: 49333.700614474816 MSE: 2433814016.3186326 MAPE: 0.0665172230947111 AdjR2: 0.9106543468665759
Fitting 5 folds for each of 18 candidates, totalling 90 fits Results from Grid Search The best estimator across ALL searched params: Lasso(alpha=10, max_iter=10000) The best score across ALL searched params: 0.8895070307475461 The best parameters across ALL searched params: {'alpha': 10, 'max_iter': 10000} RMSE: 52623.79850001417 MSE: 2769264168.5700936 MAPE: 0.07518472763015036 AdjR2: 0.8983399248336437
TRAIN Using price_per_sqm: 0.8494220056332313 Using resale_price: 0.90107642580969 TEST Using price_per_sqm: 0.8455021782924822 Using resale_price: 0.9039535295894259
print('Using price_per_sqm')
print(dict(zip(test_x.columns[:3], m1.best_estimator_.coef_[:3])))
print('Using resale_price')
print(dict(zip(test_price_x.columns[:4], m2.best_estimator_.coef_[:4])))
Using price_per_sqm {'total_resales_in_town': -5.161742464495376, 'nearest_mrt_dist': -369.1629837081054, 'remaining_lease': 63.150948039244426} Using resale_price {'floor_area_sqm': 5226.204207530446, 'total_resales_in_town': -487.8176570587973, 'nearest_mrt_dist': -32700.38068888092, 'remaining_lease': 5585.818233950848}
$||y - Xw||^2_2 + \alpha * ||w||^2_2$
parameters = {
'alpha': [0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50],
'max_iter': [1000, 3000, 5000, 10000, 50000]
}
m1, m2 = train_and_evaluate(Ridge, parameters)
Fitting 5 folds for each of 45 candidates, totalling 225 fits Results from Grid Search The best estimator across ALL searched params: Ridge(alpha=1, max_iter=1000) The best score across ALL searched params: 0.8337657522879514 The best parameters across ALL searched params: {'alpha': 1, 'max_iter': 1000} RMSE: 49317.588269997104 MSE: 2432224512.768956 MAPE: 0.06653335443767361 AdjR2: 0.9107126977643238
Fitting 5 folds for each of 45 candidates, totalling 225 fits Results from Grid Search The best estimator across ALL searched params: Ridge(alpha=1, max_iter=1000) The best score across ALL searched params: 0.8904030192043949 The best parameters across ALL searched params: {'alpha': 1, 'max_iter': 1000} RMSE: 52715.6282066038 MSE: 2778937457.2168818 MAPE: 0.07524172577585853 AdjR2: 0.8979848170537146
TRAIN Using price_per_sqm: 0.8487127980662255 Using resale_price: 0.9006859451633299 TEST Using price_per_sqm: 0.845722393803419 Using resale_price: 0.9036180306354686
print('Using price_per_sqm')
print(dict(zip(test_x.columns[:3], m1.best_estimator_.coef_[:3])))
print('Using resale_price')
print(dict(zip(test_price_x.columns[:4], m2.best_estimator_.coef_[:4])))
Using price_per_sqm {'total_resales_in_town': -5.210500860630436, 'nearest_mrt_dist': -379.24859766333975, 'remaining_lease': 63.63971960375359} Using resale_price {'floor_area_sqm': 5210.656126099369, 'total_resales_in_town': -492.0750695229306, 'nearest_mrt_dist': -33605.47537811032, 'remaining_lease': 5614.191714901921}
$1 / (2 * n_{samples}) * ||y - Xw||^2_2 + \alpha * l1_{ratio} * ||w||_1 + 0.5 * \alpha * (1 - l1_{ratio}) * ||w||^2_2$
parameters = {
'alpha': [0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50],
'l1_ratio': [0.2, 0.5, 0.8],
'max_iter': [1000, 3000, 5000, 10000, 50000]
}
m1, m2 = train_and_evaluate(ElasticNet, parameters)
Fitting 5 folds for each of 135 candidates, totalling 675 fits Results from Grid Search The best estimator across ALL searched params: ElasticNet(alpha=0.001, max_iter=3000) The best score across ALL searched params: 0.8337388712181502 The best parameters across ALL searched params: {'alpha': 0.001, 'l1_ratio': 0.5, 'max_iter': 3000} RMSE: 49330.91701427542 MSE: 2433539373.4693284 MAPE: 0.06654964096889816 AdjR2: 0.9106644290440081
Fitting 5 folds for each of 135 candidates, totalling 675 fits Results from Grid Search The best estimator across ALL searched params: ElasticNet(alpha=0.001) The best score across ALL searched params: 0.8904372491320327 The best parameters across ALL searched params: {'alpha': 0.001, 'l1_ratio': 0.5, 'max_iter': 1000} RMSE: 52786.62103940151 MSE: 2786427360.7573857 MAPE: 0.07529515445270518 AdjR2: 0.8977098616465857
c:\Users\user\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_coordinate_descent.py:530: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 4454977330532.94, tolerance: 8725634172.54874 model = cd_fast.enet_coordinate_descent(
TRAIN Using price_per_sqm: 0.848100586839195 Using resale_price: 0.9003028029726797 TEST Using price_per_sqm: 0.8455970878164325 Using resale_price: 0.9033582580912145
print('Using price_per_sqm')
print(dict(zip(test_x.columns[:3], m1.best_estimator_.coef_[:3])))
print('Using resale_price')
print(dict(zip(test_price_x.columns[:4], m2.best_estimator_.coef_[:4])))
Using price_per_sqm {'total_resales_in_town': -5.232103944728176, 'nearest_mrt_dist': -382.9537076299686, 'remaining_lease': 63.90493056446854} Using resale_price {'floor_area_sqm': 5226.6213560332535, 'total_resales_in_town': -494.2279091694513, 'nearest_mrt_dist': -34046.35209046231, 'remaining_lease': 5639.998219106059}