9.Gradient Boosting Machine Code
[목적]
- Bias를 낮추기위한 Boosting 모델의 AdaBoost을 개선한 GBM 실습 및 해석
[Process]
- Define X’s & Y
- Split Train & Valid dataset
- Modeling
- Model 해석
import os
import gc
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.ensemble import GradientBoostingClassifier
from collections import Counter
# Data Loading (수술 時 사망 데이터)
data=pd.read_csv("https://raw.githubusercontent.com/GonieAhn/Data-Science-online-course-from-gonie/main/Data%20Store/example_data.csv")
# X's & Y Split
Y = data['censor']
X = data.drop(columns=['censor'])
-
현업에서는 X,Y 나누지 않고 아래와 같이 진행(memory문제..)
data.iloc[train_idx, 1:] data.iloc[train_idx, 0]
idx = list(range(X.shape[0]))
train_idx, valid_idx = train_test_split(idx, test_size=0.3, random_state=2021)
print(">>>> # of Train data : {}".format(len(train_idx)))
print(">>>> # of valid data : {}".format(len(valid_idx)))
print(">>>> # of Train data Y : {}".format(Counter(Y.iloc[train_idx])))
print(">>>> # of valid data Y : {}".format(Counter(Y.iloc[valid_idx])))
[Gradient Boosting Machine Parameters]
- Package : https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
- n_estimators : # of Tree
- learning_rate : learning_rate과 n_estimator와 Trade-off 관계가 있음
- Weight applied to each classifier at each boosting iteration
- max_features : Feature 수 sampling (Overfitting 방지)
- subsample : Data Subsample (Overfitting 방지, Bootstrap X)
- max_depth : Tree의 최대 깊이 제한
# GBM Hyperparameter
estimators = [10, 20, 50]
learning = [0.05, 0.1, 0.5]
subsam = [0.5, 0.75, 1]
# Modeling
save_est = []
save_lr = []
save_sub = []
f1_score_ = []
cnt = 0
for est in estimators:
for lr in learning:
for sub in subsam:
print(">>> {} <<<".format(cnt))
cnt += 1
print("Number of Estimators : {}, Learning Rate : {}, Subsample : {}".format(est, lr, sub))
model = GradientBoostingClassifier(n_estimators=est,
learning_rate=lr,
subsample=sub,
random_state=119)
model.fit(X.iloc[train_idx], Y.iloc[train_idx])
# Train Acc
y_pre_train = model.predict(X.iloc[train_idx])
cm_train = confusion_matrix(Y.iloc[train_idx], y_pre_train)
print("Train Confusion Matrix")
print(cm_train)
print("Train Acc : {}".format((cm_train[0,0] + cm_train[1,1])/cm_train.sum()))
print("Train F1-Score : {}".format(f1_score(Y.iloc[train_idx], y_pre_train)))
# Test Acc
y_pre_test = model.predict(X.iloc[valid_idx])
cm_test = confusion_matrix(Y.iloc[valid_idx], y_pre_test)
print("Test Confusion Matrix")
print(cm_test)
print("TesT Acc : {}".format((cm_test[0,0] + cm_test[1,1])/cm_test.sum()))
print("Test F1-Score : {}".format(f1_score(Y.iloc[valid_idx], y_pre_test)))
print("-----------------------------------------------------------------------")
print("-----------------------------------------------------------------------")
save_est.append(est)
save_lr.append(lr)
save_sub.append(sub)
f1_score_.append(f1_score(Y.iloc[valid_idx], y_pre_test))
- parameter 3개라 for loop문으로 돌렸음
print(">>> {} <<<\nBest Test f1-score : {}\nBest n_estimators : {}\nBest SubSampling : {}\nBest Learning Rate : {}".format(np.argmax(f1_score_),
f1_score_[np.argmax(f1_score_)],
save_est[np.argmax(f1_score_)],
save_sub[np.argmax(f1_score_)],
save_lr[np.argmax(f1_score_)]))
best_model = GradientBoostingClassifier(n_estimators=save_est[np.argmax(f1_score_)],
learning_rate=save_lr[np.argmax(f1_score_)],
subsample = save_sub[np.argmax(f1_score_)],
random_state=119)
best_model.fit(X.iloc[train_idx], Y.iloc[train_idx])
# Train Acc
y_pre_train = best_model.predict(X.iloc[train_idx])
cm_train = confusion_matrix(Y.iloc[train_idx], y_pre_train)
print("Train Confusion Matrix")
print(cm_train)
print("Train Acc : {}".format((cm_train[0,0] + cm_train[1,1])/cm_train.sum()))
print("Train F1-Score : {}".format(f1_score(Y.iloc[train_idx], y_pre_train)))
# Test Acc
y_pre_test = best_model.predict(X.iloc[valid_idx])
cm_test = confusion_matrix(Y.iloc[valid_idx], y_pre_test)
print("Test Confusion Matrix")
print(cm_test)
print("TesT Acc : {}".format((cm_test[0,0] + cm_test[1,1])/cm_test.sum()))
print("Test F1-Score : {}".format(f1_score(Y.iloc[valid_idx], y_pre_test)))
feature_map = pd.DataFrame(sorted(zip(best_model.feature_importances_, X.columns), reverse=True), columns=['Score', 'Feature'])
print(feature_map)
- f1 score:0.88.. 젤 괜찮은 수치가 나온 듯
# Importance Score Top 10
feature_map_20 = feature_map.iloc[:10]
plt.figure(figsize=(20, 10))
sns.barplot(x="Score", y="Feature", data=feature_map_20.sort_values(by="Score", ascending=False), errwidth=40)
plt.title('Gradient Boosting Machine Importance Features')
plt.tight_layout()
plt.show()
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
- GBM, MSE, R^2 사용-> regression 준비
# Data Loading (당뇨병)
data = pd.read_csv('https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt', sep='\t')
- 당뇨병 데이터
# X's & Y Split
Y = data['Y']
X = data.drop(columns=['Y'])
X = pd.get_dummies(X, columns=['SEX'])
idx = list(range(X.shape[0]))
train_idx, valid_idx = train_test_split(idx, test_size=0.3, random_state=2023)
print(">>>> # of Train data : {}".format(len(train_idx)))
print(">>>> # of valid data : {}".format(len(valid_idx)))
- 똑같이 idx지정
# GBM Hyperparameter
estimators = [70, 90, 120]
learning = [0.05, 0.1, 0.5]
subsam = [0.5, 0.75, 1]
# Modeling
save_est = []
save_lr = []
save_sub = []
cnt = 0
for est in estimators:
for lr in learning:
for sub in subsam:
print(">>> {} <<<".format(cnt))
cnt += 1
print("Number of Estimators : {}, Learning Rate : {}, Subsample : {}".format(est, lr, sub))
model = GradientBoostingRegressor(n_estimators=est,
learning_rate=lr,
subsample=sub,
random_state=119)
model.fit(X.iloc[train_idx], Y.iloc[train_idx])
# Train Acc
y_pre_train = model.predict(X.iloc[train_idx])
rmse_train = np.sqrt(mean_squared_error(Y.iloc[train_idx], y_pre_train))
print("Train RMSE : {}".format(np.sqrt(mean_squared_error(Y.iloc[train_idx], y_pre_train))))
print("Train R2 : {}".format(r2_score(Y.iloc[train_idx], y_pre_train)))
# Test Acc
y_pre_test = model.predict(X.iloc[valid_idx])
print("TesT RMSE : {}".format(np.sqrt(mean_squared_error(Y.iloc[valid_idx], y_pre_test))))
print("Test R2 : {}".format(r2_score(Y.iloc[valid_idx], y_pre_test)))
print("-----------------------------------------------------------------------")
print("-----------------------------------------------------------------------")
- 나머지는 다 똑같고 ,GradientBoostingClassifier 대신에 GradientBoostingRegressor 넣어줌
- ElasticNet로 돌린 결과 보다 R^2값은 낮게 나옴 -> linear regression에서는 ElasticNet이 더 좋은 듯..(데이터가 작을 때!)
댓글남기기