3 분 소요

[목적]

  • Bias를 낮추기위한 Boosting 모델의 AdaBoost을 개선한 GBM 실습 및 해석

[Process]

  1. Define X’s & Y
  2. Split Train & Valid dataset
  3. Modeling
  4. Model 해석
import os
import gc
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.ensemble import GradientBoostingClassifier
from collections import Counter
# Data Loading (수술 時 사망 데이터)
data=pd.read_csv("https://raw.githubusercontent.com/GonieAhn/Data-Science-online-course-from-gonie/main/Data%20Store/example_data.csv")
# X's & Y Split
Y = data['censor']
X = data.drop(columns=['censor'])
  • 현업에서는 X,Y 나누지 않고 아래와 같이 진행(memory문제..)

      data.iloc[train_idx, 1:]
      data.iloc[train_idx, 0]
    
idx = list(range(X.shape[0]))
train_idx, valid_idx = train_test_split(idx, test_size=0.3, random_state=2021)
print(">>>> # of Train data : {}".format(len(train_idx)))
print(">>>> # of valid data : {}".format(len(valid_idx)))
print(">>>> # of Train data Y : {}".format(Counter(Y.iloc[train_idx])))
print(">>>> # of valid data Y : {}".format(Counter(Y.iloc[valid_idx])))

[Gradient Boosting Machine Parameters]

  • Package : https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
  • n_estimators : # of Tree
  • learning_rate : learning_rate과 n_estimator와 Trade-off 관계가 있음
    • Weight applied to each classifier at each boosting iteration
  • max_features : Feature 수 sampling (Overfitting 방지)
  • subsample : Data Subsample (Overfitting 방지, Bootstrap X)
  • max_depth : Tree의 최대 깊이 제한
# GBM Hyperparameter
estimators = [10, 20, 50]
learning = [0.05, 0.1, 0.5]
subsam = [0.5, 0.75, 1]

# Modeling
save_est = []
save_lr = []
save_sub = []
f1_score_ = []

cnt = 0
for est in estimators:
    for lr in learning:
        for sub in subsam:
            print(">>> {} <<<".format(cnt))
            cnt += 1
            print("Number of Estimators : {}, Learning Rate : {}, Subsample : {}".format(est, lr, sub))
            
            model = GradientBoostingClassifier(n_estimators=est, 
                                               learning_rate=lr, 
                                               subsample=sub,
                                               random_state=119)
            model.fit(X.iloc[train_idx], Y.iloc[train_idx])

            # Train Acc
            y_pre_train = model.predict(X.iloc[train_idx])
            cm_train = confusion_matrix(Y.iloc[train_idx], y_pre_train)
            print("Train Confusion Matrix")
            print(cm_train)
            print("Train Acc : {}".format((cm_train[0,0] + cm_train[1,1])/cm_train.sum()))
            print("Train F1-Score : {}".format(f1_score(Y.iloc[train_idx], y_pre_train)))

            # Test Acc
            y_pre_test = model.predict(X.iloc[valid_idx])
            cm_test = confusion_matrix(Y.iloc[valid_idx], y_pre_test)
            print("Test Confusion Matrix")
            print(cm_test)
            print("TesT Acc : {}".format((cm_test[0,0] + cm_test[1,1])/cm_test.sum()))
            print("Test F1-Score : {}".format(f1_score(Y.iloc[valid_idx], y_pre_test)))
            print("-----------------------------------------------------------------------")
            print("-----------------------------------------------------------------------")
            save_est.append(est)
            save_lr.append(lr)
            save_sub.append(sub)
            f1_score_.append(f1_score(Y.iloc[valid_idx], y_pre_test))
  • parameter 3개라 for loop문으로 돌렸음
print(">>> {} <<<\nBest Test f1-score : {}\nBest n_estimators : {}\nBest SubSampling : {}\nBest Learning Rate : {}".format(np.argmax(f1_score_),
                                                                                                                           f1_score_[np.argmax(f1_score_)], 
                                                                                                                           save_est[np.argmax(f1_score_)],
                                                                                                                           save_sub[np.argmax(f1_score_)],
                                                                                                                           save_lr[np.argmax(f1_score_)]))
best_model = GradientBoostingClassifier(n_estimators=save_est[np.argmax(f1_score_)], 
                                        learning_rate=save_lr[np.argmax(f1_score_)],
                                        subsample = save_sub[np.argmax(f1_score_)], 
                                        random_state=119)
best_model.fit(X.iloc[train_idx], Y.iloc[train_idx])

# Train Acc
y_pre_train = best_model.predict(X.iloc[train_idx])
cm_train = confusion_matrix(Y.iloc[train_idx], y_pre_train)
print("Train Confusion Matrix")
print(cm_train)
print("Train Acc : {}".format((cm_train[0,0] + cm_train[1,1])/cm_train.sum()))
print("Train F1-Score : {}".format(f1_score(Y.iloc[train_idx], y_pre_train)))

# Test Acc
y_pre_test = best_model.predict(X.iloc[valid_idx])
cm_test = confusion_matrix(Y.iloc[valid_idx], y_pre_test)
print("Test Confusion Matrix")
print(cm_test)
print("TesT Acc : {}".format((cm_test[0,0] + cm_test[1,1])/cm_test.sum()))
print("Test F1-Score : {}".format(f1_score(Y.iloc[valid_idx], y_pre_test)))
feature_map = pd.DataFrame(sorted(zip(best_model.feature_importances_, X.columns), reverse=True), columns=['Score', 'Feature'])
print(feature_map)
  • f1 score:0.88.. 젤 괜찮은 수치가 나온 듯
# Importance Score Top 10
feature_map_20 = feature_map.iloc[:10]
plt.figure(figsize=(20, 10))
sns.barplot(x="Score", y="Feature", data=feature_map_20.sort_values(by="Score", ascending=False), errwidth=40)
plt.title('Gradient Boosting Machine Importance Features')
plt.tight_layout()
plt.show()

image

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
  • GBM, MSE, R^2 사용-> regression 준비
# Data Loading (당뇨병)
data = pd.read_csv('https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt', sep='\t')
  • 당뇨병 데이터
# X's & Y Split
Y = data['Y']
X = data.drop(columns=['Y']) 
X = pd.get_dummies(X, columns=['SEX'])
idx = list(range(X.shape[0]))
train_idx, valid_idx = train_test_split(idx, test_size=0.3, random_state=2023)
print(">>>> # of Train data : {}".format(len(train_idx)))
print(">>>> # of valid data : {}".format(len(valid_idx)))
  • 똑같이 idx지정
# GBM Hyperparameter
estimators = [70, 90, 120]
learning = [0.05, 0.1, 0.5]
subsam = [0.5, 0.75, 1]

# Modeling
save_est = []
save_lr = []
save_sub = []

cnt = 0
for est in estimators:
    for lr in learning:
        for sub in subsam:
            print(">>> {} <<<".format(cnt))
            cnt += 1
            print("Number of Estimators : {}, Learning Rate : {}, Subsample : {}".format(est, lr, sub))
            
            model = GradientBoostingRegressor(n_estimators=est, 
                                               learning_rate=lr, 
                                               subsample=sub,
                                               random_state=119)
            model.fit(X.iloc[train_idx], Y.iloc[train_idx])

            # Train Acc
            y_pre_train = model.predict(X.iloc[train_idx])
            rmse_train = np.sqrt(mean_squared_error(Y.iloc[train_idx], y_pre_train))
            print("Train RMSE : {}".format(np.sqrt(mean_squared_error(Y.iloc[train_idx], y_pre_train))))
            print("Train R2 : {}".format(r2_score(Y.iloc[train_idx], y_pre_train)))

            # Test Acc
            y_pre_test = model.predict(X.iloc[valid_idx])
            print("TesT RMSE : {}".format(np.sqrt(mean_squared_error(Y.iloc[valid_idx], y_pre_test))))
            print("Test R2 : {}".format(r2_score(Y.iloc[valid_idx], y_pre_test)))
            print("-----------------------------------------------------------------------")
            print("-----------------------------------------------------------------------")
  • 나머지는 다 똑같고 ,GradientBoostingClassifier 대신에 GradientBoostingRegressor 넣어줌
  • ElasticNet로 돌린 결과 보다 R^2값은 낮게 나옴 -> linear regression에서는 ElasticNet이 더 좋은 듯..(데이터가 작을 때!)

카테고리:

업데이트:

댓글남기기