12.SHAP Code

2024-03-17 4 분 소요

[목적]

Global Importance Score와 Local Importance Score 계산
SHAP Page 활용 Locality 실습

[Process]

Define X’s & Y
Split Train & Valid dataset
Modeling
Model 해석
Black Box 해석

  !pip install shap

따로 설치

  import os
import gc
import re
import pickle
import joblib
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import shap

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score
from lightgbm import LGBMClassifier, LGBMRegressor
from collections import Counter

  # Data Loading (수술 時 사망 데이터)
data=pd.read_csv("https://raw.githubusercontent.com/GonieAhn/Data-Science-online-course-from-gonie/main/Data%20Store/example_data.csv")
# Data Checking
col = []
missing = []
level = [] 
for name in data.columns:
    
    # Missing
    missper = data[name].isnull().sum() / data.shape[0]
    missing.append(round(missper, 4))

    # Leveling
    lel = data[name].dropna()
    level.append(len(list(set(lel))))

    # Columns
    col.append(name)

summary = pd.concat([pd.DataFrame(col, columns=['name']), 
                     pd.DataFrame(missing, columns=['Missing Percentage']), 
                     pd.DataFrame(level, columns=['Level'])], axis=1)

drop_col = summary['name'][(summary['Level'] <= 1) | (summary['Missing Percentage'] >= 0.8)]
data.drop(columns=drop_col, inplace=True)
print(">>>> Data Shape : {}".format(data.shape))

  # X's & Y Split
Y = data['censor']
X = data.drop(columns=['censor'])
idx = list(range(X.shape[0]))
train_idx, valid_idx = train_test_split(idx, test_size=0.3, random_state=2021)
print(">>>> # of Train data : {}".format(len(train_idx)))
print(">>>> # of valid data : {}".format(len(valid_idx)))
print(">>>> # of Train data Y : {}".format(Counter(Y.iloc[train_idx])))
print(">>>> # of valid data Y : {}".format(Counter(Y.iloc[valid_idx])))

  # n_estimators
n_tree = [5, 10, 20]
# learning_rate
l_rate = [0.1, 0.3]
# max_depth
m_depth = [3, 5]
# reg_alpha
L1_norm = [0.1, 0.3, 0.5]

# Modeling
save_n = []
save_l = []
save_m = []
save_L1 = []
f1_score_ = []

cnt = 0

for n in n_tree:
    for l in l_rate:
        for m in m_depth:
            for L1 in L1_norm:
                
                print(">>> {} <<<".format(cnt))
                cnt +=1
                print("n_estimators : {}, learning_rate : {}, max_depth : {}, reg_alpha : {}".format(n, l, m, L1))
                model = LGBMClassifier(n_estimators=n, learning_rate=l, 
                                       max_depth=m, reg_alpha=L1, 
                                       n_jobs=-1, objective='cross_entropy')
                model.fit(X.iloc[train_idx], Y.iloc[train_idx])
                
                
                # Train Acc
                y_pre_train = model.predict(X.iloc[train_idx])
                cm_train = confusion_matrix(Y.iloc[train_idx], y_pre_train)
                print("Train Confusion Matrix")
                print(cm_train)
                print("Train Acc : {}".format((cm_train[0,0] + cm_train[1,1])/cm_train.sum()))
                print("Train F1-Score : {}".format(f1_score(Y.iloc[train_idx], y_pre_train)))

                # Test Acc
                y_pre_test = model.predict(X.iloc[valid_idx])
                cm_test = confusion_matrix(Y.iloc[valid_idx], y_pre_test)
                print("Test Confusion Matrix")
                print(cm_test)
                print("TesT Acc : {}".format((cm_test[0,0] + cm_test[1,1])/cm_test.sum()))
                print("Test F1-Score : {}".format(f1_score(Y.iloc[valid_idx], y_pre_test)))
                print("-----------------------------------------------------------------------")
                print("-----------------------------------------------------------------------")
                save_n.append(n)
                save_l.append(l)
                save_m.append(m)
                save_L1.append(L1)
                f1_score_.append(f1_score(Y.iloc[valid_idx], y_pre_test))


                #joblib.dump(model, './LightGBM_model/Result_{}_{}_{}_{}_{}.pkl'.format(n, l, m, L1, round(save_acc[-1], 4)))
                #gc.collect()
print(">>> {} <<<\nBest Test f1-score : {}\nBest n_estimators : {}\nBest Learning Rate : {}\nBest Max_depth : {}\nBest L1-norm : {}".format(np.argmax(f1_score_),
                                                                                                                                            f1_score_[np.argmax(f1_score_)], 
                                                                                                                                            save_n[np.argmax(f1_score_)],
                                                                                                                                            save_l[np.argmax(f1_score_)],
                                                                                                                                            save_m[np.argmax(f1_score_)],
                                                                                                                                            save_L1[np.argmax(f1_score_)]))
best_model = LGBMClassifier(n_estimators=save_n[np.argmax(f1_score_)], learning_rate=save_l[np.argmax(f1_score_)], 
                           max_depth=save_m[np.argmax(f1_score_)], reg_alpha=save_L1[np.argmax(f1_score_)], objective='cross_entropy', 
                           random_state=119)
best_model.fit(X.iloc[train_idx], Y.iloc[train_idx])

# Train Acc
y_pre_train = best_model.predict(X.iloc[train_idx])
cm_train = confusion_matrix(Y.iloc[train_idx], y_pre_train)
print("Train Confusion Matrix")
print(cm_train)
print("Train Acc : {}".format((cm_train[0,0] + cm_train[1,1])/cm_train.sum()))
print("Train F1-Score : {}".format(f1_score(Y.iloc[train_idx], y_pre_train)))

# Test Acc
y_pre_test = best_model.predict(X.iloc[valid_idx])
cm_test = confusion_matrix(Y.iloc[valid_idx], y_pre_test)
print("Test Confusion Matrix")
print(cm_test)
print("TesT Acc : {}".format((cm_test[0,0] + cm_test[1,1])/cm_test.sum()))
print("Test F1-Score : {}".format(f1_score(Y.iloc[valid_idx], y_pre_test)))

  feature_map = pd.DataFrame(sorted(zip(best_model.feature_importances_, X.columns), reverse=True), columns=['Score', 'Feature'])
print(feature_map)
# Importance Score Top 10
feature_map_10 = feature_map.iloc[:10]
plt.figure(figsize=(20, 10))
sns.barplot(x="Score", y="Feature", data=feature_map_10.sort_values(by="Score", ascending=False), errwidth=40)
plt.title('XGBoost Importance Features')
plt.tight_layout()
plt.show()

[SHAPely Values]

Package : https://shap.readthedocs.io/en/latest/
Package Code & Example : https://github.com/slundberg/shap

[SHAPely Values 내장 함수]

force_plot : 특정 데이터 하나 또는 전체 데이터에 대해 Shapley value를 1차원 평면에 정렬하여 보여주는 Plot
dependence_plot : 각 특성의 Shapely Value를 확인할 수 있음
- y축에 나온 특성은 선택한 x와의 관계(상호작용 효과)를 나타냄, 그래프 상에서 색깔이 수직 패턴이 나오는 경우 관계가 존재한다고 판단할 수 잇음
Summary_plot : Global하게 Shapely value를 보여주는 plot
- shapely value 분포에 어떤 영향을 미치는지 시각화 해줌
- shap_interaction_values : X’s 간 관계 (상관관계)를 파악할 수 있는 Plot

[SHAPely Values 주의 사항]

Shapely Values를 계산할 때 시간이 오래걸림
데이터가 크면 클수록 시간이 많이 걸리기 때문에 원하는 Index만 추출하여 계산하는 것을 추천함
현재 데이터는 많지 않기 때문에 전체에 대해서 진행할 예정

[TreeExplainer]

While SHAP can explain the output of any machine learning model, we have developed a high-speed exact algorithm for tree ensemble methods (see our Nature MI paper). Fast C++ implementations are supported for XGBoost, LightGBM, CatBoost, scikit-learn and pyspark tree models

[GradientExplainer]

Expected gradients combines ideas from Integrated Gradients, SHAP, and SmoothGrad into a single expected value equation. This allows an entire dataset to be used as the background distribution (as opposed to a single reference value) and allows local smoothing. If we approximate the model with a linear function between each background data sample and the current input to be explained, and we assume the input features are independent then expected gradients will compute approximate SHAP values. In the example below we have explained how the 7th intermediate layer of the VGG16 ImageNet model impacts the output probabilities.

[KernelExplainer] - 시간 엄청 오래 걸림…

Kernel SHAP uses a specially-weighted local linear regression to estimate SHAP values for any model. Below is a simple example for explaining a multi-class SVM on the classic iris dataset.

  # SHAP Explainer 만들기
explainer = shap.TreeExplainer(best_model)
# Shap values 계산
shap_values = explainer.shap_values(X.iloc[train_idx])

SV 계산할 때 train 데이터만 해도 되고, validation,전체 데이터 다 써도 됨 상관x
이후 .shape 확인

  #force_plot : 특정 데이터 하나 또는 전체 데이터에 대해 Shapley value를 1차원 평면에 정렬하여 보여주는 Plot
# 예시는 세번째 데이터에 대해
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[2, :], X.iloc[train_idx[2], :], link="logit")

shap.initjs(): SHAP값을 시각화, 꼭 써주기!
base value: 0.2988 -> 0.85 예측
가장 큰 원인은 “event”
```
Y.iloc[train_idx[2]]
```

값이 1 이므로 0.85와 근접

# 누적 시각화
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values, X.iloc[train_idx])

누적 값을 시각화 시킴: event의 영향이 젤크네..

200 언저리에서 class sort가능 할 듯

# Shap Value의 절대 값이 큰 순서대로 정렬함
top_inds = np.argsort(-np.sum(np.abs(shap_values), 0))
# 영향력 Top 3에 대한 Dependence plot 
for i in range(3):
  shap.dependence_plot(top_inds[i], shap_values, X.iloc[train_idx])

영향력이 큰 3개: event, cd496, preanti

# Global Importance Score
shap.summary_plot(shap_values, X.iloc[train_idx])

우측(2.0): class1, 좌측(-2.0): class0

# Interaction Plot
shap_interaction_values = explainer.shap_interaction_values(X.iloc[train_idx])
shap.summary_plot(shap_interaction_values, X.iloc[train_idx])

Twitter Facebook LinkedIn

Lee SeungHyun

12.SHAP Code

공유하기

댓글남기기

참고

코드 유사성 판단 경진 대회- private 3위

25.One-class SVM Code

24.Robust Random Cut Forest Code

23.Isolation Forest Code