4 분 소요

  • [목적]
    1. Linear Regression
    • 단순 Linear Regression을 활용하여 변수의 중요도 및 방향성을 알아봄
    • 매우 심플한 모델이기 때문에 사이즈가 큰 데이터에 적합하지 않음
    • 하지만 설명력에서는 큰 장점이 있음
  • [Process]
    1. Define X’s & Y
    2. Split Train & Valid dataset
    3. Modeling
    4. Model 해석
# Package
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split #train:test = 7:3
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, RidgeCV # CV: cross validation
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler # scaling 해주는것
from sklearn.metrics import mean_squared_error # MSE

# Sklearn Toy Data
from sklearn.datasets import load_diabetes
# Data Loading (당뇨병)
data = load_diabetes()
print('>>>>X Shape : {}'.format(data['data'].shape))
print('>>>>Y Shape : {}'.format(data['target'].shape))
# X는 10개 Y는 하나
# 데이터 설명
print(data['DESCR'])
pd.DataFrame(data['data'], columns=data['feature_names']).describe().applymap(lambda x: f"{x:0.2f}")
data = pd.read_csv('https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt', sep='\t')
data.describe()
# X's & Y Split
Y = data['Y']
X = data.drop(columns=['Y'])
# Y열을 지워버림
X = pd.get_dummies(X, columns=['SEX'])
# 성별이 1,2로 되어 있어서 이걸 더미 variable로 바꿔줌
  • [Data Split]
    • Data Split을 진행할 때 BigData의 경우 꼭 indexing을 추출하여 모델에 적용시켜야 함
    • 이유는 Data Split하여 새로운 Data set을 만들 경우 메모리에 부담을 주기 때문
X.shape[0]
idx = list(range(X.shape[0]))
train_idx, valid_idx = train_test_split(idx, test_size=0.3, random_state=2023) # random state 없으면 idx값이 계속 바뀜
print(">>>> # of Train data : {}".format(len(train_idx)))
print(">>>> # of valid data : {}".format(len(valid_idx)))
# Linear Regression
results = LinearRegression().fit(X.iloc[train_idx], Y.iloc[train_idx])
import scipy
from sklearn import metrics

def sse(clf, X, y): # MSE라고 생각해도 무관
    """Calculate the standard squared error of the model.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    float
        The standard squared error of the model.
    """
    y_hat = clf.predict(X)
    sse = np.sum((y_hat - y) ** 2)
    return sse / X.shape[0]
    # x.shape[]는 row개수
    # 그냥 x.shape는 [row,col]반환


def adj_r2_score(clf, X, y): # R^2값
    """Calculate the adjusted :math:`R^2` of the model.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    float
        The adjusted :math:`R^2` of the model.
    """
    n = X.shape[0]  # Number of observations
    p = X.shape[1]  # Number of features
    r_squared = metrics.r2_score(y, clf.predict(X))
    return 1 - (1 - r_squared) * ((n - 1) / (n - p - 1))


def coef_se(clf, X, y): # coefficient: 상관계수-> beta
    """Calculate standard error for beta coefficients.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    numpy.ndarray
        An array of standard errors for the beta coefficients.
    """
    n = X.shape[0]
    X1 = np.hstack((np.ones((n, 1)), np.matrix(X))) # hstack: 행렬 가로 병합
    se_matrix = scipy.linalg.sqrtm(
        metrics.mean_squared_error(y, clf.predict(X)) *
        np.linalg.inv(X1.T * X1)
    )
    # scipy->linalg(line algebra)->sqrtm(행렬제곱근)
    return np.diagonal(se_matrix)


def coef_tval(clf, X, y): # t값
    """Calculate t-statistic for beta coefficients.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    numpy.ndarray
        An array of t-statistic values.
    """
    a = np.array(clf.intercept_ / coef_se(clf, X, y)[0])
    b = np.array(clf.coef_ / coef_se(clf, X, y)[1:])
    return np.append(a, b)


def coef_pval(clf, X, y): # p값
    """Calculate p-values for beta coefficients.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    numpy.ndarray
        An array of p-values.
    """
    n = X.shape[0]
    t = coef_tval(clf, X, y)
    p = 2 * (1 - scipy.stats.t.cdf(abs(t), n - 1))
    return p

def summary(clf, X, y, xlabels=None):
    """
    Output summary statistics for a fitted regression model.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    xlabels : list, tuple
        The labels for the predictors.
    """
    # Check and/or make xlabels
    ncols = X.shape[1]
    if xlabels is None:
        xlabels = np.array(
            ['x{0}'.format(i) for i in range(1, ncols + 1)], dtype='str')
    elif isinstance(xlabels, (tuple, list)):
        xlabels = np.array(xlabels, dtype='str')
    # Make sure dims of xlabels matches dims of X
    if xlabels.shape[0] != ncols:
        raise AssertionError(
            "Dimension of xlabels {0} does not match "
            "X {1}.".format(xlabels.shape, X.shape))
    # Create data frame of coefficient estimates and associated stats
    coef_df = pd.DataFrame(
        index=['_intercept'] + list(xlabels),
        columns=['Estimate', 'Std. Error', 't value', 'p value']
    )
    try:
        coef_df['Estimate'] = np.concatenate(
            (np.round(np.array([clf.intercept_]), 6), np.round((clf.coef_), 6)))
    except Exception as e:
        coef_df['Estimate'] = np.concatenate(
            (
                np.round(np.array([clf.intercept_]), 6),
                np.round((clf.coef_), 6)
            ), axis = 1
    )[0,:]
    coef_df['Std. Error'] = np.round(coef_se(clf, X, y), 6)
    coef_df['t value'] = np.round(coef_tval(clf, X, y), 4)
    coef_df['p value'] = np.round(coef_pval(clf, X, y), 6)
    # Output results
    print('Coefficients:')
    print(coef_df.to_string(index=True))
    print('---')
    print('R-squared:  {0:.6f},    Adjusted R-squared:  {1:.6f},    MSE: {2:.1f}'.format(
        metrics.r2_score(y, clf.predict(X)), adj_r2_score(clf, X, y), sse(clf, X, y)))
summary(results, X.iloc[valid_idx], Y.iloc[valid_idx], xlabels=X.columns)
summary(results, X.iloc[train_idx], Y.iloc[train_idx], xlabels=X.columns)
# Scaling
scaler = MinMaxScaler().fit(X.iloc[train_idx]) # train_idx만 활용
X_scal = scaler.transform(X)
X_scal = pd.DataFrame(X_scal, columns=X.columns)
# Linear Regression
results = LinearRegression().fit(X_scal.iloc[train_idx], Y.iloc[train_idx])
summary(results, X_scal.iloc[valid_idx], Y.iloc[valid_idx], xlabels=X_scal.columns)
# p값: BMI,BP,S5
# S5 beta값이 207로 젤큼-> 젤 중요

카테고리:

업데이트:

댓글남기기