import scipy
from sklearn import metrics
def sse(clf, X, y): # MSE라고 생각해도 무관
"""Calculate the standard squared error of the model.
Parameters
----------
clf : sklearn.linear_model
A scikit-learn linear model classifier with a `predict()` method.
X : numpy.ndarray
Training data used to fit the classifier.
y : numpy.ndarray
Target training values, of shape = [n_samples].
Returns
-------
float
The standard squared error of the model.
"""
y_hat = clf.predict(X)
sse = np.sum((y_hat - y) ** 2)
return sse / X.shape[0]
# x.shape[]는 row개수
# 그냥 x.shape는 [row,col]반환
def adj_r2_score(clf, X, y): # R^2값
"""Calculate the adjusted :math:`R^2` of the model.
Parameters
----------
clf : sklearn.linear_model
A scikit-learn linear model classifier with a `predict()` method.
X : numpy.ndarray
Training data used to fit the classifier.
y : numpy.ndarray
Target training values, of shape = [n_samples].
Returns
-------
float
The adjusted :math:`R^2` of the model.
"""
n = X.shape[0] # Number of observations
p = X.shape[1] # Number of features
r_squared = metrics.r2_score(y, clf.predict(X))
return 1 - (1 - r_squared) * ((n - 1) / (n - p - 1))
def coef_se(clf, X, y): # coefficient: 상관계수-> beta
"""Calculate standard error for beta coefficients.
Parameters
----------
clf : sklearn.linear_model
A scikit-learn linear model classifier with a `predict()` method.
X : numpy.ndarray
Training data used to fit the classifier.
y : numpy.ndarray
Target training values, of shape = [n_samples].
Returns
-------
numpy.ndarray
An array of standard errors for the beta coefficients.
"""
n = X.shape[0]
X1 = np.hstack((np.ones((n, 1)), np.matrix(X))) # hstack: 행렬 가로 병합
se_matrix = scipy.linalg.sqrtm(
metrics.mean_squared_error(y, clf.predict(X)) *
np.linalg.inv(X1.T * X1)
)
# scipy->linalg(line algebra)->sqrtm(행렬제곱근)
return np.diagonal(se_matrix)
def coef_tval(clf, X, y): # t값
"""Calculate t-statistic for beta coefficients.
Parameters
----------
clf : sklearn.linear_model
A scikit-learn linear model classifier with a `predict()` method.
X : numpy.ndarray
Training data used to fit the classifier.
y : numpy.ndarray
Target training values, of shape = [n_samples].
Returns
-------
numpy.ndarray
An array of t-statistic values.
"""
a = np.array(clf.intercept_ / coef_se(clf, X, y)[0])
b = np.array(clf.coef_ / coef_se(clf, X, y)[1:])
return np.append(a, b)
def coef_pval(clf, X, y): # p값
"""Calculate p-values for beta coefficients.
Parameters
----------
clf : sklearn.linear_model
A scikit-learn linear model classifier with a `predict()` method.
X : numpy.ndarray
Training data used to fit the classifier.
y : numpy.ndarray
Target training values, of shape = [n_samples].
Returns
-------
numpy.ndarray
An array of p-values.
"""
n = X.shape[0]
t = coef_tval(clf, X, y)
p = 2 * (1 - scipy.stats.t.cdf(abs(t), n - 1))
return p
def summary(clf, X, y, xlabels=None):
"""
Output summary statistics for a fitted regression model.
Parameters
----------
clf : sklearn.linear_model
A scikit-learn linear model classifier with a `predict()` method.
X : numpy.ndarray
Training data used to fit the classifier.
y : numpy.ndarray
Target training values, of shape = [n_samples].
xlabels : list, tuple
The labels for the predictors.
"""
# Check and/or make xlabels
ncols = X.shape[1]
if xlabels is None:
xlabels = np.array(
['x{0}'.format(i) for i in range(1, ncols + 1)], dtype='str')
elif isinstance(xlabels, (tuple, list)):
xlabels = np.array(xlabels, dtype='str')
# Make sure dims of xlabels matches dims of X
if xlabels.shape[0] != ncols:
raise AssertionError(
"Dimension of xlabels {0} does not match "
"X {1}.".format(xlabels.shape, X.shape))
# Create data frame of coefficient estimates and associated stats
coef_df = pd.DataFrame(
index=['_intercept'] + list(xlabels),
columns=['Estimate', 'Std. Error', 't value', 'p value']
)
try:
coef_df['Estimate'] = np.concatenate(
(np.round(np.array([clf.intercept_]), 6), np.round((clf.coef_), 6)))
except Exception as e:
coef_df['Estimate'] = np.concatenate(
(
np.round(np.array([clf.intercept_]), 6),
np.round((clf.coef_), 6)
), axis = 1
)[0,:]
coef_df['Std. Error'] = np.round(coef_se(clf, X, y), 6)
coef_df['t value'] = np.round(coef_tval(clf, X, y), 4)
coef_df['p value'] = np.round(coef_pval(clf, X, y), 6)
# Output results
print('Coefficients:')
print(coef_df.to_string(index=True))
print('---')
print('R-squared: {0:.6f}, Adjusted R-squared: {1:.6f}, MSE: {2:.1f}'.format(
metrics.r2_score(y, clf.predict(X)), adj_r2_score(clf, X, y), sse(clf, X, y)))
댓글남기기