第100+28步 ChatGPT学习:概率校准 Bayesian Calibration

发布于:2024-10-17 ⋅ 阅读:(11) ⋅ 点赞:(0)

基于Python 3.9版本演示

一、写在前面

最近看了一篇在Lancet子刊《eClinicalMedicine》上发表的机器学习分类的文章:《Development of a novel dementia risk prediction model in the general population: A large, longitudinal, population-based machine-learning study》。

学到一种叫做“概率校准”的骚操作,顺手利用GPT系统学习学习。

文章中用的技术是:保序回归(Isotonic regression)。

为了体现举一反三,顺便问了GPT还有哪些方法也可以实现概率校准。它给我列举了很多,那么就一个一个学习吧。

这一期,介绍一个叫做 Bayesian Calibration 的方法。

二、Bayesian Calibration

Bayesian Calibration是一种统计方法,用于调整模型的输出,通常是概率预测,以更好地与观察到的结果对齐。这种方法利用贝叶斯推断,通过结合新的数据更新先验的模型参数,从而确保预测结果具有良好的校准性。

(1)主要步骤

1)先验分布:定义需要校准的参数的先验分布。这个先验分布反映了在考虑当前数据之前对参数值的初始信念。

2)似然函数:定义一个似然函数,它描述了在给定参数值的情况下,观察到的数据的可能性。

3)后验分布:使用贝叶斯定理,将先验分布和似然函数结合起来,得到参数的后验分布。

4)预测和调整:使用后验分布来调整模型的预测结果,改善其与观察数据的校准性。

三、Bayesian Calibration代码实现

下面,我编一个1比3的不太平衡的数据进行测试,对照组使用不进行校准的SVM模型,实验组就是加入校准的SVM模型,看看性能能够提高多少?

(1)不进行校准的SVM模型(默认参数)

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve

# 加载数据
dataset = pd.read_csv('8PSMjianmo.csv')
X = dataset.iloc[:, 1:20].values
Y = dataset.iloc[:, 0].values

# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=666)

# 标准化数据
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# 使用SVM分类器
classifier = SVC(kernel='linear', probability=True)
classifier.fit(X_train, y_train)

# 预测结果
y_pred = classifier.predict(X_test)
y_testprba = classifier.decision_function(X_test)

y_trainpred = classifier.predict(X_train)
y_trainprba = classifier.decision_function(X_train)

# 混淆矩阵
cm_test = confusion_matrix(y_test, y_pred)
cm_train = confusion_matrix(y_train, y_trainpred)
print(cm_train)
print(cm_test)

# 绘制测试集混淆矩阵
classes = list(set(y_test))
classes.sort()
plt.imshow(cm_test, cmap=plt.cm.Blues)
indices = range(len(cm_test))
plt.xticks(indices, classes)
plt.yticks(indices, classes)
plt.colorbar()
plt.xlabel('Predicted')
plt.ylabel('Actual')
for first_index in range(len(cm_test)):
    for second_index in range(len(cm_test[first_index])):
        plt.text(first_index, second_index, cm_test[first_index][second_index])

plt.show()

# 绘制训练集混淆矩阵
classes = list(set(y_train))
classes.sort()
plt.imshow(cm_train, cmap=plt.cm.Blues)
indices = range(len(cm_train))
plt.xticks(indices, classes)
plt.yticks(indices, classes)
plt.colorbar()
plt.xlabel('Predicted')
plt.ylabel('Actual')
for first_index in range(len(cm_train)):
    for second_index in range(len(cm_train[first_index])):
        plt.text(first_index, second_index, cm_train[first_index][second_index])

plt.show()

# 计算并打印性能参数
def calculate_metrics(cm, y_true, y_pred_prob):
    a = cm[0, 0]
    b = cm[0, 1]
    c = cm[1, 0]
    d = cm[1, 1]
    acc = (a + d) / (a + b + c + d)
    error_rate = 1 - acc
    sen = d / (d + c)
    sep = a / (a + b)
    precision = d / (b + d)
    F1 = (2 * precision * sen) / (precision + sen)
    MCC = (d * a - b * c) / (np.sqrt((d + b) * (d + c) * (a + b) * (a + c)))
    auc_score = roc_auc_score(y_true, y_pred_prob)
    
    metrics = {
        "Accuracy": acc,
        "Error Rate": error_rate,
        "Sensitivity": sen,
        "Specificity": sep,
        "Precision": precision,
        "F1 Score": F1,
        "MCC": MCC,
        "AUC": auc_score
    }
    return metrics

metrics_test = calculate_metrics(cm_test, y_test, y_testprba)
metrics_train = calculate_metrics(cm_train, y_train, y_trainprba)

print("Performance Metrics (Test):")
for key, value in metrics_test.items():
    print(f"{key}: {value:.4f}")

print("\nPerformance Metrics (Train):")
for key, value in metrics_train.items():
print(f"{key}: {value:.4f}")

结果输出:

记住这些个数字。

这个参数的SVM还没有LR好。

(2)进行校准的SVM模型(默认参数)

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, roc_auc_score, brier_score_loss, precision_score, f1_score, matthews_corrcoef
from sklearn.calibration import calibration_curve
import statsmodels.api as sm

# 加载数据
dataset = pd.read_csv('8PSMjianmo.csv')
X = dataset.iloc[:, 1:20].values
Y = dataset.iloc[:, 0].values

# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=666)

# 标准化数据
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# 使用SVM分类器
classifier = SVC(kernel='rbf', C=0.1, probability=True)
classifier.fit(X_train, y_train)

# 获取未校准的概率预测
y_train_probs = classifier.predict_proba(X_train)[:, 1]
y_test_probs = classifier.predict_proba(X_test)[:, 1]

# 贝叶斯二项回归校准类
class BayesianBinomialCalibration:
    def __init__(self):
        self.model = None

    def fit(self, probs, true_labels):
        # 使用逻辑回归模型进行校准
        self.model = sm.GLM(true_labels, sm.add_constant(probs), family=sm.families.Binomial()).fit()

    def predict(self, probs):
        # 使用校准模型预测校准后的概率
        calibrated_probs = self.model.predict(sm.add_constant(probs))
        return calibrated_probs

# 训练贝叶斯二项回归校准模型
bbc = BayesianBinomialCalibration()
bbc.fit(y_train_probs, y_train)

# 进行校准
calibrated_train_probs = bbc.predict(y_train_probs)
calibrated_test_probs = bbc.predict(y_test_probs)

# 预测结果
y_train_pred = (calibrated_train_probs >= 0.5).astype(int)
y_test_pred = (calibrated_test_probs >= 0.5).astype(int)

# 混淆矩阵
cm_test = confusion_matrix(y_test, y_test_pred)
cm_train = confusion_matrix(y_train, y_train_pred)
print(cm_train)
print(cm_test)

# 绘制混淆矩阵函数
def plot_confusion_matrix(cm, classes, title='Confusion Matrix'):
    plt.imshow(cm, cmap=plt.cm.Blues)
    indices = range(len(cm))
    plt.xticks(indices, classes)
    plt.yticks(indices, classes)
    plt.colorbar()
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    for first_index in range(len(cm)):
        for second_index in range(len(cm[first_index])):
            plt.text(second_index, first_index, cm[first_index][second_index])
    plt.title(title)
    plt.show()

# 绘制测试集混淆矩阵
plot_confusion_matrix(cm_test, list(set(y_test)), 'Confusion Matrix (Test)')

# 绘制训练集混淆矩阵
plot_confusion_matrix(cm_train, list(set(y_train)), 'Confusion Matrix (Train)')

# 计算并打印性能参数
def calculate_metrics(cm, y_true, y_pred, y_pred_prob):
    a = cm[0, 0]
    b = cm[0, 1]
    c = cm[1, 0]
    d = cm[1, 1]
    acc = (a + d) / (a + b + c + d)
    error_rate = 1 - acc
    sen = d / (d + c) if (d + c) > 0 else 0
    sep = a / (a + b) if (a + b) > 0 else 0
    precision = precision_score(y_true, y_pred, zero_division=0)
    F1 = f1_score(y_true, y_pred, zero_division=0)
    MCC = matthews_corrcoef(y_true, y_pred)
    auc_score = roc_auc_score(y_true, y_pred_prob)
    brier_score = brier_score_loss(y_true, y_pred_prob)
    
    metrics = {
        "Accuracy": acc,
        "Error Rate": error_rate,
        "Sensitivity": sen,
        "Specificity": sep,
        "Precision": precision,
        "F1 Score": F1,
        "MCC": MCC,
        "AUC": auc_score,
        "Brier Score": brier_score
    }
    return metrics

metrics_test = calculate_metrics(cm_test, y_test, y_test_pred, calibrated_test_probs)
metrics_train = calculate_metrics(cm_train, y_train, y_train_pred, calibrated_train_probs)

print("Performance Metrics (Test):")
for key, value in metrics_test.items():
    print(f"{key}: {value:.4f}")

print("\nPerformance Metrics (Train):")
for key, value in metrics_train.items():
    print(f"{key}: {value:.4f}")

看看结果:

大同小异吧。

四、换个策略

参考那篇文章的策略:采用五折交叉验证来建立和评估模型,其中四折用于训练,一折用于评估,在训练集中,其中三折用于建立SVM模型,另一折采用Bayesian Calibration概率校正,在训练集内部采用交叉验证对超参数进行调参。

代码:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, roc_auc_score, brier_score_loss, precision_score, f1_score, matthews_corrcoef
from sklearn.calibration import calibration_curve
import statsmodels.api as sm

# 加载数据
dataset = pd.read_csv('8PSMjianmo.csv')
X = dataset.iloc[:, 1:20].values
Y = dataset.iloc[:, 0].values

# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=666)

# 标准化数据
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# 定义五折交叉验证
kf = KFold(n_splits=5, shuffle=True, random_state=666)
calibrated_probs = []
true_labels = []

# 贝叶斯二项回归校准类
class BayesianBinomialCalibration:
    def __init__(self):
        self.model = None

    def fit(self, probs, true_labels):
        # 使用逻辑回归模型进行校准
        self.model = sm.GLM(true_labels, sm.add_constant(probs), family=sm.families.Binomial()).fit()

    def predict(self, probs):
        # 使用校准模型预测校准后的概率
        calibrated_probs = self.model.predict(sm.add_constant(probs))
        return calibrated_probs

best_params = None  # 用于存储最优参数

for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
    
    # 内部三折交叉验证用于超参数调优
    inner_kf = KFold(n_splits=3, shuffle=True, random_state=666)
    param_grid = {'C': [0.01, 0.1, 1, 10, 100], 'kernel': ['rbf']}
    svm = SVC(probability=True)
    clf = GridSearchCV(svm, param_grid, cv=inner_kf, scoring='roc_auc')
    clf.fit(X_train_fold, y_train_fold)
    best_params = clf.best_params_
    
    # 使用最佳参数训练SVM
    classifier = SVC(kernel=best_params['kernel'], C=best_params['C'], probability=True)
    classifier.fit(X_train_fold, y_train_fold)
    
    # 获取未校准的概率预测
    y_val_fold_probs = classifier.predict_proba(X_val_fold)[:, 1]
    
    # 贝叶斯二项回归校准
    bbc = BayesianBinomialCalibration()
    bbc.fit(y_val_fold_probs, y_val_fold)
    calibrated_val_fold_probs = bbc.predict(y_val_fold_probs)
    
    calibrated_probs.extend(calibrated_val_fold_probs)
    true_labels.extend(y_val_fold)

# 用于测试集的SVM模型训练和校准
classifier_final = SVC(kernel=best_params['kernel'], C=best_params['C'], probability=True)
classifier_final.fit(X_train, y_train)
y_test_probs = classifier_final.predict_proba(X_test)[:, 1]

# 贝叶斯二项回归校准
bbc_final = BayesianBinomialCalibration()
bbc_final.fit(y_test_probs, y_test)
calibrated_test_probs = bbc_final.predict(y_test_probs)

# 预测结果
y_train_pred = (np.array(calibrated_probs) >= 0.5).astype(int)
y_test_pred = (calibrated_test_probs >= 0.5).astype(int)

# 混淆矩阵
cm_test = confusion_matrix(y_test, y_test_pred)
cm_train = confusion_matrix(true_labels, y_train_pred)
print("Training Confusion Matrix:\n", cm_train)
print("Testing Confusion Matrix:\n", cm_test)

# 绘制混淆矩阵函数
def plot_confusion_matrix(cm, classes, title='Confusion Matrix'):
    plt.imshow(cm, cmap=plt.cm.Blues)
    indices = range(len(cm))
    plt.xticks(indices, classes)
    plt.yticks(indices, classes)
    plt.colorbar()
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    for first_index in range(len(cm)):
        for second_index in range(len(cm[first_index])):
            plt.text(second_index, first_index, cm[first_index][second_index])
    plt.title(title)
    plt.show()

# 绘制测试集混淆矩阵
plot_confusion_matrix(cm_test, list(set(y_test)), 'Confusion Matrix (Test)')

# 绘制训练集混淆矩阵
plot_confusion_matrix(cm_train, list(set(true_labels)), 'Confusion Matrix (Train)')

# 计算并打印性能参数
def calculate_metrics(cm, y_true, y_pred, y_pred_prob):
    a = cm[0, 0]
    b = cm[0, 1]
    c = cm[1, 0]
    d = cm[1, 1]
    acc = (a + d) / (a + b + c + d)
    error_rate = 1 - acc
    sen = d / (d + c) if (d + c) > 0 else 0
    sep = a / (a + b) if (a + b) > 0 else 0
    precision = precision_score(y_true, y_pred, zero_division=0)
    F1 = f1_score(y_true, y_pred, zero_division=0)
    MCC = matthews_corrcoef(y_true, y_pred)
    auc_score = roc_auc_score(y_true, y_pred_prob)
    brier_score = brier_score_loss(y_true, y_pred_prob)
    
    metrics = {
        "Accuracy": acc,
        "Error Rate": error_rate,
        "Sensitivity": sen,
        "Specificity": sep,
        "Precision": precision,
        "F1 Score": F1,
        "MCC": MCC,
        "AUC": auc_score,
        "Brier Score": brier_score
    }
    return metrics

metrics_test = calculate_metrics(cm_test, y_test, y_test_pred, calibrated_test_probs)
metrics_train = calculate_metrics(cm_train, true_labels, y_train_pred, np.array(calibrated_probs))

print("Performance Metrics (Test):")
for key, value in metrics_test.items():
    print(f"{key}: {value:.4f}")

print("\nPerformance Metrics (Train):")
for key, value in metrics_train.items():
    print(f"{key}: {value:.4f}")

# 绘制校准曲线
def plot_calibration_curve(y_true, probs, title='Calibration Curve'):
    fraction_of_positives, mean_predicted_value = calibration_curve(y_true, probs, n_bins=10)
    plt.plot(mean_predicted_value, fraction_of_positives, "s-", label="Bayesian Binomial Calibration")
    plt.plot([0, 1], [0, 1], "k--")
    plt.xlabel('Mean predicted value')
    plt.ylabel('Fraction of positives')
    plt.title(title)
    plt.legend()
    plt.show()

# 绘制校准曲线
plot_calibration_curve(y_test, calibrated_test_probs, title='Calibration Curve (Test)')
plot_calibration_curve(true_labels, np.array(calibrated_probs), title='Calibration Curve (Train)')

输出:

效果甚至还变差了呢。

五、最后

各位可以去试一试在其他数据或者在其他机器学习分类模型中使用的效果。

数据不分享啦。