NaiveBayes
朴素贝叶斯的核心是贝叶斯定理,它描述了如何根据新证据更新事件的概率。
要求:
1、实现朴素贝叶斯分类算法,验证算法的正确性,并将算法应用于给定的数据集Data_User_Modeling数据集,选择一部分数据集作为已知结果,然后用剩下的数据集作为测试集,验证算法的分类情况
2、重新选取训练样本和测试集,对比并分析分类结果
3、选取一部分数据集作为训练样本,实现分类;不断从测试集中选取数据加入到训练集中,对比并分析分类结果
代码实现:
import pandas as pd
import numpy as np
from math import pi, sqrt, exp
# ========================== 数据预处理 ==========================
# 配置文件路径(请根据实际路径修改)
DATA_PATH = r"D:\课程\数据挖掘\实验四\实验4-Data_User_Modeling_Dataset_Hamdi Tolga KAHRAMAN.xls"
# 读取Excel数据
excel_file = pd.ExcelFile(DATA_PATH)
df = excel_file.parse('Training_Data') # 假设工作表名为'Training_Data'
# 数据清洗:统一目标变量格式(转为小写并去除空格)
df['UNS'] = df['UNS'].str.strip().str.lower()
# 划分训练集和测试集(70%训练,30%测试)
train_df = df.sample(frac=0.7, random_state=42)
test_df = df.drop(train_df.index)
# 分离特征和标签
X_train = train_df.drop('UNS', axis=1)
y_train = train_df['UNS']
X_test = test_df.drop('UNS', axis=1)
y_test = test_df['UNS']
# ========================== 朴素贝叶斯算法实现 ==========================
class NaiveBayesClassifier:
def __init__(self):
self.classes = None # 存储类别标签
self.mean = {} # 各特征在类别下的均值
self.var = {} # 各特征在类别下的方差
self.prior = {} # 类别先验概率
def fit(self, X, y):
"""训练模型,计算类别统计量"""
self.classes = np.unique(y) # 获取所有类别
n_samples, n_features = X.shape # 样本数和特征数
for cls in self.classes:
cls_data = X[y == cls] # 提取当前类别数据
self.mean[cls] = cls_data.mean(axis=0) # 计算各特征均值
self.var[cls] = cls_data.var(axis=0, ddof=1) # 计算无偏方差(ddof=1)
self.prior[cls] = len(cls_data) / n_samples # 计算先验概率
def _gaussian_probability(self, x, mean, var):
"""计算高斯分布的概率密度函数(连续特征)"""
exponent = exp(-((x - mean) ** 2) / (2 * var))
denominator = sqrt(2 * pi * var)
return exponent / denominator
def predict(self, X):
"""预测样本类别"""
predictions = []
for _, sample in X.iterrows():
posteriors = {}
for cls in self.classes:
# 计算先验概率(取对数避免下溢)
prior = np.log(self.prior[cls])
# 计算似然概率(特征独立假设,乘积转对数求和)
likelihood = 1.0
for feature in X.columns:
prob = self._gaussian_probability(
sample[feature],
self.mean[cls][feature],
self.var[cls][feature]
)
likelihood *= prob if prob != 0 else 1e-10 # 处理零概率
# 后验概率 = 先验概率 * 似然概率(取对数域计算)
posterior = prior + np.log(likelihood)
posteriors[cls] = posterior
# 选择后验概率最大的类别
predictions.append(max(posteriors, key=posteriors.get))
return np.array(predictions)
# ========================== 模型训练与评估 ==========================
# 创建分类器实例并训练
nb_classifier = NaiveBayesClassifier()
nb_classifier.fit(X_train, y_train)
# 预测测试集
y_pred = nb_classifier.predict(X_test)
# 计算准确率
accuracy = np.sum(y_pred == y_test) / len(y_test)
print(f"朴素贝叶斯分类准确率: {accuracy * 100:.2f}%")
# ========================== 可选:增量学习实验(按实验要求扩展) ==========================
def incremental_learning_evaluation(initial_train, test_data, steps=5):
"""逐步将测试数据加入训练集,观察准确率变化"""
train_data = initial_train.copy()
X_incr = test_data.drop('UNS', axis=1)
y_incr = test_data['UNS']
n_test = len(X_incr)
step_size = n_test // steps if n_test >= steps else n_test # 避免除数为0
for i in range(steps):
start_idx = i * step_size
end_idx = (i + 1) * step_size
add_X = X_incr.iloc[start_idx:end_idx]
add_y = y_incr.iloc[start_idx:end_idx]
# 合并训练数据
train_data = pd.concat([train_data, pd.concat([add_X, add_y], axis=1)])
current_X_train = train_data.drop('UNS', axis=1)
current_y_train = train_data['UNS']
# 重新训练模型
incr_classifier = NaiveBayesClassifier()
incr_classifier.fit(current_X_train, current_y_train)
# 预测剩余测试数据
remaining_X_test = X_incr.iloc[end_idx:]
remaining_y_test = y_incr.iloc[end_idx:]
if not remaining_X_test.empty:
y_pred_incr = incr_classifier.predict(remaining_X_test)
current_accuracy = np.sum(y_pred_incr == remaining_y_test) / len(remaining_y_test)
print(f"加入{end_idx}条数据后准确率: {current_accuracy * 100:.2f}%")
else:
print("所有测试数据已加入训练集")
# 执行增量学习实验(可选,取消注释后运行)
# incremental_learning_evaluation(train_df, test_df, steps=5)