K-Means
K-Means是一种经典的无监督学习算法,用于将数据集划分为K个簇(clusters),使得同一簇内的数据点相似度高,不同簇间的相似度低。它在数据挖掘、模式识别和机器学习中广泛应用,如客户细分、图像压缩和异常检测。下面我将逐步介绍其核心原理、算法步骤、优缺点和应用场景。
要求:
理解并掌握K-Means算法,理解算法的原理,能够实现算法,并对给定的数据集进行聚类
代码实现:
import numpy as np
import matplotlib.pyplot as plt
import random
#算法实现
class KMeans:
def __init__(self, n_clusters=3, max_iter=300):
self.n_clusters = n_clusters
self.max_iter = max_iter
self.centroids = None
self.labels = None
def fit(self, X):
#随机初始化质心
indices = random.sample(range(len(X)), self.n_clusters)
self.centroids = X[indices]
for _ in range(self.max_iter):
#分配样本到最近的质心
distances = self._calc_distances(X)
self.labels = np.argmin(distances, axis=1)
#更新质心位置
new_centroids = np.array([X[self.labels == k].mean(axis=0)
for k in range(self.n_clusters)])
#检查收敛
if np.allclose(self.centroids, new_centroids):
break
self.centroids = new_centroids
def _calc_distances(self, X):
return np.array([[np.linalg.norm(x - c) for c in self.centroids]
for x in X])
def predict(self, X):
distances = self._calc_distances(X)
return np.argmin(distances, axis=1)
#数据加载函数
def load_data(file_path):
data = []
labels = []
with open(file_path, 'r') as f:
for line in f:
values = line.strip().split(',')
#提取前两个特征和标签
data.append([float(values[0]), float(values[1])])
labels.append(values[2])
return np.array(data), np.array(labels)
#评估函数(使用多数投票原则)
def evaluate_clustering(true_labels, pred_labels):
#创建映射关系
label_mapping = {}
for cluster_id in set(pred_labels):
cluster_samples = np.where(pred_labels == cluster_id)[0]
cluster_labels = true_labels[cluster_samples]
majority_label = max(set(cluster_labels), key=list(cluster_labels).count)
label_mapping[cluster_id] = majority_label
#计算准确率
correct = 0
for i in range(len(true_labels)):
if label_mapping[pred_labels[i]] == true_labels[i]:
correct += 1
return correct / len(true_labels), label_mapping
if __name__ == "__main__":
#加载数据
file_path = "D:/课程/数据挖掘/实验六/实验6-iris-聚类.txt"
X, true_labels = load_data(file_path)
#创建K-Means实例并训练
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
#预测聚类标签
pred_labels = kmeans.labels
#评估聚类结果
accuracy, mapping = evaluate_clustering(true_labels, pred_labels)
print(f"聚类准确率: {accuracy:.2%}")
print("聚类标签映射关系:")
for cluster_id, species in mapping.items():
print(f"聚类{cluster_id} -> {species}")
plt.figure(figsize=(12, 5))
plt.subplot(121)
colors = {'Iris-setosa': 'red', 'Iris-versicolor': 'green', 'Iris-virginica': 'blue'}
for species in np.unique(true_labels):
plt.scatter(X[true_labels == species, 0],
X[true_labels == species, 1],
label=species,
c=colors[species],
alpha=0.6)
plt.title('真实标签分布')
plt.xlabel('花萼长度')
plt.ylabel('花萼宽度')
plt.legend()
#聚类结果分布
plt.subplot(122)
for cluster_id in range(3):
plt.scatter(X[pred_labels == cluster_id, 0],
X[pred_labels == cluster_id, 1],
label=f'聚类{cluster_id}',
alpha=0.6)
#标记质心位置
plt.scatter(kmeans.centroids[:, 0], kmeans.centroids[:, 1],
marker='X', s=200, c='black', label='质心')
plt.title('K-Means聚类结果')
plt.xlabel('花萼长度')
plt.ylabel('花萼宽度')
plt.legend()
plt.tight_layout()
plt.savefig('kmeans_clustering_result.png')
plt.show()
运行结果:
左图为真实的标签分布,右图为K-Means的聚类结果