【数据挖掘】聚类算法学习—K-Means

发布于:2025-06-30 ⋅ 阅读:(12) ⋅ 点赞:(0)

K-Means

        K-Means是一种经典的无监督学习算法,用于将数据集划分为K个簇(clusters),使得同一簇内的数据点相似度高,不同簇间的相似度低。它在数据挖掘、模式识别和机器学习中广泛应用,如客户细分、图像压缩和异常检测。下面我将逐步介绍其核心原理、算法步骤、优缺点和应用场景。

要求:

        理解并掌握K-Means算法,理解算法的原理,能够实现算法,并对给定的数据集进行聚类

代码实现:

import numpy as np
import matplotlib.pyplot as plt
import random
#算法实现
class KMeans:
    def __init__(self, n_clusters=3, max_iter=300):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.centroids = None
        self.labels = None
    
    def fit(self, X):
        #随机初始化质心
        indices = random.sample(range(len(X)), self.n_clusters)
        self.centroids = X[indices]
        
        for _ in range(self.max_iter):
            #分配样本到最近的质心
            distances = self._calc_distances(X)
            self.labels = np.argmin(distances, axis=1)
            
            #更新质心位置
            new_centroids = np.array([X[self.labels == k].mean(axis=0) 
                                     for k in range(self.n_clusters)])
            
            #检查收敛
            if np.allclose(self.centroids, new_centroids):
                break
                
            self.centroids = new_centroids
    
    def _calc_distances(self, X):
        return np.array([[np.linalg.norm(x - c) for c in self.centroids] 
                        for x in X])
    
    def predict(self, X):
        distances = self._calc_distances(X)
        return np.argmin(distances, axis=1)

#数据加载函数
def load_data(file_path):
    data = []
    labels = []
    with open(file_path, 'r') as f:
        for line in f:
            values = line.strip().split(',')
            #提取前两个特征和标签
            data.append([float(values[0]), float(values[1])])
            labels.append(values[2])
    return np.array(data), np.array(labels)

#评估函数(使用多数投票原则)
def evaluate_clustering(true_labels, pred_labels):
    #创建映射关系
    label_mapping = {}
    for cluster_id in set(pred_labels):
        cluster_samples = np.where(pred_labels == cluster_id)[0]
        cluster_labels = true_labels[cluster_samples]
        majority_label = max(set(cluster_labels), key=list(cluster_labels).count)
        label_mapping[cluster_id] = majority_label
    
    #计算准确率
    correct = 0
    for i in range(len(true_labels)):
        if label_mapping[pred_labels[i]] == true_labels[i]:
            correct += 1
    
    return correct / len(true_labels), label_mapping

if __name__ == "__main__":
    #加载数据
    file_path = "D:/课程/数据挖掘/实验六/实验6-iris-聚类.txt"
    X, true_labels = load_data(file_path)
    
    #创建K-Means实例并训练
    kmeans = KMeans(n_clusters=3)
    kmeans.fit(X)
    
    #预测聚类标签
    pred_labels = kmeans.labels
    
    #评估聚类结果
    accuracy, mapping = evaluate_clustering(true_labels, pred_labels)
    
    print(f"聚类准确率: {accuracy:.2%}")
    print("聚类标签映射关系:")
    for cluster_id, species in mapping.items():
        print(f"聚类{cluster_id} -> {species}")
    
    plt.figure(figsize=(12, 5))
    plt.subplot(121)
    colors = {'Iris-setosa': 'red', 'Iris-versicolor': 'green', 'Iris-virginica': 'blue'}
    for species in np.unique(true_labels):
        plt.scatter(X[true_labels == species, 0], 
                    X[true_labels == species, 1], 
                    label=species,
                    c=colors[species],
                    alpha=0.6)
    plt.title('真实标签分布')
    plt.xlabel('花萼长度')
    plt.ylabel('花萼宽度')
    plt.legend()
    #聚类结果分布
    plt.subplot(122)
    for cluster_id in range(3):
        plt.scatter(X[pred_labels == cluster_id, 0], 
                    X[pred_labels == cluster_id, 1], 
                    label=f'聚类{cluster_id}',
                    alpha=0.6)
    #标记质心位置
    plt.scatter(kmeans.centroids[:, 0], kmeans.centroids[:, 1], 
                marker='X', s=200, c='black', label='质心')
    plt.title('K-Means聚类结果')
    plt.xlabel('花萼长度')
    plt.ylabel('花萼宽度')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig('kmeans_clustering_result.png')
    plt.show()

运行结果:

左图为真实的标签分布,右图为K-Means的聚类结果


网站公告

今日签到

点亮在社区的每一天
去签到