机器学习knnlearn4

发布于:2025-03-30 ⋅ 阅读:(23) ⋅ 点赞:(0)
import numpy as np
import operator


def  classify0(test_data: np.ndarray, training_data: np.ndarray, labels: list, k: int) -> str:
    """
    kNN 算法分类器
    :param test_data: 用于分类的数据(测试集)
    :param training_data: 用于训练的数据(训练集)
    :param labels: 分类标签
    :param k: kNN 算法参数, 选择距离最小的 k 个点
    :return: 分类结果
    """
    distances = np.sqrt(np.sum((np.tile(test_data, (training_data.shape[0], 1)) - training_data) ** 2, axis=1))
    sorted_indices = distances.argsort()
    class_count = {}
    for i in range(k):
        label = labels[sorted_indices[i]]
        class_count[label] = class_count.get(label, 0) + 1
    sorted_class_count = sorted(class_count.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_class_count[0][0]


def file2matrix(filename: str) -> (np.ndarray, list):
    """
    打开并解析文件,对数据进行分类:1 代表不喜欢, 2 代表魅力一般, 3 代表极具魅力
    :param filename: 文件名
    :return: 特征矩阵和分类 Label 向量
    """
    try:
        with open(filename, 'r') as file:
            lines = file.readlines()
            num_lines = len(lines)
            feature_matrix = np.zeros((num_lines, 3))
            label_vector = []
            for i, line in enumerate(lines):
                line = line.strip()
                data = line.split('\t')
                feature_matrix[i, :] = data[0:3]
                if data[-1] == 'didntLike':
                    label_vector.append(1)
                elif data[-1] == 'smallDoses':
                    label_vector.append(2)
                elif data[-1] == 'largeDoses':
                    label_vector.append(3)
            return feature_matrix, label_vector
    except FileNotFoundError:
        print(f"错误:未找到文件 {filename}")
        return np.array([]), []


def autoNorm(data_set: np.ndarray) -> (np.ndarray, np.ndarray, np.ndarray):
    """
    对数据进行归一化
    :param data_set: 特征矩阵
    :return: 归一化后的特征矩阵、数据范围、数据最小值
    """
    min_vals = data_set.min(0)
    max_vals = data_set.max(0)
    ranges = max_vals - min_vals
    norm_data_set = (data_set - min_vals) / ranges
    return norm_data_set, ranges, min_vals


def classifyPerson():
    """
    通过输入一个人的三维特征,进行分类输出
    """
    result_list = ['讨厌', '有些喜欢', '非常喜欢']
    try:
        percent_tats = float(input("玩视频游戏所耗时间百分比:"))
        ff_miles = float(input("每年获得的飞行常客里程数:"))
        ice_cream = float(input("每周消费的冰激淋公升数:"))
        filename = "datingTestSet.txt"
        dating_data_matrix, dating_labels = file2matrix(filename)
        if dating_data_matrix.size == 0:
            return
        norm_matrix, ranges, min_vals = autoNorm(dating_data_matrix)
        test_array = np.array([percent_tats, ff_miles, ice_cream])
        norm_test_array = (test_array - min_vals) / ranges
        classifier_result = classify0(norm_test_array, norm_matrix, dating_labels, 3)
        print(f"你可能{result_list[classifier_result - 1]}这个人")
    except ValueError:
        print("错误:输入必须为有效的数字")


if __name__ == '__main__':
    classifyPerson()
    
你可能有些喜欢这个人
import numpy as np
data = np.mat([[1,200,105,3,False],
               [2,165,80,2,False],
               [3,184.5,120,2,False],
               [4,116,70,8,False],
               [5,270,150,4,True]])

row = 0
for line in data:
    row += 1
print(row)

print(data.size)
5
25
import numpy as np
data = np.mat([[1,200,105,3,False],
               [2,165,80,2,False],
               [3,184.5,120,2,False],
               [4,116,70,8,False],
               [5,270,150,4,True]])
print(print(data[0,3]))
print(print(data[0,4]))
3.0
None
0.0
None
import numpy as np
data = np.mat([[1,200,105,3,False],
               [2,165,80,2,False],
               [3,184.5,120,2,False],
               [4,116,70,8,False],
               [5,270,150,4,True]])

coll = []

for row in data:
    coll.append(row[0,1])


print(np.sum(coll))
print(np.mean(coll))
print(np.std(coll))
print(np.var(coll))
935.5
187.1
50.17808286493218
2517.84
import numpy as np
import pylab
import scipy.stats as stats

data = np.mat([[1,200,105,3,False],
               [2,165,80,2,False],
               [3,184.5,120,2,False],
               [4,116,70,8,False],
               [5,270,150,4,True]])


coll = []
for row in data:
    coll.append(row[0,1])

stats.probplot(coll,plot=pylab)
pylab.show()

外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传


import pandas as pd
import matplotlib.pyplot as plot

rocksVmines = pd.DataFrame([[1,200,105,3,False],
                            [2,165,80,2,False],
                            [3,184.5,120,2,False],
                            [4,116,70,8,False],
                            [5,270,150,4,True]])
datarow1 = rocksVmines.iloc[1,0:3]
datarow2 = rocksVmines.iloc[2,0:3]

plot.scatter(datarow1,datarow2)
plot.xlabel("Attribute1")
plot.ylabel("Attribute2")

plot.show()


datarow3 = rocksVmines.iloc[3,0:3]
plot.scatter(datarow2,datarow3)
plot.xlabel("Attribute2")
plot.ylabel("Attribute3")
plot.show()

外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传

外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传

plot.scatter(datarow2,datarow3)
plot.xlabel(“Attribute2”)
plot.ylabel(“Attribute3”)
plot.show()



    
[外链图片转存中...(img-BJOeAcI2-1743117156398)]
    



    
[外链图片转存中...(img-pvuHG2dQ-1743117156398)]