机器学习——KNN与线性回归算法小结

发布于:2025-03-30 ⋅ 阅读:(22) ⋅ 点赞:(0)

一、KNN(K近邻)算法

1、算法实现(sklearn.neighbors.KNeighborsClassifier)

#主要包
from sklearn.neighbors import KNeighborsClassifier

#创建KNN分类对象
kNN_classifier = KNeighborsClassifier(n_neighbors=5) #n_neighbors就是K

#适配数据
kNN_classifier.fit(X_train,y_train)

#预测
predict_y = kNN_classifier.predict(data_new.reshape(1,-1))

2、划分数据集(sklearn.model_selection.train_test_split)

#数据集制作包
from sklearn.model_selection import train_test_split

#划分
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.7, random_state = 233, stratify = y)

3、模型评价( sklearn.metrics.accuracy_score)

#认识鸢尾花数据集
from sklearn import datasets
iris = datasets.load_iris()

x=iris.data    #特征
y=iris.target  #标签

#sklean模型评价包
from sklearn.metrics import accuracy_score

#评价
accuracy_score(y_test,y_predict)

4、超参数搜索(sklearn.model_selection.GridSearchCV)

from sklearn.model_selection import GridSearchCV

#设置参数范围,便于使用GridSearchCV进行循环匹配
params = {
    'n_neighbors': [n for n in range(1, 20)],
    'weights': ['uniform', 'distance'],
    'p': [p for p in range(1, 7)]
}

#创建对象
grid = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=params,
    n_jobs=-1
)


#适配数据
grid.fit(x_train, y_train)

#最佳参数
grid.best_params_

#最佳预测分数
grid.best_score_

#适配最佳参数
grid.best_estimator_

#使用最佳参数进行预测
grid.best_estimator_.predict(x_test)

#评估预测分数
grid.best_estimator_.score(x_test, y_test)

5、归一化(sklearn.preprocessing.StandardScaler)

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(iris.data,iris.target,train_size=0.8,random_state=666)
from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()
standard_scaler.fit(X_train)
standard_scaler.mean_
standard_scaler.scale_
X_train_standard = standard_scaler.transform(X_train)
X_test_standard = standard_scaler.transform(X_test)

from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train_standard,y_train)
knn_classifier.score(X_test_standard, y_test)

6、KNN简单回归(预测值)( sklearn.neighbors.KNeighborsRegressor)

x_train ,x_test, y_train, y_test = train_test_split(x, y ,train_size = 0.7, random_state=233)

from sklearn.neighbors import KNeighborsRegressor

knn_reg = KNeighborsRegressor(n_neighbors=5, weights='distance', p=2)

knn_reg.fit(x_train, y_train)

knn_reg.score(x_test, y_test)

二、线性回归算法

1、线性回归(sklearn.linear_model.LinearRegression)

from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()

lin_reg.fit(x_train.reshape(-1,1), y_train)

y_predict = lin_reg.predict(x_test.reshape(-1,1))

plt.scatter(x_test, y_test)
plt.plot(x_test, y_predict, c='r')
plt.show()

2、线性回归模型评价(MSE, RMSE, MAE, R2)

MSE, RMSE:   sklearn.metrics.mean_squared_error
MAE:         sklearn.metrics.mean_absolute_error

R方:            from sklearn.metrics import r2_score

3、线性多项式回归(sklearn.preprocessing.PolynomialFeatures)

from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)

X_poly = poly .fit_transform(X)

4、逻辑回归(sklearn.linear_model.LogisticRegression)

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(x_train, y_train)
clf.score(x_train, y_train)
clf.score(x_test, y_test)
clf.predict(x_test)
np.argmax(clf.predict_proba(x_test), axis = 1)

5、复杂逻辑回归(多项式/多分类)

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification

#制作数据集
np.random.seed(0)
X = np.random.normal(0,1,size=(200,2))
y = np.array((X[:,0]**2)+(X[:,1]**2)<2, dtype='int')
print(X)
print(y)


plt.scatter(x_train[:,0], x_train[:,1], c = y_train) 
plt.show()  #散点图显示 


#普通线性逻辑回归
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(x_train, y_train)
clf.score(x_train, y_train) #0.7071428571428572
clf.score(x_test, y_test)#0.6666666666666666,可见效果并不佳



#多项式逻辑回归

from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2) #设置多项式式最高幂
poly.fit(x_train) #特征适配,将多元降幂

x2 = poly.transform(x_train) #定义新x_train(x2)
x2t = poly.transform(x_test) #定义新x_test(x2t)

clf.fit(x2, y_train) #使用普通线性逻辑回归训练数据

clf.score(x2, y_train) #1.0
clf.score(x2t, y_test) #0.9666666666666667 ,可见使用多项式逻辑回归效果显著提升


#多分类逻辑回归(主要针对数据杂乱无序,样本数可数)

from sklearn import datasets
iris = datasets.load_iris() #加载鸢尾花数据集
x = iris.data
y = iris.target
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=666)

plt.scatter(x_train[:,0], x_train[:,1], c=y_train)
plt.show()

from sklearn.multiclass import OneVsRestClassifier #使用一对多的方法,耗时少,准确率略低

ovr = OneVsRestClassifier(clf)
ovr.fit(x_train,y_train)
ovr.score(x_test, y_test) #0.9736842105263158

from sklearn.multiclass import OneVsOneClassifier

ovr = OneVsOneClassifier(clf)
ovr.fit(x_train,y_train)
ovr.score(x_test, y_test) #1.0  使用一对一的方法,耗时多,准确率较高