整体流程
数据预处理:标准化->加一列全为1的偏置项
训练:梯度下降,将数学公式转换成代码
预测
模型代码
import numpy as np
# 标准化函数:对特征做均值-方差标准化
# 返回标准化后的特征、新数据的均值和标准差,用于后续预测
def standard(feats):
new_feats = np.copy(feats).astype(float)
mean = np.mean(new_feats, axis=0)
std = np.std(new_feats, axis=0)
std[std == 0] = 1
new_feats = (new_feats - mean) / std
return new_feats, mean, std
class LinearRegression:
def __init__(self, data, labels):
# 对训练数据进行标准化
new_data, mean, std = standard(data)
# 存储用于预测的均值和标准差
self.mean = mean
self.std = std
# 样本数 m 和 原始特征数 n
m, n = new_data.shape
# 在特征矩阵前加一列 1 作为偏置项
X = np.hstack((np.ones((m, 1)), new_data)) # shape (m, n+1)
self.X = X # 训练特征 (m, n+1)
self.y = labels # 训练标签 (m, 1)
self.m = m # 样本数
self.n = n + 1 # 特征数(含偏置)
# 初始化参数 theta
self.theta = np.zeros((self.n, 1))
def train(self, alpha, num_iterations=500):
"""
执行梯度下降
:param alpha: 学习率
:param num_iterations: 迭代次数
:return: 学习到的 theta 和每次迭代的损失历史
"""
cost_history = []
for _ in range(num_iterations):
self.gradient_step(alpha)
cost_history.append(self.cost_function())
return self.theta, cost_history
def gradient_step(self, alpha):
# 计算预测值
predictions = self.X.dot(self.theta) # shape (m,1)
# 计算误差
delta = predictions - self.y # shape (m,1)
# 计算梯度并更新 theta
grad = (self.X.T.dot(delta)) / self.m # shape (n+1,1)
self.theta -= alpha * grad
def cost_function(self):
# 计算当前 theta 下的损失
delta = self.X.dot(self.theta) - self.y # shape (m,1)
return float((delta.T.dot(delta)) / (2 * self.m))
def predict(self, data):
"""
对新数据进行预测
:param data: 新数据,shape (m_new, n)
:return: 预测值,shape (m_new, 1)
"""
# 确保输入为二维数组
data = np.array(data, ndmin=2)
# 使用训练时的均值和标准差进行标准化
new_data = (data - self.mean) / self.std
# 加入偏置项
m_new = new_data.shape[0]
X_new = np.hstack((np.ones((m_new, 1)), new_data))
# 返回预测结果
return X_new.dot(self.theta)
测试代码
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from linear_regression import LinearRegression
data = pd.read_csv('../data/world-happiness-report-2017.csv')
train_data = data.sample(frac = 0.8)
test_data = data.drop(train_data.index)
input_param_name = 'Economy..GDP.per.Capita.'
output_param_name = 'Happiness.Score'
# 取出城市gdp的值和对应的幸福指数
x_train = train_data[[input_param_name]].values
y_train = train_data[[output_param_name]].values
x_test = test_data[input_param_name].values
y_test = test_data[output_param_name].values
num_iterations = 500
learning_rate = 0.01
# 训练
# x_train是gdp值,y_train是幸福指数
linear_regression = LinearRegression(x_train,y_train)
# 梯度下降比率,训练轮数
(theta,cost_history) = linear_regression.train(learning_rate,num_iterations)
print ('开始时的损失:',cost_history[0])
print ('训练后的损失:',cost_history[-1])
plt.plot(range(num_iterations),cost_history)
plt.xlabel('Iter')
plt.ylabel('cost')
plt.title('GD')
plt.show()
predictions_num = 100
# 最小值,最大值,多少个等间隔的数,然后做成列向量的形式
x_predictions = np.linspace(x_train.min(),x_train.max(),predictions_num).reshape(predictions_num,1)
y_predictions = linear_regression.predict(x_predictions)
plt.scatter(x_train,y_train,label='Train data')
plt.scatter(x_test,y_test,label='test data')
plt.plot(x_predictions,y_predictions,'r',label = 'Prediction')
plt.xlabel(input_param_name)
plt.ylabel(output_param_name)
plt.title('Happy')
plt.legend()
plt.show()