波士顿房价预测工具 - XGBoost实现
项目概述
这是一个基于PyQt5和XGBoost的波士顿房价预测桌面应用程序。该工具提供了友好的图形界面,允许用户调整XGBoost模型参数,并实时查看训练结果和预测效果。
开发过程详解
1. 项目架构设计
项目采用模块化的MVC架构,将代码按功能分为以下几个主要模块:
boston_housing_predictor/
├── main.py # 程序入口
├── run.py # 启动脚本
├── path_config.py # 路径配置
├── ui/ # UI界面模块
│ ├── __init__.py
│ ├── main_window.py # 主窗口
│ └── widgets/ # UI组件
│ ├── __init__.py
│ ├── canvas.py
│ ├── control_panel.py
│ ├── data_table_widget.py
│ ├── prediction_widget.py
│ └── visualization_panel.py
├── models/ # 模型相关
│ ├── __init__.py
│ ├── base_model.py
│ └── xgboost_model.py
├── utils/ # 工具函数
│ ├── __init__.py
│ ├── data_loader.py
│ ├── validators.py
│ └── visualizers.py
├── config/ # 配置管理
│ ├── __init__.py
│ └── settings.py
└── threads/ # 多线程支持
├── __init__.py
└── training_thread.py
2. 核心功能实现
2.1 数据加载与验证
数据加载模块支持多种数据源:
- CSV文件(支持多种编码)
- sklearn内置数据集
- 自动生成示例数据
2.2 XGBoost模型封装
将XGBoost模型封装为类,提供统一的接口:
- 训练方法(支持早停法)
- 预测方法
- 参数管理
- 特征重要性获取
2.3 可视化功能
提供多种可视化图表:
- 数据探索(分布图、相关性热力图)
- 特征重要性排序
- 预测结果对比
- 残差分析
2.4 预测功能
- 数据预测功能:支持手动输入特征值和CSV批量预测
- GPU加速支持:自动检测并使用GPU加速训练
- 数据表格展示:在第一个标签页显示原始数据
3. 完整代码实现
3.1 程序入口 (main.py)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
波士顿房价预测工具 - 主程序入口
"""
import sys
from PyQt5.QtWidgets import QApplication
from ui.main_window import BostonHousingApp
from utils.visualizers import setup_chinese_font
def main():
"""主程序入口"""
# 设置中文字体
setup_chinese_font()
# 创建应用
app = QApplication(sys.argv)
app.setApplicationName("波士顿房价预测工具")
app.setOrganizationName("ML Tools Inc.")
# 创建主窗口
window = BostonHousingApp()
window.show()
# 运行应用
sys.exit(app.exec_())
if __name__ == '__main__':
main()
3.2 路径配置 (path_config.py)
"""
路径配置模块 - 确保所有模块都能正确导入
"""
import sys
import os
def setup_path():
"""设置项目路径"""
# 获取当前文件所在目录
current_dir = os.path.dirname(os.path.abspath(__file__))
# 将项目根目录添加到Python路径
if current_dir not in sys.path:
sys.path.insert(0, current_dir)
# 自动执行路径设置
setup_path()
3.3 配置文件 (config/settings.py)
"""
配置文件 - 存储所有默认参数和常量
"""
# XGBoost默认参数
DEFAULT_XGBOOST_PARAMS = {
'n_estimators': 100,
'max_depth': 6,
'learning_rate': 0.1,
'subsample': 0.8,
'colsample_bytree': 0.8,
'reg_alpha': 0.0,
'reg_lambda': 1.0,
'random_state': 42,
'n_jobs': -1,
'tree_method': 'hist'
}
# 滑块参数范围
SLIDER_RANGES = {
'train_test_split': {
'min': 50, 'max': 90, 'default': 80},
'n_estimators': {
'min': 10, 'max': 500, 'default': 100},
'max_depth': {
'min': 1, 'max': 20, 'default': 6},
'learning_rate': {
'min': 1, 'max': 50, 'default': 10}, # 实际值需要除以100
'subsample': {
'min': 50, 'max': 100, 'default': 80}, # 实际值需要除以100
'colsample_bytree': {
'min': 50, 'max': 100, 'default': 80}, # 实际值需要除以100
'reg_alpha': {
'min': 0, 'max': 100, 'default': 0}, # 实际值需要除以100
'reg_lambda': {
'min': 0, 'max': 100, 'default': 10} # 实际值需要除以10
}
# 数据文件配置
DATA_FILE_NAMES = ['波士顿房价数据集.csv', 'boston_housing.csv']
DATA_ENCODINGS = ['utf-8', 'gbk', 'gb2312']
# UI配置
WINDOW_TITLE = '波士顿房价预测工具 - XGBoost'
WINDOW_SIZE = (1400, 900)
# 可视化配置
FIGURE_DPI = 100
FIGURE_SIZE = (8, 6)
# 数据显示配置
MAX_DISPLAY_ROWS = 100 # 最多显示100行数据
3.4 基础模型类 (models/base_model.py)
"""
基础模型类
"""
from abc import ABC, abstractmethod
import numpy as np
from typing import Tuple, Dict, Any
class BaseModel(ABC):
"""模型基类"""
def __init__(self):
self.model = None
self.is_trained = False
@abstractmethod
def train(self, X_train: np.ndarray, y_train: np.ndarray,
X_val: np.ndarray = None, y_val: np.ndarray = None,
**kwargs) -> None:
"""训练模型"""
pass
@abstractmethod
def predict(self, X: np.ndarray) -> np.ndarray:
"""预测"""
pass
@abstractmethod
def get_params(self) -> Dict[str, Any]:
"""获取模型参数"""
pass
@abstractmethod
def set_params(self, **params) -> None:
"""设置模型参数"""
pass
def evaluate(self, X_test: np.ndarray, y_test: np.ndarray) -> Dict[str, float]:
"""
评估模型性能
Args:
X_test: 测试集特征
y_test: 测试集标签
Returns:
Dict[str, float]: 评估指标
"""
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
if not self.is_trained:
raise ValueError("模型还未训练")
y_pred = self.predict(X_test)
return {
'mse': mean_squared_error(y_test, y_pred),
'mae': mean_absolute_error(y_test, y_pred),
'r2': r2_score(y_test, y_pred)
}
3.5 XGBoost模型实现 (models/xgboost_model.py)
"""
XGBoost模型实现
"""
import xgboost as xgb
import numpy as np
from typing import Dict, Any, Optional, List
from models.base_model import BaseModel
class XGBoostModel(BaseModel):
"""XGBoost回归模型"""
def __init__(self, **params):
super().__init__()
self.params = params
self.model = None
self.feature_names = None
def train(self, X_train: np.ndarray, y_train: np.ndarray,
X_val: Optional[np.ndarray] = None,
y_val: Optional[np.ndarray] = None,
early_stopping: bool = False,
feature_names: Optional[List[str]] = None) -> None:
"""
训练XGBoost模型
Args:
X_train: 训练集特征
y_train: 训练集标签
X_val: 验证集特征
y_val: 验证集标签
early_stopping: 是否使用早停法
feature_names: 特征名称列表
"""
# 保存特征名称
self.feature_names = feature_names
# 如果使用早停法,设置相关参数
if early_stopping and X_val is not None and y_val is not None:
self.params['early_stopping_rounds'] = 10
self.params['eval_metric'] = 'rmse'
# 创建模型
self.model = xgb.XGBRegressor(**self.params)
# 设置特征名称
if feature_names:
self.model.feature_names = feature_names
# 训练模型
if early_stopping and X_val is not None and y_val is not None:
eval_set = [(X_train, y_train), (X_val, y_val)]
self.model.fit(X_train, y_train, eval_set=eval_set, verbose=False)
else:
self.model.fit(X_train, y_train)
self.is_trained = True
def predict(self, X: np.ndarray) -> np.ndarray:
"""预测"""
if not self.is_trained:
raise ValueError("模型还未训练")
return self.model.predict(X)
def get_params(self) -> Dict[str, Any]:
"""获取模型参数"""
return self.params.copy()
def set_params(self, **params) -> None:
"""设置模型参数"""
self.params.update(params)
if self.model is not None:
self.model.set_params(**params)
def get_feature_importance(self) -> Optional[np.ndarray]:
"""获取特征重要性"""
if self.is_trained and hasattr(self.model, 'feature_importances_'):
return self.model.feature_importances_
return None
def get_best_iteration(self) -> Optional[int]:
"""获取最佳迭代次数(早停法)"""
if self.is_trained and hasattr(self.model, 'best_iteration'):
return self.model.best_iteration
return None
3.6 训练线程 (threads/training_thread.py)
"""
训练线程模块
"""
from PyQt5.QtCore import QThread, pyqtSignal
from typing import Optional
import numpy as np
class TrainingThread(QThread):
"""训练线程,避免界面卡顿"""
progress = pyqtSignal(str)
finished = pyqtSignal(object)
error = pyqtSignal(str)
def __init__(self, model, X_train: np.ndarray, y_train: np.ndarray,
X_val: Optional[np.ndarray] = None,
y_val: Optional[np.ndarray] = None,
early_stopping: bool = False,
feature_names: Optional[list] = None):
super().__init__()
self.model = model
self.X_train = X_train
self.y_train = y_train
self.X_val = X_val
self.y_val = y_val
self.early_stopping = early_stopping
self.feature_names = feature_names
def run(self):
"""运行训练"""
try:
self.progress.emit("开始训练模型...")
# 训练模型
self.model.train(
self.X_train, self.y_train,
self.X_val, self.y_val,
early_stopping=self.early_stopping,
feature_names=self.feature_names
)
if self.early_stopping and self.X_val is not None:
self.progress.emit("训练完成(使用早停法)!")
else:
self.progress.emit("训练完成!")
self.finished.emit(self.model)
except Exception as e:
error_msg = f"训练出错:{
str(e)}"
self.progress.emit(error_msg)
self.error.emit(error_msg)
self.finished.emit(None)
3.7 主窗口 (ui/main_window.py)
[主窗口代码已在文档中,这里省略以节省空间]
3.8 UI组件 - 画布 (ui/widgets/canvas.py)
"""
Matplotlib画布组件
"""
from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas
from matplotlib.figure import Figure
from utils.visualizers import setup_chinese_font
class MplCanvas(FigureCanvas):
"""Matplotlib画布类"""
def __init__(self, parent=None, width=5, height=4, dpi=100):
# 确保中文字体设置
setup_chinese_font()
self.fig = Figure(figsize=(width, height), dpi=dpi, tight_layout=True)
super(MplCanvas, self).__init__(self.fig)
self.setParent(parent)
3.9 UI组件 - 控制面板 (ui/widgets/control_panel.py)
"""
控制面板组件
"""
from PyQt5.QtWidgets import *
from PyQt5.QtCore import Qt
from PyQt5