题意:
给出每个人的基本信息,预测运动后的卡路里消耗值。
数据处理:
1.构造出人体机能、运动相关的特征值。
2.所有特征值进行从新组合,注意唯独爆炸
3.对连续信息分箱变成离散
建立模型:
1.xgb模型,lgb模型,cat模型
2.使用stack堆叠融合,使用3折交叉验证
3.对xgb、lgb、cat进行K折交叉验证,最终和stack进行结果融合。
代码:
import os
import sys
import warnings
import numpy as np
import pandas as pd
import seaborn
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from matplotlib import pyplot as plt
import lightgbm
from mlxtend.regressor import StackingCVRegressor
from sklearn import clone
from sklearn.ensemble import VotingRegressor, StackingClassifier, StackingRegressor
from sklearn.linear_model import Lasso, LogisticRegression, RidgeCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer, mean_squared_log_error
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
def init():
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # 仅输出错误日志
warnings.simplefilter('ignore') # 忽略警告日志
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns", 1000)
def show_dataframe(df):
print("查看特征值和特征值类型\n" + str(df.dtypes) + "\n" + "-" * 100)
print("查看前10行信息\n" + str(df.head()) + "\n" + "-" * 100)
print("查看每个特征值的各种数据统计信息\n" + str(df.describe()) + "\n" + "-" * 100)
print("输出重复行的个数\n" + str(df.duplicated().sum()) + "\n" + "-" * 100)
print("查看每列的缺失值个数\n" + str(df.isnull().sum()) + "\n" + "-" * 100)
print("查看缺失值的具体信息\n" + str(df.info()) + "\n" + "-" * 100)
#print("输出X所有值出现的是什么,还有对应出现的次数\n" + str(df['X'].value_counts()) + "\n" + "-" * 100)
def show_relation(data, colx, coly): # 输出某一特征值与目标值的关系
if data[colx].dtype == 'object' or data[colx].dtype == 'category' or len(data[colx].unique()) < 20:
seaborn.boxplot(x=colx, y=coly, data=data)
else:
plt.scatter(data[colx], data[coly])
plt.xlabel(colx)
plt.ylabel(coly)
plt.show()
# 自定义RMSLE评分函数(GridSearchCV需要最大化评分,因此返回负RMSLE)
def rmsle_scorer(y_true, y_pred):
y_pred = np.clip(y_pred, 1e-15, None) # 防止对0取对数
y_true = np.clip(y_true, 1e-15, None)
log_error = np.log(y_pred + 1) - np.log(y_true + 1)
rmsle = np.sqrt(np.mean(log_error ** 2))
return -rmsle # 返回负值,因为GridSearchCV默认最大化评分
if __name__ == '__main__':
init()
df_train = pd.read_csv('/kaggle/input/playground-series-s5e5/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s5e5/test.csv')
#for col in df_train.columns:
# show_relation(df_train, col, 'Calories')
#特征工程
df_all = pd.concat([df_train.drop(['id', 'Calories'], axis=1), df_test.drop(['id'], axis=1)], axis=0)
df_all['Sex'] = df_all['Sex'].map({'male': 0, 'female': 1})
df_all = df_all.reset_index(drop=True)
#构造BMI
df_all['BMI'] = df_all['Weight'] / (df_all['Height'] / 100) ** 2
#Harris-Benedict公式
df_all['BMR'] = 0
df_all.loc[df_all['Sex'] == 0, 'BMR'] = 88.362 + (13.397 * df_all['Weight']) + (4.799 * df_all['Height']) - (5.677 * df_all['Age'])
df_all.loc[df_all['Sex'] == 1, 'BMR'] = 447.593 + (9.247 * df_all['Weight']) + (3.098 * df_all['Height']) - (4.330 * df_all['Age'])
# 数值特征标准化
#numeric_features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']
#scaler = StandardScaler()
#df_all[numeric_features] = scaler.fit_transform(df_all[numeric_features])
#运动强度特征
df_all['Max_HR'] = 220 - df_all['Age'] # 最大心率
df_all['HR_Reserve_Ratio'] = df_all['Heart_Rate'] / df_all['Max_HR']
#交互特征
df_all['Weight_Duration'] = df_all['Weight'] * df_all['Duration']
df_all['Sex_Weight'] = df_all['Sex'] * df_all['Weight']
# 构造运动功率特征
df_all['workload'] = df_all['Weight'] * df_all['Duration'] * df_all['Heart_Rate'] / 1000
# 构造生理特征交互项
df_all['age_heart_ratio'] = df_all['Age'] / df_all['Heart_Rate']
# 时间维度特征(如有时间戳)
df_all['hour_of_day'] = df_all['Duration']/60/24
# 组合特征
numeric_cols = df_all.columns
for i in range(len(numeric_cols)):
feature_1 = numeric_cols[i]
for j in range(i + 1, len(numeric_cols)):
feature_2 = numeric_cols[j]
df_all[f'{feature_1}_x_{feature_2}'] = df_all[feature_1] * df_all[feature_2]
#数值归一化
#scaler = RobustScaler()
#df_all = scaler.fit_transform(df_all)
now_col = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'BMI']
for i in now_col:
df_all[i + "_box"] = pd.cut(df_all[i], bins=10, labels=False, right=False)
X_train = df_all[:df_train.shape[0]]
Y_train = np.log1p(df_train['Calories'])
x_test = df_all[df_train.shape[0]:]
#xgb
model_xgb =estimator=XGBRegressor(
random_state=42,
n_estimators=8000,
objective='reg:squarederror',
eval_metric='rmse',
device='cuda',
learning_rate=0.05,
max_depth=8,
colsample_bytree=0.75,
subsample=0.9,
#reg_lambda = 1,
#reg_alpha = 0.5,
early_stopping_rounds=500,
)
#lgb
model_lgb = lightgbm.LGBMRegressor(
n_estimators=3000, # 增加迭代次数配合早停
learning_rate=0.03, # 减小学习率
num_leaves=15, # 限制模型复杂度
min_child_samples=25, # 增加最小叶子样本数
reg_alpha=0.1, # L1正则化
reg_lambda=0.1, # L2正则化
objective='regression_l1', # 改用MAE损失
early_stopping_rounds=500,
)
#cat
model_cat = CatBoostRegressor(
iterations=3500,
learning_rate=0.02,
depth=12,
loss_function='RMSE',
l2_leaf_reg=3,
random_seed=42,
eval_metric='RMSE',
early_stopping_rounds=200,
verbose=1000,
task_type='GPU',
)
#融合
#创建基模型列表(需禁用早停以生成完整预测)
base_models = [
('xgb', XGBRegressor(
early_stopping_rounds=None, # 禁用早停
**{k: v for k, v in model_xgb.get_params().items() if k != 'early_stopping_rounds'}
)),
('lgb', LGBMRegressor(
early_stopping_rounds=None, # 禁用早停
**{k: v for k, v in model_lgb.get_params().items() if k != 'early_stopping_rounds'}
)),
('cat', CatBoostRegressor(
early_stopping_rounds=None, # 禁用早停
**{k: v for k, v in model_cat.get_params().items() if k != 'early_stopping_rounds'}
))
]
meta_model = RidgeCV()
model_stack = StackingRegressor(
estimators=base_models,
final_estimator=meta_model,
cv=3, # 使用3折交叉验证生成元特征
passthrough=False, # 不使用原始特征
verbose=1
)
FOLDS = 20
KF = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
cat_features = ['Sex']
oof_cat = np.zeros(len(df_train))
pred_cat = np.zeros(len(df_test))
oof_xgb = np.zeros(len(df_train))
pred_xgb = np.zeros(len(df_test))
oof_lgb = np.zeros(len(df_train))
pred_lgb = np.zeros(len(df_test))
for i, (train_idx, valid_idx) in enumerate(KF.split(X_train, Y_train)):
print('#' * 15, i + 1, '#' * 15)
## SPLIT DS
x_train, y_train = X_train.iloc[train_idx], Y_train.iloc[train_idx]
x_valid, y_valid = X_train.iloc[valid_idx], Y_train.iloc[valid_idx]
## CATBOOST fit
model_cat.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], cat_features=cat_features,
use_best_model=True, verbose=0)
## XGB FIR
model_xgb.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=0)
## LGB MODEL
model_lgb.fit(x_train, y_train, eval_set=[(x_valid, y_valid)])
## PREDICTION CATBOOST
oof_cat[valid_idx] = model_cat.predict(x_valid)
pred_cat += model_cat.predict(x_test)
## PREDICTION XGB
oof_xgb[valid_idx] = model_xgb.predict(x_valid)
pred_xgb += model_xgb.predict(x_test)
## PREDICTION LGB
oof_lgb[valid_idx] = model_lgb.predict(x_valid)
pred_lgb += model_lgb.predict(x_test)
cat_rmse = mean_squared_error(y_valid, oof_cat[valid_idx]) ** 0.5
xgb_rmse = mean_squared_error(y_valid, oof_xgb[valid_idx]) ** 0.5
lgb_rmse = mean_squared_error(y_valid, oof_lgb[valid_idx]) ** 0.5
print(
f'FOLD {i + 1} CATBOOST_RMSE = {cat_rmse:.4f} <=> XGB_RMSE = {xgb_rmse:.4f} <=> LGB_RMSE = {lgb_rmse:.4f}')
#预测
pred_cat /= FOLDS
pred_xgb /= FOLDS
pred_lgb /= FOLDS
pred_stack = model_stack.predict(df_test)
pred_all = np.expm1(pred_xgb) * 0.1 + np.expm1(pred_stack) * 0.80 + np.expm1(pred_cat) * 0.1
submission = pd.DataFrame({
'id': df_test['id'],
'Calories': pred_all
})
submission['Calories'] = np.clip(submission['Calories'], a_min=1, a_max=20*df_test['Duration'])
submission.to_csv('/kaggle/working/submission.csv', index=False)
代码
使用k折交叉验证,对预测结果再进行训练预测。
import os
import sys
import warnings
import numpy as np
import pandas as pd
import seaborn
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from matplotlib import pyplot as plt
import lightgbm
from mlxtend.regressor import StackingCVRegressor
from sklearn import clone
from sklearn.ensemble import VotingRegressor, StackingClassifier, StackingRegressor
from sklearn.linear_model import Lasso, LogisticRegression, RidgeCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer, mean_squared_log_error
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
def init():
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # 仅输出错误日志
warnings.simplefilter('ignore') # 忽略警告日志
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)
pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns", 1000)
def show_dataframe(df):
print("查看特征值和特征值类型\n" + str(df.dtypes) + "\n" + "-" * 100)
print("查看前10行信息\n" + str(df.head()) + "\n" + "-" * 100)
print("查看每个特征值的各种数据统计信息\n" + str(df.describe()) + "\n" + "-" * 100)
print("输出重复行的个数\n" + str(df.duplicated().sum()) + "\n" + "-" * 100)
print("查看每列的缺失值个数\n" + str(df.isnull().sum()) + "\n" + "-" * 100)
print("查看缺失值的具体信息\n" + str(df.info()) + "\n" + "-" * 100)
#print("输出X所有值出现的是什么,还有对应出现的次数\n" + str(df['X'].value_counts()) + "\n" + "-" * 100)
def show_relation(data, colx, coly): # 输出某一特征值与目标值的关系
if data[colx].dtype == 'object' or data[colx].dtype == 'category' or len(data[colx].unique()) < 20:
seaborn.boxplot(x=colx, y=coly, data=data)
else:
plt.scatter(data[colx], data[coly])
plt.xlabel(colx)
plt.ylabel(coly)
plt.show()
def show_score(model_name, pred):
mse = mean_squared_error(y_val, pred)
mae = mean_absolute_error(y_val, pred)
score = r2_score(y_val, pred)
print(model_name)
print(f"{'MSE':<10}{mse:<15.4f}")
print(f"{'MAE':<10}{mae:<15.4f}")
print(f"{'R²':<10}{score:<15.4f}")
print("-" * 100)
# Function to calculate RMSLE
def rmsle(y_true, y_pred):
return np.sqrt(np.mean(np.power(np.log1p(y_true) - np.log1p(y_pred), 2)))
if __name__ == '__main__':
init()
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
#for col in df_train.columns:
# show_relation(df_train, col, 'Calories')
#特征工程
df_all = pd.concat([df_train.drop(['id', 'Calories'], axis=1), df_test.drop(['id'], axis=1)], axis=0)
df_all['Sex_encoded'] = df_all['Sex'].map({'male': 0, 'female': 1})
df_all.drop(['Sex'], axis=1, inplace=True)
df_all = df_all.reset_index(drop=True)
#构造BMI
df_all['BMI'] = df_all['Weight'] / (df_all['Height'] / 100) ** 2
#Harris-Benedict公式
df_all['BMR'] = 0
df_all.loc[df_all['Sex_encoded'] == 0, 'BMR'] = 88.362 + (13.397 * df_all['Weight']) + (4.799 * df_all['Height']) - (5.677 * df_all['Age'])
df_all.loc[df_all['Sex_encoded'] == 1, 'BMR'] = 447.593 + (9.247 * df_all['Weight']) + (3.098 * df_all['Height']) - (4.330 * df_all['Age'])
# 数值特征标准化
#numeric_features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']
#scaler = StandardScaler()
#df_all[numeric_features] = scaler.fit_transform(df_all[numeric_features])
#运动强度特征
df_all['Max_HR'] = 220 - df_all['Age'] # 最大心率
df_all['HR_Reserve_Ratio'] = df_all['Heart_Rate'] / df_all['Max_HR']
#交互特征
df_all['Weight_Duration'] = df_all['Weight'] * df_all['Duration']
df_all['Sex_Weight'] = df_all['Sex_encoded'] * df_all['Weight']
# 构造运动功率特征
df_all['workload'] = df_all['Weight'] * df_all['Duration'] * df_all['Heart_Rate'] / 1000
# 构造生理特征交互项
df_all['age_heart_ratio'] = df_all['Age'] / df_all['Heart_Rate']
# 时间维度特征(如有时间戳)
df_all['hour_of_day'] = df_all['Duration']/60/24
# 组合特征
numeric_cols = df_all.columns
for i in range(len(numeric_cols)):
feature_1 = numeric_cols[i]
for j in range(i + 1, len(numeric_cols)):
feature_2 = numeric_cols[j]
df_all[f'{feature_1}_x_{feature_2}'] = df_all[feature_1] * df_all[feature_2]
#数值归一化
#scaler = RobustScaler()
#df_all = scaler.fit_transform(df_all)
# 分箱,把连续变成离散的,看你在哪一类
now_col = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'BMI']
for i in now_col:
df_all[i + "_box"] = pd.cut(df_all[i], bins=10, labels=False, right=False)
baseline_temp = 37.0
# Calculate 'Temp_Change' for the training data
df_all['Temp_Change'] = df_all['Body_Temp'] - baseline_temp
# Calculate 'Intensity' for the training data
df_all['Intensity'] = df_all['Heart_Rate'] / df_all['Duration']
# Calculate 'Heart_Rate_Ratio' for the training data
df_all['Heart_Rate_Ratio'] = df_all['Heart_Rate'] / df_all['Age']
# Calculate 'Duration_x_HeartRate' for the training data
df_all['Duration_x_HeartRate'] = df_all['Duration'] * df_all['Heart_Rate']
# Calculate 'Weight_x_Duration' for the training data
df_all['Weight_x_Duration'] = df_all['Weight'] * df_all['Duration']
# Calculate 'Height_x_Duration' for the training data
df_all['Height_x_Duration'] = df_all['Height'] * df_all['Duration']
# Calculate 'Weight_x_Height' for the training data
df_all['Weight_x_Height'] = df_all['Weight'] * df_all['Height']
# Calculate 'Weight_x_Intensity' for the training data
df_all['Weight_x_Intensity'] = df_all['Weight'] * df_all['Intensity']
# Calculate 'Height_x_Intensity' for the training data
df_all['Height_x_Intensity'] = df_all['Height'] * df_all['Intensity']
X_train = df_all[:df_train.shape[0]]
Y_train = np.log1p(df_train['Calories'])
x_test = df_all[df_train.shape[0]:]
#xgb
model_xgb =estimator=XGBRegressor(
random_state=42,
n_estimators=8000,
objective='reg:squarederror',
eval_metric='rmse',
device='cuda',
learning_rate=0.05,
max_depth=8,
colsample_bytree=0.75,
subsample=0.9,
#reg_lambda = 1,
#reg_alpha = 0.5,
early_stopping_rounds=200,
)
#lgb
model_lgb = lightgbm.LGBMRegressor(
n_estimators=3000, # 增加迭代次数配合早停
learning_rate=0.03, # 减小学习率
num_leaves=15, # 限制模型复杂度
min_child_samples=25, # 增加最小叶子样本数
reg_alpha=0.1, # L1正则化
reg_lambda=0.1, # L2正则化
objective='regression_l1', # 改用MAE损失
early_stopping_rounds=200,
eval_metric='RMSE',
)
#cat
model_cat = CatBoostRegressor(
iterations=3500,
learning_rate=0.02,
depth=12,
loss_function='RMSE',
l2_leaf_reg=3,
random_seed=42,
eval_metric='RMSE',
verbose=1000,
task_type='GPU',
early_stopping_rounds=200,
)
#k折交叉
print("🔄Generating Out-of-Fold (OOF) predictions and Test predictions for Base Models...\n" + "-"*70 + "\n")
# --- Prediction Storage ---
# Arrays to store out-of-fold (OOF)
add_pred_val_cat = np.zeros(len(X_train))
add_pred_val_xgb = np.zeros(len(X_train))
add_pred_val_lgb = np.zeros(len(X_train))
# Arrays to store test predictions (accumulated across folds for averaging)
add_pred_test_cat = np.zeros(len(x_test))
add_pred_test_xgb = np.zeros(len(x_test))
add_pred_test_lgb = np.zeros(len(x_test))
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for fold, (train_index, val_index) in enumerate(kf.split(X_train, Y_train)):
print(f"\n---Fold {fold + 1}/{kf.n_splits} ---")
x_train, x_val = X_train.iloc[train_index], X_train.iloc[val_index]
y_train, y_val = Y_train.iloc[train_index], Y_train.iloc[val_index]
#Apply log1p transformation to target for training
y_train_log1p = np.log1p(y_train)
y_val_log1p = np.log1p(y_val)
# --- CatBoost Training and Prediction ---
print(" ➡ Training CatBoost...")
model_cat.fit(x_train, y_train_log1p,
eval_set=[(x_val, y_val_log1p)],
verbose=0 # Set to 100 if you want to see progress
)
pred_val_cat = model_cat.predict(x_val)
pred_test_cat = model_cat.predict(x_test)
# --- XGBoost Training and Prediction ---
print(" ➡ Training XGBoost...")
model_xgb.fit(x_train, y_train_log1p,
eval_set=[(x_val, y_val_log1p)],
verbose=0 # Set to 100 if you want to see progress
)
pred_val_xgb = model_xgb.predict(x_val)
pred_test_xgb = model_xgb.predict(x_test)
# --- LGBM Training and Prediction ---
print(" ➡ Training LGBM...")
model_lgb.fit(x_train, y_train_log1p,
eval_set=[(x_val, y_val_log1p)],
)
pred_val_lgb = model_lgb.predict(x_val)
pred_test_lgb = model_lgb.predict(x_test)
# --- Store OOF and Test Predictions (transformed back to original scale) ---
add_pred_val_cat[val_index] = np.expm1(pred_val_cat)
add_pred_val_xgb[val_index] = np.expm1(pred_val_xgb)
add_pred_val_lgb[val_index] = np.expm1(pred_val_lgb)
add_pred_test_cat += np.expm1(pred_test_cat) / kf.n_splits
add_pred_test_xgb += np.expm1(pred_test_xgb) / kf.n_splits
add_pred_test_lgb += np.expm1(pred_test_lgb) / kf.n_splits
# Ensure all predictions are non-negative
add_pred_val_cat[add_pred_val_cat < 0] = 0
add_pred_val_xgb[add_pred_val_xgb < 0] = 0
add_pred_val_lgb[add_pred_val_lgb < 0] = 0
# Note: test predictions will also be non-negative after final prediction step
# Calculate and print RMSLE for individual models on this fold
print(f" CatBoost RMSLE (Fold {fold + 1}): {rmsle(y_val, add_pred_val_cat[val_index]):.4f}")
print(f" XGBoost RMSLE (Fold {fold + 1}): {rmsle(y_val, add_pred_val_xgb[val_index]):.4f}")
print(f" LGBM RMSLE (Fold {fold + 1}): {rmsle(y_val, add_pred_val_lgb[val_index]):.4f}")
x_meta_train = pd.DataFrame({
'cat_pred': add_pred_val_cat,
'xgb_pred': add_pred_val_xgb,
'lgbm_pred': add_pred_val_lgb,
})
y_meta_train = Y_train
x_meta_test = pd.DataFrame({
'cat_pred': add_pred_test_cat,
'xgb_pred': add_pred_test_xgb,
'lgbm_pred': add_pred_test_lgb,
})
model_meta = Ridge(random_state=42)
model_meta.fit(x_meta_train, y_meta_train)
print(f" meta RMSLE :{rmsle(y_meta_train, model_meta.predict(x_meta_train)):.4f}")
pred_all = np.expm1(model_meta.predict(x_meta_test))
submission = pd.DataFrame({
'id': df_test['id'],
'Calories': pred_all
})
submission['Calories'] = np.clip(submission['Calories'], a_min=df_test['Duration'], a_max=20*df_test['Duration'])
submission.to_csv('submission.csv', index=False)