下游任务的两阶段(two-stage) 处理
下载数据并预处理的程序:
# -------------------------------
# 导入必要的库
# -------------------------------
import numpy as np
import torch
from benchpots.datasets import preprocess_physionet2012
from pypots.imputation import SAITS
from pypots.optim import Adam
from pypots.nn.functional import calc_mse
from pypots.data.saving import pickle_dump
# -------------------------------
# 设备配置
# -------------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# -------------------------------
# 预处理PhysioNet2012数据
# -------------------------------
physionet2012_dataset = preprocess_physionet2012(
subset="set-a",
pattern="point",
rate=0.1,
)
print(physionet2012_dataset.keys())
# -------------------------------
# 构建缺失掩码、替换NaN
# -------------------------------
physionet2012_dataset["test_X_indicating_mask"] = (
np.isnan(physionet2012_dataset["test_X"]) ^
np.isnan(physionet2012_dataset["test_X_ori"])
)
physionet2012_dataset["test_X_ori"] = np.nan_to_num(physionet2012_dataset["test_X_ori"])
# -------------------------------
# 构建训练、验证、测试集
# -------------------------------
train_set = {"X": physionet2012_dataset["train_X"]}
val_set = {
"X": physionet2012_dataset["val_X"],
"X_ori": physionet2012_dataset["val_X_ori"],
}
test_set = {
"X": physionet2012_dataset["test_X"],
"X_ori": physionet2012_dataset["test_X_ori"],
}
# -------------------------------
# 初始化SAITS模型
# -------------------------------
saits = SAITS(
n_steps=physionet2012_dataset['n_steps'],
n_features=physionet2012_dataset['n_features'],
n_layers=3,
d_model=64,
n_heads=4,
d_k=16,
d_v=16,
d_ffn=128,
dropout=0.1,
ORT_weight=1,
MIT_weight=1,
batch_size=32,
epochs=10,
patience=3,
optimizer=Adam(lr=1e-3),
num_workers=0,
device=DEVICE,
saving_path="result_saving/imputation/saits",
model_saving_strategy="best",
)
# -------------------------------
# 模型训练
# -------------------------------
saits.fit(train_set, val_set)
# -------------------------------
# 测试集预测 & 评估
# -------------------------------
test_set_imputation_results = saits.predict(test_set)
test_MSE = calc_mse(
test_set_imputation_results["imputation"],
physionet2012_dataset["test_X_ori"],
physionet2012_dataset["test_X_indicating_mask"],
)
print(f"SAITS test_MSE: {test_MSE}")
# -------------------------------
# 保存插补结果和标签
# -------------------------------
train_set_imputation = saits.impute(train_set)
val_set_imputation = saits.impute(val_set)
test_set_imputation = test_set_imputation_results["imputation"]
dict_to_save = {
'train_set_imputation': train_set_imputation,
'train_set_labels': physionet2012_dataset['train_y'],
'val_set_imputation': val_set_imputation,
'val_set_labels': physionet2012_dataset['val_y'],
'test_set_imputation': test_set_imputation,
'test_set_labels': physionet2012_dataset['test_y'],
}
pickle_dump(dict_to_save, "result_saving/imputed_physionet2012.pkl")
结果:
████████╗██╗███╗ ███╗███████╗ ███████╗███████╗██████╗ ██╗███████╗███████╗ █████╗ ██╗
╚══██╔══╝██║████╗ ████║██╔════╝ ██╔════╝██╔════╝██╔══██╗██║██╔════╝██╔════╝ ██╔══██╗██║
██║ ██║██╔████╔██║█████╗█████╗███████╗█████╗ ██████╔╝██║█████╗ ███████╗ ███████║██║
██║ ██║██║╚██╔╝██║██╔══╝╚════╝╚════██║██╔══╝ ██╔══██╗██║██╔══╝ ╚════██║ ██╔══██║██║
██║ ██║██║ ╚═╝ ██║███████╗ ███████║███████╗██║ ██║██║███████╗███████║██╗██║ ██║██║
╚═╝ ╚═╝╚═╝ ╚═╝╚══════╝ ╚══════╝╚══════╝╚═╝ ╚═╝╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═╝╚═╝
ai4ts v0.0.3 - building AI for unified time-series analysis, https://time-series.ai2025-05-17 22:48:33 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link:
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2025-05-17 22:48:33 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2025-05-17 22:48:33 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2025-05-17 22:48:33 [INFO]: Loaded successfully!
2025-05-17 22:48:40 [WARNING]: Note that physionet_2012 has sparse observations in the time series, hence we don't add additional missing values to the training dataset.
2025-05-17 22:48:40 [INFO]: 23126 values masked out in the val set as ground truth, take 10.03% of the original observed values
2025-05-17 22:48:40 [INFO]: 28703 values masked out in the test set as ground truth, take 10.00% of the original observed values
2025-05-17 22:48:40 [INFO]: Total sample number: 3997
2025-05-17 22:48:40 [INFO]: Training set size: 2557 (63.97%)
2025-05-17 22:48:40 [INFO]: Validation set size: 640 (16.01%)
2025-05-17 22:48:40 [INFO]: Test set size: 800 (20.02%)
2025-05-17 22:48:40 [INFO]: Number of steps: 48
2025-05-17 22:48:40 [INFO]: Number of features: 37
2025-05-17 22:48:40 [INFO]: Train set missing rate: 79.70%
2025-05-17 22:48:40 [INFO]: Validating set missing rate: 81.74%
2025-05-17 22:48:40 [INFO]: Test set missing rate: 81.82%
dict_keys(['n_classes', 'n_steps', 'n_features', 'scaler', 'train_X', 'train_y', 'train_ICUType', 'val_X', 'val_y', 'val_ICUType', 'test_X', 'test_y', 'test_ICUType', 'val_X_ori', 'test_X_ori'])
2025-05-17 22:48:40 [INFO]: Using the given device: cpu
2025-05-17 22:48:40 [INFO]: Model files will be saved to result_saving/imputation/saits\20250517_T224840
2025-05-17 22:48:40 [INFO]: Tensorboard file will be saved to result_saving/imputation/saits\20250517_T224840\tensorboard
2025-05-17 22:48:40 [INFO]: Using customized MAE as the training loss function.
2025-05-17 22:48:40 [INFO]: Using customized MSE as the validation metric function.
2025-05-17 22:48:40 [INFO]: SAITS initialized with the given hyperparameters, the number of trainable parameters: 218,294
2025-05-17 22:48:46 [INFO]: Epoch 001 - training loss (MAE): 1.0853, validation MSE: 1.1169
2025-05-17 22:48:50 [INFO]: Epoch 002 - training loss (MAE): 0.7760, validation MSE: 1.0304
2025-05-17 22:48:54 [INFO]: Epoch 003 - training loss (MAE): 0.6874, validation MSE: 1.0108
2025-05-17 22:48:59 [INFO]: Epoch 004 - training loss (MAE): 0.6466, validation MSE: 0.9866
2025-05-17 22:49:03 [INFO]: Epoch 005 - training loss (MAE): 0.6152, validation MSE: 0.9774
2025-05-17 22:49:08 [INFO]: Epoch 006 - training loss (MAE): 0.5928, validation MSE: 0.9666
2025-05-17 22:49:12 [INFO]: Epoch 007 - training loss (MAE): 0.5806, validation MSE: 0.9617
2025-05-17 22:49:17 [INFO]: Epoch 008 - training loss (MAE): 0.5676, validation MSE: 0.9548
2025-05-17 22:49:21 [INFO]: Epoch 009 - training loss (MAE): 0.5569, validation MSE: 0.9529
2025-05-17 22:49:25 [INFO]: Epoch 010 - training loss (MAE): 0.5460, validation MSE: 0.9478
2025-05-17 22:49:25 [INFO]: Finished training. The best model is from epoch#10.
2025-05-17 22:49:25 [INFO]: Saved the model to result_saving/imputation/saits\20250517_T224840\SAITS.pypots
SAITS test_MSE: 0.38155692078597747
2025-05-17 22:49:27 [INFO]: Successfully saved to result_saving/imputed_physionet2012.pkl
(pypots-env) PS D:\Projects\pypots-experiments>
两阶段处理的脚本程序:
# 导入必要库
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from copy import deepcopy
from pypots.data.saving import pickle_load
from pypots.nn.functional.classification import calc_binary_classification_metrics
# 设置设备
DEVICE = "cpu"
# 数据集封装类
class LoadImputedDataAndLabel(Dataset):
def __init__(self, imputed_data, labels):
self.imputed_data = imputed_data
self.labels = labels
def __len__(self):
return len(self.labels)
def __getitem__(self, idx):
return (
torch.from_numpy(self.imputed_data[idx]).to(torch.float32),
torch.tensor(self.labels[idx]).to(torch.long),
)
# LSTM 分类器模型
class ClassificationLSTM(torch.nn.Module):
def __init__(self, n_features, rnn_hidden_size, n_classes):
super().__init__()
self.rnn = torch.nn.LSTM(
n_features,
hidden_size=rnn_hidden_size,
batch_first=True,
)
self.fcn = torch.nn.Linear(rnn_hidden_size, n_classes)
def forward(self, data):
hidden_states, _ = self.rnn(data)
logits = self.fcn(hidden_states[:, -1, :])
prediction_probabilities = torch.sigmoid(logits)
return prediction_probabilities
# 数据加载器生成函数
def get_dataloaders(train_X, train_y, val_X, val_y, test_X, test_y, batch_size=128):
train_set = LoadImputedDataAndLabel(train_X, train_y)
val_set = LoadImputedDataAndLabel(val_X, val_y)
test_set = LoadImputedDataAndLabel(test_X, test_y)
train_loader = DataLoader(train_set, batch_size, shuffle=True)
val_loader = DataLoader(val_set, batch_size, shuffle=False)
test_loader = DataLoader(test_set, batch_size, shuffle=False)
return train_loader, val_loader, test_loader
# 训练流程
def train(model, train_dataloader, val_dataloader, test_loader):
n_epochs = 20
patience = 5
optimizer = torch.optim.Adam(model.parameters(), 1e-3)
current_patience = patience
best_loss = float("inf")
for epoch in range(n_epochs):
model.train()
for idx, data in enumerate(train_dataloader):
X, y = map(lambda x: x.to(DEVICE), data)
optimizer.zero_grad()
probabilities = model(X)
loss = F.cross_entropy(probabilities, y.reshape(-1))
loss.backward()
optimizer.step()
model.eval()
loss_collector = []
with torch.no_grad():
for idx, data in enumerate(val_dataloader):
X, y = map(lambda x: x.to(DEVICE), data)
probabilities = model(X)
loss = F.cross_entropy(probabilities, y.reshape(-1))
loss_collector.append(loss.item())
loss = np.asarray(loss_collector).mean()
if best_loss > loss:
current_patience = patience
best_loss = loss
best_model = deepcopy(model.state_dict())
else:
current_patience -= 1
if current_patience == 0:
break
model.load_state_dict(best_model)
model.eval()
probability_collector = []
for idx, data in enumerate(test_loader):
X, y = map(lambda x: x.to(DEVICE), data)
probabilities = model.forward(X)
probability_collector += probabilities.cpu().tolist()
probability_collector = np.asarray(probability_collector)
return probability_collector
# 加载插补后的数据
data = pickle_load('result_saving/imputed_physionet2012.pkl')
train_X, val_X, test_X = data['train_set_imputation'], data['val_set_imputation'], data['test_set_imputation']
train_y, val_y, test_y = data['train_set_labels'], data['val_set_labels'], data['test_set_labels']
# 构建数据加载器
train_loader, val_loader, test_loader = get_dataloaders(train_X, train_y, val_X, val_y, test_X, test_y)
# 初始化并训练分类器
rnn_classifier = ClassificationLSTM(n_features=37, rnn_hidden_size=128, n_classes=2)
proba_predictions = train(rnn_classifier, train_loader, val_loader, test_loader)
# 输出测试集分类性能
pos_num = test_y.sum()
neg_num = len(test_y) - pos_num
print(f'test_set中的正负样本比例为{pos_num}:{neg_num}, 正样本占比为{pos_num/len(test_y):.4f}\n')
classification_metrics = calc_binary_classification_metrics(proba_predictions, test_y)
print(f"LSTM在测试集上的ROC-AUC为: {classification_metrics['roc_auc']:.4f}")
print(f"LSTM在测试集上的PR-AUC为: {classification_metrics['pr_auc']:.4f}")
结果如下: