太阳黑子数量 (Sunspots)
- 数据描述: 每年观测到的太阳黑子数量。
- 无明显趋势: 长期来看,数据没有持续的上升或下降趋势。
- 强周期性 (Cyclical): 数据呈现非常明显的周期性波动,大约每11年一个周期。注意,这与“季节性”不同,季节性周期是固定的(如12个月),而这里的周期长度是近似的。
- 相对平稳: 经过检验,数据通常被认为是平稳或近似平稳的。
非常适合用来理解ARMA模型。由于数据本身比较平稳,不需要差分,可以专注于用ACF和PACF图来确定 p 和 q 的值。
加载数据及预处理
import pandas as pd
from statsmodels.datasets import sunspots
# 加载数据
data = sunspots.load_pandas().data
data['YEAR'] = pd.to_datetime(data['YEAR'], format='%Y')
data.set_index('YEAR', inplace=True)
data = data['SUNACTIVITY'] # 假设列名为'SUNACTIVITY'
# 检查缺失值
print("缺失值数量:", data.isnull().sum())
数据可视化
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
data.plot(title='Sunspots Activity Over Time')
plt.xlabel('Year')
plt.ylabel('Sunspot Count')
plt.show()
平稳性检验
from statsmodels.tsa.stattools import adfuller
# ADF检验
def adf_test(series):
result = adfuller(series)
print(f'ADF Statistic: {result[0]}')
print(f'p-value: {result[1]}')
if result[1] <= 0.05:
print("序列平稳")
else:
print("序列非平稳")
adf_test(data)
# 若非平稳,进行差分
data_diff = data.diff().dropna()
adf_test(data_diff) # 重复直到平稳,确定d值
确定ARIMA参数 (p, d, q)
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
# 绘制差分后的ACF和PACF
plot_acf(data_diff, lags=20)
plot_pacf(data_diff, lags=20)
plt.show()
# 自动选择参数(需安装pmdarima)
from pmdarima import auto_arima
model = auto_arima(data, seasonal=False, trace=True)
print(f'Best ARIMA Order: {model.order}') # 输出(p, d, q)
拟合ARIMA模型
from statsmodels.tsa.arima.model import ARIMA
# 手动设置参数示例:ARIMA(2,1,2)
model = ARIMA(data, order=(2,1,2))
results = model.fit()
print(results.summary())
模型诊断
# 残差检查
residuals = results.resid
# 残差ACF/PACF
plot_acf(residuals, lags=20)
plot_pacf(residuals, lags=20)
plt.show()
# Ljung-Box检验(残差是否为白噪声)
from statsmodels.stats.diagnostic import acorr_ljungbox
lb_test = acorr_ljungbox(residuals, lags=20)
print(lb_test)
预测未来值
# 预测未来5年
forecast_steps = 5
forecast = results.get_forecast(steps=forecast_steps)
forecast_mean = forecast.predicted_mean
conf_int = forecast.conf_int()
# 可视化结果
plt.figure(figsize=(10,6))
plt.plot(data, label='Observed')
plt.plot(results.fittedvalues, color='red', label='Fitted')
plt.plot(forecast_mean.index, forecast_mean, color='green', label='Forecast')
plt.fill_between(conf_int.index, conf_int.iloc[:,0], conf_int.iloc[:,1], color='pink', alpha=0.3)
plt.title('Sunspots Forecast with ARIMA')
plt.legend()
plt.show()
模型评估
# 计算均方根误差(需划分训练集/测试集)
from sklearn.metrics import mean_squared_error
import numpy as np
train_size = int(len(data) * 0.8)
train, test = data[:train_size], data[train_size:]
model = ARIMA(train, order=(2,1,2)).fit()
forecast = model.get_forecast(steps=len(test))
rmse = np.sqrt(mean_squared_error(test, forecast.predicted_mean))
print(f'RMSE: {rmse}')