1. 基于滑动窗口的中值绝对偏差(MAD)方法
public static double[] removeContinuousOutliersMAD(double[] data, int windowSize, double threshold) {
double[] filtered = Arrays.copyOf(data, data.length);
for (int i = 0; i < data.length; i++) {
// 计算窗口内的中位数
List<Double> window = new ArrayList<>();
for (int j = Math.max(0, i - windowSize/2); j <= Math.min(data.length - 1, i + windowSize/2); j++) {
window.add(data[j]);
}
double median = getMedian(window);
// 计算MAD (Median Absolute Deviation)
List<Double> deviations = new ArrayList<>();
for (Double value : window) {
deviations.add(Math.abs(value - median));
}
double mad = getMedian(deviations);
// 替换异常值
if (mad != 0 && Math.abs(data[i] - median) > threshold * mad) {
filtered[i] = median; // 或用邻域值替换
}
}
return filtered;
}
private static double getMedian(List<Double> list) {
Collections.sort(list);
return list.get(list.size() / 2);
}
2、基于连续异常值计数的剔除方法
public static double[] removeContinuousOutliers(double[] data, double threshold, int maxConsecutive) {
double[] filtered = Arrays.copyOf(data, data.length);
double mean = calculateMean(data);
double stdDev = calculateStdDev(data, mean);
int consecutiveCount = 0;
for (int i = 0; i < data.length; i++) {
if (Math.abs(data[i] - mean) > threshold * stdDev) {
consecutiveCount++;
if (consecutiveCount > maxConsecutive) {
// 使用前后非异常值的平均值替换
double replacement = findReplacementValue(data, i);
filtered[i] = replacement;
}
} else {
consecutiveCount = 0;
}
}
return filtered;
}
private static double findReplacementValue(double[] data, int index) {
// 向前找第一个非异常值
double prev = 0;
for (int i = index - 1; i >= 0; i--) {
if (Math.abs(data[i] - calculateMean(data)) <= calculateStdDev(data, calculateMean(data))) {
prev = data[i];
break;
}
}
// 向后找第一个非异常值
double next = 0;
for (int i = index + 1; i < data.length; i++) {
if (Math.abs(data[i] - calculateMean(data)) <= calculateStdDev(data, calculateMean(data))) {
next = data[i];
break;
}
}
return (prev + next) / 2.0;
}
3. 使用指数加权移动平均(EWMA)检测连续异常
public static double[] detectContinuousAnomaliesEWMA(double[] data, double lambda, double threshold) {
double[] filtered = Arrays.copyOf(data, data.length);
double ewma = data[0];
int anomalyStreak = 0;
for (int i = 1; i < data.length; i++) {
ewma = lambda * data[i] + (1 - lambda) * ewma;
double residual = Math.abs(data[i] - ewma);
if (residual > threshold) {
anomalyStreak++;
if (anomalyStreak >= 3) { // 连续3个点异常
// 使用EWMA值替换
filtered[i] = ewma;
}
} else {
anomalyStreak = 0;
}
}
return filtered;
}
4. 基于变化率的连续异常检测
public static double[] removeContinuousSpikes(double[] data, double rateThreshold) {
double[] filtered = Arrays.copyOf(data, data.length);
double[] rates = new double[data.length - 1];
// 计算变化率
for (int i = 0; i < rates.length; i++) {
rates[i] = Math.abs(data[i+1] - data[i]);
}
// 计算变化率的统计量
double rateMean = calculateMean(rates);
double rateStd = calculateStdDev(rates, rateMean);
// 检测连续异常变化
int spikeLength = 0;
for (int i = 1; i < data.length - 1; i++) {
double prevRate = Math.abs(data[i] - data[i-1]);
double nextRate = Math.abs(data[i+1] - data[i]);
if ((prevRate > rateMean + rateThreshold * rateStd) &&
(nextRate > rateMean + rateThreshold * rateStd)) {
spikeLength++;
if (spikeLength >= 2) { // 连续两个点变化率过大
// 使用前后点的平均值替换
filtered[i] = (data[i-1] + data[i+1]) / 2.0;
}
} else {
spikeLength = 0;
}
}
return filtered;
}
辅助方法
private static double calculateMean(double[] data) {
double sum = 0;
for (double d : data) sum += d;
return sum / data.length;
}
private static double calculateStdDev(double[] data, double mean) {
double variance = 0;
for (double d : data) variance += Math.pow(d - mean, 2);
return Math.sqrt(variance / data.length);
}
测试:
public static void main(String[] args) {
double[] data = {10, 10.1, 10.2, 50, 55, 52, 10.3, 10.2, 10.1, 60, 65, 10};
// 方法1: 基于MAD
double[] result1 = removeContinuousOutliersMAD(data, 5, 3.0);
// 方法2: 基于连续计数
double[] result2 = removeContinuousOutliers(data, 2.5, 2);
// 方法3: EWMA方法
double[] result3 = detectContinuousAnomaliesEWMA(data, 0.2, 3.0);
System.out.println("原始数据: " + Arrays.toString(data));
System.out.println("MAD方法: " + Arrays.toString(result1));
System.out.println("连续计数方法: " + Arrays.toString(result2));
System.out.println("EWMA方法: " + Arrays.toString(result3));
}
方法选择建议
MAD方法:对非正态分布数据更鲁棒,适合数据分布未知的情况
连续计数方法:适合已知异常值最大连续长度的情况
EWMA方法:适合时间序列数据,对缓慢变化的异常更敏感
变化率方法:适合检测数据中突然的连续跳跃
对于特别长的连续异常,可能需要结合领域知识或更复杂的算法,如基于机器学习的方法。