本文采用小波包分解和随机森林分类器对uOttawa轴承数据集进行分类,比较简单,直接看代码就可以看懂,并可迁移至其他的一维数据集,比如心电信号,肌电信号,脑电信号,微振信号,各种声信号等等,顺便把python学一下,结合自己的领域学python能有效避免劝退。
数据集分为5类,分别为健康状态,内圈故障,外圈故障,滚动体故障和复合故障
其中复合故障数据文件CompF内容如下
健康工况数据文件Healthy内容如下
首先导入相关的信号处理模块,若没有小波模块pywt,要首先pip install pywt
import glob
from scipy.io import loadmat
from numpy import asarray
import matplotlib.pyplot as plt
import numpy as np
from scipy import signal
import scipy
import re
import os
import pandas as pd
import pywt
from scipy.fftpack import fft
from warnings import warn
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')
然后定义个FFT函数便于以后的特征提取(频谱中较高幅值及其对应的频率)
def apply_fft(x, fs, num_samples):
f = np.linspace(0.0, (fs/2.0), num_samples//2)
freq_values = fft(x)
freq_values = 2.0/num_samples * np.abs(freq_values[0:num_samples//2])
return f, freq_values
然后定义一个从原始振动信号创建数据集的函数,该函数处理.mat 振动数据文件并对振动信号进行分割
def make_dataset(data_src, num_samples, class_):
files = glob.glob(data_src)
files = np.sort(files)
data = loadmat(files[0])
keysList = sorted(data.keys())
key = keysList[0]
drive_end_data = data[key]
drive_end_data = drive_end_data.reshape(-1)
num_segments = np.floor(len(drive_end_data)/num_samples)
slices = np.split(drive_end_data[0:int(num_segments*num_samples)], num_samples)
silces = np.array(slices).reshape(int(num_segments), num_samples)
segmented_data = silces
files = files[1:]
for file in files:
data = loadmat(file)
keysList = sorted(data.keys())
key = keysList[0]
drive_end_data = data[key]
drive_end_data = drive_end_data.reshape(-1)
num_segments = np.floor(len(drive_end_data)/num_samples)
slices = np.split(drive_end_data[0:int(num_segments*num_samples)], num_samples)
silces = np.array(slices).reshape(int(num_segments), num_samples)
segmented_data = np.concatenate( (segmented_data, silces) , axis=0, out=None)
segmented_data = np.unique(segmented_data, axis= 0) # 删除重复项
np.random.shuffle( segmented_data) #打乱数据
Class_ = np.ones(len(segmented_data))*class_
return segmented_data, Class_
对振动信号进行分组以生成数据集,下载数据集后,振动信号将根据其运行条件/属性*分组在 5 个文件夹中(对应于数据集中的运行类别数:1 个正常类别和 4 个故障类别),文件夹名称如下:
#Healthy
#IR
#OR
#BF
#CompF
#*IR = Inner Race fault
#*OR = Outer Race faults
#*BF = Ball faults
#*CompF = Combination of faults
num_samples = 40000 # 输入长度
fs = 200000; # 采样频率
data_path = (r"D:\dataset") #路径
cls_1 = 'Healthy/*'
cls_2 = 'IR/*'
cls_3 = 'OR/*'
cls_4 = 'BF/*'
cls_5 = 'CompF/*'
数据集创建
norm, y_norm = make_dataset(os.path.join(data_path, cls_1), num_samples, 0)
defc1, y_defc1 = make_dataset(os.path.join(data_path, cls_2), num_samples, 1)
defc2, y_defc2 = make_dataset(os.path.join(data_path, cls_3), num_samples, 2)
defc3, y_defc3 = make_dataset(os.path.join(data_path, cls_4), num_samples, 3)
defc4, y_defc4 = make_dataset(os.path.join(data_path, cls_5), num_samples, 4)
X = np.concatenate( (norm, defc1, defc2, defc3, defc4) , axis=0, out=None)
Y = np.concatenate( (y_norm, y_defc1, y_defc2, y_defc3, y_defc4), axis=0, out=None)
开始特征提取步骤,此处花费时间较长。选择db4小波,分解层数为7,代码还是很容易看懂的
wavelet_function = "db4" #db4小波
num_levels = 7 # 分解层数
m = 5 #m个最高幅度及其对应的频率
#小波包分解和特征提取
num_features = 2**num_levels
features = np.repeat(np.nan, len(X)*m*num_features).reshape(len(X),m*num_features)
for i in range(len(X)):
wp = pywt.WaveletPacket(X[i], wavelet = wavelet_function, maxlevel = num_levels) # 小波包分解
packet_names = [node.path for node in wp.get_level(num_levels, "natural")]
for j in range(num_features):
new_wp = pywt.WaveletPacket(data = None, wavelet = wavelet_function, maxlevel = num_levels)
new_wp[packet_names[j]] = wp[packet_names[j]].data
reconstructed_signal = new_wp.reconstruct(update = False) #从小波包系数重建信号
f, c = apply_fft(reconstructed_signal, fs, len(reconstructed_signal))
z = abs(c)
#求频谱的m个最高幅度及其对应的频率
maximal_idx = np.argpartition(z, -m)[-m:]
high_amp = z[maximal_idx]
high_freq = f[maximal_idx]
feature = high_amp*high_freq
l = 0
for f in feature:
features[i,j*m+l] = f
l = l+1
特征提取完毕后,开始基于随机森林的故障分类,导入相关机器学习模块
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score, confusion_matrix
下面进行标签转换
labels = pd.Categorical(Y)
然后,训练集和测试集划分
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2,
shuffle = True, stratify = labels)#random_state = 42
并进行数据标准化
scaler = StandardScaler()
train_data_scaled = scaler.fit_transform(X_train)
test_data_scaled = scaler.transform(X_test)
最后开始随机森林训练
clf_RF = RandomForestClassifier(criterion='entropy', max_features = 1, min_samples_leaf=1, min_samples_split=3,
max_depth=20, n_estimators=200)
clf_RF.fit(train_data_scaled, y_train)
看一下混淆矩阵等结果
No. of Samples = 40000 / k = 7 / m = 5
ROC AUC = 1.000
F1 Score = 0.9915254237288136
Accuracy = 99.153 %
面包多代码如下
本文含有隐藏内容,请 开通VIP 后查看