设有一离散序列xi(i=1,2...,n),其数学均值μ与方差σ2传统计算方式如下:
该方法的缺陷在于,每次新增一个数据点时,都需要重新遍历全部历史数据以更新均值与方差,这在实际的实时信号处理系统中难以实现,尤其当数据量较大时,计算效率将显著下降。为解决这一问题,本设计采用Welford递推算法,实现对均值与方差的增量式更新,从而避免对历史数据的重复访问与遍历,显著提升计算效率。
Welford 算法的递推公式如下:
其中
module exp_var_calc(
input i_clk ,
input i_rst ,
input [31:0] i_data ,
input i_start ,
output reg [31:0] o_exp ,
output reg [79:0] o_var ,
output reg o_done ,
output o_ready
);
(*max_fanout = 20, keep = "true"*)
reg [12:0] r_cstate ;
reg [12:0] r_nstate ;
localparam S_IDLE = 13'b0000000000001 ,
S_STEP0 = 13'b0000000000010 ,//delta0
S_STEP1 = 13'b0000000000100 ,//delta0 / i
S_STEP2 = 13'b0000000001000 ,
S_STEP3 = 13'b0000000010000 ,//exp + delta0 / i
S_EXP = 13'b0000000100000 ,
S_STEP4 = 13'b0000001000000 ,//delta1
S_STEP5 = 13'b0000010000000 ,//delta0 * delta1
S_STEP6 = 13'b0000100000000 ,//平方偏差累积
S_STEP7 = 13'b0001000000000 ,
S_STEP8 = 13'b0010000000000 ,//平方偏差累积 / (i - 1)
S_VAR = 13'b0100000000000 ,
S_DONE = 13'b1000000000000 ;
// 寄存输入值
reg signed [47:0] r_data_reg ;
wire signed [47:0] w_data_reg ;
assign o_ready = r_cstate == S_IDLE;
always@(posedge i_clk or posedge i_rst) begin
if(i_rst)
r_data_reg <= 'd0;
else if(i_start & r_cstate == S_IDLE)
r_data_reg <= w_data_reg;
else
r_data_reg <= r_data_reg;
end
wire [47:0] w_exp;
bit_extension# (
.BW_IN (32'd32 ),
.BW_OUT (32'd48 ),
.SIGNED (1'b1 ))
data_ext_u(
.i_data (i_data ),
.o_data (w_data_reg )
);
bit_extension# (
.BW_IN (32'd32 ),
.BW_OUT (32'd48 ),
.SIGNED (1'b1 ))
exp_ext_u(
.i_data (o_exp ),
.o_data (w_exp )
);
// STEP0 start
wire signed [47:0] w_delta0 ;
addsub_S48S48
delta0 (
.A (r_data_reg ), // input wire [47 : 0] A
.B (w_exp ), // input wire [47 : 0] B
.CLK(i_clk ), // input wire CLK
.ADD(1'b0 ), // input wire ADD
.CE (1'b1 ), // input wire CE
.S (w_delta0 ) // output wire [47 : 0] S
);
// STEP0 end
// STEP1 start
(*max_fanout = 20, keep = "true"*)
reg [15:0] r_din_cnt ;
always@(posedge i_clk or posedge i_rst) begin
if(i_rst)
r_din_cnt <= 'd0;
else if(r_cstate == S_IDLE & i_start)
r_din_cnt <= r_din_cnt + 1'd1;
else
r_din_cnt <= r_din_cnt;
end
reg [15:0] r_divisor_data ;
reg [47:0] r_dividend_data ;
reg r_dividend_vld ;
reg r_divisor_vld ;
wire w_dividend_rdy ;
wire w_divisor_rdy ;
wire w_div_res_vld ;
wire [63:0] w_div_res ;
wire [47:0] w_quotient ;
wire [15:0] w_reminder ;
assign w_quotient = w_div_res[63:16];
assign w_reminder = w_div_res[15:0];
reg signed [47:0] r_round_quotient;
always@(posedge i_clk or posedge i_rst) begin
if(i_rst)
r_round_quotient <= 'd0;
else if(w_quotient[47])
if(w_reminder >= 16'h8000)
r_round_quotient <= $signed(w_quotient) + 'd1;
else
r_round_quotient <= w_quotient;
else
if(w_reminder >= 16'h8000)
r_round_quotient <= w_quotient + 'd1;
else
r_round_quotient <= w_quotient;
end
always@(posedge i_clk or posedge i_rst) begin
if(i_rst) begin
r_dividend_data <= 'd0;
r_divisor_data <= 'd0;
r_dividend_vld <= 'd0;
r_divisor_vld <= 'd0;
end
else if(r_cstate == S_STEP1 & w_dividend_rdy & w_divisor_rdy) begin
r_dividend_data <= w_delta0;
r_divisor_data <= r_din_cnt;
r_dividend_vld <= 'd1;
r_divisor_vld <= 'd1;
end
else begin
r_dividend_data <= r_dividend_data;
r_divisor_data <= r_divisor_data;
r_dividend_vld <= 'd0;
r_divisor_vld <= 'd0;
end
end
div_S48S16
delta_div_i (
.aclk (i_clk ), // input wire aclk
.s_axis_divisor_tvalid (r_divisor_vld ), // input wire s_axis_divisor_tvalid
.s_axis_divisor_tready (w_divisor_rdy ), // output wire s_axis_divisor_tready
.s_axis_divisor_tdata (r_divisor_data ), // input wire [15 : 0] s_axis_divisor_tdata
.s_axis_dividend_tvalid (r_dividend_vld ), // input wire s_axis_dividend_tvalid
.s_axis_dividend_tready (w_dividend_rdy ), // output wire s_axis_dividend_tready
.s_axis_dividend_tdata (r_dividend_data), // input wire [47 : 0] s_axis_dividend_tdata
.m_axis_dout_tvalid (w_div_res_vld ), // output wire m_axis_dout_tvalid
.m_axis_dout_tdata (w_div_res ) // output wire [63 : 0] m_axis_dout_tdata
);
// STEP1 end
// STEP3 start
wire [47:0] w_new_exp ;
reg r_beat_delay ;//延迟一拍凑时许
always@(posedge i_clk or posedge i_rst) begin
if(i_rst)
r_beat_delay <= 'd0;
else if(r_cstate == S_STEP3)
r_beat_delay <= 'd1;
else
r_beat_delay <= 'd0;
end
addsub_S48S48
new_exp (
.A (r_round_quotient ), // input wire [47 : 0] A
.B (w_exp ), // input wire [47 : 0] B
.CLK(i_clk ), // input wire CLK
.ADD(1'b1 ), // input wire ADD
.CE (1'b1 ), // input wire CE
.S (w_new_exp ) // output wire [47 : 0] S
);
// STEP3 end
reg signed [47:0] r_new_exp ;
always@(posedge i_clk or posedge i_rst) begin
if(i_rst)
r_new_exp <= 'd0;
else if(r_cstate == S_EXP)
r_new_exp <= w_new_exp;
else
r_new_exp <= r_new_exp;
end
// STEP4 start
wire [47:0] w_delta1;
addsub_S48S48
delta1 (
.A (r_data_reg ), // input wire [47 : 0] A
.B (r_new_exp ), // input wire [47 : 0] B
.CLK(i_clk ), // input wire CLK
.ADD(1'b0 ), // input wire ADD
.CE (1'b1 ), // input wire CE
.S (w_delta1 ) // output wire [47 : 0] S
);
// STEP4 end
// STEP5 start
reg [3:0] r_mult_cnt ;
always@(posedge i_clk or posedge i_rst) begin
if(i_rst)
r_mult_cnt <= 'd0;
else if(r_cstate == S_STEP5)
r_mult_cnt <= r_mult_cnt + 1;
else
r_mult_cnt <= 'd0;
end
wire signed [63:0] w_delta0Xdelta1;
mult_S32S32
delta0_delta1 (
.CLK (i_clk ), // input wire CLK
.A ({w_delta0[47],w_delta0[30:0]}), // input wire [31 : 0] A
.B ({w_delta1[47],w_delta1[30:0]}), // input wire [31 : 0] B
.P (w_delta0Xdelta1) // output wire [63 : 0] P
);
// STEP5 end
// STEP6 start
reg [3:0] r_add_cnt ;//延迟计数
always@(posedge i_clk or posedge i_rst) begin
if(i_rst)
r_add_cnt <= 'd0;
else if(r_cstate == S_STEP6)
r_add_cnt <= r_add_cnt + 1;
else
r_add_cnt <= 'd0;
end
wire [79:0] w_sd_acc ;
reg signed [79:0] r_sd_acc ;//平方偏差累积
always@(posedge i_clk or posedge i_rst) begin
if(i_rst)
r_sd_acc <= 'd0 ;
else if(r_cstate == S_STEP6 & r_add_cnt >= 7)
r_sd_acc <= w_sd_acc;
else
r_sd_acc <= r_sd_acc;
end
add_S80S64
ad_acc (
.A (r_sd_acc ), // input wire [79 : 0] A
.B (w_delta0Xdelta1 ), // input wire [63 : 0] B
.CLK(i_clk ), // input wire CLK
.S (w_sd_acc ) // output wire [79 : 0] S
);
// STEP6 end
// STEP7 start
reg r_div_start ;
wire [79:0] w_new_var ;
wire [79:0] w_round_var ;
wire w_div_done ;
always@(posedge i_clk or posedge i_rst) begin
if(i_rst)
r_div_start <= 'd0;
else if(r_cstate == S_STEP7)
r_div_start <= 'd1;
else
r_div_start <= 'd0;
end
wire [15:0] w_var_rem;
assign w_round_var = w_var_rem >= (r_din_cnt >> 1) ? w_new_var + 'd1 : w_new_var;
reg [79:0] r_round_var ;
always@(posedge i_clk or posedge i_rst) begin
if(i_rst)
r_round_var <= 'd0;
else if(r_cstate == S_VAR)
r_round_var <= w_round_var;
else
r_round_var <= r_round_var;
end
my_divider#(
.DIVIDEND_BW(80) ,
.DIVISOR_BW (16))
my_divider_u(
.i_clk (i_clk ),
.i_rst (i_rst ),
.i_dividend (r_sd_acc ),
.i_divisor (r_din_cnt - 1 ),
.i_start (r_div_start),
.o_quotient (w_new_var ),
.o_reminder (w_var_rem ),
.o_done (w_div_done )
);
always@(posedge i_clk or posedge i_rst) begin
if(i_rst) begin
o_exp <= 'd0;
o_var <= 'd0;
o_done <= 'd0;
end
else if(r_cstate == S_DONE) begin
o_exp <= r_new_exp;
o_var <= r_din_cnt > 1 ? r_round_var : 'd0;
o_done <= 'd1;
end
else begin
o_exp <= o_exp;
o_var <= o_var;
o_done <= 'd0;
end
end
always@(posedge i_clk or posedge i_rst) begin
if(i_rst)
r_cstate <= S_IDLE;
else
r_cstate <= r_nstate;
end
always@(*) begin
case(r_cstate)
S_IDLE: r_nstate = i_start ? S_STEP0 : S_IDLE;
S_STEP0: r_nstate = S_STEP1;
S_STEP1: r_nstate = w_dividend_rdy & w_divisor_rdy ? S_STEP2 : S_STEP1;
S_STEP2: r_nstate = w_div_res_vld ? S_STEP3 : S_STEP2;
S_STEP3: r_nstate = r_beat_delay ? S_EXP : S_STEP3;
S_EXP: r_nstate = S_STEP4;
S_STEP4: r_nstate = S_STEP5;
S_STEP5: r_nstate = r_mult_cnt >= 6 ? S_STEP6 : S_STEP5;
S_STEP6: r_nstate = r_add_cnt >= 7 ? S_STEP7 : S_STEP6;
S_STEP7: r_nstate = S_STEP8;
S_STEP8: r_nstate = w_div_done ? S_VAR : S_STEP8;
S_VAR: r_nstate = S_DONE;
S_DONE: r_nstate = S_IDLE;
default: r_nstate = S_IDLE;
endcase
end
endmodule
逻辑级数比较大,目前勉强跑200mhz,可以在状态机中再加一拍处理round var的逻辑运算,tb文件如下
`timescale 1ns / 1ps
module exp_var_tb();
reg i_clk ;
reg i_rst ;
reg [31:0] i_data ;
reg i_start ;
wire [31:0] o_exp ;
wire [79:0] o_var ;
wire o_done ;
wire o_ready ;
initial begin
i_clk = 1;
i_rst = 1;
i_data = 0;
i_start = 0;
#100;
i_rst = 0;
repeat(1000) begin
#10
i_data = $random()%8'hff;
i_start = 1;
#10
i_start = 0;
@(posedge o_ready);
end
#1000
$stop;
end
always#5 i_clk = ~i_clk;
exp_var_calc
exp_var_calc_u(
i_clk ,
i_rst ,
i_data ,
i_start ,
o_exp ,
o_var ,
o_done ,
o_ready
);
endmodule
仿真波形白色是满足[0,255]的均匀分布随机数列,橙色表示均值收敛在127,黄色代表方差收敛在5400左右。