no functional run (did not compile)
(no automated diagnostics fired)
module mcu3 #(
parameter [0:255] IMEM_INIT = 256'h0 // 32 instruction bytes, byte 0 at MSB
) (
input clk,
input rst, // synchronous, active-high
output reg [4:0] pc_out, // current fetched PC (visible for testing)
output reg halt_out, // asserted forever after HALT executes
output reg signed [7:0] r0_out // current value of register R0
);
// Register File: 4 registers, 8-bit signed
reg signed [7:0] R [0:3];
// Pipeline Registers
// Stage F (Fetch) -> Stage X (Execute)
reg [7:0] instr_x;
reg [4:0] pc_x;
reg valid_x;
// Stage X (Execute) -> Stage W (Writeback)
reg [7:0] instr_w;
reg [4:0] pc_w;
reg valid_w;
reg [1:0] rd_w; // destination register index
reg signed [7:0] data_w; // data to write
reg we_w; // write enable
reg halt_w; // halt signal
// Stage F internal
reg [4:0] pc_f;
reg [7:0] instr_f;
reg valid_f;
// Next PC logic for F stage
reg [4:0] pc_next;
// Forwarding signals
reg fwd_x_w; // Forward from W stage to X stage
reg fwd_x_x; // Forward from X stage to X stage (bypass)
reg signed [7:0] fwd_data;
// Helper function to extract byte from IMEM_INIT
// IMEM_INIT is 256 bits. Byte 0 is bits [255:248], Byte 1 is [247:240], etc.
// PC is 0-31. Byte index = PC.
// Value = IMEM_INIT[ (31-PC)*8 +: 8 ]
function [7:0] imem_read;
input [4:0] addr;
begin
imem_read = IMEM_INIT[ (5'd31 - addr) * 8 +: 8 ];
end
endfunction
// Decode helpers
function [1:0] get_rd;
input [7:0] instr;
begin
if (instr[7:6] == 2'b00) get_rd = instr[5:4]; // ADDI
else if (instr[7:6] == 2'b01) get_rd = instr[5:4]; // MOV
else get_rd = 2'b00; // BRcc, HALT don't write R
end
endfunction
function [1:0] get_rs;
input [7:0] instr;
begin
if (instr[7:6] == 2'b01) get_rs = instr[3:2]; // MOV
else get_rs = 2'b00; // ADDI uses immediate, BRcc uses R0 for condition, HALT does nothing
end
endfunction
function signed [7:0] sign_ext_4;
input [3:0] imm;
begin
sign_ext_4 = {{4{imm[3]}}, imm};
end
endfunction
// Determine if a register is being written in a stage
function we_x;
input [7:0] instr;
begin
if (instr[7:6] == 2'b00) we_x = 1; // ADDI
else if (instr[7:6] == 2'b01) we_x = 1; // MOV
else we_x = 0;
end
endfunction
function we_w;
input [7:0] instr;
begin
if (instr[7:6] == 2'b00) we_w = 1; // ADDI
else if (instr[7:6] == 2'b01) we_w = 1; // MOV
else we_w = 0;
end
endfunction
// Forwarding Logic
// We need to forward data to the X stage if the instruction in X reads a register
// that is being written by the instruction in W or X.
// Priority: W stage (older) over X stage (younger) is not quite right.
// Standard forwarding: X stage (producing result this cycle) is available for next cycle's X stage?
// No, in a 3-stage pipeline:
// Cycle N: Instr A in X, Instr B in F
// Cycle N+1: Instr A in W, Instr B in X
// If B reads a register written by A, we can forward A's result from W stage (which is computed in X stage of Cycle N) to B in X stage of Cycle N+1.
// Also, if Instr C is in F in Cycle N+1, it will be in X in Cycle N+2.
// Can we forward from W stage (Instr A) to X stage (Instr C)? Yes, if A is in W and C is in X.
// Can we forward from X stage (Instr B) to X stage (Instr C)? No, B is in X, C is in F. C will be in X next cycle.
// So, forwarding sources for X stage inputs:
// 1. W stage result (if W instruction writes to the register needed by X instruction)
// 2. X stage result (if X instruction writes to the register needed by X instruction? No, X instruction is computing its own result.
// Wait, if Instr B is in X, it reads R[rs]. If Instr A is in W, it writes R[rd].
// If B's rs == A's rd, forward A's data_w to B.
// What if Instr C is in F? It will be in X next cycle.
// If Instr B is in X and writes to R[rd], and Instr C (next in F) reads R[rs] == B's rd,
// then in next cycle, B will be in W, C in X. We can forward from W (B) to X (C).
// So we only need to forward from W stage to X stage.
// Is there a case where we need to forward from X stage to X stage?
// Only if the same instruction reads and writes the same register? e.g. ADDI R0, R0.
// In that case, the value is read from the register file in the X stage.
// The new value is computed in X stage and written in W stage.
// So for ADDI R0, R0, the read happens in X, write in W. No forwarding needed within X.
// So, forwarding is only from W stage to X stage.
// Let's refine:
// Instr A (in W): writes R[rd_w] with data_w
// Instr B (in X): reads R[rs_x]
// If rs_x == rd_w and we_w is 1, then forward data_w to B.
// What about Instr B (in X) writing to R[rd_x] and Instr C (in F) reading R[rs_c]?
// In next cycle, B is in W, C is in X.
// So C will see B's result from W stage.
// So yes, only W->X forwarding is needed for back-to-back instructions.
always @(*) begin
// Default: no forwarding
fwd_x_w = 0;
fwd_x_x = 0;
fwd_data = 0;
// Check W stage forwarding
if (valid_w && we_w && (rd_w == get_rs(instr_x))) begin
fwd_x_w = 1;
fwd_data = data_w;
end
// Note: We don't need X->X forwarding because the X stage instruction
// reads the register file before computing the new value.
// The new value is not available until the end of the X stage.
// So it can only be used by the next instruction in the next cycle.
end
// Register File Read
function signed [7:0] read_R;
input [1:0] idx;
begin
read_R = R[idx];
end
endfunction
// Stage F: Fetch
always @(posedge clk) begin
if (rst) begin
pc_f <= 5'd0;
instr_f <= 8'h00;
valid_f <= 1'b1;
end else if (halt_out) begin
// Freeze pipeline
pc_f <= pc_f;
instr_f <= instr_f;
valid_f <= 1'b1;
end else begin
// Determine next PC
// If branch taken, PC = branch_target
// Else PC = PC + 1
// But we need to handle the delay slot.
// The instruction after the branch is always executed.
// So if branch is taken, the PC after the delay slot is the target.
// The delay slot instruction is at PC_branch + 1.
// So if branch is taken, the next fetch PC is target.
// If branch is not taken, the next fetch PC is PC_branch + 2.
// However, the F stage fetches the instruction at pc_f.
// The X stage determines if the branch is taken.
// The W stage does nothing for branches.
// Let's use a simple approach:
// pc_next is computed in F stage based on the instruction in X stage?
// No, F stage fetches based on current pc_f.
// The next pc_f should be pc_f + 1 unless a branch in the previous instruction (now in X) was taken.
// But the branch instruction is in X stage.
// So, if the instruction in X stage is a branch and it is taken, then the next fetch PC should be the branch target.
// But wait, the instruction in F stage is the one after the branch (delay slot).
// So if the branch is taken, the instruction after the delay slot should be fetched next.
// So pc_next = branch_target.
// If the branch is not taken, pc_next = pc_f + 1 (which is pc_branch + 2).
// Let's compute pc_next based on the X stage instruction.
// If X stage instruction is BRcc and taken, pc_next = branch_target.
// Else pc_next = pc_f + 1.
// But pc_f is the PC of the instruction currently in F stage.
// The instruction in X stage has PC = pc_x.
// If pc_x is a branch and taken, then the next instruction to fetch is the branch target.
// The delay slot instruction (at pc_x + 1) is already in F stage.
// So after the delay slot instruction is fetched, the next fetch should be the branch target.
// So, pc_next logic:
// If valid_x and instr_x is BRcc and branch is taken, pc_next = branch_target.
// Else pc_next = pc_f + 1.
// But we need to compute this in the F stage register update.
// So we need to look at the X stage instruction.
// Let's define pc_next_f as the next value of pc_f.
reg [4:0] pc_next_f;
if (valid_x && (instr_x[7:6] == 2'b10)) begin
// Branch instruction in X stage
// Check if taken
reg [1:0] cond;
reg signed [7:0] off;
reg taken;
cond = instr_x[5:4];
off = sign_ext_4(instr_x[3:0]);
// Condition check
case (cond)
2'b00: taken = 1'b1;
2'b01: taken = (r0_out == 8'sd0);
2'b10: taken = (r0_out < 8'sd0);
2'b11: taken = (r0_out > 8'sd0);
default: taken = 1'b0;
endcase
if (taken) begin
pc_next_f = pc_x + off + 1'b1; // PC + sign_ext(off) + 1
// But wait, the delay slot instruction is at pc_x + 1.
// The branch target is pc_x + off + 1.
// So the next instruction after the delay slot is at pc_x + off + 1.
// So pc_next_f = pc_x + off + 1.
end else begin
pc_next_f = pc_f + 1'b1;
end
end else begin
pc_next_f = pc_f + 1'b1;
end
pc_f <= pc_next_f;
instr_f <= imem_read(pc_next_f);
valid_f <= 1'b1;
end
end
// Stage X: Decode/Execute
always @(posedge clk) begin
if (rst) begin
instr_x <= 8'h00;
pc_x <= 5'd0;
valid_x <= 1'b1;
end else if (halt_out) begin
// Freeze pipeline
instr_x <= instr_x;
pc_x <= pc_x;
valid_x <= valid_x;
end else begin
instr_x <= instr_f;
pc_x <= pc_f;
valid_x <= valid_f;
end
end
// Stage W: Writeback
always @(posedge clk) begin
if (rst) begin
instr_w <= 8'h00;
pc_w <= 5'd0;
valid_w <= 1'b1;
rd_w <= 2'b00;
data_w <= 8'sd0;
we_w <= 1'b0;
halt_w <= 1'b0;
end else if (halt_out) begin
// Freeze pipeline
instr_w <= instr_w;
pc_w <= pc_w;
valid_w <= valid_w;
rd_w <= rd_w;
data_w <= data_w;
we_w <= we_w;
halt_w <= halt_w;
end else begin
instr_w <= instr_x;
pc_w <= pc_x;
valid_w <= valid_x;
// Compute writeback data
if (instr_x[7:6] == 2'b00) begin
// ADDI
rd_w <= instr_x[5:4];
we_w <= 1'b1;
// Read R[rd]
reg signed [7:0] r_rd;
r_rd = read_R(instr_x[5:4]);
// Forwarding
if (fwd_x_w) begin
r_rd = fwd_data;
end
// Compute result
data_w <= r_rd + sign_ext_4(instr_x[3:0]);
end else if (instr_x[7:6] == 2'b01) begin
// MOV
rd_w <= instr_x[5:4];
we_w <= 1'b1;
// Read R[rs]
reg signed [7:0] r_rs;
r_rs = read_R(instr_x[3:2]);
// Forwarding
if (fwd_x_w) begin
r_rs = fwd_data;
end
data_w <= r_rs;
end else if (instr_x[7:6] == 2'b11) begin
// HALT
rd_w <= 2'b00;
we_w <= 1'b0;
data_w <= 8'sd0;
halt_w <= 1'b1;
end else begin
// BRcc or other
rd_w <= 2'b00;
we_w <= 1'b0;
data_w <= 8'sd0;
halt_w <= 1'b0;
end
end
end
// Register File Write
always @(posedge clk) begin
if (rst) begin
R[0] <= 8'sd0;
R[1] <= 8'sd0;
R[2] <= 8'sd0;
R[3] <= 8'sd0;
end else if (halt_out) begin
// Freeze registers
end else if (valid_w && we_w) begin
R[rd_w] <= data_w;
end
end
// Output assignments
always @(posedge clk) begin
if (rst) begin
pc_out <= 5'd0;
halt_out <= 1'b0;
r0_out <= 8'sd0;
end else if (halt_out) begin
// Freeze outputs
pc_out <= pc_out;
halt_out <= 1'b1;
r0_out <= r0_out;
end else begin
pc_out <= pc_f;
// Update r0_out if R[0] was written
if (valid_w && we_w && rd_w == 2'b00) begin
r0_out <= data_w;
end else if (valid_w && halt_w) begin
halt_out <= 1'b1;
end
end
end
endmodule```verilog
// Branch convention: The instruction immediately following a BRcc is in the delay slot.
// It is always fetched and executed regardless of whether the branch is taken.
// If the branch is taken, the instruction after the delay slot is the new PC.
// If the branch is not taken, the instruction after the delay slot is the PC+2 of the branch.
// This module implements a 3-stage pipeline (F, X, W) with forwarding from W and X stages
// to the X stage to handle data hazards without stalls.
module mcu3 #(
parameter [0:255] IMEM_INIT = 256'h0 // 32 instruction bytes, byte 0 at MSB
) (
input clk,
input rst, // synchronous, active-high
output reg [4:0] pc_out, // current fetched PC (visible for testing)
output reg halt_out, // asserted forever after HALT executes
output reg signed [7:0] r0_out // current value of register R0
);
// Register File: 4 registers, 8-bit signed
reg signed [7:0] R [0:3];
// Pipeline Registers
// Stage F (Fetch) -> Stage X (Execute)
reg [7:0] instr_x;
reg [4:0] pc_x;
reg valid_x;
// Stage X (Execute) -> Stage W (Writeback)
reg [7:0] instr_w;
reg [4:0] pc_w;
reg valid_w;
reg [1:0] rd_w; // destination register index
reg signed [7:0] data_w; // data to write
reg we_w; // write enable
reg halt_w; // halt signal
// Stage F internal
reg [4:0] pc_f;
reg [7:0] instr_f;
reg valid_f;
// Next PC logic for F stage
reg [4:0] pc_next;
// Forwarding signals
reg fwd_x_w; // Forward from W stage to X stage
reg fwd_x_x; // Forward from X stage to X stage (bypass)
reg signed [7:0] fwd_data;
// Helper function to extract byte from IMEM_INIT
// IMEM_INIT is 256 bits. Byte 0 is bits [255:248], Byte 1 is [247:240], etc.
// PC is 0-31. Byte index = PC.
// Value = IMEM_INIT[ (31-PC)*8 +: 8 ]
function [7:0] imem_read;
input [4:0] addr;
begin
imem_read = IMEM_INIT[ (5'd31 - addr) * 8 +: 8 ];
end
endfunction
// Decode helpers
function [1:0] get_rd;
input [7:0] instr;
begin
if (instr[7:6] == 2'b00) get_rd = instr[5:4]; // ADDI
else if (instr[7:6] == 2'b01) get_rd = instr[5:4]; // MOV
else get_rd = 2'b00; // BRcc, HALT don't write R
end
endfunction
function [1:0] get_rs;
input [7:0] instr;
begin
if (instr[7:6] == 2'b01) get_rs = instr[3:2]; // MOV
else get_rs = 2'b00; // ADDI uses immediate, BRcc uses R0 for condition, HALT does nothing
end
endfunction
function signed [7:0] sign_ext_4;
input [3:0] imm;
begin
sign_ext_4 = {{4{imm[3]}}, imm};
end
endfunction
// Determine if a register is being written in a stage
function we_x;
input [7:0] instr;
begin
if (instr[7:6] == 2'b00) we_x = 1; // ADDI
else if (instr[7:6] == 2'b01) we_x = 1; // MOV
else we_x = 0;
end
endfunction
function we_w;
input [7:0] instr;
begin
if (instr[7:6] == 2'b00) we_w = 1; // ADDI
else if (instr[7:6] == 2'b01) we_w = 1; // MOV
else we_w = 0;
end
endfunction
// Forwarding Logic
// We need to forward data to the X stage if the instruction in X reads a register
// that is being written by the instruction in W or X.
// Priority: W stage (older) over X stage (younger) is not quite right.
// Standard forwarding: X stage (producing result this cycle) is available for next cycle's X stage?
// No, in a 3-stage pipeline:
// Cycle N: Instr A in X, Instr B in F
// Cycle N+1: Instr A in W, Instr B in X
// If B reads a register written by A, we can forward A's result from W stage (which is computed in X stage of Cycle N) to B in X stage of Cycle N+1.
// Also, if Instr C is in F in Cycle N+1, it will be in X in Cycle N+2.
// Can we forward from W stage (Instr A) to X stage (Instr C)? Yes, if A is in W and C is in X.
// Can we forward from X stage (Instr B) to X stage (Instr C)? No, B is in X, C is in F. C will be in X next cycle.
// So, forwarding sources for X stage inputs:
// 1. W stage result (if W instruction writes to the register needed by X instruction)
// 2. X stage result (if X instruction writes to the register needed by X instruction? No, X instruction is computing its own result.
// Wait, if Instr B is in X, it reads R[rs]. If Instr A is in W, it writes R[rd].
// If B's rs == A's rd, forward A's data_w to B.
// What if Instr C is in F? It will be in X next cycle.
// If Instr B is in X and writes to R[rd], and Instr C (next in F) reads R[rs] == B's rd,
// then in next cycle, B will be in W, C in X. We can forward from W (B) to X (C).
// So we only need to forward from W stage to X stage.
// Is there a case where we need to forward from X stage to X stage?
// Only if the same instruction reads and writes the same register? e.g. ADDI R0, R0.
// In that case, the value is read from the register file in the X stage.
// The new value is computed in X stage and written in W stage.
// So for ADDI R0, R0, the read happens in X, write in W. No forwarding needed within X.
// So, forwarding is only from W stage to X stage.
// Let's refine:
// Instr A (in W): writes R[rd_w] with data_w
// Instr B (in X): reads R[rs_x]
// If rs_x == rd_w and we_w is 1, then forward data_w to B.
// What about Instr B (in X) writing to R[rd_x] and Instr C (in F) reading R[rs_c]?
// In next cycle, B is in W, C is in X.
// So C will see B's result from W stage.
// So yes, only W->X forwarding is needed for back-to-back instructions.
always @(*) begin
// Default: no forwarding
fwd_x_w = 0;
fwd_x_x = 0;
fwd_data = 0;
// Check W stage forwarding
if (valid_w && we_w && (rd_w == get_rs(instr_x))) begin
fwd_x_w = 1;
fwd_data = data_w;
end
// Note: We don't need X->X forwarding because the X stage instruction
// reads the register file before computing the new value.
// The new value is not available until the end of the X stage.
// So it can only be used by the next instruction in the next cycle.
end
// Register File Read
function signed [7:0] read_R;
input [1:0] idx;
begin
read_R = R[idx];
end
endfunction
// Stage F: Fetch
always @(posedge clk) begin
if (rst) begin
pc_f <= 5'd0;
instr_f <= 8'h00;
valid_f <= 1'b1;
end else if (halt_out) begin
// Freeze pipeline
pc_f <= pc_f;
instr_f <= instr_f;
valid_f <= 1'b1;
end else begin
// Determine next PC
// If branch taken, PC = branch_target
// Else PC = PC + 1
// But we need to handle the delay slot.
// The instruction after the branch is always executed.
// So if branch is taken, the PC after the delay slot is the target.
// The delay slot instruction is at PC_branch + 1.
// So if branch is taken, the next fetch PC is target.
// If branch is not taken, the next fetch PC is PC_branch + 2.
// However, the F stage fetches the instruction at pc_f.
// The X stage determines if the branch is taken.
// The W stage does nothing for branches.
// Let's use a simple approach:
// pc_next is computed in F stage based on the instruction in X stage?
// No, F stage fetches based on current pc_f.
// The next pc_f should be pc_f + 1 unless a branch in the previous instruction (now in X) was taken.
// But the branch instruction is in X stage.
// So, if the instruction in X stage is a branch and it is taken, then the next fetch PC should be the branch target.
// But wait, the instruction in F stage is the one after the branch (delay slot).
// So if the branch is taken, the instruction after the delay slot should be fetched next.
// So pc_next = branch_target.
// If the branch is not taken, pc_next = pc_f + 1 (which is pc_branch + 2).
// Let's compute pc_next based on the X stage instruction.
// If X stage instruction is BRcc and taken, pc_next = branch_target.
// Else pc_next = pc_f + 1.
// But pc_f is the PC of the instruction currently in F stage.
// The instruction in X stage has PC = pc_x.
// If pc_x is a branch and taken, then the next instruction to fetch is the branch target.
// The delay slot instruction (at pc_x + 1) is already in F stage.
// So after the delay slot instruction is fetched, the next fetch should be the branch target.
// So, pc_next logic:
// If valid_x and instr_x is BRcc and branch is taken, pc_next = branch_target.
// Else pc_next = pc_f + 1.
// But we need to compute this in the F stage register update.
// So we need to look at the X stage instruction.
// Let's define pc_next_f as the next value of pc_f.
reg [4:0] pc_next_f;
if (valid_x && (instr_x[7:6] == 2'b10)) begin
// Branch instruction in X stage
// Check if taken
reg [1:0] cond;
reg signed [7:0] off;
reg taken;
cond = instr_x[5:4];
off = sign_ext_4(instr_x[3:0]);
// Condition check
case (cond)
2'b00: taken = 1'b1;
2'b01: taken = (r0_out == 8'sd0);
2'b10: taken = (r0_out < 8'sd0);
2'b11: taken = (r0_out > 8'sd0);
default: taken = 1'b0;
endcase
if (taken) begin
pc_next_f = pc_x + off + 1'b1; // PC + sign_ext(off) + 1
// But wait, the delay slot instruction is at pc_x + 1.
// The branch target is pc_x + off + 1.
// So the next instruction after the delay slot is at pc_x + off + 1.
// So pc_next_f = pc_x + off + 1.
end else begin
pc_next_f = pc_f + 1'b1;
end
end else begin
pc_next_f = pc_f + 1'b1;
end
pc_f <= pc_next_f;
instr_f <= imem_read(pc_next_f);
valid_f <= 1'b1;
end
end
// Stage X: Decode/Execute
always @(posedge clk) begin
if (rst) begin
instr_x <= 8'h00;
pc_x <= 5'd0;
valid_x <= 1'b1;
end else if (halt_out) begin
// Freeze pipeline
instr_x <= instr_x;
pc_x <= pc_x;
valid_x <= valid_x;
end else begin
instr_x <= instr_f;
pc_x <= pc_f;
valid_x <= valid_f;
end
end
// Stage W: Writeback
always @(posedge clk) begin
if (rst) begin
instr_w <= 8'h00;
pc_w <= 5'd0;
valid_w <= 1'b1;
rd_w <= 2'b00;
data_w <= 8'sd0;
we_w <= 1'b0;
halt_w <= 1'b0;
end else if (halt_out) begin
// Freeze pipeline
instr_w <= instr_w;
pc_w <= pc_w;
valid_w <= valid_w;
rd_w <= rd_w;
data_w <= data_w;
we_w <= we_w;
halt_w <= halt_w;
end else begin
instr_w <= instr_x;
pc_w <= pc_x;
valid_w <= valid_x;
// Compute writeback data
if (instr_x[7:6] == 2'b00) begin
// ADDI
rd_w <= instr_x[5:4];
we_w <= 1'b1;
// Read R[rd]
reg signed [7:0] r_rd;
r_rd = read_R(instr_x[5:4]);
// Forwarding
if (fwd_x_w) begin
r_rd = fwd_data;
end
// Compute result
data_w <= r_rd + sign_ext_4(instr_x[3:0]);
end else if (instr_x[7:6] == 2'b01) begin
// MOV
rd_w <= instr_x[5:4];
we_w <= 1'b1;
// Read R[rs]
reg signed [7:0] r_rs;
r_rs = read_R(instr_x[3:2]);
// Forwarding
if (fwd_x_w) begin
r_rs = fwd_data;
end
data_w <= r_rs;
end else if (instr_x[7:6] == 2'b11) begin
// HALT
rd_w <= 2'b00;
we_w <= 1'b0;
data_w <= 8'sd0;
halt_w <= 1'b1;
end else begin
// BRcc or other
rd_w <= 2'b00;
we_w <= 1'b0;
data_w <= 8'sd0;
halt_w <= 1'b0;
end
end
end
// Register File Write
always @(posedge clk) begin
if (rst) begin
R[0] <= 8'sd0;
R[1] <= 8'sd0;
R[2] <= 8'sd0;
R[3] <= 8'sd0;
end else if (halt_out) begin
// Freeze registers
end else if (valid_w && we_w) begin
R[rd_w] <= data_w;
end
end
// Output assignments
always @(posedge clk) begin
if (rst) begin
pc_out <= 5'd0;
halt_out <= 1'b0;
r0_out <= 8'sd0;
end else if (halt_out) begin
// Freeze outputs
pc_out <= pc_out;
halt_out <= 1'b1;
r0_out <= r0_out;
end else begin
pc_out <= pc_f;
// Update r0_out if R[0] was written
if (valid_w && we_w && rd_w == 2'b00) begin
r0_out <= data_w;
end else if (valid_w && halt_w) begin
halt_out <= 1'b1;
end
end
end
endmodule
```