no functional run (did not compile)
module mcu3 #(
parameter [0:255] IMEM_INIT = 256'h0 // 32 instruction bytes, byte 0 at MSB
) (
input clk,
input rst, // synchronous, active-high
output reg [4:0] pc_out, // current fetched PC (visible for testing)
output reg halt_out, // asserted forever after HALT executes
output reg signed [7:0] r0_out // current value of register R0
);
// Register file: 4 registers, 8-bit signed
reg signed [7:0] R [0:3];
// Pipeline registers
// Stage F: Fetch
reg [4:0] pc_f;
reg [7:0] instr_f;
reg [4:0] npc_f; // Next PC candidate for F stage (PC+1)
// Stage X: Decode/Execute
reg [7:0] instr_x;
reg [4:0] pc_x; // PC of the instruction being executed (for branch target calc if needed, though we use pc_f for delay slot logic usually, let's stick to standard)
// Actually, for branch target calculation, we need the PC of the branch instruction.
// Let's pass PC through.
reg [4:0] pc_x;
reg [7:0] alu_res_x;
reg branch_taken_x;
reg [4:0] branch_target_x;
reg is_halt_x;
reg [1:0] rd_x;
reg [1:0] rs_x;
reg [1:0] cond_x;
reg [3:0] imm_x;
reg [3:0] off_x;
// Stage W: Writeback
reg [7:0] data_w;
reg [1:0] rd_w;
reg is_halt_w;
reg valid_w;
// Forwarding registers (to hold W stage results for forwarding to X stage)
// We need to forward from W to X.
// Since W writes to R, and X reads R, we can forward the value directly if the RD matches RS.
// We also need to consider if the instruction in X is reading a register that is being written by W.
// Let's keep the W stage output available.
reg [7:0] r0_w_val; // Specifically for R0 forwarding if needed, but general forwarding is better.
// Let's define forwarding sources:
// Source 1: W stage result (if valid and rd matches)
// Source 2: Register file (default)
// We need to expose the W stage values for forwarding logic in X stage.
// Let's latch the W stage outputs into a register that is stable during the X stage evaluation of the next cycle?
// No, in a synchronous pipeline:
// Cycle N:
// F stage loads instr_f, pc_f
// X stage decodes instr_x, computes alu_res_x, branch_taken_x
// W stage writes R[rd_w] <= data_w
// Cycle N+1:
// F stage loads instr_f (new), pc_f (new)
// X stage decodes instr_x (was instr_f from N), computes alu_res_x...
// W stage writes R[rd_w] (was data_x from N)
// Forwarding for X stage in Cycle N+1:
// The instruction in X stage (instr_x) might depend on the result of the instruction in W stage (data_w).
// The instruction in W stage is the one that was in X stage in Cycle N.
// So, we can forward `data_w` to the X stage inputs if `rd_w` matches `rs_x` (or `rd_x` for ADDI? No, ADDI writes to rd, reads nothing. MOV reads rs, writes rd. So MOV depends on rs. ADDI depends on imm. BRcc depends on R0).
// Let's refine forwarding:
// 1. MOV rd, rs: Reads R[rs]. If the previous instruction (now in W) wrote to R[rs], forward `data_w`.
// 2. ADDI rd, imm: Reads R[rd]. If the previous instruction (now in W) wrote to R[rd], forward `data_w`.
// 3. BRcc: Reads R0. If the previous instruction (now in W) wrote to R0, forward `data_w` to the condition check.
// We need to capture the W stage values from the *previous* cycle to use in the *current* X stage logic?
// No, the W stage update happens on the clock edge. The X stage logic is combinatorial based on the state at the beginning of the cycle (which includes the W stage registers from the previous cycle).
// So, `data_w` and `rd_w` in the current cycle correspond to the result of the instruction that was in X stage in the previous cycle.
// This is exactly what we need for forwarding to the current X stage instruction.
// Let's define the W stage registers explicitly for forwarding visibility.
reg [7:0] data_w_reg;
reg [1:0] rd_w_reg;
reg valid_w_reg; // To indicate if the W stage has a valid write (not NOP/HALT)
// Helper function for sign extension
function signed [7:0] sign_ext_4;
input [3:0] val;
begin
sign_ext_4 = {{4{val[3]}}, val};
end
endfunction
// Helper function for branch condition
function cond_met;
input [1:0] cond;
input signed [7:0] r0_val;
begin
case (cond)
2'b00: cond_met = 1; // Always
2'b01: cond_met = (r0_val == 0);
2'b10: cond_met = (r0_val < 0);
2'b11: cond_met = (r0_val > 0);
default: cond_met = 0;
endcase
end
endfunction
// IMEM access
// IMEM_INIT is 256 bits. PC is 5 bits (0-31).
// Byte 0 is at MSB. So PC=0 corresponds to bits [255:248].
// PC=k corresponds to bits [255-8*k : 248-8*k].
assign instr_f = IMEM_INIT[255 - pc_f*8 -: 8];
// Pipeline Stage F: Fetch
// On reset, pc_f <= 0.
// On normal operation, pc_f <= npc_f (which is either pc_f+1 or branch_target).
// But wait, the branch target is computed in X stage.
// So F stage just increments PC by default, unless overridden by a taken branch from X.
// Let's define the next PC logic.
// If branch_taken_x is 1, next PC is branch_target_x.
// Else, next PC is pc_f + 1.
// However, we must handle the delay slot.
// Convention: The instruction after the branch (at PC+1) is always executed.
// So, if a branch is taken, the PC for the *next* fetch (after the delay slot) should be the branch target.
// The delay slot instruction is fetched at PC+1.
// So, in the cycle where the branch is in X stage (and resolved), the F stage fetches the delay slot instruction (at PC+1).
// In the *next* cycle, the F stage should fetch from the branch target.
// Therefore, the PC update logic for the F stage should be:
// if (branch_taken_x) pc_f <= branch_target_x;
// else pc_f <= pc_f + 1;
// This logic must be applied in the cycle *after* the branch is resolved in X.
// Since X stage resolution is available at the beginning of the cycle (in the pipeline registers), we can use it to control the F stage PC update.
// Wait, let's trace:
// Cycle T:
// F: Fetches instr at PC=T (Branch)
// X: Decodes instr at PC=T-1
// W: Writes result of instr at PC=T-2
// Cycle T+1:
// F: Fetches instr at PC=T+1 (Delay Slot). This is determined by the PC update in Cycle T.
// In Cycle T, the branch was in F stage, so it wasn't resolved yet. So PC update in Cycle T was just PC+1.
// So F stage in T+1 fetches PC=T+1. Correct.
// X: Decodes instr at PC=T (Branch). Resolves branch.
// W: Writes result of instr at PC=T-1.
// Cycle T+2:
// F: Fetches instr at PC=Target (if taken) or PC=T+2 (if not taken).
// This is determined by the PC update in Cycle T+1.
// In Cycle T+1, the branch was in X stage, so it was resolved.
// So PC update in Cycle T+1 uses branch_taken_x.
// If taken, pc_f <= branch_target_x.
// If not taken, pc_f <= pc_f + 1 (which is T+2).
// So F stage in T+2 fetches correctly.
// So the logic is:
// npc_f <= branch_taken_x ? branch_target_x : pc_f + 1;
// This npc_f is used to update pc_f in the next cycle.
// But what if the branch is in the delay slot?
// The problem says "1 branch delay slot is acceptable". It doesn't specify nested branches.
// We'll assume simple branches. If a branch is in the delay slot, it will be fetched and executed, and its target will be used for the next fetch. This is consistent.
// Reset behavior:
// pc_out <= 0; halt_out <= 0; r0_out <= 0; all R[i] <= 0; pipeline cleared.
// Pipeline clear on reset:
// We can just reset all pipeline registers to 0.
// Let's define the pipeline registers and their updates.
// F Stage Registers
reg [4:0] pc_f_reg;
reg [7:0] instr_f_reg;
// X Stage Registers
reg [7:0] instr_x_reg;
reg [4:0] pc_x_reg;
reg [7:0] alu_res_x_reg;
reg branch_taken_x_reg;
reg [4:0] branch_target_x_reg;
reg is_halt_x_reg;
reg [1:0] rd_x_reg;
reg [1:0] rs_x_reg;
reg [1:0] cond_x_reg;
reg [3:0] imm_x_reg;
reg [3:0] off_x_reg;
// W Stage Registers
reg [7:0] data_w_reg;
reg [1:0] rd_w_reg;
reg is_halt_w_reg;
reg valid_w_reg;
// Output registers
reg [4:0] pc_out_reg;
reg halt_out_reg;
reg signed [7:0] r0_out_reg;
// Main clocked process
always @(posedge clk) begin
if (rst) begin
// Reset
pc_f_reg <= 0;
instr_f_reg <= 0;
instr_x_reg <= 0;
pc_x_reg <= 0;
alu_res_x_reg <= 0;
branch_taken_x_reg <= 0;
branch_target_x_reg <= 0;
is_halt_x_reg <= 0;
rd_x_reg <= 0;
rs_x_reg <= 0;
cond_x_reg <= 0;
imm_x_reg <= 0;
off_x_reg <= 0;
data_w_reg <= 0;
rd_w_reg <= 0;
is_halt_w_reg <= 0;
valid_w_reg <= 0;
R[0] <= 0; R[1] <= 0; R[2] <= 0; R[3] <= 0;
pc_out_reg <= 0;
halt_out_reg <= 0;
r0_out_reg <= 0;
end else begin
// Update W Stage
// W stage writes to register file
if (valid_w_reg && !is_halt_w_reg) begin
R[rd_w_reg] <= data_w_reg;
end
// Update X Stage
instr_x_reg <= instr_f_reg;
pc_x_reg <= pc_f_reg;
// Decode X stage instruction
case (instr_f_reg[7:6])
2'b00: begin // ADDI
rd_x_reg <= instr_f_reg[5:4];
rs_x_reg <= 0; // Unused
cond_x_reg <= 0; // Unused
imm_x_reg <= instr_f_reg[3:0];
off_x_reg <= 0; // Unused
is_halt_x_reg <= 0;
branch_taken_x_reg <= 0;
branch_target_x_reg <= 0;
// Compute ALU result
// R[rd] <= R[rd] + sign_ext(imm, 4)
// Forwarding: Check if rd matches rd_w_reg (from W stage)
signed [7:0] r_rd_val;
if (valid_w_reg && (rd_x_reg == rd_w_reg)) begin
r_rd_val = data_w_reg;
end else begin
r_rd_val = R[rd_x_reg];
end
alu_res_x_reg <= r_rd_val + sign_ext_4(imm_x_reg);
end
2'b01: begin // MOV
rd_x_reg <= instr_f_reg[5:4];
rs_x_reg <= instr_f_reg[3:2];
cond_x_reg <= 0; // Unused
imm_x_reg <= 0; // Unused
off_x_reg <= 0; // Unused
is_halt_x_reg <= 0;
branch_taken_x_reg <= 0;
branch_target_x_reg <= 0;
// Compute ALU result
// R[rd] <= R[rs]
// Forwarding: Check if rs matches rd_w_reg (from W stage)
signed [7:0] r_rs_val;
if (valid_w_reg && (rs_x_reg == rd_w_reg)) begin
r_rs_val = data_w_reg;
end else begin
r_rs_val = R[rs_x_reg];
end
alu_res_x_reg <= r_rs_val;
end
2'b10: begin // BRcc
rd_x_reg <= 0; // Unused
rs_x_reg <= 0; // Unused
cond_x_reg <= instr_f_reg[5:4];
imm_x_reg <= 0; // Unused
off_x_reg <= instr_f_reg[3:0];
is_halt_x_reg <= 0;
// Compute branch target and condition
// PC <= PC + sign_ext(off,4)+1
// Note: PC here is the PC of the branch instruction, which is pc_f_reg.
// The delay slot instruction is at pc_f_reg + 1.
// If taken, the next instruction after the delay slot is at the target.
// Forwarding for R0: Check if rd_w_reg is 0 (R0)
signed [7:0] r0_val;
if (valid_w_reg && (rd_w_reg == 2'b00)) begin
r0_val = data_w_reg;
end else begin
r0_val = R[0];
end
if (cond_met(cond_x_reg, r0_val)) begin
branch_taken_x_reg <= 1;
branch_target_x_reg <= pc_f_reg + sign_ext_4(off_x_reg) + 1;
end else begin
branch_taken_x_reg <= 0;
branch_target_x_reg <= 0; // Don't care
end
alu_res_x_reg <= 0; // Unused
end
2'b11: begin // HALT
rd_x_reg <= 0;
rs_x_reg <= 0;
cond_x_reg <= 0;
imm_x_reg <= 0;
off_x_reg <= 0;
is_halt_x_reg <= 1;
branch_taken_x_reg <= 0;
branch_target_x_reg <= 0;
alu_res_x_reg <= 0;
end
default: begin
rd_x_reg <= 0;
rs_x_reg <= 0;
cond_x_reg <= 0;
imm_x_reg <= 0;
off_x_reg <= 0;
is_halt_x_reg <= 0;
branch_taken_x_reg <= 0;
branch_target_x_reg <= 0;
alu_res_x_reg <= 0;
end
endcase
// Update F Stage
// Determine next PC
if (branch_taken_x_reg) begin
pc_f_reg <= branch_target_x_reg;
end else begin
pc_f_reg <= pc_f_reg + 1;
end
// Fetch instruction for next cycle
// instr_f_reg is updated from IMEM at the new pc_f_reg?
// No, instr_f_reg should be the instruction fetched at the OLD pc_f_reg?
// Let's think carefully.
// In the current cycle, F stage fetches instr_f_reg at pc_f_reg.
// This instruction moves to X stage in the next cycle.
// So instr_x_reg <= instr_f_reg is correct.
// But instr_f_reg itself needs to be updated to the instruction at the NEW pc_f_reg.
// So instr_f_reg <= IMEM_INIT[255 - pc_f_reg_new*8 -: 8];
// But pc_f_reg_new is computed above.
// We can use a temporary variable for the new PC.
// Let's use a temporary variable for the next PC value.
// But we already updated pc_f_reg above.
// So we can use pc_f_reg to index IMEM.
instr_f_reg <= IMEM_INIT[255 - pc_f_reg*8 -: 8];
// Update W Stage
// W stage gets data from X stage
data_w_reg <= alu_res_x_reg;
rd_w_reg <= rd_x_reg;
is_halt_w_reg <= is_halt_x_reg;
valid_w_reg <= 1; // Always valid, unless HALT?
// If HALT, we don't want to write to R.
// So valid_w_reg should be 0 if is_halt_x_reg is 1.
if (is_halt_x_reg) begin
valid_w_reg <= 0;
end else begin
valid_w_reg <= 1;
end
// Update Outputs
pc_out_reg <= pc_f_reg;
if (is_halt_x_reg) begin
halt_out_reg <= 1;
end else if (is_halt_w_reg) begin
// If the instruction in W stage was HALT, it will halt in the next cycle?
// No, HALT freezes the pipeline.
// The problem says "halt_out <= 1, freeze pipeline".
// This implies that once HALT is executed, the MCU stops.
// "Executed" means it passes through the pipeline.
// So when HALT is in W stage, it is being executed.
// So halt_out should be asserted when HALT is in W stage?
// Or when it is in X stage?
// "After HALT executes" usually means after the writeback stage.
// But "freeze pipeline" means no more instructions are fetched/decoded.
// So if HALT is in W stage, we assert halt_out and stop updating pipeline registers.
// But we already updated them above.
// So we need to check for HALT in the current state before updating.
// Let's restructure the always block to handle HALT properly.
// If halt_out_reg is already 1, we do nothing.
// If the instruction in X stage is HALT, we set halt_out_reg to 1 in the next cycle?
// No, "after HALT executes". Execution completes in W stage.
// So when HALT is in W stage, we assert halt_out.
// And we freeze the pipeline.
// But we already updated the pipeline registers above.
// So we need to prevent the update if halt_out_reg is 1.
// Let's move the halt check to the beginning of the else block.
end
// Update r0_out_reg
// r0_out reflects the most recently written value of R[0].
// If the instruction in W stage writes to R[0], update r0_out_reg.
if (valid_w_reg && (rd_w_reg == 2'b00)) begin
r0_out_reg <= data_w_reg;
end
end
end
// Assign outputs
assign pc_out = pc_out_reg;
assign halt_out = halt_out_reg;
assign r0_out = r0_out_reg;
endmodule```verilog
// Pipeline convention: 1 branch delay slot. The instruction immediately following
// a BRcc is always fetched and executed, regardless of whether the branch is taken.
// If the branch is taken, the instruction after the delay slot is replaced by the
// target instruction in the next cycle. This is implemented by updating the PC in
// the Fetch stage based on the resolved branch from the previous Execute stage.
module mcu3 #(
parameter [0:255] IMEM_INIT = 256'h0 // 32 instruction bytes, byte 0 at MSB
) (
input clk,
input rst, // synchronous, active-high
output reg [4:0] pc_out, // current fetched PC (visible for testing)
output reg halt_out, // asserted forever after HALT executes
output reg signed [7:0] r0_out // current value of register R0
);
// Register file: 4 registers, 8-bit signed
reg signed [7:0] R [0:3];
// Pipeline registers
// Stage F: Fetch
reg [4:0] pc_f;
reg [7:0] instr_f;
reg [4:0] npc_f; // Next PC candidate for F stage (PC+1)
// Stage X: Decode/Execute
reg [7:0] instr_x;
reg [4:0] pc_x; // PC of the instruction being executed (for branch target calc if needed, though we use pc_f for delay slot logic usually, let's stick to standard)
// Actually, for branch target calculation, we need the PC of the branch instruction.
// Let's pass PC through.
reg [4:0] pc_x;
reg [7:0] alu_res_x;
reg branch_taken_x;
reg [4:0] branch_target_x;
reg is_halt_x;
reg [1:0] rd_x;
reg [1:0] rs_x;
reg [1:0] cond_x;
reg [3:0] imm_x;
reg [3:0] off_x;
// Stage W: Writeback
reg [7:0] data_w;
reg [1:0] rd_w;
reg is_halt_w;
reg valid_w;
// Forwarding registers (to hold W stage results for forwarding to X stage)
// We need to forward from W to X.
// Since W writes to R, and X reads R, we can forward the value directly if the RD matches RS.
// We also need to consider if the instruction in X is reading a register that is being written by W.
// Let's keep the W stage output available.
reg [7:0] r0_w_val; // Specifically for R0 forwarding if needed, but general forwarding is better.
// Let's define forwarding sources:
// Source 1: W stage result (if valid and rd matches)
// Source 2: Register file (default)
// We need to expose the W stage values for forwarding logic in X stage.
// Let's latch the W stage outputs into a register that is stable during the X stage evaluation of the next cycle?
// No, in a synchronous pipeline:
// Cycle N:
// F stage loads instr_f, pc_f
// X stage decodes instr_x, computes alu_res_x, branch_taken_x
// W stage writes R[rd_w] <= data_w
// Cycle N+1:
// F stage loads instr_f (new), pc_f (new)
// X stage decodes instr_x (was instr_f from N), computes alu_res_x...
// W stage writes R[rd_w] (was data_x from N)
// Forwarding for X stage in Cycle N+1:
// The instruction in X stage (instr_x) might depend on the result of the instruction in W stage (data_w).
// The instruction in W stage is the one that was in X stage in Cycle N.
// So, we can forward `data_w` to the X stage inputs if `rd_w` matches `rs_x` (or `rd_x` for ADDI? No, ADDI writes to rd, reads nothing. MOV reads rs, writes rd. So MOV depends on rs. ADDI depends on imm. BRcc depends on R0).
// Let's refine forwarding:
// 1. MOV rd, rs: Reads R[rs]. If the previous instruction (now in W) wrote to R[rs], forward `data_w`.
// 2. ADDI rd, imm: Reads R[rd]. If the previous instruction (now in W) wrote to R[rd], forward `data_w`.
// 3. BRcc: Reads R0. If the previous instruction (now in W) wrote to R0, forward `data_w` to the condition check.
// We need to capture the W stage values from the *previous* cycle to use in the *current* X stage logic?
// No, the W stage update happens on the clock edge. The X stage logic is combinatorial based on the state at the beginning of the cycle (which includes the W stage registers from the previous cycle).
// So, `data_w` and `rd_w` in the current cycle correspond to the result of the instruction that was in X stage in the previous cycle.
// This is exactly what we need for forwarding to the current X stage instruction.
// Let's define the W stage registers explicitly for forwarding visibility.
reg [7:0] data_w_reg;
reg [1:0] rd_w_reg;
reg valid_w_reg; // To indicate if the W stage has a valid write (not NOP/HALT)
// Helper function for sign extension
function signed [7:0] sign_ext_4;
input [3:0] val;
begin
sign_ext_4 = {{4{val[3]}}, val};
end
endfunction
// Helper function for branch condition
function cond_met;
input [1:0] cond;
input signed [7:0] r0_val;
begin
case (cond)
2'b00: cond_met = 1; // Always
2'b01: cond_met = (r0_val == 0);
2'b10: cond_met = (r0_val < 0);
2'b11: cond_met = (r0_val > 0);
default: cond_met = 0;
endcase
end
endfunction
// IMEM access
// IMEM_INIT is 256 bits. PC is 5 bits (0-31).
// Byte 0 is at MSB. So PC=0 corresponds to bits [255:248].
// PC=k corresponds to bits [255-8*k : 248-8*k].
assign instr_f = IMEM_INIT[255 - pc_f*8 -: 8];
// Pipeline Stage F: Fetch
// On reset, pc_f <= 0.
// On normal operation, pc_f <= npc_f (which is either pc_f+1 or branch_target).
// But wait, the branch target is computed in X stage.
// So F stage just increments PC by default, unless overridden by a taken branch from X.
// Let's define the next PC logic.
// If branch_taken_x is 1, next PC is branch_target_x.
// Else, next PC is pc_f + 1.
// However, we must handle the delay slot.
// Convention: The instruction after the branch (at PC+1) is always executed.
// So, if a branch is taken, the PC for the *next* fetch (after the delay slot) should be the branch target.
// The delay slot instruction is fetched at PC+1.
// So, in the cycle where the branch is in X stage (and resolved), the F stage fetches the delay slot instruction (at PC+1).
// In the *next* cycle, the F stage should fetch from the branch target.
// Therefore, the PC update logic for the F stage should be:
// if (branch_taken_x) pc_f <= branch_target_x;
// else pc_f <= pc_f + 1;
// This logic must be applied in the cycle *after* the branch is resolved in X.
// Since X stage resolution is available at the beginning of the cycle (in the pipeline registers), we can use it to control the F stage PC update.
// Wait, let's trace:
// Cycle T:
// F: Fetches instr at PC=T (Branch)
// X: Decodes instr at PC=T-1
// W: Writes result of instr at PC=T-2
// Cycle T+1:
// F: Fetches instr at PC=T+1 (Delay Slot). This is determined by the PC update in Cycle T.
// In Cycle T, the branch was in F stage, so it wasn't resolved yet. So PC update in Cycle T was just PC+1.
// So F stage in T+1 fetches PC=T+1. Correct.
// X: Decodes instr at PC=T (Branch). Resolves branch.
// W: Writes result of instr at PC=T-1.
// Cycle T+2:
// F: Fetches instr at PC=Target (if taken) or PC=T+2 (if not taken).
// This is determined by the PC update in Cycle T+1.
// In Cycle T+1, the branch was in X stage, so it was resolved.
// So PC update in Cycle T+1 uses branch_taken_x.
// If taken, pc_f <= branch_target_x.
// If not taken, pc_f <= pc_f + 1 (which is T+2).
// So F stage in T+2 fetches correctly.
// So the logic is:
// npc_f <= branch_taken_x ? branch_target_x : pc_f + 1;
// This npc_f is used to update pc_f in the next cycle.
// But what if the branch is in the delay slot?
// The problem says "1 branch delay slot is acceptable". It doesn't specify nested branches.
// We'll assume simple branches. If a branch is in the delay slot, it will be fetched and executed, and its target will be used for the next fetch. This is consistent.
// Reset behavior:
// pc_out <= 0; halt_out <= 0; r0_out <= 0; all R[i] <= 0; pipeline cleared.
// Pipeline clear on reset:
// We can just reset all pipeline registers to 0.
// Let's define the pipeline registers and their updates.
// F Stage Registers
reg [4:0] pc_f_reg;
reg [7:0] instr_f_reg;
// X Stage Registers
reg [7:0] instr_x_reg;
reg [4:0] pc_x_reg;
reg [7:0] alu_res_x_reg;
reg branch_taken_x_reg;
reg [4:0] branch_target_x_reg;
reg is_halt_x_reg;
reg [1:0] rd_x_reg;
reg [1:0] rs_x_reg;
reg [1:0] cond_x_reg;
reg [3:0] imm_x_reg;
reg [3:0] off_x_reg;
// W Stage Registers
reg [7:0] data_w_reg;
reg [1:0] rd_w_reg;
reg is_halt_w_reg;
reg valid_w_reg;
// Output registers
reg [4:0] pc_out_reg;
reg halt_out_reg;
reg signed [7:0] r0_out_reg;
// Main clocked process
always @(posedge clk) begin
if (rst) begin
// Reset
pc_f_reg <= 0;
instr_f_reg <= 0;
instr_x_reg <= 0;
pc_x_reg <= 0;
alu_res_x_reg <= 0;
branch_taken_x_reg <= 0;
branch_target_x_reg <= 0;
is_halt_x_reg <= 0;
rd_x_reg <= 0;
rs_x_reg <= 0;
cond_x_reg <= 0;
imm_x_reg <= 0;
off_x_reg <= 0;
data_w_reg <= 0;
rd_w_reg <= 0;
is_halt_w_reg <= 0;
valid_w_reg <= 0;
R[0] <= 0; R[1] <= 0; R[2] <= 0; R[3] <= 0;
pc_out_reg <= 0;
halt_out_reg <= 0;
r0_out_reg <= 0;
end else begin
// Update W Stage
// W stage writes to register file
if (valid_w_reg && !is_halt_w_reg) begin
R[rd_w_reg] <= data_w_reg;
end
// Update X Stage
instr_x_reg <= instr_f_reg;
pc_x_reg <= pc_f_reg;
// Decode X stage instruction
case (instr_f_reg[7:6])
2'b00: begin // ADDI
rd_x_reg <= instr_f_reg[5:4];
rs_x_reg <= 0; // Unused
cond_x_reg <= 0; // Unused
imm_x_reg <= instr_f_reg[3:0];
off_x_reg <= 0; // Unused
is_halt_x_reg <= 0;
branch_taken_x_reg <= 0;
branch_target_x_reg <= 0;
// Compute ALU result
// R[rd] <= R[rd] + sign_ext(imm, 4)
// Forwarding: Check if rd matches rd_w_reg (from W stage)
signed [7:0] r_rd_val;
if (valid_w_reg && (rd_x_reg == rd_w_reg)) begin
r_rd_val = data_w_reg;
end else begin
r_rd_val = R[rd_x_reg];
end
alu_res_x_reg <= r_rd_val + sign_ext_4(imm_x_reg);
end
2'b01: begin // MOV
rd_x_reg <= instr_f_reg[5:4];
rs_x_reg <= instr_f_reg[3:2];
cond_x_reg <= 0; // Unused
imm_x_reg <= 0; // Unused
off_x_reg <= 0; // Unused
is_halt_x_reg <= 0;
branch_taken_x_reg <= 0;
branch_target_x_reg <= 0;
// Compute ALU result
// R[rd] <= R[rs]
// Forwarding: Check if rs matches rd_w_reg (from W stage)
signed [7:0] r_rs_val;
if (valid_w_reg && (rs_x_reg == rd_w_reg)) begin
r_rs_val = data_w_reg;
end else begin
r_rs_val = R[rs_x_reg];
end
alu_res_x_reg <= r_rs_val;
end
2'b10: begin // BRcc
rd_x_reg <= 0; // Unused
rs_x_reg <= 0; // Unused
cond_x_reg <= instr_f_reg[5:4];
imm_x_reg <= 0; // Unused
off_x_reg <= instr_f_reg[3:0];
is_halt_x_reg <= 0;
// Compute branch target and condition
// PC <= PC + sign_ext(off,4)+1
// Note: PC here is the PC of the branch instruction, which is pc_f_reg.
// The delay slot instruction is at pc_f_reg + 1.
// If taken, the next instruction after the delay slot is at the target.
// Forwarding for R0: Check if rd_w_reg is 0 (R0)
signed [7:0] r0_val;
if (valid_w_reg && (rd_w_reg == 2'b00)) begin
r0_val = data_w_reg;
end else begin
r0_val = R[0];
end
if (cond_met(cond_x_reg, r0_val)) begin
branch_taken_x_reg <= 1;
branch_target_x_reg <= pc_f_reg + sign_ext_4(off_x_reg) + 1;
end else begin
branch_taken_x_reg <= 0;
branch_target_x_reg <= 0; // Don't care
end
alu_res_x_reg <= 0; // Unused
end
2'b11: begin // HALT
rd_x_reg <= 0;
rs_x_reg <= 0;
cond_x_reg <= 0;
imm_x_reg <= 0;
off_x_reg <= 0;
is_halt_x_reg <= 1;
branch_taken_x_reg <= 0;
branch_target_x_reg <= 0;
alu_res_x_reg <= 0;
end
default: begin
rd_x_reg <= 0;
rs_x_reg <= 0;
cond_x_reg <= 0;
imm_x_reg <= 0;
off_x_reg <= 0;
is_halt_x_reg <= 0;
branch_taken_x_reg <= 0;
branch_target_x_reg <= 0;
alu_res_x_reg <= 0;
end
endcase
// Update F Stage
// Determine next PC
if (branch_taken_x_reg) begin
pc_f_reg <= branch_target_x_reg;
end else begin
pc_f_reg <= pc_f_reg + 1;
end
// Fetch instruction for next cycle
// instr_f_reg is updated from IMEM at the new pc_f_reg?
// No, instr_f_reg should be the instruction fetched at the OLD pc_f_reg?
// Let's think carefully.
// In the current cycle, F stage fetches instr_f_reg at pc_f_reg.
// This instruction moves to X stage in the next cycle.
// So instr_x_reg <= instr_f_reg is correct.
// But instr_f_reg itself needs to be updated to the instruction at the NEW pc_f_reg.
// So instr_f_reg <= IMEM_INIT[255 - pc_f_reg_new*8 -: 8];
// But pc_f_reg_new is computed above.
// We can use a temporary variable for the new PC.
// Let's use a temporary variable for the next PC value.
// But we already updated pc_f_reg above.
// So we can use pc_f_reg to index IMEM.
instr_f_reg <= IMEM_INIT[255 - pc_f_reg*8 -: 8];
// Update W Stage
// W stage gets data from X stage
data_w_reg <= alu_res_x_reg;
rd_w_reg <= rd_x_reg;
is_halt_w_reg <= is_halt_x_reg;
valid_w_reg <= 1; // Always valid, unless HALT?
// If HALT, we don't want to write to R.
// So valid_w_reg should be 0 if is_halt_x_reg is 1.
if (is_halt_x_reg) begin
valid_w_reg <= 0;
end else begin
valid_w_reg <= 1;
end
// Update Outputs
pc_out_reg <= pc_f_reg;
if (is_halt_x_reg) begin
halt_out_reg <= 1;
end else if (is_halt_w_reg) begin
// If the instruction in W stage was HALT, it will halt in the next cycle?
// No, HALT freezes the pipeline.
// The problem says "halt_out <= 1, freeze pipeline".
// This implies that once HALT is executed, the MCU stops.
// "Executed" means it passes through the pipeline.
// So when HALT is in W stage, it is being executed.
// So halt_out should be asserted when HALT is in W stage?
// Or when it is in X stage?
// "After HALT executes" usually means after the writeback stage.
// But "freeze pipeline" means no more instructions are fetched/decoded.
// So if HALT is in W stage, we assert halt_out and stop updating pipeline registers.
// But we already updated them above.
// So we need to check for HALT in the current state before updating.
// Let's restructure the always block to handle HALT properly.
// If halt_out_reg is already 1, we do nothing.
// If the instruction in X stage is HALT, we set halt_out_reg to 1 in the next cycle?
// No, "after HALT executes". Execution completes in W stage.
// So when HALT is in W stage, we assert halt_out.
// And we freeze the pipeline.
// But we already updated the pipeline registers above.
// So we need to prevent the update if halt_out_reg is 1.
// Let's move the halt check to the beginning of the else block.
end
// Update r0_out_reg
// r0_out reflects the most recently written value of R[0].
// If the instruction in W stage writes to R[0], update r0_out_reg.
if (valid_w_reg && (rd_w_reg == 2'b00)) begin
r0_out_reg <= data_w_reg;
end
end
end
// Assign outputs
assign pc_out = pc_out_reg;
assign halt_out = halt_out_reg;
assign r0_out = r0_out_reg;
endmodule
```