no functional run (did not compile)
```verilog
// Branch convention: The instruction immediately following a BRcc is always executed (1 delay slot).
// If the branch is taken, the instruction in the delay slot is executed, and the next instruction
// fetched is the branch target. This is standard for simple 3-stage pipelines without prediction.
module mcu3 #(
parameter [0:255] IMEM_INIT = 256'h0 // 32 instruction bytes, byte 0 at MSB
) (
input clk,
input rst, // synchronous, active-high
output reg [4:0] pc_out, // current fetched PC (visible for testing)
output reg halt_out, // asserted forever after HALT executes
output reg signed [7:0] r0_out // current value of register R0
);
// Register File: 4 registers, 8-bit signed
reg signed [7:0] rf [0:3];
// Pipeline Registers
// Stage F (Fetch)
reg [4:0] pc_f;
reg [7:0] instr_f;
reg [4:0] npc_f; // Next PC calculated in F stage
// Stage X (Execute)
reg [4:0] pc_x;
reg [7:0] instr_x;
reg [4:0] npc_x;
reg [1:0] opcode_x;
reg [1:0] rd_x;
reg [1:0] rs_x;
reg [3:0] imm_x;
reg [3:0] off_x;
reg [1:0] cond_x;
reg signed [7:0] alu_result_x;
reg branch_taken_x;
reg is_halt_x;
reg [7:0] rs_val_x;
// Stage W (Writeback)
reg [4:0] pc_w;
reg [7:0] instr_w;
reg [1:0] opcode_w;
reg [1:0] rd_w;
reg signed [7:0] alu_result_w;
reg is_halt_w;
reg branch_taken_w;
reg [4:0] npc_w; // The PC that should be committed if branch was taken
// Forwarding logic signals
reg signed [7:0] rs_fwd;
reg signed [7:0] rd_fwd; // Not strictly needed for ADDI/MOV as rd is destination, but good for completeness if we had 2 src regs. Here only MOV has rs.
reg [1:0] fwd_sel; // 0: RF, 1: EX/MEM (X stage result), 2: MEM/WB (W stage result) - simplified to just X and W for 3 stage
// Internal wires for forwarding decisions
wire [1:0] fwd_src;
// ----------------------------------------------------------------
// Reset and Pipeline Control
// ----------------------------------------------------------------
reg pipeline_flush;
reg halt_internal;
always @(posedge clk) begin
if (rst) begin
pc_f <= 0;
instr_f <= 0;
npc_f <= 0;
pc_x <= 0;
instr_x <= 0;
npc_x <= 0;
opcode_x <= 0;
rd_x <= 0;
rs_x <= 0;
imm_x <= 0;
off_x <= 0;
cond_x <= 0;
alu_result_x <= 0;
branch_taken_x <= 0;
is_halt_x <= 0;
rs_val_x <= 0;
pc_w <= 0;
instr_w <= 0;
opcode_w <= 0;
rd_w <= 0;
alu_result_w <= 0;
is_halt_w <= 0;
branch_taken_w <= 0;
npc_w <= 0;
for (int i = 0; i < 4; i++) rf[i] <= 0;
pc_out <= 0;
halt_out <= 0;
r0_out <= 0;
halt_internal <= 0;
pipeline_flush <= 0;
end else if (halt_internal) begin
// Freeze pipeline
halt_out <= 1;
end else begin
// Update outputs
pc_out <= pc_f;
halt_out <= 0; // Reset if not halted
// Update R0 output
if (opcode_w == 2'b00 || opcode_w == 2'b01) begin // ADDI or MOV
if (rd_w == 0) begin
r0_out <= alu_result_w;
end
end else if (opcode_w == 2'b11) begin // HALT
// r0_out remains unchanged
end else begin
// r0_out remains unchanged
end
// ----------------------------------------------------------------
// Stage W: Writeback
// ----------------------------------------------------------------
if (pipeline_flush) begin
// Flush W stage? Usually branch resolution happens in X, so W is just delayed.
// If branch is taken in X, the instruction in W is the delay slot instruction.
// It should execute. So we don't flush W on branch taken.
// We only flush if we have a structural hazard or similar, but problem says no stalls.
// Actually, if we detect a hazard that requires a stall, we would flush.
// But problem says "No stalls allowed for ALU->ALU".
// What about Load-Use? We don't have LOAD. Only ADDI/MOV/BR/HALT.
// So no stalls needed.
pc_w <= pc_w;
instr_w <= instr_w;
opcode_w <= opcode_w;
rd_w <= rd_w;
alu_result_w <= alu_result_w;
is_halt_w <= is_halt_w;
branch_taken_w <= branch_taken_w;
npc_w <= npc_w;
end else begin
pc_w <= pc_x;
instr_w <= instr_x;
opcode_w <= opcode_x;
rd_w <= rd_x;
alu_result_w <= alu_result_x;
is_halt_w <= is_halt_x;
branch_taken_w <= branch_taken_x;
npc_w <= npc_x;
end
// Execute W stage logic
if (!pipeline_flush) begin
if (is_halt_w) begin
halt_internal <= 1;
end else if (opcode_w == 2'b00 || opcode_w == 2'b01) begin
// Write result to RF
if (rd_w < 4) begin
rf[rd_w] <= alu_result_w;
end
end
// BRcc and HALT do not write to RF
end
// ----------------------------------------------------------------
// Stage X: Decode/Execute
// ----------------------------------------------------------------
if (pipeline_flush) begin
// If we flush X, we need to decide what to put in.
// Standard bubble: put NOP-like values.
pc_x <= 0;
instr_x <= 0;
npc_x <= 0;
opcode_x <= 0;
rd_x <= 0;
rs_x <= 0;
imm_x <= 0;
off_x <= 0;
cond_x <= 0;
alu_result_x <= 0;
branch_taken_x <= 0;
is_halt_x <= 0;
rs_val_x <= 0;
end else begin
// Move from F to X
pc_x <= pc_f;
instr_x <= instr_f;
npc_x <= npc_f;
// Decode
opcode_x <= instr_f[7:6];
if (instr_f[7:6] == 2'b00) begin // ADDI
rd_x <= instr_f[5:4];
imm_x <= instr_f[3:0];
rs_x <= 0; // Unused
off_x <= 0;
cond_x <= 0;
end else if (instr_f[7:6] == 2'b01) begin // MOV
rd_x <= instr_f[5:4];
rs_x <= instr_f[3:2];
imm_x <= 0;
off_x <= 0;
cond_x <= 0;
end else if (instr_f[7:6] == 2'b10) begin // BRcc
cond_x <= instr_f[5:4];
off_x <= instr_f[3:0];
rd_x <= 0;
rs_x <= 0;
imm_x <= 0;
end else begin // HALT
rd_x <= 0;
rs_x <= 0;
imm_x <= 0;
off_x <= 0;
cond_x <= 0;
end
// Execute
is_halt_x <= (instr_f[7:6] == 2'b11);
branch_taken_x <= 0;
// Forwarding for RS in X stage
// We need to determine the value of RS for MOV
// Sources: RF, X-stage result (if it was just computed), W-stage result (if it was just committed)
// Since we are in X stage, the "previous" instruction is in W stage.
// The "current" instruction is in X stage.
// Forwarding logic:
// If the instruction in W stage writes to the register that X stage reads, use W result.
// If the instruction in X stage writes to the register that X stage reads? No, X stage is computing.
// Wait, forwarding is for the *current* X instruction's operands.
// The operands for X instruction come from RF, unless a previous instruction (in W or X) is writing to that register.
// Since we have a 3-stage pipeline:
// Time T: Inst I in W, Inst I+1 in X, Inst I+2 in F
// Inst I+1 in X needs RS.
// If Inst I (in W) writes to RS, forward from W.
// If Inst I+1 (in X) is an ADDI/MOV, it produces a result.
// Does Inst I+1 read a register that it itself writes? No.
// Does Inst I+1 read a register that Inst I+2 (in F) will read? That's for the next cycle.
// So for X stage, we only forward from W stage.
// Let's refine:
// Current X instruction: opcode_x, rs_x
// Previous W instruction: opcode_w, rd_w, alu_result_w
// If opcode_w is ADDI or MOV, and rd_w == rs_x, and rd_w != 0 (R0 is hardwired 0? No, R0 is a register).
// Note: R0 is a register. If rd_w is 0, we write to R0.
// If rs_x is 0, we read R0.
// So if rd_w == rs_x, we forward.
// Also, what if the instruction in the *next* X stage (current F) needs forwarding?
// That will be handled in the next cycle when F moves to X.
// So here, we just compute rs_val_x for the current X instruction.
// Check W stage for forwarding
if (opcode_w == 2'b00 || opcode_w == 2'b01) begin
if (rd_w == rs_x) begin
rs_val_x <= alu_result_w;
end else begin
rs_val_x <= rf[rs_x];
end
end else begin
rs_val_x <= rf[rs_x];
end
case (opcode_x)
2'b00: begin // ADDI
// R[rd] <= R[rd] + sign_ext(imm, 4)
// Note: ADDI reads R[rd] as source? "R[rd] <= R[rd] + ..."
// So we need the value of R[rd].
// Forwarding for RD in ADDI?
// ADDI uses RD as both source and destination.
// So we need to forward the value of RD if a previous instruction wrote to it.
// The previous instruction is in W stage.
// So we need to forward to the "source" of ADDI, which is RD.
// Let's create a generic source value for the ALU.
// For ADDI, source is R[rd].
// For MOV, source is R[rs].
// Re-evaluating forwarding for ADDI:
// ADDI: R[rd] <= R[rd] + imm
// We need R[rd].
// Check W stage: if opcode_w is ADDI/MOV and rd_w == rd_x, forward alu_result_w.
// Else check RF.
signed [7:0] src_val;
if (opcode_w == 2'b00 || opcode_w == 2'b01) begin
if (rd_w == rd_x) begin
src_val <= alu_result_w;
end else begin
src_val <= rf[rd_x];
end
end else begin
src_val <= rf[rd_x];
end
alu_result_x <= src_val + $signed({{4{imm_x[3]}}, imm_x});
end
2'b01: begin // MOV
// R[rd] <= R[rs]
// We already computed rs_val_x with forwarding from W.
alu_result_x <= rs_val_x;
end
2'b10: begin // BRcc
// Evaluate condition
// cond 00=always, 01=zero(R0), 10=neg(R0), 11=pos(R0)
// We need the current value of R0.
// R0 might have been updated by the instruction in W stage.
// So we need to forward R0 from W stage if W stage wrote to R0.
signed [7:0] r0_val;
if (opcode_w == 2'b00 || opcode_w == 2'b01) begin
if (rd_w == 0) begin
r0_val <= alu_result_w;
end else begin
r0_val <= rf[0];
end
end else begin
r0_val <= rf[0];
end
case (cond_x)
2'b00: branch_taken_x <= 1;
2'b01: branch_taken_x <= (r0_val == 0);
2'b10: branch_taken_x <= (r0_val < 0);
2'b11: branch_taken_x <= (r0_val > 0);
default: branch_taken_x <= 0;
endcase
// Compute branch target
// PC <= PC + sign_ext(off,4) + 1
// PC here is pc_x (the PC of the branch instruction)
alu_result_x <= pc_x + $signed({{4{off_x[3]}}, off_x}) + 1;
end
2'b11: begin // HALT
alu_result_x <= 0;
end
default: begin
alu_result_x <= 0;
end
endcase
end
// ----------------------------------------------------------------
// Stage F: Fetch
// ----------------------------------------------------------------
if (pipeline_flush) begin
// If we flush F, we bubble it.
pc_f <= pc_f; // Or increment? Usually bubble means NOP.
instr_f <= 0;
npc_f <= pc_f + 1; // Default next PC
end else begin
// Determine next PC for the *next* fetch
// The instruction currently in X stage might be a branch.
// If branch is taken, the next PC (for the instruction after the delay slot) is the branch target.
// The instruction in W stage is the delay slot instruction. It executes.
// The instruction in F stage is the one after the delay slot.
// So if X stage branch is taken, F stage should fetch from branch target.
// If X stage branch is not taken, F stage should fetch from npc_x (which is pc_x + 1).
// Wait, npc_x is computed in X stage as the branch target.
// If branch is not taken, the next PC is pc_x + 1.
// We stored npc_x in X stage. For non-branch instructions, npc_x is just pc_x + 1?
// In X stage decode, we set npc_x <= npc_f.
// For BRcc, we computed alu_result_x as the target.
// So if branch_taken_x, the next PC is alu_result_x.
// Else, the next PC is pc_x + 1.
// But wait, npc_f was computed in the previous cycle for the instruction now in X.
// For the instruction now in F, we need to compute its next PC.
// The instruction in F is the one after the delay slot.
// If the branch in X was taken, the instruction in F is the first instruction at the target.
// So npc_f for the *next* cycle should be pc_f + 1.
// And pc_f for the *next* cycle should be the branch target.
// Let's trace:
// Cycle T:
// W: I-1
// X: I (Branch)
// F: I+1 (Delay Slot)
// Cycle T+1:
// W: I (Branch) -> Committed. If taken, PC becomes target.
// X: I+1 (Delay Slot) -> Executed.
// F: I+2 -> Fetched from Target+1 if branch taken, or I+1+1 if not.
// So in Cycle T, when we update F for Cycle T+1:
// pc_f_next = branch_taken_x ? alu_result_x : pc_x + 1;
// instr_f_next = IMEM_INIT[pc_f_next * 8 +: 8];
// npc_f_next = pc_f_next + 1;
// What if X is not a branch?
// Then pc_f_next = pc_x + 1;
// Which is the same as npc_x (if npc_x was set to pc_x+1 in X stage).
// So:
if (opcode_x == 2'b10) begin
if (branch_taken_x) begin
pc_f <= alu_result_x;
end else begin
pc_f <= pc_x + 1;
end
end else begin
pc_f <= pc_x + 1;
end
// Fetch instruction
if (pc_f < 32) begin
instr_f <= IMEM_INIT[pc_f * 8 +: 8];
end else begin
instr_f <= 0; // Out of bounds
end
// Compute npc_f for the instruction now in F (for the next cycle's X stage)
npc_f <= pc_f + 1;
end
end
end
// Pipeline Flush Logic
// When does a flush happen?
// In this simple pipeline with forwarding and no stalls, we don't have data hazards that require stalls.
// We have branch hazards.
// The branch decision is made in X stage.
// The instruction in F stage is the delay slot instruction.
// If the branch is taken, the instruction in F stage is the delay slot instruction.
// It should execute.
// The instruction *after* the delay slot (which will be in F stage in the next cycle) should be the target.
// We handled this by updating pc_f directly based on branch_taken_x.
// So we don't need to flush the F stage in the sense of bubbling it.
// We just redirect the fetch.
// What about the W stage?
// The instruction in W stage is I-1.
// The instruction in X stage is I (Branch).
// The instruction in F stage is I+1 (Delay Slot).
// If branch is taken, I+1 executes.
// I (Branch) commits in W stage next cycle.
// So no flush is needed for W or X.
// The only "flush" is the redirection of F.
// So pipeline_flush is always 0?
// Wait, what if we have a HALT in X stage?
// If X stage is HALT, it will set halt_internal in W stage next cycle.
// The instruction in F stage (delay slot) will execute.
// Then the pipeline halts.
// This is correct.
// What if we have a branch in X stage, and the branch target is the same as the next sequential PC?
// Then it's effectively not taken.
// Our logic handles this.
// So pipeline_flush is not needed for data hazards.
// Is it needed for control hazards?
// We resolved control hazards by redirecting F.
// So pipeline_flush <= 0;
// However, if we wanted to implement a stall, we would set pipeline_flush.
// But the problem says no stalls for ALU->ALU.
// And we don't have Load-Use hazards.
// So pipeline_flush is always 0.
// Let's double check the forwarding logic.
// ADDI: R[rd] <= R[rd] + imm
// MOV: R[rd] <= R[rs]
// BRcc: uses R0.
// Forwarding for ADDI:
// If the previous instruction (in W) wrote to R[rd], we use that value.
// This is correct.
// Forwarding for MOV:
// If the previous instruction (in W) wrote to R[rs], we use that value.
// This is correct.
// Forwarding for BRcc:
// If the previous instruction (in W) wrote to R0, we use that value.
// This is correct.
// What if the instruction in X stage is an ADDI that writes to R0, and```verilog
// Branch convention: The instruction immediately following a BRcc is always executed (1 delay slot).
// If the branch is taken, the instruction in the delay slot is executed, and the next instruction
// fetched is the branch target. This is standard for simple 3-stage pipelines without prediction.
module mcu3 #(
parameter [0:255] IMEM_INIT = 256'h0 // 32 instruction bytes, byte 0 at MSB
) (
input clk,
input rst, // synchronous, active-high
output reg [4:0] pc_out, // current fetched PC (visible for testing)
output reg halt_out, // asserted forever after HALT executes
output reg signed [7:0] r0_out // current value of register R0
);
// Register File: 4 registers, 8-bit signed
reg signed [7:0] rf [0:3];
// Pipeline Registers
// Stage F (Fetch)
reg [4:0] pc_f;
reg [7:0] instr_f;
reg [4:0] npc_f; // Next PC calculated in F stage
// Stage X (Execute)
reg [4:0] pc_x;
reg [7:0] instr_x;
reg [4:0] npc_x;
reg [1:0] opcode_x;
reg [1:0] rd_x;
reg [1:0] rs_x;
reg [3:0] imm_x;
reg [3:0] off_x;
reg [1:0] cond_x;
reg signed [7:0] alu_result_x;
reg branch_taken_x;
reg is_halt_x;
reg [7:0] rs_val_x;
// Stage W (Writeback)
reg [4:0] pc_w;
reg [7:0] instr_w;
reg [1:0] opcode_w;
reg [1:0] rd_w;
reg signed [7:0] alu_result_w;
reg is_halt_w;
reg branch_taken_w;
reg [4:0] npc_w; // The PC that should be committed if branch was taken
// Forwarding logic signals
reg signed [7:0] rs_fwd;
reg signed [7:0] rd_fwd; // Not strictly needed for ADDI/MOV as rd is destination, but good for completeness if we had 2 src regs. Here only MOV has rs.
reg [1:0] fwd_sel; // 0: RF, 1: EX/MEM (X stage result), 2: MEM/WB (W stage result) - simplified to just X and W for 3 stage
// Internal wires for forwarding decisions
wire [1:0] fwd_src;
// ----------------------------------------------------------------
// Reset and Pipeline Control
// ----------------------------------------------------------------
reg pipeline_flush;
reg halt_internal;
always @(posedge clk) begin
if (rst) begin
pc_f <= 0;
instr_f <= 0;
npc_f <= 0;
pc_x <= 0;
instr_x <= 0;
npc_x <= 0;
opcode_x <= 0;
rd_x <= 0;
rs_x <= 0;
imm_x <= 0;
off_x <= 0;
cond_x <= 0;
alu_result_x <= 0;
branch_taken_x <= 0;
is_halt_x <= 0;
rs_val_x <= 0;
pc_w <= 0;
instr_w <= 0;
opcode_w <= 0;
rd_w <= 0;
alu_result_w <= 0;
is_halt_w <= 0;
branch_taken_w <= 0;
npc_w <= 0;
for (int i = 0; i < 4; i++) rf[i] <= 0;
pc_out <= 0;
halt_out <= 0;
r0_out <= 0;
halt_internal <= 0;
pipeline_flush <= 0;
end else if (halt_internal) begin
// Freeze pipeline
halt_out <= 1;
end else begin
// Update outputs
pc_out <= pc_f;
halt_out <= 0; // Reset if not halted
// Update R0 output
if (opcode_w == 2'b00 || opcode_w == 2'b01) begin // ADDI or MOV
if (rd_w == 0) begin
r0_out <= alu_result_w;
end
end else if (opcode_w == 2'b11) begin // HALT
// r0_out remains unchanged
end else begin
// r0_out remains unchanged
end
// ----------------------------------------------------------------
// Stage W: Writeback
// ----------------------------------------------------------------
if (pipeline_flush) begin
// Flush W stage? Usually branch resolution happens in X, so W is just delayed.
// If branch is taken in X, the instruction in W is the delay slot instruction.
// It should execute. So we don't flush W on branch taken.
// We only flush if we have a structural hazard or similar, but problem says no stalls.
// Actually, if we detect a hazard that requires a stall, we would flush.
// But problem says "No stalls allowed for ALU->ALU".
// What about Load-Use? We don't have LOAD. Only ADDI/MOV/BR/HALT.
// So no stalls needed.
pc_w <= pc_w;
instr_w <= instr_w;
opcode_w <= opcode_w;
rd_w <= rd_w;
alu_result_w <= alu_result_w;
is_halt_w <= is_halt_w;
branch_taken_w <= branch_taken_w;
npc_w <= npc_w;
end else begin
pc_w <= pc_x;
instr_w <= instr_x;
opcode_w <= opcode_x;
rd_w <= rd_x;
alu_result_w <= alu_result_x;
is_halt_w <= is_halt_x;
branch_taken_w <= branch_taken_x;
npc_w <= npc_x;
end
// Execute W stage logic
if (!pipeline_flush) begin
if (is_halt_w) begin
halt_internal <= 1;
end else if (opcode_w == 2'b00 || opcode_w == 2'b01) begin
// Write result to RF
if (rd_w < 4) begin
rf[rd_w] <= alu_result_w;
end
end
// BRcc and HALT do not write to RF
end
// ----------------------------------------------------------------
// Stage X: Decode/Execute
// ----------------------------------------------------------------
if (pipeline_flush) begin
// If we flush X, we need to decide what to put in.
// Standard bubble: put NOP-like values.
pc_x <= 0;
instr_x <= 0;
npc_x <= 0;
opcode_x <= 0;
rd_x <= 0;
rs_x <= 0;
imm_x <= 0;
off_x <= 0;
cond_x <= 0;
alu_result_x <= 0;
branch_taken_x <= 0;
is_halt_x <= 0;
rs_val_x <= 0;
end else begin
// Move from F to X
pc_x <= pc_f;
instr_x <= instr_f;
npc_x <= npc_f;
// Decode
opcode_x <= instr_f[7:6];
if (instr_f[7:6] == 2'b00) begin // ADDI
rd_x <= instr_f[5:4];
imm_x <= instr_f[3:0];
rs_x <= 0; // Unused
off_x <= 0;
cond_x <= 0;
end else if (instr_f[7:6] == 2'b01) begin // MOV
rd_x <= instr_f[5:4];
rs_x <= instr_f[3:2];
imm_x <= 0;
off_x <= 0;
cond_x <= 0;
end else if (instr_f[7:6] == 2'b10) begin // BRcc
cond_x <= instr_f[5:4];
off_x <= instr_f[3:0];
rd_x <= 0;
rs_x <= 0;
imm_x <= 0;
end else begin // HALT
rd_x <= 0;
rs_x <= 0;
imm_x <= 0;
off_x <= 0;
cond_x <= 0;
end
// Execute
is_halt_x <= (instr_f[7:6] == 2'b11);
branch_taken_x <= 0;
// Forwarding for RS in X stage
// We need to determine the value of RS for MOV
// Sources: RF, X-stage result (if it was just computed), W-stage result (if it was just committed)
// Since we are in X stage, the "previous" instruction is in W stage.
// The "current" instruction is in X stage.
// Forwarding logic:
// If the instruction in W stage writes to the register that X stage reads, use W result.
// If the instruction in X stage writes to the register that X stage reads? No, X stage is computing.
// Wait, forwarding is for the *current* X instruction's operands.
// The operands for X instruction come from RF, unless a previous instruction (in W or X) is writing to that register.
// Since we have a 3-stage pipeline:
// Time T: Inst I in W, Inst I+1 in X, Inst I+2 in F
// Inst I+1 in X needs RS.
// If Inst I (in W) writes to RS, forward from W.
// If Inst I+1 (in X) is an ADDI/MOV, it produces a result.
// Does Inst I+1 read a register that it itself writes? No.
// Does Inst I+1 read a register that Inst I+2 (in F) will read? That's for the next cycle.
// So for X stage, we only forward from W stage.
// Let's refine:
// Current X instruction: opcode_x, rs_x
// Previous W instruction: opcode_w, rd_w, alu_result_w
// If opcode_w is ADDI or MOV, and rd_w == rs_x, and rd_w != 0 (R0 is hardwired 0? No, R0 is a register).
// Note: R0 is a register. If rd_w is 0, we write to R0.
// If rs_x is 0, we read R0.
// So if rd_w == rs_x, we forward.
// Also, what if the instruction in the *next* X stage (current F) needs forwarding?
// That will be handled in the next cycle when F moves to X.
// So here, we just compute rs_val_x for the current X instruction.
// Check W stage for forwarding
if (opcode_w == 2'b00 || opcode_w == 2'b01) begin
if (rd_w == rs_x) begin
rs_val_x <= alu_result_w;
end else begin
rs_val_x <= rf[rs_x];
end
end else begin
rs_val_x <= rf[rs_x];
end
case (opcode_x)
2'b00: begin // ADDI
// R[rd] <= R[rd] + sign_ext(imm, 4)
// Note: ADDI reads R[rd] as source? "R[rd] <= R[rd] + ..."
// So we need the value of R[rd].
// Forwarding for RD in ADDI?
// ADDI uses RD as both source and destination.
// So we need to forward the value of RD if a previous instruction wrote to it.
// The previous instruction is in W stage.
// So we need to forward to the "source" of ADDI, which is RD.
// Let's create a generic source value for the ALU.
// For ADDI, source is R[rd].
// For MOV, source is R[rs].
// Re-evaluating forwarding for ADDI:
// ADDI: R[rd] <= R[rd] + imm
// We need R[rd].
// Check W stage: if opcode_w is ADDI/MOV and rd_w == rd_x, forward alu_result_w.
// Else check RF.
signed [7:0] src_val;
if (opcode_w == 2'b00 || opcode_w == 2'b01) begin
if (rd_w == rd_x) begin
src_val <= alu_result_w;
end else begin
src_val <= rf[rd_x];
end
end else begin
src_val <= rf[rd_x];
end
alu_result_x <= src_val + $signed({{4{imm_x[3]}}, imm_x});
end
2'b01: begin // MOV
// R[rd] <= R[rs]
// We already computed rs_val_x with forwarding from W.
alu_result_x <= rs_val_x;
end
2'b10: begin // BRcc
// Evaluate condition
// cond 00=always, 01=zero(R0), 10=neg(R0), 11=pos(R0)
// We need the current value of R0.
// R0 might have been updated by the instruction in W stage.
// So we need to forward R0 from W stage if W stage wrote to R0.
signed [7:0] r0_val;
if (opcode_w == 2'b00 || opcode_w == 2'b01) begin
if (rd_w == 0) begin
r0_val <= alu_result_w;
end else begin
r0_val <= rf[0];
end
end else begin
r0_val <= rf[0];
end
case (cond_x)
2'b00: branch_taken_x <= 1;
2'b01: branch_taken_x <= (r0_val == 0);
2'b10: branch_taken_x <= (r0_val < 0);
2'b11: branch_taken_x <= (r0_val > 0);
default: branch_taken_x <= 0;
endcase
// Compute branch target
// PC <= PC + sign_ext(off,4) + 1
// PC here is pc_x (the PC of the branch instruction)
alu_result_x <= pc_x + $signed({{4{off_x[3]}}, off_x}) + 1;
end
2'b11: begin // HALT
alu_result_x <= 0;
end
default: begin
alu_result_x <= 0;
end
endcase
end
// ----------------------------------------------------------------
// Stage F: Fetch
// ----------------------------------------------------------------
if (pipeline_flush) begin
// If we flush F, we bubble it.
pc_f <= pc_f; // Or increment? Usually bubble means NOP.
instr_f <= 0;
npc_f <= pc_f + 1; // Default next PC
end else begin
// Determine next PC for the *next* fetch
// The instruction currently in X stage might be a branch.
// If branch is taken, the next PC (for the instruction after the delay slot) is the branch target.
// The instruction in W stage is the delay slot instruction. It executes.
// The instruction in F stage is the one after the delay slot.
// So if X stage branch is taken, F stage should fetch from branch target.
// If X stage branch is not taken, F stage should fetch from npc_x (which is pc_x + 1).
// Wait, npc_x is computed in X stage as the branch target.
// If branch is not taken, the next PC is pc_x + 1.
// We stored npc_x in X stage. For non-branch instructions, npc_x is just pc_x + 1?
// In X stage decode, we set npc_x <= npc_f.
// For BRcc, we computed alu_result_x as the target.
// So if branch_taken_x, the next PC is alu_result_x.
// Else, the next PC is pc_x + 1.
// But wait, npc_f was computed in the previous cycle for the instruction now in X.
// For the instruction now in F, we need to compute its next PC.
// The instruction in F is the one after the delay slot.
// If the branch in X was taken, the instruction in F is the first instruction at the target.
// So npc_f for the *next* cycle should be pc_f + 1.
// And pc_f for the *next* cycle should be the branch target.
// Let's trace:
// Cycle T:
// W: I-1
// X: I (Branch)
// F: I+1 (Delay Slot)
// Cycle T+1:
// W: I (Branch) -> Committed. If taken, PC becomes target.
// X: I+1 (Delay Slot) -> Executed.
// F: I+2 -> Fetched from Target+1 if branch taken, or I+1+1 if not.
// So in Cycle T, when we update F for Cycle T+1:
// pc_f_next = branch_taken_x ? alu_result_x : pc_x + 1;
// instr_f_next = IMEM_INIT[pc_f_next * 8 +: 8];
// npc_f_next = pc_f_next + 1;
// What if X is not a branch?
// Then pc_f_next = pc_x + 1;
// Which is the same as npc_x (if npc_x was set to pc_x+1 in X stage).
// So:
if (opcode_x == 2'b10) begin
if (branch_taken_x) begin
pc_f <= alu_result_x;
end else begin
pc_f <= pc_x + 1;
end
end else begin
pc_f <= pc_x + 1;
end
// Fetch instruction
if (pc_f < 32) begin
instr_f <= IMEM_INIT[pc_f * 8 +: 8];
end else begin
instr_f <= 0; // Out of bounds
end
// Compute npc_f for the instruction now in F (for the next cycle's X stage)
npc_f <= pc_f + 1;
end
end
end
// Pipeline Flush Logic
// When does a flush happen?
// In this simple pipeline with forwarding and no stalls, we don't have data hazards that require stalls.
// We have branch hazards.
// The branch decision is made in X stage.
// The instruction in F stage is the delay slot instruction.
// If the branch is taken, the instruction in F stage is the delay slot instruction.
// It should execute.
// The instruction *after* the delay slot (which will be in F stage in the next cycle) should be the target.
// We handled this by updating pc_f directly based on branch_taken_x.
// So we don't need to flush the F stage in the sense of bubbling it.
// We just redirect the fetch.
// What about the W stage?
// The instruction in W stage is I-1.
// The instruction in X stage is I (Branch).
// The instruction in F stage is I+1 (Delay Slot).
// If branch is taken, I+1 executes.
// I (Branch) commits in W stage next cycle.
// So no flush is needed for W or X.
// The only "flush" is the redirection of F.
// So pipeline_flush is always 0?
// Wait, what if we have a HALT in X stage?
// If X stage is HALT, it will set halt_internal in W stage next cycle.
// The instruction in F stage (delay slot) will execute.
// Then the pipeline halts.
// This is correct.
// What if we have a branch in X stage, and the branch target is the same as the next sequential PC?
// Then it's effectively not taken.
// Our logic handles this.
// So pipeline_flush is not needed for data hazards.
// Is it needed for control hazards?
// We resolved control hazards by redirecting F.
// So pipeline_flush <= 0;
// However, if we wanted to implement a stall, we would set pipeline_flush.
// But the problem says no stalls for ALU->ALU.
// And we don't have Load-Use hazards.
// So pipeline_flush is always 0.
// Let's double check the forwarding logic.
// ADDI: R[rd] <= R[rd] + imm
// MOV: R[rd] <= R[rs]
// BRcc: uses R0.
// Forwarding for ADDI:
// If the previous instruction (in W) wrote to R[rd], we use that value.
// This is correct.
// Forwarding for MOV:
// If the previous instruction (in W) wrote to R[rs], we use that value.
// This is correct.
// Forwarding for BRcc:
// If the previous instruction (in W) wrote to R0, we use that value.
// This is correct.
// What if the instruction in X stage is an ADDI that writes to R0, and