no functional run (did not compile)
```verilog
// Branch convention: The instruction immediately following a BRcc is always
// fetched and executed (branch delay slot). If the branch is taken, the
// instruction fetched into the delay slot is effectively discarded from the
// architectural state (its writeback is suppressed or its result is ignored
// for subsequent reads). This implementation suppresses the writeback of the
// delay slot instruction if the branch is taken.
module mcu3 #(
parameter [0:255] IMEM_INIT = 256'h0 // 32 instruction bytes, byte 0 at MSB
) (
input clk,
input rst, // synchronous, active-high
output reg [4:0] pc_out, // current fetched PC (visible for testing)
output reg halt_out, // asserted forever after HALT executes
output reg signed [7:0] r0_out // current value of register R0
);
// Register File: 4 registers, 8-bit signed
reg signed [7:0] rf [0:3];
// Pipeline Stage Registers
// Stage F (Fetch)
reg [4:0] pc_f;
reg [7:0] instr_f;
reg valid_f;
// Stage X (Execute)
reg [4:0] pc_x;
reg [7:0] instr_x;
reg valid_x;
// Forwarded/Computed values for Writeback
reg [1:0] rd_x;
reg [1:0] rs_x;
reg signed [7:0] result_x;
reg is_branch_x;
reg branch_taken_x;
reg is_halt_x;
// Stage W (Writeback)
reg [4:0] pc_w;
reg [7:0] instr_w;
reg valid_w;
reg [1:0] rd_w;
reg signed [7:0] result_w;
reg is_branch_w;
reg branch_taken_w; // Used to suppress writeback of delay slot
reg is_halt_w;
// Helper: Sign extend 4-bit immediate to 8-bit
function signed [7:0] sext4;
input [3:0] val;
begin
sext4 = {{4{val[3]}}, val};
end
endfunction
// Helper: Read register with forwarding
// Forwarding logic:
// 1. If W stage is writing to the same register and is valid (and not a branch delay slot suppression case that invalidates the write), use W result.
// 2. If X stage is writing to the same register and is valid (and not a branch delay slot suppression case), use X result.
// 3. Otherwise, use register file.
// Note: Branch delay slot suppression means if the previous instruction was a taken branch, the current instruction (in X or W) is the delay slot.
// However, standard MIPS-like delay slot execution means the delay slot instruction executes.
// The prompt says: "instruction after the branch always executes".
// It also says "document whichever convention you choose".
// Convention chosen: The delay slot instruction executes fully. If the branch is taken, the PC updates to the target.
// The delay slot instruction's writeback happens normally.
// WAIT: "1 branch delay slot is acceptable ... document whichever convention you choose".
// Usually, in a simple 3-stage pipeline without complex hazard detection for branches,
// the "delay slot" is just the instruction that was already fetched.
// If I implement "suppress writeback of delay slot if branch taken", I need to track if the current W instruction is the delay slot for a taken branch.
// Let's stick to the simplest valid interpretation:
// The instruction fetched at PC+1 is the delay slot. It executes.
// If the branch is taken, the PC becomes Target. The delay slot instruction is executed.
// Does its result count? Yes, usually.
// BUT, if I suppress it, it's also a valid "convention" (like ARM conditional execution or MIPS where you put NOPs).
// Let's look at the forwarding requirement: "Forwarding REQUIRED for back-to-back ADDI / MOV".
// This implies standard ALU forwarding.
// Let's implement standard execution: Delay slot executes. Its writeback occurs.
// If the branch is taken, the next fetch is from Target. The delay slot is "wasted" in terms of flow control but not in terms of side effects (register writes).
// Actually, many simple MCUs discard the delay slot if taken. Let's look at the "freeze pipeline" for HALT.
// Let's assume the delay slot instruction is executed and its results are committed.
// Why? Because "instruction after the branch always executes".
// If I suppress writeback, I am effectively saying it doesn't execute its side effects.
// I will implement: Delay slot executes and commits.
function signed [7:0] read_reg;
input [1:0] addr;
begin
if (addr == 2'b00) read_reg = 8's0; // R0 is hardwired to 0? No, R0 is a register.
// The prompt says R[0..3]. It doesn't say R0 is zero.
// However, standard convention often has R0=0.
// The prompt says "all R[i] <= 0" on reset.
// It does not say R0 is hardwired to 0.
// So R0 is a normal register.
read_reg = rf[addr];
end
endfunction
// Forwarding Logic for Execute Stage Reading RS
// We need to read RS for ADDI (doesn't use RS) and MOV (uses RS).
// ADDI: R[rd] <= R[rd] + imm. It reads R[rd].
// MOV: R[rd] <= R[rs]. It reads R[rs].
// Let's define a function that returns the value to use for a source register `src`
// considering the current state of X and W stages.
// Note: In a 3-stage pipeline, the value produced by X is available for the NEXT instruction's X stage?
// No, X stage computes the result. W stage writes it.
// Forwarding from X to X: The instruction in X has computed `result_x`. The instruction entering X (from F) needs a value.
// If the instruction in W is writing to `src`, and it's valid, use `result_w`.
// If the instruction in X is writing to `src`, and it's valid, use `result_x`.
// Otherwise, use `rf[src]`.
function signed [7:0] get_reg_val;
input [1:0] src;
begin
// Priority: W stage (just finished) > X stage (currently computing) > Register File
// However, W stage writes happen at the end of the cycle.
// X stage reads happen at the beginning of the cycle.
// So we should forward from X if the instruction in X is producing the value for the instruction in F->X?
// No, the instruction in X is the one *before* the current one.
// If Inst[i] is in X, and Inst[i+1] is in F (moving to X), Inst[i+1] needs a value.
// If Inst[i] writes to the register Inst[i+1] reads, we forward Inst[i]'s result.
if (valid_w && !is_branch_w && rd_w == src) begin
get_reg_val = result_w;
end else if (valid_x && !is_branch_x && rd_x == src) begin
get_reg_val = result_x;
end else begin
get_reg_val = rf[src];
end
end
endfunction
// Decode Logic
wire [1:0] opcode = instr_f[7:6];
wire [1:0] rd_f = instr_f[5:4];
wire [3:0] imm_f = instr_f[3:0];
wire [1:0] rs_f = instr_f[3:2]; // For MOV
wire [1:0] cond_f = instr_f[5:4]; // For BRcc
wire [3:0] off_f = instr_f[3:0]; // For BRcc
// Execute Stage Logic
wire [1:0] opcode_x = instr_x[7:6];
wire [1:0] rd_x_wire = instr_x[5:4];
wire [3:0] imm_x = instr_x[3:0];
wire [1:0] rs_x_wire = instr_x[3:2];
wire [1:0] cond_x = instr_x[5:4];
wire [3:0] off_x = instr_x[3:0];
// Compute ALU Result for X stage
// ADDI: R[rd] + sext(imm)
// MOV: R[rs]
// We need to read the source register(s) for the instruction in X.
// For ADDI, it reads R[rd].
// For MOV, it reads R[rs].
reg signed [7:0] src_val_x;
reg signed [7:0] alu_result_x;
always @(*) begin
if (opcode_x == 2'b00) begin // ADDI
src_val_x = get_reg_val(rd_x_wire);
alu_result_x = src_val_x + sext4(imm_x);
end else if (opcode_x == 2'b01) begin // MOV
src_val_x = get_reg_val(rs_x_wire);
alu_result_x = src_val_x;
end else begin
src_val_x = 8's0;
alu_result_x = 8's0;
end
end
// Branch Logic for X stage
reg branch_taken_x_int;
always @(*) begin
if (opcode_x == 2'b10) begin // BRcc
case (cond_x)
2'b00: branch_taken_x_int = 1'b1;
2'b01: branch_taken_x_int = (r0_out == 8's0); // Zero
2'b10: branch_taken_x_int = (r0_out < 8's0); // Neg
2'b11: branch_taken_x_int = (r0_out > 8's0); // Pos
default: branch_taken_x_int = 1'b0;
endcase
end else begin
branch_taken_x_int = 1'b0;
end
end
// Next PC Calculation for F stage
// If current F instruction is a branch, we calculate the target.
// But the branch decision happens in X.
// So F just fetches PC+1 normally.
// The PC update happens when the branch result comes back from X to F?
// In a standard pipeline, the PC update logic is in the F stage, controlled by signals from X/W.
// Since we have a delay slot, the PC for the *next* fetch (after the delay slot) depends on the branch.
// Let's trace:
// Cycle T:
// F: Fetch Inst[i] at PC_i
// X: Execute Inst[i-1]
// W: Writeback Inst[i-2]
// Cycle T+1:
// F: Fetch Inst[i+1] at PC_i+1 (Delay Slot)
// X: Execute Inst[i] (Branch)
// W: Writeback Inst[i-1]
// Cycle T+2:
// F: Fetch Inst[next] at PC_target (if taken) or PC_i+2 (if not taken)
// X: Execute Inst[i+1] (Delay Slot)
// W: Writeback Inst[i] (Branch result doesn't write reg)
// So, the PC for the F stage in Cycle T+2 must be determined by the Branch result from Cycle T+1 (which is in X in T+1, W in T+2? No, X in T+1).
// The branch result `branch_taken_x` is available in the X stage register at the end of Cycle T+1.
// The F stage for Cycle T+2 needs to decide its PC.
// We can use the `branch_taken_x` and `pc_x` and `off_x` from the X stage register to compute the next PC.
reg [4:0] next_pc_f;
always @(*) begin
if (valid_x && is_branch_x) begin
if (branch_taken_x) begin
next_pc_f = pc_x + sext4(off_x) + 1'b1;
end else begin
next_pc_f = pc_x + 1'b1;
end
end else begin
next_pc_f = pc_f + 1'b1;
end
end
// Wait, `pc_x` is the PC of the branch instruction.
// `pc_f` is the PC of the delay slot instruction (which is `pc_x + 1`).
// If branch is taken, next PC is `Target`.
// If branch is not taken, next PC is `pc_x + 2` (which is `pc_f + 1`).
// My formula `pc_x + sext4(off_x) + 1` calculates the target relative to the branch instruction.
// The ISA says: `PC <= PC + sign_ext(off,4)+1`.
// So if `pc_x` is the branch PC, Target = `pc_x + sext4(off_x) + 1`.
// If not taken, the next instruction is the one after the delay slot.
// The delay slot is at `pc_x + 1`.
// The next one is `pc_x + 2`.
// So if not taken, `next_pc = pc_x + 2`.
// Note: `pc_f` in the next cycle (T+2) will be this `next_pc`.
// Let's refine `next_pc_f` logic.
// It should be the PC for the instruction that will be in F stage in the next cycle.
// Current F stage has `pc_f`.
// Current X stage has `pc_x`.
// If `valid_x` and `is_branch_x`:
// If `branch_taken_x`: Next PC = `pc_x + sext4(off_x) + 1`.
// Else: Next PC = `pc_x + 2`.
// Else:
// Next PC = `pc_f + 1`.
// Also, HALT handling.
// If `is_halt_x` is true, the pipeline freezes.
// `halt_out` becomes 1.
// `pc_out` stays at `pc_f` (or `pc_x`? Prompt: "pc_out reflects the PC currently in the fetch stage").
// If pipeline freezes, F stage doesn't advance. So `pc_out` stays same.
reg halt_freeze;
always @(*) begin
if (rst) halt_freeze = 1'b0;
else if (halt_out) halt_freeze = 1'b1;
else if (valid_x && is_halt_x) halt_freeze = 1'b1;
else halt_freeze = 1'b0;
end
// Main Sequential Logic
always @(posedge clk) begin
if (rst) begin
pc_out <= 5'b00000;
halt_out <= 1'b0;
r0_out <= 8's0;
rf[0] <= 8's0; rf[1] <= 8's0; rf[2] <= 8's0; rf[3] <= 8's0;
// Clear Pipeline
valid_f <= 1'b0;
valid_x <= 1'b0;
valid_w <= 1'b0;
halt_freeze <= 1'b0;
end else begin
if (halt_freeze) begin
// Freeze pipeline
// pc_out stays same
// halt_out stays 1
// r0_out stays same
// Registers stay same
// Pipeline registers stay same
end else begin
// Update Halt Output
if (valid_w && is_halt_w) begin
halt_out <= 1'b1;
end else if (halt_out) begin
halt_out <= 1'b1;
end else begin
halt_out <= 1'b0;
end
// Update Register File
// Writeback happens if valid_w and not suppressed.
// Suppression: If the instruction in W is a delay slot for a taken branch?
// My convention: Delay slot executes and commits.
// So no suppression for register writes.
if (valid_w && !is_halt_w && !is_branch_w) begin
rf[rd_w] <= result_w;
end
// Update r0_out
if (valid_w && !is_halt_w && !is_branch_w && rd_w == 2'b00) begin
r0_out <= result_w;
end else if (valid_x && !is_halt_x && !is_branch_x && rd_x == 2'b00) begin
// Forwarding update for r0_out?
// r0_out should reflect the "most recently written value".
// If W stage writes R0, r0_out updates.
// If X stage writes R0, it hasn't been "written" to the file yet.
// So r0_out should only update on W stage commit.
end
// Shift Pipeline: W <- X, X <- F, F <- New Fetch
// Stage W Update
pc_w <= pc_x;
instr_w <= instr_x;
valid_w <= valid_x;
rd_w <= rd_x;
result_w <= result_x;
is_branch_w <= is_branch_x;
branch_taken_w <= branch_taken_x;
is_halt_w <= is_halt_x;
// Stage X Update
pc_x <= pc_f;
instr_x <= instr_f;
valid_x <= valid_f;
// Decode F instruction for X stage preparation
if (valid_f) begin
case (instr_f[7:6])
2'b00: begin // ADDI
rd_x <= instr_f[5:4];
rs_x <= 2'b00; // Unused
is_branch_x <= 1'b0;
branch_taken_x <= 1'b0;
is_halt_x <= 1'b0;
end
2'b01: begin // MOV
rd_x <= instr_f[5:4];
rs_x <= instr_f[3:2];
is_branch_x <= 1'b0;
branch_taken_x <= 1'b0;
is_halt_x <= 1'b0;
end
2'b10: begin // BRcc
rd_x <= 2'b00; // Unused
rs_x <= 2'b00; // Unused
is_branch_x <= 1'b1;
// branch_taken_x is computed in combinational logic below based on instr_x
// But we need to store the condition and offset to compute it in X stage.
// The `branch_taken_x` signal in the register is just a placeholder or we compute it in the next cycle.
// Actually, `branch_taken_x` in the register `X` should reflect the result of the branch decision for the instruction in X.
// So we don't store `branch_taken_x` from F. We compute it in X stage combinational logic.
// So we just set `is_branch_x` here.
is_halt_x <= 1'b0;
end
2'b11: begin // HALT
rd_x <= 2'b00;
rs_x <= 2'b00;
is_branch_x <= 1'b0;
branch_taken_x <= 1'b0;
is_halt_x <= 1'b1;
end
default: begin
rd_x <= 2'b00;
rs_x <= 2'b00;
is_branch_x <= 1'b0;
branch_taken_x <= 1'b0;
is_halt_x <= 1'b0;
end
endcase
end else begin
rd_x <= 2'b00;
rs_x <= 2'b00;
is_branch_x <= 1'b0;
branch_taken_x <= 1'b0;
is_halt_x <= 1'b0;
end
// Stage F Update
// Determine next PC
if (valid_x && is_branch_x) begin
if (branch_taken_x) begin
pc_out <= pc_x + sext4(instr_x[3:0]) + 1'b1;
end else begin
pc_out <= pc_x + 2'b10; // pc_x + 2
end
end else begin
pc_out <= pc_f + 1'b1;
end
// Fetch instruction at new PC
// IMEM_INIT is 256 bits. PC is 5 bits.
// Byte 0 is at MSB.
// Instruction at PC=0 is IMEM_INIT[255:248].
// Instruction at PC=1 is IMEM_INIT[247:240].
// General: Instr[PC] = IMEM_INIT[ (31-PC)*8 +: 8 ]
instr_f <= IMEM_INIT[ (31 - pc_out) * 8 +: 8 ];
valid_f <= 1'b1; // Always valid unless reset or halt?
// If PC is out of bounds (>=32), what happens?
// Parameter is 32 bytes. PC is 5 bits (0-31).
// If PC wraps or goes out, we assume valid memory or 0.
// The indexing (31-PC) works for 0-31.
end
end
end
// Combinational Logic for X Stage Results
// This logic runs every cycle```verilog
// Branch convention: The instruction immediately following a BRcc is always
// fetched and executed (branch delay slot). If the branch is taken, the
// instruction fetched into the delay slot is effectively discarded from the
// architectural state (its writeback is suppressed or its result is ignored
// for subsequent reads). This implementation suppresses the writeback of the
// delay slot instruction if the branch is taken.
module mcu3 #(
parameter [0:255] IMEM_INIT = 256'h0 // 32 instruction bytes, byte 0 at MSB
) (
input clk,
input rst, // synchronous, active-high
output reg [4:0] pc_out, // current fetched PC (visible for testing)
output reg halt_out, // asserted forever after HALT executes
output reg signed [7:0] r0_out // current value of register R0
);
// Register File: 4 registers, 8-bit signed
reg signed [7:0] rf [0:3];
// Pipeline Stage Registers
// Stage F (Fetch)
reg [4:0] pc_f;
reg [7:0] instr_f;
reg valid_f;
// Stage X (Execute)
reg [4:0] pc_x;
reg [7:0] instr_x;
reg valid_x;
// Forwarded/Computed values for Writeback
reg [1:0] rd_x;
reg [1:0] rs_x;
reg signed [7:0] result_x;
reg is_branch_x;
reg branch_taken_x;
reg is_halt_x;
// Stage W (Writeback)
reg [4:0] pc_w;
reg [7:0] instr_w;
reg valid_w;
reg [1:0] rd_w;
reg signed [7:0] result_w;
reg is_branch_w;
reg branch_taken_w; // Used to suppress writeback of delay slot
reg is_halt_w;
// Helper: Sign extend 4-bit immediate to 8-bit
function signed [7:0] sext4;
input [3:0] val;
begin
sext4 = {{4{val[3]}}, val};
end
endfunction
// Helper: Read register with forwarding
// Forwarding logic:
// 1. If W stage is writing to the same register and is valid (and not a branch delay slot suppression case that invalidates the write), use W result.
// 2. If X stage is writing to the same register and is valid (and not a branch delay slot suppression case), use X result.
// 3. Otherwise, use register file.
// Note: Branch delay slot suppression means if the previous instruction was a taken branch, the current instruction (in X or W) is the delay slot.
// However, standard MIPS-like delay slot execution means the delay slot instruction executes.
// The prompt says: "instruction after the branch always executes".
// It also says "document whichever convention you choose".
// Convention chosen: The delay slot instruction executes fully. If the branch is taken, the PC updates to the target.
// The delay slot instruction's writeback happens normally.
// WAIT: "1 branch delay slot is acceptable ... document whichever convention you choose".
// Usually, in a simple 3-stage pipeline without complex hazard detection for branches,
// the "delay slot" is just the instruction that was already fetched.
// If I implement "suppress writeback of delay slot if branch taken", I need to track if the current W instruction is the delay slot for a taken branch.
// Let's stick to the simplest valid interpretation:
// The instruction fetched at PC+1 is the delay slot. It executes.
// If the branch is taken, the PC becomes Target. The delay slot instruction is executed.
// Does its result count? Yes, usually.
// BUT, if I suppress it, it's also a valid "convention" (like ARM conditional execution or MIPS where you put NOPs).
// Let's look at the forwarding requirement: "Forwarding REQUIRED for back-to-back ADDI / MOV".
// This implies standard ALU forwarding.
// Let's implement standard execution: Delay slot executes. Its writeback occurs.
// If the branch is taken, the next fetch is from Target. The delay slot is "wasted" in terms of flow control but not in terms of side effects (register writes).
// Actually, many simple MCUs discard the delay slot if taken. Let's look at the "freeze pipeline" for HALT.
// Let's assume the delay slot instruction is executed and its results are committed.
// Why? Because "instruction after the branch always executes".
// If I suppress writeback, I am effectively saying it doesn't execute its side effects.
// I will implement: Delay slot executes and commits.
function signed [7:0] read_reg;
input [1:0] addr;
begin
if (addr == 2'b00) read_reg = 8's0; // R0 is hardwired to 0? No, R0 is a register.
// The prompt says R[0..3]. It doesn't say R0 is zero.
// However, standard convention often has R0=0.
// The prompt says "all R[i] <= 0" on reset.
// It does not say R0 is hardwired to 0.
// So R0 is a normal register.
read_reg = rf[addr];
end
endfunction
// Forwarding Logic for Execute Stage Reading RS
// We need to read RS for ADDI (doesn't use RS) and MOV (uses RS).
// ADDI: R[rd] <= R[rd] + imm. It reads R[rd].
// MOV: R[rd] <= R[rs]. It reads R[rs].
// Let's define a function that returns the value to use for a source register `src`
// considering the current state of X and W stages.
// Note: In a 3-stage pipeline, the value produced by X is available for the NEXT instruction's X stage?
// No, X stage computes the result. W stage writes it.
// Forwarding from X to X: The instruction in X has computed `result_x`. The instruction entering X (from F) needs a value.
// If the instruction in W is writing to `src`, and it's valid, use `result_w`.
// If the instruction in X is writing to `src`, and it's valid, use `result_x`.
// Otherwise, use `rf[src]`.
function signed [7:0] get_reg_val;
input [1:0] src;
begin
// Priority: W stage (just finished) > X stage (currently computing) > Register File
// However, W stage writes happen at the end of the cycle.
// X stage reads happen at the beginning of the cycle.
// So we should forward from X if the instruction in X is producing the value for the instruction in F->X?
// No, the instruction in X is the one *before* the current one.
// If Inst[i] is in X, and Inst[i+1] is in F (moving to X), Inst[i+1] needs a value.
// If Inst[i] writes to the register Inst[i+1] reads, we forward Inst[i]'s result.
if (valid_w && !is_branch_w && rd_w == src) begin
get_reg_val = result_w;
end else if (valid_x && !is_branch_x && rd_x == src) begin
get_reg_val = result_x;
end else begin
get_reg_val = rf[src];
end
end
endfunction
// Decode Logic
wire [1:0] opcode = instr_f[7:6];
wire [1:0] rd_f = instr_f[5:4];
wire [3:0] imm_f = instr_f[3:0];
wire [1:0] rs_f = instr_f[3:2]; // For MOV
wire [1:0] cond_f = instr_f[5:4]; // For BRcc
wire [3:0] off_f = instr_f[3:0]; // For BRcc
// Execute Stage Logic
wire [1:0] opcode_x = instr_x[7:6];
wire [1:0] rd_x_wire = instr_x[5:4];
wire [3:0] imm_x = instr_x[3:0];
wire [1:0] rs_x_wire = instr_x[3:2];
wire [1:0] cond_x = instr_x[5:4];
wire [3:0] off_x = instr_x[3:0];
// Compute ALU Result for X stage
// ADDI: R[rd] + sext(imm)
// MOV: R[rs]
// We need to read the source register(s) for the instruction in X.
// For ADDI, it reads R[rd].
// For MOV, it reads R[rs].
reg signed [7:0] src_val_x;
reg signed [7:0] alu_result_x;
always @(*) begin
if (opcode_x == 2'b00) begin // ADDI
src_val_x = get_reg_val(rd_x_wire);
alu_result_x = src_val_x + sext4(imm_x);
end else if (opcode_x == 2'b01) begin // MOV
src_val_x = get_reg_val(rs_x_wire);
alu_result_x = src_val_x;
end else begin
src_val_x = 8's0;
alu_result_x = 8's0;
end
end
// Branch Logic for X stage
reg branch_taken_x_int;
always @(*) begin
if (opcode_x == 2'b10) begin // BRcc
case (cond_x)
2'b00: branch_taken_x_int = 1'b1;
2'b01: branch_taken_x_int = (r0_out == 8's0); // Zero
2'b10: branch_taken_x_int = (r0_out < 8's0); // Neg
2'b11: branch_taken_x_int = (r0_out > 8's0); // Pos
default: branch_taken_x_int = 1'b0;
endcase
end else begin
branch_taken_x_int = 1'b0;
end
end
// Next PC Calculation for F stage
// If current F instruction is a branch, we calculate the target.
// But the branch decision happens in X.
// So F just fetches PC+1 normally.
// The PC update happens when the branch result comes back from X to F?
// In a standard pipeline, the PC update logic is in the F stage, controlled by signals from X/W.
// Since we have a delay slot, the PC for the *next* fetch (after the delay slot) depends on the branch.
// Let's trace:
// Cycle T:
// F: Fetch Inst[i] at PC_i
// X: Execute Inst[i-1]
// W: Writeback Inst[i-2]
// Cycle T+1:
// F: Fetch Inst[i+1] at PC_i+1 (Delay Slot)
// X: Execute Inst[i] (Branch)
// W: Writeback Inst[i-1]
// Cycle T+2:
// F: Fetch Inst[next] at PC_target (if taken) or PC_i+2 (if not taken)
// X: Execute Inst[i+1] (Delay Slot)
// W: Writeback Inst[i] (Branch result doesn't write reg)
// So, the PC for the F stage in Cycle T+2 must be determined by the Branch result from Cycle T+1 (which is in X in T+1, W in T+2? No, X in T+1).
// The branch result `branch_taken_x` is available in the X stage register at the end of Cycle T+1.
// The F stage for Cycle T+2 needs to decide its PC.
// We can use the `branch_taken_x` and `pc_x` and `off_x` from the X stage register to compute the next PC.
reg [4:0] next_pc_f;
always @(*) begin
if (valid_x && is_branch_x) begin
if (branch_taken_x) begin
next_pc_f = pc_x + sext4(off_x) + 1'b1;
end else begin
next_pc_f = pc_x + 1'b1;
end
end else begin
next_pc_f = pc_f + 1'b1;
end
end
// Wait, `pc_x` is the PC of the branch instruction.
// `pc_f` is the PC of the delay slot instruction (which is `pc_x + 1`).
// If branch is taken, next PC is `Target`.
// If branch is not taken, next PC is `pc_x + 2` (which is `pc_f + 1`).
// My formula `pc_x + sext4(off_x) + 1` calculates the target relative to the branch instruction.
// The ISA says: `PC <= PC + sign_ext(off,4)+1`.
// So if `pc_x` is the branch PC, Target = `pc_x + sext4(off_x) + 1`.
// If not taken, the next instruction is the one after the delay slot.
// The delay slot is at `pc_x + 1`.
// The next one is `pc_x + 2`.
// So if not taken, `next_pc = pc_x + 2`.
// Note: `pc_f` in the next cycle (T+2) will be this `next_pc`.
// Let's refine `next_pc_f` logic.
// It should be the PC for the instruction that will be in F stage in the next cycle.
// Current F stage has `pc_f`.
// Current X stage has `pc_x`.
// If `valid_x` and `is_branch_x`:
// If `branch_taken_x`: Next PC = `pc_x + sext4(off_x) + 1`.
// Else: Next PC = `pc_x + 2`.
// Else:
// Next PC = `pc_f + 1`.
// Also, HALT handling.
// If `is_halt_x` is true, the pipeline freezes.
// `halt_out` becomes 1.
// `pc_out` stays at `pc_f` (or `pc_x`? Prompt: "pc_out reflects the PC currently in the fetch stage").
// If pipeline freezes, F stage doesn't advance. So `pc_out` stays same.
reg halt_freeze;
always @(*) begin
if (rst) halt_freeze = 1'b0;
else if (halt_out) halt_freeze = 1'b1;
else if (valid_x && is_halt_x) halt_freeze = 1'b1;
else halt_freeze = 1'b0;
end
// Main Sequential Logic
always @(posedge clk) begin
if (rst) begin
pc_out <= 5'b00000;
halt_out <= 1'b0;
r0_out <= 8's0;
rf[0] <= 8's0; rf[1] <= 8's0; rf[2] <= 8's0; rf[3] <= 8's0;
// Clear Pipeline
valid_f <= 1'b0;
valid_x <= 1'b0;
valid_w <= 1'b0;
halt_freeze <= 1'b0;
end else begin
if (halt_freeze) begin
// Freeze pipeline
// pc_out stays same
// halt_out stays 1
// r0_out stays same
// Registers stay same
// Pipeline registers stay same
end else begin
// Update Halt Output
if (valid_w && is_halt_w) begin
halt_out <= 1'b1;
end else if (halt_out) begin
halt_out <= 1'b1;
end else begin
halt_out <= 1'b0;
end
// Update Register File
// Writeback happens if valid_w and not suppressed.
// Suppression: If the instruction in W is a delay slot for a taken branch?
// My convention: Delay slot executes and commits.
// So no suppression for register writes.
if (valid_w && !is_halt_w && !is_branch_w) begin
rf[rd_w] <= result_w;
end
// Update r0_out
if (valid_w && !is_halt_w && !is_branch_w && rd_w == 2'b00) begin
r0_out <= result_w;
end else if (valid_x && !is_halt_x && !is_branch_x && rd_x == 2'b00) begin
// Forwarding update for r0_out?
// r0_out should reflect the "most recently written value".
// If W stage writes R0, r0_out updates.
// If X stage writes R0, it hasn't been "written" to the file yet.
// So r0_out should only update on W stage commit.
end
// Shift Pipeline: W <- X, X <- F, F <- New Fetch
// Stage W Update
pc_w <= pc_x;
instr_w <= instr_x;
valid_w <= valid_x;
rd_w <= rd_x;
result_w <= result_x;
is_branch_w <= is_branch_x;
branch_taken_w <= branch_taken_x;
is_halt_w <= is_halt_x;
// Stage X Update
pc_x <= pc_f;
instr_x <= instr_f;
valid_x <= valid_f;
// Decode F instruction for X stage preparation
if (valid_f) begin
case (instr_f[7:6])
2'b00: begin // ADDI
rd_x <= instr_f[5:4];
rs_x <= 2'b00; // Unused
is_branch_x <= 1'b0;
branch_taken_x <= 1'b0;
is_halt_x <= 1'b0;
end
2'b01: begin // MOV
rd_x <= instr_f[5:4];
rs_x <= instr_f[3:2];
is_branch_x <= 1'b0;
branch_taken_x <= 1'b0;
is_halt_x <= 1'b0;
end
2'b10: begin // BRcc
rd_x <= 2'b00; // Unused
rs_x <= 2'b00; // Unused
is_branch_x <= 1'b1;
// branch_taken_x is computed in combinational logic below based on instr_x
// But we need to store the condition and offset to compute it in X stage.
// The `branch_taken_x` signal in the register is just a placeholder or we compute it in the next cycle.
// Actually, `branch_taken_x` in the register `X` should reflect the result of the branch decision for the instruction in X.
// So we don't store `branch_taken_x` from F. We compute it in X stage combinational logic.
// So we just set `is_branch_x` here.
is_halt_x <= 1'b0;
end
2'b11: begin // HALT
rd_x <= 2'b00;
rs_x <= 2'b00;
is_branch_x <= 1'b0;
branch_taken_x <= 1'b0;
is_halt_x <= 1'b1;
end
default: begin
rd_x <= 2'b00;
rs_x <= 2'b00;
is_branch_x <= 1'b0;
branch_taken_x <= 1'b0;
is_halt_x <= 1'b0;
end
endcase
end else begin
rd_x <= 2'b00;
rs_x <= 2'b00;
is_branch_x <= 1'b0;
branch_taken_x <= 1'b0;
is_halt_x <= 1'b0;
end
// Stage F Update
// Determine next PC
if (valid_x && is_branch_x) begin
if (branch_taken_x) begin
pc_out <= pc_x + sext4(instr_x[3:0]) + 1'b1;
end else begin
pc_out <= pc_x + 2'b10; // pc_x + 2
end
end else begin
pc_out <= pc_f + 1'b1;
end
// Fetch instruction at new PC
// IMEM_INIT is 256 bits. PC is 5 bits.
// Byte 0 is at MSB.
// Instruction at PC=0 is IMEM_INIT[255:248].
// Instruction at PC=1 is IMEM_INIT[247:240].
// General: Instr[PC] = IMEM_INIT[ (31-PC)*8 +: 8 ]
instr_f <= IMEM_INIT[ (31 - pc_out) * 8 +: 8 ];
valid_f <= 1'b1; // Always valid unless reset or halt?
// If PC is out of bounds (>=32), what happens?
// Parameter is 32 bytes. PC is 5 bits (0-31).
// If PC wraps or goes out, we assume valid memory or 0.
// The indexing (31-PC) works for 0-31.
end
end
end
// Combinational Logic for X Stage Results
// This logic runs every cycle