no functional run (did not compile)
module mcu3 #(
parameter [0:255] IMEM_INIT = 256'h0 // 32 instruction bytes, byte 0 at MSB
) (
input clk,
input rst, // synchronous, active-high
output reg [4:0] pc_out, // current fetched PC (visible for testing)
output reg halt_out, // asserted forever after HALT executes
output reg signed [7:0] r0_out // current value of register R0
);
// Register File: 4 registers, 8-bit signed
reg signed [7:0] rf [0:3];
// Pipeline Registers
// Stage F: Fetch
reg [4:0] pc_f;
reg [7:0] instr_f;
reg [4:0] npc_f; // Next PC candidate for fetch stage (PC+1)
// Stage X: Decode/Execute
reg [7:0] instr_x;
reg [4:0] pc_x;
reg [4:0] npc_x; // Next PC candidate from previous stage
reg [1:0] opcode_x;
reg [1:0] rd_x;
reg [1:0] rs_x;
reg [3:0] imm_x;
reg [1:0] cond_x;
reg [3:0] off_x;
reg signed [7:0] rs_val_x; // Read value of rs
reg signed [7:0] alu_res_x; // Result of ALU operation
reg branch_taken_x;
reg [4:0] branch_target_x;
reg is_halt_x;
// Stage W: Writeback
reg [7:0] instr_w;
reg [1:0] rd_w;
reg signed [7:0] wb_data_w;
reg is_halt_w;
reg valid_w;
// Helper: Sign extend 4-bit to 8-bit
function signed [7:0] sign_ext4;
input [3:0] val;
begin
sign_ext4 = {{4{val[3]}}, val};
end
endfunction
// Helper: Sign extend 4-bit offset to 5-bit for PC addition
function signed [5:0] sign_ext4_pc;
input [3:0] val;
begin
sign_ext4_pc = {{6{val[3]}}, val};
end
endfunction
// Forwarding Logic
// We need to forward results from X stage to X stage inputs if the rd of X matches rs of current X.
// Since we don't have a separate WB register for forwarding source other than X and W,
// and W is the previous instruction's result, we can forward from W if valid.
// However, the problem states "Forwarding REQUIRED for back-to-back ADDI / MOV".
// This implies if Instr[i] writes R[d] and Instr[i+1] reads R[s]==R[d], we must forward.
// In a 3-stage pipeline:
// Cycle T: Instr[i] is in X, Instr[i+1] is in F
// Cycle T+1: Instr[i] is in W, Instr[i+1] is in X.
// So we can forward from W stage to X stage inputs.
reg signed [7:0] fwd_val_x;
reg fwd_en_x;
always @(*) begin
// Default: read from register file
fwd_en_x = 0;
fwd_val_x = rf[rs_x];
// Forward from W stage if:
// 1. W stage has a valid write to a register
// 2. The rd in W matches rs in X
// 3. The instruction in W is not HALT (HALT doesn't write registers)
if (valid_w && !is_halt_w && (rd_w != 2'b00 || instr_w[7:6] != 2'b00) && rd_w == rs_x) begin
// Note: MOV and ADDI write to rd.
// We check if rd_w is the target register.
// Special case: if rd_w is 0, it's usually ignored in some ISAs, but here R[0..3] are valid.
// Let's just forward if rd matches rs.
fwd_en_x = 1;
fwd_val_x = wb_data_w;
end
end
// Determine actual rs value for ALU
reg signed [7:0] rs_eff_x;
always @(*) begin
if (fwd_en_x) begin
rs_eff_x = fwd_val_x;
end else begin
rs_eff_x = rf[rs_x];
end
end
// Combinational Decode for X stage inputs (for simulation clarity, though we use regs)
// The decoding happens when moving from F to X.
always @(posedge clk) begin
if (rst) begin
pc_out <= 0;
halt_out <= 0;
r0_out <= 0;
for (int i = 0; i < 4; i++) rf[i] <= 0;
// Clear Pipeline
pc_f <= 0;
instr_f <= 0;
npc_f <= 0;
instr_x <= 0;
pc_x <= 0;
npc_x <= 0;
opcode_x <= 0;
rd_x <= 0;
rs_x <= 0;
imm_x <= 0;
cond_x <= 0;
off_x <= 0;
rs_val_x <= 0;
alu_res_x <= 0;
branch_taken_x <= 0;
branch_target_x <= 0;
is_halt_x <= 0;
instr_w <= 0;
rd_w <= 0;
wb_data_w <= 0;
is_halt_w <= 0;
valid_w <= 0;
end else begin
// Update Outputs
pc_out <= pc_f;
// Update Register File and r0_out based on W stage
if (valid_w) begin
if (is_halt_w) begin
halt_out <= 1;
end else begin
// Write back to register file
if (instr_w[7:6] == 2'b00) begin // ADDI
rf[rd_w] <= wb_data_w;
end else if (instr_w[7:6] == 2'b01) begin // MOV
rf[rd_w] <= wb_data_w;
end
// HALT and BRcc do not write to RF
// Update r0_out if R0 was written
if (rd_w == 2'b00) begin
r0_out <= wb_data_w;
end
end
end else begin
// If pipeline is stalled or cleared, halt_out stays as is (unless reset)
// If halt_out is already 1, it stays 1.
if (halt_out) begin
halt_out <= 1;
end
end
// Stage W Update
valid_w <= valid_x; // valid_x indicates X stage had a valid instruction
instr_w <= instr_x;
rd_w <= rd_x;
wb_data_w <= alu_res_x;
is_halt_w <= is_halt_x;
// Stage X Update
// Decode and Execute
// Inputs come from F stage
// First, determine if we should forward for the X stage calculation
// The forwarding logic above uses current W stage values.
// Decode Opcode
opcode_x <= instr_f[7:6];
if (instr_f[7:6] == 2'b00) begin // ADDI
rd_x <= instr_f[5:4];
// rs is not used in ADDI, but we can set it to 0
rs_x <= 2'b00;
imm_x <= instr_f[3:0];
cond_x <= 2'b00;
off_x <= 4'b0000;
// Execute: R[rd] <= R[rd] + sign_ext(imm, 4)
// Note: ADDI reads R[rd] as the source register too.
// So we need to forward to the 'rd' register value.
// Let's treat rs_x as rd_x for the purpose of reading the base value.
rs_x <= instr_f[5:4]; // Actually, we need to read R[rd]
// Re-evaluate rs_eff_x for ADDI: it should be R[rd]
// The forwarding logic checks rs_x. So we set rs_x to rd.
// Calculate ALU Result
// We need the current value of R[rd_x].
// We use rs_eff_x which is forwarded R[rs_x]. Since rs_x is set to rd, this works.
alu_res_x <= rs_eff_x + sign_ext4(imm_x);
branch_taken_x <= 0;
branch_target_x <= 0;
is_halt_x <= 0;
valid_x <= 1; // Valid instruction
end else if (instr_f[7:6] == 2'b01) begin // MOV
rd_x <= instr_f[5:4];
rs_x <= instr_f[3:2];
imm_x <= 4'b0000;
cond_x <= 2'b00;
off_x <= 4'b0000;
// Execute: R[rd] <= R[rs]
alu_res_x <= rs_eff_x;
branch_taken_x <= 0;
branch_target_x <= 0;
is_halt_x <= 0;
valid_x <= 1;
end else if (instr_f[7:6] == 2'b10) begin // BRcc
cond_x <= instr_f[5:4];
off_x <= instr_f[3:0];
rd_x <= 2'b00; // Unused
rs_x <= 2'b00; // Unused
// Execute: Check condition
// Condition based on R0
// 00: always
// 01: zero(R0)
// 10: neg(R0)
// 11: pos(R0)
// We need the current value of R0.
// Does BRcc need forwarding? Yes, if previous instruction wrote R0.
// We can use the same forwarding mechanism. Let's set rs_x to 0 to read R0.
rs_x <= 2'b00;
// Calculate branch target
// PC <= PC + sign_ext(off,4) + 1
// The PC used here is the PC of the BRcc instruction itself (pc_x will be updated to pc_f next cycle, but for calculation we use pc_f)
// Wait, pc_x in the NEXT cycle will be pc_f of THIS cycle.
// The branch target calculation should use the PC of the branch instruction.
// In the X stage, we have pc_x which is the PC of the instruction being executed.
// But we are updating X from F. So the PC of the instruction in X is pc_f.
branch_target_x <= pc_f + sign_ext4_pc(off_x) + 1;
// Determine if branch is taken
case (cond_x)
2'b00: branch_taken_x = 1;
2'b01: branch_taken_x = (rs_eff_x == 0) ? 1 : 0;
2'b10: branch_taken_x = (rs_eff_x < 0) ? 1 : 0;
2'b11: branch_taken_x = (rs_eff_x > 0) ? 1 : 0;
default: branch_taken_x = 0;
endcase
alu_res_x <= 0; // Not used for writeback
is_halt_x <= 0;
valid_x <= 1;
end else if (instr_f[7:6] == 2'b11) begin // HALT
rd_x <= 2'b00;
rs_x <= 2'b00;
imm_x <= 4'b0000;
cond_x <= 2'b00;
off_x <= 4'b0000;
alu_res_x <= 0;
branch_taken_x <= 0;
branch_target_x <= 0;
is_halt_x <= 1;
valid_x <= 1;
end else begin
// Invalid opcode, treat as NOP
opcode_x <= 2'b00;
rd_x <= 2'b00;
rs_x <= 2'b00;
imm_x <= 4'b0000;
cond_x <= 2'b00;
off_x <= 4'b0000;
alu_res_x <= 0;
branch_taken_x <= 0;
branch_target_x <= 0;
is_halt_x <= 0;
valid_x <= 0; // Invalid instruction, don't writeback
end
// Update X stage registers
instr_x <= instr_f;
pc_x <= pc_f;
npc_x <= npc_f; // This is the PC+1 of the current instruction
// Stage F Update
// Determine next PC
// If the instruction in X stage (which was F stage last cycle) was a branch and taken,
// then the next PC should be the branch target.
// However, due to the delay slot, the instruction after the branch (which is currently in F)
// will execute. The PC after the delay slot should be the branch target.
// So, if branch_taken_x is true, the next PC (for the instruction after the delay slot) is branch_target_x.
// But wait, the instruction currently in F is the delay slot instruction.
// Its PC is npc_f (which is pc_f + 1).
// After the delay slot executes, the PC should update to branch_target_x.
// So, if branch_taken_x is true, we set the next PC to branch_target_x.
// If branch_taken_x is false, we set the next PC to npc_f (which is pc_f + 1, but wait, npc_f is already pc_f+1).
// Actually, let's trace carefully.
// Cycle T:
// F: PC=0, Instr=BRcc
// X: PC=0, Instr=Prev (NOP)
// W: PC=0, Instr=PrevPrev
// Cycle T+1:
// F: PC=1 (npc_f from T), Instr=DelaySlot
// X: PC=0, Instr=BRcc (from F in T)
// W: PC=0, Instr=Prev (from X in T)
// In Cycle T+1, X stage evaluates BRcc. If taken, branch_target is calculated.
// The instruction in F (DelaySlot) will execute.
// After Cycle T+1, we want the next PC (for Cycle T+2 F stage) to be branch_target if taken.
// If not taken, the next PC should be PC(DelaySlot) + 1 = npc_f + 1?
// No, npc_f is PC(F)+1. So if not taken, next PC is npc_f + 1?
// Wait, if BRcc is not taken, the PC continues sequentially.
// The BRcc instruction is at PC=0. The delay slot is at PC=1.
// After delay slot executes, PC should be 2.
// npc_f in Cycle T+1 is 1 (from PC=0 in Cycle T).
// So next PC should be npc_f + 1 = 2.
// If BRcc is taken, next PC should be branch_target_x.
// So, logic for next PC (pc_f_next):
// if (branch_taken_x) pc_f_next = branch_target_x;
// else pc_f_next = npc_f + 1;
// But wait, what if the instruction in X is not a branch?
// Then branch_taken_x is 0.
// So pc_f_next = npc_f + 1.
// This is correct for sequential execution.
// What if the instruction in X is HALT?
// Then we freeze the pipeline. pc_out stays at current pc_f.
// But halt_out is set in W stage.
// If is_halt_x is true, we should not update pc_f.
if (is_halt_x) begin
// Freeze pipeline
pc_f <= pc_f;
instr_f <= instr_f;
npc_f <= npc_f;
end else if (halt_out) begin
// Already halted, stay frozen
pc_f <= pc_f;
instr_f <= instr_f;
npc_f <= npc_f;
end else begin
if (branch_taken_x) begin
pc_f <= branch_target_x;
end else begin
pc_f <= npc_f + 1;
end
// Fetch instruction
// IMEM_INIT is 256 bits. PC is 5 bits (0-31).
// Byte address = PC.
// IMEM_INIT[PC*8 +: 8]
instr_f <= IMEM_INIT[pc_f*8 +: 8];
// npc_f is PC+1 for the current instruction
npc_f <= pc_f + 1;
end
end
end
// Helper reg for valid_x
reg valid_x;
always @(*) begin
if (opcode_x == 2'b00 || opcode_x == 2'b01 || opcode_x == 2'b10 || opcode_x == 2'b11) begin
valid_x = 1;
end else begin
valid_x = 0;
end
end
endmodule```verilog
// Pipeline convention: Branch delay slot. The instruction immediately following a
// BRcc instruction is always fetched and executed, regardless of whether the
// branch is taken. If the branch is taken, the PC updates to the branch target
// after the delay slot instruction completes. This means the delay slot
// instruction executes in the context of the sequential PC, not the branch target.
// Forwarding is implemented from the Execute stage to the Execute stage inputs
// to handle back-to-back ALU/MOV dependencies without stalls.
module mcu3 #(
parameter [0:255] IMEM_INIT = 256'h0 // 32 instruction bytes, byte 0 at MSB
) (
input clk,
input rst, // synchronous, active-high
output reg [4:0] pc_out, // current fetched PC (visible for testing)
output reg halt_out, // asserted forever after HALT executes
output reg signed [7:0] r0_out // current value of register R0
);
// Register File: 4 registers, 8-bit signed
reg signed [7:0] rf [0:3];
// Pipeline Registers
// Stage F: Fetch
reg [4:0] pc_f;
reg [7:0] instr_f;
reg [4:0] npc_f; // Next PC candidate for fetch stage (PC+1)
// Stage X: Decode/Execute
reg [7:0] instr_x;
reg [4:0] pc_x;
reg [4:0] npc_x; // Next PC candidate from previous stage
reg [1:0] opcode_x;
reg [1:0] rd_x;
reg [1:0] rs_x;
reg [3:0] imm_x;
reg [1:0] cond_x;
reg [3:0] off_x;
reg signed [7:0] rs_val_x; // Read value of rs
reg signed [7:0] alu_res_x; // Result of ALU operation
reg branch_taken_x;
reg [4:0] branch_target_x;
reg is_halt_x;
// Stage W: Writeback
reg [7:0] instr_w;
reg [1:0] rd_w;
reg signed [7:0] wb_data_w;
reg is_halt_w;
reg valid_w;
// Helper: Sign extend 4-bit to 8-bit
function signed [7:0] sign_ext4;
input [3:0] val;
begin
sign_ext4 = {{4{val[3]}}, val};
end
endfunction
// Helper: Sign extend 4-bit offset to 5-bit for PC addition
function signed [5:0] sign_ext4_pc;
input [3:0] val;
begin
sign_ext4_pc = {{6{val[3]}}, val};
end
endfunction
// Forwarding Logic
// We need to forward results from X stage to X stage inputs if the rd of X matches rs of current X.
// Since we don't have a separate WB register for forwarding source other than X and W,
// and W is the previous instruction's result, we can forward from W if valid.
// However, the problem states "Forwarding REQUIRED for back-to-back ADDI / MOV".
// This implies if Instr[i] writes R[d] and Instr[i+1] reads R[s]==R[d], we must forward.
// In a 3-stage pipeline:
// Cycle T: Instr[i] is in X, Instr[i+1] is in F
// Cycle T+1: Instr[i] is in W, Instr[i+1] is in X.
// So we can forward from W stage to X stage inputs.
reg signed [7:0] fwd_val_x;
reg fwd_en_x;
always @(*) begin
// Default: read from register file
fwd_en_x = 0;
fwd_val_x = rf[rs_x];
// Forward from W stage if:
// 1. W stage has a valid write to a register
// 2. The rd in W matches rs in X
// 3. The instruction in W is not HALT (HALT doesn't write registers)
if (valid_w && !is_halt_w && (rd_w != 2'b00 || instr_w[7:6] != 2'b00) && rd_w == rs_x) begin
// Note: MOV and ADDI write to rd.
// We check if rd_w is the target register.
// Special case: if rd_w is 0, it's usually ignored in some ISAs, but here R[0..3] are valid.
// Let's just forward if rd matches rs.
fwd_en_x = 1;
fwd_val_x = wb_data_w;
end
end
// Determine actual rs value for ALU
reg signed [7:0] rs_eff_x;
always @(*) begin
if (fwd_en_x) begin
rs_eff_x = fwd_val_x;
end else begin
rs_eff_x = rf[rs_x];
end
end
// Combinational Decode for X stage inputs (for simulation clarity, though we use regs)
// The decoding happens when moving from F to X.
always @(posedge clk) begin
if (rst) begin
pc_out <= 0;
halt_out <= 0;
r0_out <= 0;
for (int i = 0; i < 4; i++) rf[i] <= 0;
// Clear Pipeline
pc_f <= 0;
instr_f <= 0;
npc_f <= 0;
instr_x <= 0;
pc_x <= 0;
npc_x <= 0;
opcode_x <= 0;
rd_x <= 0;
rs_x <= 0;
imm_x <= 0;
cond_x <= 0;
off_x <= 0;
rs_val_x <= 0;
alu_res_x <= 0;
branch_taken_x <= 0;
branch_target_x <= 0;
is_halt_x <= 0;
instr_w <= 0;
rd_w <= 0;
wb_data_w <= 0;
is_halt_w <= 0;
valid_w <= 0;
end else begin
// Update Outputs
pc_out <= pc_f;
// Update Register File and r0_out based on W stage
if (valid_w) begin
if (is_halt_w) begin
halt_out <= 1;
end else begin
// Write back to register file
if (instr_w[7:6] == 2'b00) begin // ADDI
rf[rd_w] <= wb_data_w;
end else if (instr_w[7:6] == 2'b01) begin // MOV
rf[rd_w] <= wb_data_w;
end
// HALT and BRcc do not write to RF
// Update r0_out if R0 was written
if (rd_w == 2'b00) begin
r0_out <= wb_data_w;
end
end
end else begin
// If pipeline is stalled or cleared, halt_out stays as is (unless reset)
// If halt_out is already 1, it stays 1.
if (halt_out) begin
halt_out <= 1;
end
end
// Stage W Update
valid_w <= valid_x; // valid_x indicates X stage had a valid instruction
instr_w <= instr_x;
rd_w <= rd_x;
wb_data_w <= alu_res_x;
is_halt_w <= is_halt_x;
// Stage X Update
// Decode and Execute
// Inputs come from F stage
// First, determine if we should forward for the X stage calculation
// The forwarding logic above uses current W stage values.
// Decode Opcode
opcode_x <= instr_f[7:6];
if (instr_f[7:6] == 2'b00) begin // ADDI
rd_x <= instr_f[5:4];
// rs is not used in ADDI, but we can set it to 0
rs_x <= 2'b00;
imm_x <= instr_f[3:0];
cond_x <= 2'b00;
off_x <= 4'b0000;
// Execute: R[rd] <= R[rd] + sign_ext(imm, 4)
// Note: ADDI reads R[rd] as the source register too.
// So we need to forward to the 'rd' register value.
// Let's treat rs_x as rd_x for the purpose of reading the base value.
rs_x <= instr_f[5:4]; // Actually, we need to read R[rd]
// Re-evaluate rs_eff_x for ADDI: it should be R[rd]
// The forwarding logic checks rs_x. So we set rs_x to rd.
// Calculate ALU Result
// We need the current value of R[rd_x].
// We use rs_eff_x which is forwarded R[rs_x]. Since rs_x is set to rd, this works.
alu_res_x <= rs_eff_x + sign_ext4(imm_x);
branch_taken_x <= 0;
branch_target_x <= 0;
is_halt_x <= 0;
valid_x <= 1; // Valid instruction
end else if (instr_f[7:6] == 2'b01) begin // MOV
rd_x <= instr_f[5:4];
rs_x <= instr_f[3:2];
imm_x <= 4'b0000;
cond_x <= 2'b00;
off_x <= 4'b0000;
// Execute: R[rd] <= R[rs]
alu_res_x <= rs_eff_x;
branch_taken_x <= 0;
branch_target_x <= 0;
is_halt_x <= 0;
valid_x <= 1;
end else if (instr_f[7:6] == 2'b10) begin // BRcc
cond_x <= instr_f[5:4];
off_x <= instr_f[3:0];
rd_x <= 2'b00; // Unused
rs_x <= 2'b00; // Unused
// Execute: Check condition
// Condition based on R0
// 00: always
// 01: zero(R0)
// 10: neg(R0)
// 11: pos(R0)
// We need the current value of R0.
// Does BRcc need forwarding? Yes, if previous instruction wrote R0.
// We can use the same forwarding mechanism. Let's set rs_x to 0 to read R0.
rs_x <= 2'b00;
// Calculate branch target
// PC <= PC + sign_ext(off,4) + 1
// The PC used here is the PC of the BRcc instruction itself (pc_x will be updated to pc_f next cycle, but for calculation we use pc_f)
// Wait, pc_x in the NEXT cycle will be pc_f of THIS cycle.
// The branch target calculation should use the PC of the branch instruction.
// In the X stage, we have pc_x which is the PC of the instruction being executed.
// But we are updating X from F. So the PC of the instruction in X is pc_f.
branch_target_x <= pc_f + sign_ext4_pc(off_x) + 1;
// Determine if branch is taken
case (cond_x)
2'b00: branch_taken_x = 1;
2'b01: branch_taken_x = (rs_eff_x == 0) ? 1 : 0;
2'b10: branch_taken_x = (rs_eff_x < 0) ? 1 : 0;
2'b11: branch_taken_x = (rs_eff_x > 0) ? 1 : 0;
default: branch_taken_x = 0;
endcase
alu_res_x <= 0; // Not used for writeback
is_halt_x <= 0;
valid_x <= 1;
end else if (instr_f[7:6] == 2'b11) begin // HALT
rd_x <= 2'b00;
rs_x <= 2'b00;
imm_x <= 4'b0000;
cond_x <= 2'b00;
off_x <= 4'b0000;
alu_res_x <= 0;
branch_taken_x <= 0;
branch_target_x <= 0;
is_halt_x <= 1;
valid_x <= 1;
end else begin
// Invalid opcode, treat as NOP
opcode_x <= 2'b00;
rd_x <= 2'b00;
rs_x <= 2'b00;
imm_x <= 4'b0000;
cond_x <= 2'b00;
off_x <= 4'b0000;
alu_res_x <= 0;
branch_taken_x <= 0;
branch_target_x <= 0;
is_halt_x <= 0;
valid_x <= 0; // Invalid instruction, don't writeback
end
// Update X stage registers
instr_x <= instr_f;
pc_x <= pc_f;
npc_x <= npc_f; // This is the PC+1 of the current instruction
// Stage F Update
// Determine next PC
// If the instruction in X stage (which was F stage last cycle) was a branch and taken,
// then the next PC should be the branch target.
// However, due to the delay slot, the instruction after the branch (which is currently in F)
// will execute. The PC after the delay slot should be the branch target.
// So, if branch_taken_x is true, the next PC (for the instruction after the delay slot) is branch_target_x.
// But wait, the instruction currently in F is the delay slot instruction.
// Its PC is npc_f (which is pc_f + 1).
// After the delay slot executes, the PC should update to branch_target_x.
// So, if branch_taken_x is true, we set the next PC to branch_target_x.
// If branch_taken_x is false, we set the next PC to npc_f (which is pc_f + 1, but wait, npc_f is already pc_f+1).
// Actually, let's trace carefully.
// Cycle T:
// F: PC=0, Instr=BRcc
// X: PC=0, Instr=Prev (NOP)
// W: PC=0, Instr=PrevPrev
// Cycle T+1:
// F: PC=1 (npc_f from T), Instr=DelaySlot
// X: PC=0, Instr=BRcc (from F in T)
// W: PC=0, Instr=Prev (from X in T)
// In Cycle T+1, X stage evaluates BRcc. If taken, branch_target is calculated.
// The instruction in F (DelaySlot) will execute.
// After Cycle T+1, we want the next PC (for Cycle T+2 F stage) to be branch_target if taken.
// If not taken, the next PC should be PC(DelaySlot) + 1 = npc_f + 1?
// No, npc_f is PC(F)+1. So if not taken, next PC is npc_f + 1?
// Wait, if BRcc is not taken, the PC continues sequentially.
// The BRcc instruction is at PC=0. The delay slot is at PC=1.
// After delay slot executes, PC should be 2.
// npc_f in Cycle T+1 is 1 (from PC=0 in Cycle T).
// So next PC should be npc_f + 1 = 2.
// If BRcc is taken, next PC should be branch_target_x.
// So, logic for next PC (pc_f_next):
// if (branch_taken_x) pc_f_next = branch_target_x;
// else pc_f_next = npc_f + 1;
// But wait, what if the instruction in X is not a branch?
// Then branch_taken_x is 0.
// So pc_f_next = npc_f + 1.
// This is correct for sequential execution.
// What if the instruction in X is HALT?
// Then we freeze the pipeline. pc_out stays at current pc_f.
// But halt_out is set in W stage.
// If is_halt_x is true, we should not update pc_f.
if (is_halt_x) begin
// Freeze pipeline
pc_f <= pc_f;
instr_f <= instr_f;
npc_f <= npc_f;
end else if (halt_out) begin
// Already halted, stay frozen
pc_f <= pc_f;
instr_f <= instr_f;
npc_f <= npc_f;
end else begin
if (branch_taken_x) begin
pc_f <= branch_target_x;
end else begin
pc_f <= npc_f + 1;
end
// Fetch instruction
// IMEM_INIT is 256 bits. PC is 5 bits (0-31).
// Byte address = PC.
// IMEM_INIT[PC*8 +: 8]
instr_f <= IMEM_INIT[pc_f*8 +: 8];
// npc_f is PC+1 for the current instruction
npc_f <= pc_f + 1;
end
end
end
// Helper reg for valid_x
reg valid_x;
always @(*) begin
if (opcode_x == 2'b00 || opcode_x == 2'b01 || opcode_x == 2'b10 || opcode_x == 2'b11) begin
valid_x = 1;
end else begin
valid_x = 0;
end
end
endmodule
```