base_4

extracted RTL

module mcu3 #(
    parameter [0:255] IMEM_INIT = 256'h0
) (
    input               clk,
    input               rst,
    output reg [4:0]    pc_out,
    output reg          halt_out,
    output reg signed [7:0] r0_out
);

    // Pipeline Registers
    // Stage F: Fetch
    reg [4:0] pc_f;
    reg [7:0] instr_f;
    reg [4:0] npc_f; // Next PC candidate (PC+1)

    // Stage X: Decode/Execute
    reg [4:0] pc_x;
    reg [7:0] instr_x;
    reg [4:0] npc_x;
    reg [1:0] opcode_x;
    reg [1:0] rd_x;
    reg [1:0] rs_x;
    reg [3:0] imm_x;
    reg [1:0] cond_x;
    reg [3:0] off_x;
    reg [7:0] rs_val_x;
    reg [7:0] alu_res_x;
    reg       is_branch_x;
    reg       branch_taken_x;
    reg       is_halt_x;
    reg [1:0] rd_wb_x; // Tag for writeback
    reg       wb_en_x; // Enable writeback

    // Stage W: Writeback
    reg [4:0] pc_w;
    reg [7:0] instr_w;
    reg [1:0] opcode_w;
    reg [1:0] rd_w;
    reg [7:0] alu_res_w;
    reg       is_halt_w;
    reg       wb_en_w;

    // Register File
    reg signed [7:0] rf [0:3];

    // Forwarding Logic Signals
    reg [7:0] fwd_val_x;
    reg       fwd_valid_x;

    // IMEM Access
    wire [7:0] imem_data;
    assign imem_data = IMEM_INIT[pc_f*8 +: 8];

    // Reset Logic
    always @(posedge clk) begin
        if (rst) begin
            pc_f <= 5'd0;
            instr_f <= 8'b0;
            npc_f <= 5'd1;

            pc_x <= 5'd0;
            instr_x <= 8'b0;
            npc_x <= 5'd1;
            opcode_x <= 2'b00;
            rd_x <= 2'b00;
            rs_x <= 2'b00;
            imm_x <= 4'b0000;
            cond_x <= 2'b00;
            off_x <= 4'b0000;
            rs_val_x <= 8'b0;
            alu_res_x <= 8'b0;
            is_branch_x <= 1'b0;
            branch_taken_x <= 1'b0;
            is_halt_x <= 1'b0;
            rd_wb_x <= 2'b00;
            wb_en_x <= 1'b0;

            pc_w <= 5'd0;
            instr_w <= 8'b0;
            opcode_w <= 2'b00;
            rd_w <= 2'b00;
            alu_res_w <= 8'b0;
            is_halt_w <= 1'b0;
            wb_en_w <= 1'b0;

            halt_out <= 1'b0;
            pc_out <= 5'd0;
            r0_out <= 8'b0;

            rf[0] <= 8'b0;
            rf[1] <= 8'b0;
            rf[2] <= 8'b0;
            rf[3] <= 8'b0;
        end else begin
            // Update outputs
            pc_out <= pc_f;
            
            // Check for HALT in W stage
            if (is_halt_w) begin
                halt_out <= 1'b1;
            end else if (wb_en_w && rd_w == 2'b00) begin
                r0_out <= alu_res_w;
            end

            // Update Register File
            if (!is_halt_w) begin
                if (wb_en_w) begin
                    case (rd_w)
                        2'b00: rf[0] <= alu_res_w;
                        2'b01: rf[1] <= alu_res_w;
                        2'b10: rf[2] <= alu_res_w;
                        2'b11: rf[3] <= alu_res_w;
                    endcase
                end
            end

            // Pipeline Stage W -> X
            pc_x <= pc_w;
            instr_x <= instr_w;
            npc_x <= npc_x; // NPC is computed in X based on branch, but we need to propagate the *fetch* NPC if no branch? 
                            // Actually, let's restructure NPC handling.
                            // F computes PC+1. X computes Branch Target. W does nothing with PC.
                            // Let's move NPC computation to X and W properly.
            
            // Correct Pipeline Flow for PC:
            // F: Fetch instr at PC_f. Compute PC_f + 1.
            // X: Decode instr. If branch, compute target. Determine if taken.
            // W: Writeback.
            // PC Update:
            // If Halt, freeze.
            // If Branch Taken in X, next PC is Target.
            // Else, next PC is PC_f + 1 (from F).
            
            // Let's refine the registers to handle PC correctly.
            // We need to know the "Next PC" from the previous stage to decide the current PC.
            
            // Re-defining Pipeline Registers for clarity and correctness
            // F Stage
            pc_f <= (is_halt_x) ? pc_x : (branch_taken_x ? npc_x : npc_f); 
            // Wait, if branch is taken in X, the instruction in W is the delay slot. 
            // The instruction after delay slot is fetched at Target.
            // The instruction in X is the branch.
            // The instruction in W is the one fetched at PC_branch.
            // So if Branch Taken:
            //   Next PC (for F) = Branch Target.
            // If Branch Not Taken:
            //   Next PC (for F) = PC_branch + 1.
            
            // Let's restart the pipeline logic update block carefully.
            
            // --- Pipeline Updates ---
            
            // W Stage Update
            pc_w <= pc_x;
            instr_w <= instr_x;
            opcode_w <= opcode_x;
            rd_w <= rd_x;
            alu_res_w <= alu_res_x;
            is_halt_w <= is_halt_x;
            wb_en_w <= wb_en_x;

            // X Stage Update
            pc_x <= pc_f;
            instr_x <= instr_f;
            
            // Decode
            opcode_x <= instr_f[7:6];
            case (instr_f[7:6])
                2'b00: begin // ADDI
                    rd_x <= instr_f[5:4];
                    imm_x <= instr_f[3:0];
                    rs_x <= 2'b00; // Unused
                    cond_x <= 2'b00; // Unused
                    off_x <= 4'b0000; // Unused
                    is_branch_x <= 1'b0;
                    is_halt_x <= 1'b0;
                    wb_en_x <= 1'b1;
                end
                2'b01: begin // MOV
                    rd_x <= instr_f[5:4];
                    rs_x <= instr_f[3:2];
                    imm_x <= 4'b0000; // Unused
                    cond_x <= 2'b00; // Unused
                    off_x <= 4'b0000; // Unused
                    is_branch_x <= 1'b0;
                    is_halt_x <= 1'b0;
                    wb_en_x <= 1'b1;
                end
                2'b10: begin // BRcc
                    cond_x <= instr_f[5:4];
                    off_x <= instr_f[3:0];
                    rd_x <= 2'b00; // Unused
                    rs_x <= 2'b00; // Unused
                    imm_x <= 4'b0000; // Unused
                    is_branch_x <= 1'b1;
                    is_halt_x <= 1'b0;
                    wb_en_x <= 1'b0;
                end
                2'b11: begin // HALT
                    rd_x <= 2'b00;
                    rs_x <= 2'b00;
                    imm_x <= 4'b0000;
                    cond_x <= 2'b00;
                    off_x <= 4'b0000;
                    is_branch_x <= 1'b0;
                    is_halt_x <= 1'b1;
                    wb_en_x <= 1'b0;
                end
                default: begin
                    is_branch_x <= 1'b0;
                    is_halt_x <= 1'b0;
                    wb_en_x <= 1'b0;
                end
            endcase

            // Execute ALU / Branch Logic
            // Forwarding for rs
            // We need to read rs_val_x.
            // If opcode is ADDI, rs is implicit 0? No, ADDI is R[rd] <= R[rd] + imm. So rs is rd.
            // If opcode is MOV, rs is rs.
            
            // Determine source register index
            reg [1:0] src_reg_idx;
            if (opcode_x == 2'b00) src_reg_idx = rd_x; // ADDI reads rd
            else if (opcode_x == 2'b01) src_reg_idx = rs_x; // MOV reads rs
            else src_reg_idx = 2'b00; // Dummy

            // Forwarding Logic
            // Check W stage
            if (wb_en_w && rd_w == src_reg_idx && rd_w != 2'b00) begin // Avoid forwarding to/from R0 if R0 is special? No, R0 is just a reg.
                 fwd_val_x = alu_res_w;
                 fwd_valid_x = 1'b1;
            end else if (wb_en_x && rd_x == src_reg_idx && rd_x != 2'b00) begin
                 // Forward from X stage (current instruction writing to rd, reading same rd? e.g. ADDI R0, R0, imm? No, ADDI reads R[rd] and writes R[rd].
                 // If we are in X stage, the result isn't ready yet.
                 // Standard forwarding: X stage reads from W stage (previous instr) or X stage (current instr)?
                 // Current instr result is not available until end of X.
                 // So we can only forward from W (previous) or RF (older).
                 // Exception: If we need to forward from X to X? No, X produces result at end of cycle.
                 // So we check W stage.
                 fwd_val_x = alu_res_x; // This is from the *previous* cycle's X stage result, which is now in W? No.
                 // Let's trace:
                 // Cycle T: Instr A in W. Instr B in X.
                 // Instr B needs result of Instr A.
                 // Instr A result is in alu_res_w.
                 // So fwd_val_x = alu_res_w is correct for W->X forwarding.
                 
                 // What about X->X? Instr B in X. Instr B reads R[rd] and writes R[rd].
                 // It reads the old value from RF.
                 fwd_valid_x = 1'b0; // Handled below
            end else begin
                fwd_valid_x = 1'b0;
            end
            
            // Re-evaluating Forwarding Logic
            // We need to select the value for rs_val_x.
            // Candidates:
            // 1. Forward from W stage (Instr in W writes to src_reg_idx)
            // 2. Forward from X stage (Instr in X writes to src_reg_idx) -> Only valid if we can use the result immediately? No, ALU result is ready at end of cycle.
            //    So we can't forward from X to X within the same cycle for the ALU operation.
            //    However, if the instruction is ADDI R0, imm, it reads R0.
            //    If the previous instruction (in W) wrote to R0, we use W's result.
            //    If the previous instruction (in X) wrote to R0, we can't use it yet. We use RF.
            
            if (wb_en_w && rd_w == src_reg_idx) begin
                rs_val_x = alu_res_w;
            end else if (wb_en_x && rd_x == src_reg_idx) begin
                // This case is tricky. If we are in X, and we need a value that is being written by the current X instruction?
                // e.g. ADDI R0, R0, 1.
                // We read R0 from RF.
                rs_val_x = rf[src_reg_idx];
            end else begin
                rs_val_x = rf[src_reg_idx];
            end

            // Compute ALU Result
            case (opcode_x)
                2'b00: begin // ADDI
                    alu_res_x = rs_val_x + $signed({{4{imm_x[3]}}, imm_x});
                end
                2'b01: begin // MOV
                    alu_res_x = rs_val_x;
                end
                default: begin
                    alu_res_x = 8'b0;
                end
            end

            // Compute Branch Condition
            if (is_branch_x) begin
                case (cond_x)
                    2'b00: branch_taken_x = 1'b1; // Always
                    2'b01: branch_taken_x = (rs_val_x == 8'b0) ? 1'b1 : 1'b0; // Zero(R0) -> Wait, spec says Zero(R0).
                        // Spec: cond 01=zero(R0).
                        // Does this mean we always check R0? Or R[rs]?
                        // "cond 01=zero(R0)" implies the condition is on R0 specifically.
                        // "cond 10=neg(R0)"
                        // "cond 11=pos(R0)"
                        // This is unusual. Usually it's on the result of the previous ALU or a specific register.
                        // Given the ISA: BRcc off. No register operand.
                        // So it must be checking R0 specifically.
                        branch_taken_x = (rf[0] == 8'b0) ? 1'b1 : 1'b0;
                    2'b10: branch_taken_x = ($signed(rf[0]) < 0) ? 1'b1 : 1'b0;
                    2'b11: branch_taken_x = ($signed(rf[0]) > 0) ? 1'b1 : 1'b0;
                    default: branch_taken_x = 1'b0;
                endcase
                // Compute Branch Target
                // PC <= PC + sign_ext(off,4) + 1
                // PC in X stage is pc_x.
                npc_x = pc_x + $signed({{4{off_x[3]}}, off_x}) + 1;
            end else begin
                branch_taken_x = 1'b0;
                npc_x = pc_x + 1; // Default next PC
            end

            // F Stage Update
            // Determine next PC for F stage
            reg [4:0] next_pc_f;
            
            if (is_halt_x) begin
                // Freeze PC
                next_pc_f = pc_f;
            end else if (branch_taken_x) begin
                // Branch Taken: Next PC is Target
                // Note: The instruction in W is the delay slot.
                // The instruction in X is the branch.
                // The next instruction to fetch is the Target.
                next_pc_f = npc_x;
            end else begin
                // Branch Not Taken or Not Branch: Next PC is PC+1
                next_pc_f = pc_x + 1;
            end
            
            pc_f <= next_pc_f;
            instr_f <= imem_data;
            npc_f <= next_pc_f + 1; // Precompute for next cycle? Not strictly needed if we compute in X.
                                   // But we need npc_f for the "Not Taken" case in the next cycle?
                                   // In the next cycle, pc_x will be next_pc_f.
                                   // If not branch, npc_x = pc_x + 1.
                                   // So we don't need to store npc_f explicitly if we compute it in X.
        end
    end

endmodule

raw model output

// Branch convention: The instruction immediately following a BRcc is always executed (1 delay slot).
// If the branch is taken, the target instruction is fetched in the cycle after the delay slot instruction.
// Forwarding is implemented for the X-stage to read from W-stage or X-stage registers if rd matches rs.

`timescale 1ns / 1ps

module mcu3 #(
    parameter [0:255] IMEM_INIT = 256'h0
) (
    input               clk,
    input               rst,
    output reg [4:0]    pc_out,
    output reg          halt_out,
    output reg signed [7:0] r0_out
);

    // Pipeline Registers
    // Stage F: Fetch
    reg [4:0] pc_f;
    reg [7:0] instr_f;
    reg [4:0] npc_f; // Next PC candidate (PC+1)

    // Stage X: Decode/Execute
    reg [4:0] pc_x;
    reg [7:0] instr_x;
    reg [4:0] npc_x;
    reg [1:0] opcode_x;
    reg [1:0] rd_x;
    reg [1:0] rs_x;
    reg [3:0] imm_x;
    reg [1:0] cond_x;
    reg [3:0] off_x;
    reg [7:0] rs_val_x;
    reg [7:0] alu_res_x;
    reg       is_branch_x;
    reg       branch_taken_x;
    reg       is_halt_x;
    reg [1:0] rd_wb_x; // Tag for writeback
    reg       wb_en_x; // Enable writeback

    // Stage W: Writeback
    reg [4:0] pc_w;
    reg [7:0] instr_w;
    reg [1:0] opcode_w;
    reg [1:0] rd_w;
    reg [7:0] alu_res_w;
    reg       is_halt_w;
    reg       wb_en_w;

    // Register File
    reg signed [7:0] rf [0:3];

    // Forwarding Logic Signals
    reg [7:0] fwd_val_x;
    reg       fwd_valid_x;

    // IMEM Access
    wire [7:0] imem_data;
    assign imem_data = IMEM_INIT[pc_f*8 +: 8];

    // Reset Logic
    always @(posedge clk) begin
        if (rst) begin
            pc_f <= 5'd0;
            instr_f <= 8'b0;
            npc_f <= 5'd1;

            pc_x <= 5'd0;
            instr_x <= 8'b0;
            npc_x <= 5'd1;
            opcode_x <= 2'b00;
            rd_x <= 2'b00;
            rs_x <= 2'b00;
            imm_x <= 4'b0000;
            cond_x <= 2'b00;
            off_x <= 4'b0000;
            rs_val_x <= 8'b0;
            alu_res_x <= 8'b0;
            is_branch_x <= 1'b0;
            branch_taken_x <= 1'b0;
            is_halt_x <= 1'b0;
            rd_wb_x <= 2'b00;
            wb_en_x <= 1'b0;

            pc_w <= 5'd0;
            instr_w <= 8'b0;
            opcode_w <= 2'b00;
            rd_w <= 2'b00;
            alu_res_w <= 8'b0;
            is_halt_w <= 1'b0;
            wb_en_w <= 1'b0;

            halt_out <= 1'b0;
            pc_out <= 5'd0;
            r0_out <= 8'b0;

            rf[0] <= 8'b0;
            rf[1] <= 8'b0;
            rf[2] <= 8'b0;
            rf[3] <= 8'b0;
        end else begin
            // Update outputs
            pc_out <= pc_f;
            
            // Check for HALT in W stage
            if (is_halt_w) begin
                halt_out <= 1'b1;
            end else if (wb_en_w && rd_w == 2'b00) begin
                r0_out <= alu_res_w;
            end

            // Update Register File
            if (!is_halt_w) begin
                if (wb_en_w) begin
                    case (rd_w)
                        2'b00: rf[0] <= alu_res_w;
                        2'b01: rf[1] <= alu_res_w;
                        2'b10: rf[2] <= alu_res_w;
                        2'b11: rf[3] <= alu_res_w;
                    endcase
                end
            end

            // Pipeline Stage W -> X
            pc_x <= pc_w;
            instr_x <= instr_w;
            npc_x <= npc_x; // NPC is computed in X based on branch, but we need to propagate the *fetch* NPC if no branch? 
                            // Actually, let's restructure NPC handling.
                            // F computes PC+1. X computes Branch Target. W does nothing with PC.
                            // Let's move NPC computation to X and W properly.
            
            // Correct Pipeline Flow for PC:
            // F: Fetch instr at PC_f. Compute PC_f + 1.
            // X: Decode instr. If branch, compute target. Determine if taken.
            // W: Writeback.
            // PC Update:
            // If Halt, freeze.
            // If Branch Taken in X, next PC is Target.
            // Else, next PC is PC_f + 1 (from F).
            
            // Let's refine the registers to handle PC correctly.
            // We need to know the "Next PC" from the previous stage to decide the current PC.
            
            // Re-defining Pipeline Registers for clarity and correctness
            // F Stage
            pc_f <= (is_halt_x) ? pc_x : (branch_taken_x ? npc_x : npc_f); 
            // Wait, if branch is taken in X, the instruction in W is the delay slot. 
            // The instruction after delay slot is fetched at Target.
            // The instruction in X is the branch.
            // The instruction in W is the one fetched at PC_branch.
            // So if Branch Taken:
            //   Next PC (for F) = Branch Target.
            // If Branch Not Taken:
            //   Next PC (for F) = PC_branch + 1.
            
            // Let's restart the pipeline logic update block carefully.
            
            // --- Pipeline Updates ---
            
            // W Stage Update
            pc_w <= pc_x;
            instr_w <= instr_x;
            opcode_w <= opcode_x;
            rd_w <= rd_x;
            alu_res_w <= alu_res_x;
            is_halt_w <= is_halt_x;
            wb_en_w <= wb_en_x;

            // X Stage Update
            pc_x <= pc_f;
            instr_x <= instr_f;
            
            // Decode
            opcode_x <= instr_f[7:6];
            case (instr_f[7:6])
                2'b00: begin // ADDI
                    rd_x <= instr_f[5:4];
                    imm_x <= instr_f[3:0];
                    rs_x <= 2'b00; // Unused
                    cond_x <= 2'b00; // Unused
                    off_x <= 4'b0000; // Unused
                    is_branch_x <= 1'b0;
                    is_halt_x <= 1'b0;
                    wb_en_x <= 1'b1;
                end
                2'b01: begin // MOV
                    rd_x <= instr_f[5:4];
                    rs_x <= instr_f[3:2];
                    imm_x <= 4'b0000; // Unused
                    cond_x <= 2'b00; // Unused
                    off_x <= 4'b0000; // Unused
                    is_branch_x <= 1'b0;
                    is_halt_x <= 1'b0;
                    wb_en_x <= 1'b1;
                end
                2'b10: begin // BRcc
                    cond_x <= instr_f[5:4];
                    off_x <= instr_f[3:0];
                    rd_x <= 2'b00; // Unused
                    rs_x <= 2'b00; // Unused
                    imm_x <= 4'b0000; // Unused
                    is_branch_x <= 1'b1;
                    is_halt_x <= 1'b0;
                    wb_en_x <= 1'b0;
                end
                2'b11: begin // HALT
                    rd_x <= 2'b00;
                    rs_x <= 2'b00;
                    imm_x <= 4'b0000;
                    cond_x <= 2'b00;
                    off_x <= 4'b0000;
                    is_branch_x <= 1'b0;
                    is_halt_x <= 1'b1;
                    wb_en_x <= 1'b0;
                end
                default: begin
                    is_branch_x <= 1'b0;
                    is_halt_x <= 1'b0;
                    wb_en_x <= 1'b0;
                end
            endcase

            // Execute ALU / Branch Logic
            // Forwarding for rs
            // We need to read rs_val_x.
            // If opcode is ADDI, rs is implicit 0? No, ADDI is R[rd] <= R[rd] + imm. So rs is rd.
            // If opcode is MOV, rs is rs.
            
            // Determine source register index
            reg [1:0] src_reg_idx;
            if (opcode_x == 2'b00) src_reg_idx = rd_x; // ADDI reads rd
            else if (opcode_x == 2'b01) src_reg_idx = rs_x; // MOV reads rs
            else src_reg_idx = 2'b00; // Dummy

            // Forwarding Logic
            // Check W stage
            if (wb_en_w && rd_w == src_reg_idx && rd_w != 2'b00) begin // Avoid forwarding to/from R0 if R0 is special? No, R0 is just a reg.
                 fwd_val_x = alu_res_w;
                 fwd_valid_x = 1'b1;
            end else if (wb_en_x && rd_x == src_reg_idx && rd_x != 2'b00) begin
                 // Forward from X stage (current instruction writing to rd, reading same rd? e.g. ADDI R0, R0, imm? No, ADDI reads R[rd] and writes R[rd].
                 // If we are in X stage, the result isn't ready yet.
                 // Standard forwarding: X stage reads from W stage (previous instr) or X stage (current instr)?
                 // Current instr result is not available until end of X.
                 // So we can only forward from W (previous) or RF (older).
                 // Exception: If we need to forward from X to X? No, X produces result at end of cycle.
                 // So we check W stage.
                 fwd_val_x = alu_res_x; // This is from the *previous* cycle's X stage result, which is now in W? No.
                 // Let's trace:
                 // Cycle T: Instr A in W. Instr B in X.
                 // Instr B needs result of Instr A.
                 // Instr A result is in alu_res_w.
                 // So fwd_val_x = alu_res_w is correct for W->X forwarding.
                 
                 // What about X->X? Instr B in X. Instr B reads R[rd] and writes R[rd].
                 // It reads the old value from RF.
                 fwd_valid_x = 1'b0; // Handled below
            end else begin
                fwd_valid_x = 1'b0;
            end
            
            // Re-evaluating Forwarding Logic
            // We need to select the value for rs_val_x.
            // Candidates:
            // 1. Forward from W stage (Instr in W writes to src_reg_idx)
            // 2. Forward from X stage (Instr in X writes to src_reg_idx) -> Only valid if we can use the result immediately? No, ALU result is ready at end of cycle.
            //    So we can't forward from X to X within the same cycle for the ALU operation.
            //    However, if the instruction is ADDI R0, imm, it reads R0.
            //    If the previous instruction (in W) wrote to R0, we use W's result.
            //    If the previous instruction (in X) wrote to R0, we can't use it yet. We use RF.
            
            if (wb_en_w && rd_w == src_reg_idx) begin
                rs_val_x = alu_res_w;
            end else if (wb_en_x && rd_x == src_reg_idx) begin
                // This case is tricky. If we are in X, and we need a value that is being written by the current X instruction?
                // e.g. ADDI R0, R0, 1.
                // We read R0 from RF.
                rs_val_x = rf[src_reg_idx];
            end else begin
                rs_val_x = rf[src_reg_idx];
            end

            // Compute ALU Result
            case (opcode_x)
                2'b00: begin // ADDI
                    alu_res_x = rs_val_x + $signed({{4{imm_x[3]}}, imm_x});
                end
                2'b01: begin // MOV
                    alu_res_x = rs_val_x;
                end
                default: begin
                    alu_res_x = 8'b0;
                end
            end

            // Compute Branch Condition
            if (is_branch_x) begin
                case (cond_x)
                    2'b00: branch_taken_x = 1'b1; // Always
                    2'b01: branch_taken_x = (rs_val_x == 8'b0) ? 1'b1 : 1'b0; // Zero(R0) -> Wait, spec says Zero(R0).
                        // Spec: cond 01=zero(R0).
                        // Does this mean we always check R0? Or R[rs]?
                        // "cond 01=zero(R0)" implies the condition is on R0 specifically.
                        // "cond 10=neg(R0)"
                        // "cond 11=pos(R0)"
                        // This is unusual. Usually it's on the result of the previous ALU or a specific register.
                        // Given the ISA: BRcc off. No register operand.
                        // So it must be checking R0 specifically.
                        branch_taken_x = (rf[0] == 8'b0) ? 1'b1 : 1'b0;
                    2'b10: branch_taken_x = ($signed(rf[0]) < 0) ? 1'b1 : 1'b0;
                    2'b11: branch_taken_x = ($signed(rf[0]) > 0) ? 1'b1 : 1'b0;
                    default: branch_taken_x = 1'b0;
                endcase
                // Compute Branch Target
                // PC <= PC + sign_ext(off,4) + 1
                // PC in X stage is pc_x.
                npc_x = pc_x + $signed({{4{off_x[3]}}, off_x}) + 1;
            end else begin
                branch_taken_x = 1'b0;
                npc_x = pc_x + 1; // Default next PC
            end

            // F Stage Update
            // Determine next PC for F stage
            reg [4:0] next_pc_f;
            
            if (is_halt_x) begin
                // Freeze PC
                next_pc_f = pc_f;
            end else if (branch_taken_x) begin
                // Branch Taken: Next PC is Target
                // Note: The instruction in W is the delay slot.
                // The instruction in X is the branch.
                // The next instruction to fetch is the Target.
                next_pc_f = npc_x;
            end else begin
                // Branch Not Taken or Not Branch: Next PC is PC+1
                next_pc_f = pc_x + 1;
            end
            
            pc_f <= next_pc_f;
            instr_f <= imem_data;
            npc_f <= next_pc_f + 1; // Precompute for next cycle? Not strictly needed if we compute in X.
                                   // But we need npc_f for the "Not Taken" case in the next cycle?
                                   // In the next cycle, pc_x will be next_pc_f.
                                   // If not branch, npc_x = pc_x + 1.
                                   // So we don't need to store npc_f explicitly if we compute it in X.
        end
    end

endmodule

line-item PPA

per-program functional

pipeline shape (heuristic register bank count)

mistakes detected

extracted RTL

raw model output