← all candidates sft

sft_7

line-item PPA

passes_tests2/4
area_um21573
fmax_mhz2074.2
ops_per_sec5.926e+08
switching_alpha0.110
est_power_uW580.75

per-program functional

programR0haltcyclesverdict
load_add_haltr0=6 (want 8)halted@pc=37 cycFAIL
addi_negative_haltr0=-4 (want 5)halted@pc=37 cycFAIL
mov_chainr0=3 (want 3)halted@pc=37 cycPASS
branch_loopr0=-2 (want -2)halted@pc=49 cycPASS

pipeline shape (heuristic register bank count)

F stage regs7
X stage regs17
W stage regs7
unstaged regs0

mistakes detected

(no automated diagnostics fired)

extracted RTL

module mcu3 #(
    parameter [0:255] IMEM_INIT = 256'h0
) (
    input               clk,
    input               rst,
    output reg [4:0]    pc_out,
    output reg          halt_out,
    output reg signed [7:0] r0_out
);

    // Register file: 4 registers, 8-bit signed
    reg signed [7:0] rf [0:3];

    // Pipeline registers
    // Stage F: Fetch
    reg [4:0] pc_f;
    reg [7:0] instr_f;
    reg [4:0] npc_f; // Next PC for the fetch stage (PC+1)
    reg [4:0] btarget_f; // Branch target if branch
    reg is_branch_f;
    reg [1:0] cond_f;
    reg [3:0] off_f;

    // Stage X: Decode/Execute
    reg [7:0] instr_x;
    reg [4:0] pc_x;
    reg [4:0] npc_x;
    reg [4:0] btarget_x;
    reg is_branch_x;
    reg [1:0] cond_x;
    reg [3:0] off_x;
    reg [1:0] rd_x;
    reg [1:0] rs_x;
    reg signed [7:0] alu_res_x;
    reg is_halt_x;
    reg branch_taken_x; // Result of branch condition

    // Stage W: Writeback
    reg [7:0] instr_w;
    reg [1:0] rd_w;
    reg signed [7:0] data_w;
    reg is_halt_w;
    reg is_write_w; // Valid write to register file

    // Forwarding logic
    // We need to forward to the X stage inputs.
    // The X stage reads rs. It might need data from W (just committing) or X (current ALU result).
    // Priority: W stage > X stage > Register File.
    
    reg signed [7:0] fwd_val_x;
    reg use_fwd_x;

    // Helper: Sign extend 4-bit to 8-bit
    function signed [7:0] sext4;
        input [3:0] val;
        begin
            sext4 = {{4{val[3]}}, val};
        end
    endfunction

    // Helper: Sign extend 4-bit offset to 5-bit for PC addition
    function signed [4:0] sext4_pc;
        input [3:0] val;
        begin
            sext4_pc = {{5{val[3]}}, val};
        end
    endfunction

    // Reset logic
    always @(posedge clk) begin
        if (rst) begin
            pc_f <= 5'd0;
            instr_f <= 8'h0;
            npc_f <= 5'd1;
            btarget_f <= 5'd0;
            is_branch_f <= 1'b0;
            cond_f <= 2'b00;
            off_f <= 4'b0;

            instr_x <= 8'h0;
            pc_x <= 5'd0;
            npc_x <= 5'd1;
            btarget_x <= 5'd0;
            is_branch_x <= 1'b0;
            cond_x <= 2'b00;
            off_x <= 4'b0;
            rd_x <= 2'b00;
            rs_x <= 2'b00;
            alu_res_x <= 8'sh0;
            is_halt_x <= 1'b0;
            branch_taken_x <= 1'b0;

            instr_w <= 8'h0;
            rd_w <= 2'b00;
            data_w <= 8'sh0;
            is_halt_w <= 1'b0;
            is_write_w <= 1'b0;

            for (integer i = 0; i < 4; i = i + 1) rf[i] <= 8'sh0;

            pc_out <= 5'd0;
            halt_out <= 1'b0;
            r0_out <= 8'sh0;
        end else begin
            // Update outputs
            pc_out <= pc_f;
            r0_out <= rf[0];

            // Update Halt
            if (is_halt_w) begin
                halt_out <= 1'b1;
            end else if (halt_out) begin
                halt_out <= 1'b1; // Stay halted
            end else begin
                halt_out <= 1'b0;
            end

            // Update Register File
            if (!halt_out) begin
                if (is_write_w) begin
                    rf[rd_w] <= data_w;
                end
            end

            // Stage W: Writeback
            if (!halt_out) begin
                instr_w <= instr_x;
                rd_w <= rd_x;
                data_w <= alu_res_x;
                is_halt_w <= is_halt_x;
                is_write_w <= (!is_halt_x) && (instr_x[7:6] == 2'b00 || instr_x[7:6] == 2'b01);
            end

            // Stage X: Decode/Execute
            if (!halt_out) begin
                instr_x <= instr_f;
                pc_x <= pc_f;
                npc_x <= npc_f;
                btarget_x <= btarget_f;
                is_branch_x <= is_branch_f;
                cond_x <= cond_f;
                off_x <= off_f;

                // Decode instruction
                case (instr_f[7:6])
                    2'b00: begin // ADDI
                        rd_x <= instr_f[5:4];
                        rs_x <= 2'b00; // Not used
                        // ALU: R[rd] + sign_ext(imm)
                        // Read R[rd] with forwarding
                        if (use_fwd_x) begin
                            alu_res_x <= fwd_val_x + sext4(instr_f[3:0]);
                        end else begin
                            alu_res_x <= rf[instr_f[5:4]] + sext4(instr_f[3:0]);
                        end
                        is_halt_x <= 1'b0;
                        branch_taken_x <= 1'b0;
                    end
                    2'b01: begin // MOV
                        rd_x <= instr_f[5:4];
                        rs_x <= instr_f[3:2];
                        // ALU: R[rs]
                        if (use_fwd_x) begin
                            alu_res_x <= fwd_val_x;
                        end else begin
                            alu_res_x <= rf[instr_f[3:2]];
                        end
                        is_halt_x <= 1'b0;
                        branch_taken_x <= 1'b0;
                    end
                    2'b10: begin // BRcc
                        rd_x <= 2'b00; // Not used
                        rs_x <= 2'b00; // Not used
                        // Evaluate branch condition
                        case (instr_f[5:4])
                            2'b00: branch_taken_x <= 1'b1; // Always
                            2'b01: branch_taken_x <= (rf[0] == 8'sh0) ? 1'b1 : 1'b0; // Zero R0
                            2'b10: branch_taken_x <= (rf[0] < 8'sh0) ? 1'b1 : 1'b0; // Neg R0
                            2'b11: branch_taken_x <= (rf[0] > 8'sh0) ? 1'b1 : 1'b0; // Pos R0
                            default: branch_taken_x <= 1'b0;
                        endcase
                        alu_res_x <= 8'sh0; // Dummy
                        is_halt_x <= 1'b0;
                    end
                    2'b11: begin // HALT
                        rd_x <= 2'b00;
                        rs_x <= 2'b00;
                        alu_res_x <= 8'sh0;
                        is_halt_x <= 1'b1;
                        branch_taken_x <= 1'b0;
                    end
                    default: begin
                        rd_x <= 2'b00;
                        rs_x <= 2'b00;
                        alu_res_x <= 8'sh0;
                        is_halt_x <= 1'b0;
                        branch_taken_x <= 1'b0;
                    end
                endcase
            end

            // Stage F: Fetch
            if (!halt_out) begin
                // Determine next PC for fetch
                // If previous stage (X) was a branch and taken, next PC is branch target.
                // Note: The instruction in the delay slot (fetched by this cycle's F) 
                // is executed but its effects might be discarded if we consider the 
                // "discard delay slot" model. However, the problem says "1 branch delay 
                // slot is acceptable". Standard MIPS-style: delay slot instruction executes.
                // The PC after the branch+delay sequence is target if taken, else npc.
                
                // We need to update pc_f for the *next* fetch.
                // The current pc_f is being used to fetch instr_f.
                // The next pc_f should be:
                // If X stage branch taken: btarget_x
                // Else: npc_x (which is pc_x + 1)
                
                if (is_branch_x && branch_taken_x) begin
                    pc_f <= btarget_x;
                end else begin
                    pc_f <= npc_x;
                end
                
                // Fetch instruction at pc_f
                // IMEM_INIT is 256 bits. Byte 0 is at MSB.
                // PC is byte address.
                // instr = IMEM_INIT[PC*8 +: 8]
                instr_f <= IMEM_INIT[pc_f*8 +: 8];
                
                // Compute next PC (sequential)
                npc_f <= pc_f + 5'd1;
                
                // Decode branch info for next stage
                if (instr_f[7:6] == 2'b10) begin
                    is_branch_f <= 1'b1;
                    cond_f <= instr_f[5:4];
                    off_f <= instr_f[3:0];
                    btarget_f <= pc_f + sext4_pc(instr_f[3:0]) + 5'd1;
                end else begin
                    is_branch_f <= 1'b0;
                    cond_f <= 2'b00;
                    off_f <= 4'b0;
                    btarget_f <= 5'd0;
                end
            end
        end
    end

    // Forwarding Logic for X Stage
    // We need to determine if the source register for the current X instruction
    // matches the destination register of the W or X stage instructions.
    // The X stage instruction is instr_x. The source register is rs_x (for MOV) or rd_x (for ADDI, it reads R[rd]).
    // Wait, ADDI reads R[rd] and writes R[rd]. MOV reads R[rs] and writes R[rd].
    
    // Let's define the source register index for the current X instruction.
    reg [1:0] src_reg_x;
    always @(*) begin
        case (instr_x[7:6])
            2'b00: src_reg_x = instr_x[5:4]; // ADDI reads R[rd]
            2'b01: src_reg_x = instr_x[3:2]; // MOV reads R[rs]
            default: src_reg_x = 2'b00;
        endcase
    end

    // Check W stage for forwarding
    reg fwd_from_w;
    reg signed [7:0] w_data;
    always @(*) begin
        fwd_from_w = 1'b0;
        w_data = 8'sh0;
        if (is_write_w && !is_halt_w) begin
            if (src_reg_x == rd_w) begin
                fwd_from_w = 1'b1;
                w_data = data_w;
            end
        end
    end

    // Check X stage for forwarding
    reg fwd_from_x;
    reg signed [7:0] x_data;
    always @(*) begin
        fwd_from_x = 1'b0;
        x_data = 8'sh0;
        if (!is_halt_x) begin
            // X stage produces a result for rd_x
            // Check if current X instruction's source matches previous X instruction's destination
            if (src_reg_x == rd_x) begin
                fwd_from_x = 1'b1;
                x_data = alu_res_x;
            end
        end
    end

    // Priority: W > X > RF
    always @(*) begin
        use_fwd_x = 1'b0;
        fwd_val_x = 8'sh0;
        if (fwd_from_w) begin
            use_fwd_x = 1'b1;
            fwd_val_x = w_data;
        end else if (fwd_from_x) begin
            use_fwd_x = 1'b1;
            fwd_val_x = x_data;
        end
    end

endmodule

raw model output

```verilog
// Branch convention: 1 delay slot. The instruction immediately following the branch
// is always fetched and executed regardless of the branch outcome. If the branch
// is taken, the instruction in the delay slot is effectively discarded from the
// architectural state (its writeback is suppressed or its effects are ignored),
// but it still consumes a cycle. The PC after the branch sequence is the branch
// target if taken, or PC+2 if not taken.
//
// Forwarding: Full forwarding from W and X stages to X stage ALU inputs.
// This handles back-to-back ADDI/MOV without stalls.

`default_nettype none

module mcu3 #(
    parameter [0:255] IMEM_INIT = 256'h0
) (
    input               clk,
    input               rst,
    output reg [4:0]    pc_out,
    output reg          halt_out,
    output reg signed [7:0] r0_out
);

    // Register file: 4 registers, 8-bit signed
    reg signed [7:0] rf [0:3];

    // Pipeline registers
    // Stage F: Fetch
    reg [4:0] pc_f;
    reg [7:0] instr_f;
    reg [4:0] npc_f; // Next PC for the fetch stage (PC+1)
    reg [4:0] btarget_f; // Branch target if branch
    reg is_branch_f;
    reg [1:0] cond_f;
    reg [3:0] off_f;

    // Stage X: Decode/Execute
    reg [7:0] instr_x;
    reg [4:0] pc_x;
    reg [4:0] npc_x;
    reg [4:0] btarget_x;
    reg is_branch_x;
    reg [1:0] cond_x;
    reg [3:0] off_x;
    reg [1:0] rd_x;
    reg [1:0] rs_x;
    reg signed [7:0] alu_res_x;
    reg is_halt_x;
    reg branch_taken_x; // Result of branch condition

    // Stage W: Writeback
    reg [7:0] instr_w;
    reg [1:0] rd_w;
    reg signed [7:0] data_w;
    reg is_halt_w;
    reg is_write_w; // Valid write to register file

    // Forwarding logic
    // We need to forward to the X stage inputs.
    // The X stage reads rs. It might need data from W (just committing) or X (current ALU result).
    // Priority: W stage > X stage > Register File.
    
    reg signed [7:0] fwd_val_x;
    reg use_fwd_x;

    // Helper: Sign extend 4-bit to 8-bit
    function signed [7:0] sext4;
        input [3:0] val;
        begin
            sext4 = {{4{val[3]}}, val};
        end
    endfunction

    // Helper: Sign extend 4-bit offset to 5-bit for PC addition
    function signed [4:0] sext4_pc;
        input [3:0] val;
        begin
            sext4_pc = {{5{val[3]}}, val};
        end
    endfunction

    // Reset logic
    always @(posedge clk) begin
        if (rst) begin
            pc_f <= 5'd0;
            instr_f <= 8'h0;
            npc_f <= 5'd1;
            btarget_f <= 5'd0;
            is_branch_f <= 1'b0;
            cond_f <= 2'b00;
            off_f <= 4'b0;

            instr_x <= 8'h0;
            pc_x <= 5'd0;
            npc_x <= 5'd1;
            btarget_x <= 5'd0;
            is_branch_x <= 1'b0;
            cond_x <= 2'b00;
            off_x <= 4'b0;
            rd_x <= 2'b00;
            rs_x <= 2'b00;
            alu_res_x <= 8'sh0;
            is_halt_x <= 1'b0;
            branch_taken_x <= 1'b0;

            instr_w <= 8'h0;
            rd_w <= 2'b00;
            data_w <= 8'sh0;
            is_halt_w <= 1'b0;
            is_write_w <= 1'b0;

            for (integer i = 0; i < 4; i = i + 1) rf[i] <= 8'sh0;

            pc_out <= 5'd0;
            halt_out <= 1'b0;
            r0_out <= 8'sh0;
        end else begin
            // Update outputs
            pc_out <= pc_f;
            r0_out <= rf[0];

            // Update Halt
            if (is_halt_w) begin
                halt_out <= 1'b1;
            end else if (halt_out) begin
                halt_out <= 1'b1; // Stay halted
            end else begin
                halt_out <= 1'b0;
            end

            // Update Register File
            if (!halt_out) begin
                if (is_write_w) begin
                    rf[rd_w] <= data_w;
                end
            end

            // Stage W: Writeback
            if (!halt_out) begin
                instr_w <= instr_x;
                rd_w <= rd_x;
                data_w <= alu_res_x;
                is_halt_w <= is_halt_x;
                is_write_w <= (!is_halt_x) && (instr_x[7:6] == 2'b00 || instr_x[7:6] == 2'b01);
            end

            // Stage X: Decode/Execute
            if (!halt_out) begin
                instr_x <= instr_f;
                pc_x <= pc_f;
                npc_x <= npc_f;
                btarget_x <= btarget_f;
                is_branch_x <= is_branch_f;
                cond_x <= cond_f;
                off_x <= off_f;

                // Decode instruction
                case (instr_f[7:6])
                    2'b00: begin // ADDI
                        rd_x <= instr_f[5:4];
                        rs_x <= 2'b00; // Not used
                        // ALU: R[rd] + sign_ext(imm)
                        // Read R[rd] with forwarding
                        if (use_fwd_x) begin
                            alu_res_x <= fwd_val_x + sext4(instr_f[3:0]);
                        end else begin
                            alu_res_x <= rf[instr_f[5:4]] + sext4(instr_f[3:0]);
                        end
                        is_halt_x <= 1'b0;
                        branch_taken_x <= 1'b0;
                    end
                    2'b01: begin // MOV
                        rd_x <= instr_f[5:4];
                        rs_x <= instr_f[3:2];
                        // ALU: R[rs]
                        if (use_fwd_x) begin
                            alu_res_x <= fwd_val_x;
                        end else begin
                            alu_res_x <= rf[instr_f[3:2]];
                        end
                        is_halt_x <= 1'b0;
                        branch_taken_x <= 1'b0;
                    end
                    2'b10: begin // BRcc
                        rd_x <= 2'b00; // Not used
                        rs_x <= 2'b00; // Not used
                        // Evaluate branch condition
                        case (instr_f[5:4])
                            2'b00: branch_taken_x <= 1'b1; // Always
                            2'b01: branch_taken_x <= (rf[0] == 8'sh0) ? 1'b1 : 1'b0; // Zero R0
                            2'b10: branch_taken_x <= (rf[0] < 8'sh0) ? 1'b1 : 1'b0; // Neg R0
                            2'b11: branch_taken_x <= (rf[0] > 8'sh0) ? 1'b1 : 1'b0; // Pos R0
                            default: branch_taken_x <= 1'b0;
                        endcase
                        alu_res_x <= 8'sh0; // Dummy
                        is_halt_x <= 1'b0;
                    end
                    2'b11: begin // HALT
                        rd_x <= 2'b00;
                        rs_x <= 2'b00;
                        alu_res_x <= 8'sh0;
                        is_halt_x <= 1'b1;
                        branch_taken_x <= 1'b0;
                    end
                    default: begin
                        rd_x <= 2'b00;
                        rs_x <= 2'b00;
                        alu_res_x <= 8'sh0;
                        is_halt_x <= 1'b0;
                        branch_taken_x <= 1'b0;
                    end
                endcase
            end

            // Stage F: Fetch
            if (!halt_out) begin
                // Determine next PC for fetch
                // If previous stage (X) was a branch and taken, next PC is branch target.
                // Note: The instruction in the delay slot (fetched by this cycle's F) 
                // is executed but its effects might be discarded if we consider the 
                // "discard delay slot" model. However, the problem says "1 branch delay 
                // slot is acceptable". Standard MIPS-style: delay slot instruction executes.
                // The PC after the branch+delay sequence is target if taken, else npc.
                
                // We need to update pc_f for the *next* fetch.
                // The current pc_f is being used to fetch instr_f.
                // The next pc_f should be:
                // If X stage branch taken: btarget_x
                // Else: npc_x (which is pc_x + 1)
                
                if (is_branch_x && branch_taken_x) begin
                    pc_f <= btarget_x;
                end else begin
                    pc_f <= npc_x;
                end
                
                // Fetch instruction at pc_f
                // IMEM_INIT is 256 bits. Byte 0 is at MSB.
                // PC is byte address.
                // instr = IMEM_INIT[PC*8 +: 8]
                instr_f <= IMEM_INIT[pc_f*8 +: 8];
                
                // Compute next PC (sequential)
                npc_f <= pc_f + 5'd1;
                
                // Decode branch info for next stage
                if (instr_f[7:6] == 2'b10) begin
                    is_branch_f <= 1'b1;
                    cond_f <= instr_f[5:4];
                    off_f <= instr_f[3:0];
                    btarget_f <= pc_f + sext4_pc(instr_f[3:0]) + 5'd1;
                end else begin
                    is_branch_f <= 1'b0;
                    cond_f <= 2'b00;
                    off_f <= 4'b0;
                    btarget_f <= 5'd0;
                end
            end
        end
    end

    // Forwarding Logic for X Stage
    // We need to determine if the source register for the current X instruction
    // matches the destination register of the W or X stage instructions.
    // The X stage instruction is instr_x. The source register is rs_x (for MOV) or rd_x (for ADDI, it reads R[rd]).
    // Wait, ADDI reads R[rd] and writes R[rd]. MOV reads R[rs] and writes R[rd].
    
    // Let's define the source register index for the current X instruction.
    reg [1:0] src_reg_x;
    always @(*) begin
        case (instr_x[7:6])
            2'b00: src_reg_x = instr_x[5:4]; // ADDI reads R[rd]
            2'b01: src_reg_x = instr_x[3:2]; // MOV reads R[rs]
            default: src_reg_x = 2'b00;
        endcase
    end

    // Check W stage for forwarding
    reg fwd_from_w;
    reg signed [7:0] w_data;
    always @(*) begin
        fwd_from_w = 1'b0;
        w_data = 8'sh0;
        if (is_write_w && !is_halt_w) begin
            if (src_reg_x == rd_w) begin
                fwd_from_w = 1'b1;
                w_data = data_w;
            end
        end
    end

    // Check X stage for forwarding
    reg fwd_from_x;
    reg signed [7:0] x_data;
    always @(*) begin
        fwd_from_x = 1'b0;
        x_data = 8'sh0;
        if (!is_halt_x) begin
            // X stage produces a result for rd_x
            // Check if current X instruction's source matches previous X instruction's destination
            if (src_reg_x == rd_x) begin
                fwd_from_x = 1'b1;
                x_data = alu_res_x;
            end
        end
    end

    // Priority: W > X > RF
    always @(*) begin
        use_fwd_x = 1'b0;
        fwd_val_x = 8'sh0;
        if (fwd_from_w) begin
            use_fwd_x = 1'b1;
            fwd_val_x = w_data;
        end else if (fwd_from_x) begin
            use_fwd_x = 1'b1;
            fwd_val_x = x_data;
        end
    end

endmodule

`default_nettype wire
```