← all candidates reference

reference

line-item PPA

passes_tests3/4
area_um21126
fmax_mhz1183.1
ops_per_sec5.916e+08
switching_alpha0.173
est_power_uW373.55

per-program functional

programR0haltcyclesverdict
load_add_haltr0=8 (want 8)halted@pc=44 cycPASS
addi_negative_haltr0=5 (want 5)halted@pc=44 cycPASS
mov_chainr0=3 (want 3)halted@pc=44 cycPASS
branch_loopr0=-3 (want -2)halted@pc=55 cycFAIL

pipeline shape (heuristic register bank count)

F stage regs3
X stage regs9
W stage regs0
unstaged regs1

mistakes detected

(no automated diagnostics fired)

extracted RTL

// Minimalist 3-stage MCU reference. Pipeline:
//   F : fetch ins, compute next PC (no branch prediction; mispred costs 0 since
//       we use a 1-slot delay-slot convention)
//   X : decode/execute (read regs, ALU, decide branch taken)
//   W : writeback to reg file or HALT
// Forwarding: X reads from the W stage's writeback if it targets the same reg
// (i.e. write-through reg file) so back-to-back ALU works.
// Branch convention: 1 delay slot — instruction immediately after a taken
// branch executes; PC for the branch is computed in X and applied in F next.

module mcu3 #(
    parameter [0:255] IMEM_INIT = 256'h0
) (
    input               clk,
    input               rst,
    output reg [4:0]    pc_out,
    output reg          halt_out,
    output reg signed [7:0] r0_out
);
    // Instruction memory (32 bytes; byte 0 at MSB of IMEM_INIT)
    function [7:0] imem_byte;
        input [4:0] addr;
        begin
            // IMEM_INIT is declared [0:255] with byte 0 at MSB.
            // Byte k occupies bits [k*8 +: 8] under that convention.
            imem_byte = IMEM_INIT[addr*8 +: 8];
        end
    endfunction

    // Register file
    reg signed [7:0] R [0:3];

    // F stage
    reg [4:0] pc_f;
    reg [7:0] ins_f;
    reg       valid_f;

    // X stage
    reg [4:0] pc_x;
    reg [7:0] ins_x;
    reg       valid_x;
    reg signed [7:0] alu_result_x;
    reg [1:0] wb_rd_x;
    reg       wb_en_x;
    reg       halt_x;
    reg       branch_taken_x;
    reg [4:0] branch_target_x;

    // Decode helpers
    function signed [7:0] sxt4;
        input [3:0] v;
        sxt4 = {{4{v[3]}}, v};
    endfunction

    integer i;

    // Sequential
    always @(posedge clk) begin
        if (rst) begin
            pc_f <= 5'd0; ins_f <= 8'd0; valid_f <= 1'b0;
            pc_x <= 5'd0; ins_x <= 8'd0; valid_x <= 1'b0;
            alu_result_x <= 8'sd0; wb_rd_x <= 2'd0; wb_en_x <= 1'b0;
            halt_x <= 1'b0; branch_taken_x <= 1'b0; branch_target_x <= 5'd0;
            for (i = 0; i < 4; i = i + 1) R[i] <= 8'sd0;
            halt_out <= 1'b0; r0_out <= 8'sd0; pc_out <= 5'd0;
        end else if (halt_out) begin
            // freeze
        end else begin
            // W stage (apply writeback from X)
            if (valid_x && wb_en_x && !halt_x) begin
                R[wb_rd_x] <= alu_result_x;
                if (wb_rd_x == 2'd0) r0_out <= alu_result_x;
            end
            if (valid_x && halt_x) halt_out <= 1'b1;

            // F stage moves to X
            pc_x <= pc_f;
            ins_x <= ins_f;
            valid_x <= valid_f;
            // Decode/execute combinationally on ins_f -> X regs
            begin : decode_exec
                reg [1:0] op;
                reg [1:0] rd, rs, cond;
                reg [3:0] imm, off;
                reg signed [7:0] rs_val, rd_val;
                op = ins_f[7:6];
                rd = ins_f[5:4];
                rs = ins_f[3:2];
                cond = ins_f[5:4];
                imm = ins_f[3:0];
                off = ins_f[3:0];
                // Forwarding: if X stage is about to write rd or rs, use its value
                rd_val = (valid_x && wb_en_x && wb_rd_x == rd && !halt_x) ? alu_result_x : R[rd];
                rs_val = (valid_x && wb_en_x && wb_rd_x == rs && !halt_x) ? alu_result_x : R[rs];

                wb_en_x  <= 1'b0;
                halt_x   <= 1'b0;
                branch_taken_x <= 1'b0;
                wb_rd_x  <= rd;
                alu_result_x <= 8'sd0;

                case (op)
                  2'b00: begin   // ADDI
                    alu_result_x <= rd_val + sxt4(imm);
                    wb_en_x <= 1'b1;
                  end
                  2'b01: begin   // MOV rd, rs
                    alu_result_x <= rs_val;
                    wb_en_x <= 1'b1;
                  end
                  2'b10: begin   // BRcc
                    case (cond)
                      2'b00: branch_taken_x <= 1'b1;
                      2'b01: branch_taken_x <= (R[0] == 8'sd0);
                      2'b10: branch_taken_x <= (R[0] < 8'sd0);
                      2'b11: branch_taken_x <= (R[0] > 8'sd0);
                    endcase
                    branch_target_x <= pc_f + {{1{off[3]}}, off} + 5'd1;
                  end
                  2'b11: begin   // HALT
                    halt_x <= 1'b1;
                  end
                endcase
            end

            // F stage: fetch from IMEM, decide next PC
            // 1-slot delay slot: branch_taken_x applies AFTER current F has been issued
            ins_f <= imem_byte(pc_f);
            valid_f <= 1'b1;
            pc_out <= pc_f;
            if (valid_x && branch_taken_x) begin
                pc_f <= branch_target_x;
            end else begin
                pc_f <= pc_f + 5'd1;
            end
        end
    end
endmodule

raw model output

// Minimalist 3-stage MCU reference. Pipeline:
//   F : fetch ins, compute next PC (no branch prediction; mispred costs 0 since
//       we use a 1-slot delay-slot convention)
//   X : decode/execute (read regs, ALU, decide branch taken)
//   W : writeback to reg file or HALT
// Forwarding: X reads from the W stage's writeback if it targets the same reg
// (i.e. write-through reg file) so back-to-back ALU works.
// Branch convention: 1 delay slot — instruction immediately after a taken
// branch executes; PC for the branch is computed in X and applied in F next.

module mcu3 #(
    parameter [0:255] IMEM_INIT = 256'h0
) (
    input               clk,
    input               rst,
    output reg [4:0]    pc_out,
    output reg          halt_out,
    output reg signed [7:0] r0_out
);
    // Instruction memory (32 bytes; byte 0 at MSB of IMEM_INIT)
    function [7:0] imem_byte;
        input [4:0] addr;
        begin
            // IMEM_INIT is declared [0:255] with byte 0 at MSB.
            // Byte k occupies bits [k*8 +: 8] under that convention.
            imem_byte = IMEM_INIT[addr*8 +: 8];
        end
    endfunction

    // Register file
    reg signed [7:0] R [0:3];

    // F stage
    reg [4:0] pc_f;
    reg [7:0] ins_f;
    reg       valid_f;

    // X stage
    reg [4:0] pc_x;
    reg [7:0] ins_x;
    reg       valid_x;
    reg signed [7:0] alu_result_x;
    reg [1:0] wb_rd_x;
    reg       wb_en_x;
    reg       halt_x;
    reg       branch_taken_x;
    reg [4:0] branch_target_x;

    // Decode helpers
    function signed [7:0] sxt4;
        input [3:0] v;
        sxt4 = {{4{v[3]}}, v};
    endfunction

    integer i;

    // Sequential
    always @(posedge clk) begin
        if (rst) begin
            pc_f <= 5'd0; ins_f <= 8'd0; valid_f <= 1'b0;
            pc_x <= 5'd0; ins_x <= 8'd0; valid_x <= 1'b0;
            alu_result_x <= 8'sd0; wb_rd_x <= 2'd0; wb_en_x <= 1'b0;
            halt_x <= 1'b0; branch_taken_x <= 1'b0; branch_target_x <= 5'd0;
            for (i = 0; i < 4; i = i + 1) R[i] <= 8'sd0;
            halt_out <= 1'b0; r0_out <= 8'sd0; pc_out <= 5'd0;
        end else if (halt_out) begin
            // freeze
        end else begin
            // W stage (apply writeback from X)
            if (valid_x && wb_en_x && !halt_x) begin
                R[wb_rd_x] <= alu_result_x;
                if (wb_rd_x == 2'd0) r0_out <= alu_result_x;
            end
            if (valid_x && halt_x) halt_out <= 1'b1;

            // F stage moves to X
            pc_x <= pc_f;
            ins_x <= ins_f;
            valid_x <= valid_f;
            // Decode/execute combinationally on ins_f -> X regs
            begin : decode_exec
                reg [1:0] op;
                reg [1:0] rd, rs, cond;
                reg [3:0] imm, off;
                reg signed [7:0] rs_val, rd_val;
                op = ins_f[7:6];
                rd = ins_f[5:4];
                rs = ins_f[3:2];
                cond = ins_f[5:4];
                imm = ins_f[3:0];
                off = ins_f[3:0];
                // Forwarding: if X stage is about to write rd or rs, use its value
                rd_val = (valid_x && wb_en_x && wb_rd_x == rd && !halt_x) ? alu_result_x : R[rd];
                rs_val = (valid_x && wb_en_x && wb_rd_x == rs && !halt_x) ? alu_result_x : R[rs];

                wb_en_x  <= 1'b0;
                halt_x   <= 1'b0;
                branch_taken_x <= 1'b0;
                wb_rd_x  <= rd;
                alu_result_x <= 8'sd0;

                case (op)
                  2'b00: begin   // ADDI
                    alu_result_x <= rd_val + sxt4(imm);
                    wb_en_x <= 1'b1;
                  end
                  2'b01: begin   // MOV rd, rs
                    alu_result_x <= rs_val;
                    wb_en_x <= 1'b1;
                  end
                  2'b10: begin   // BRcc
                    case (cond)
                      2'b00: branch_taken_x <= 1'b1;
                      2'b01: branch_taken_x <= (R[0] == 8'sd0);
                      2'b10: branch_taken_x <= (R[0] < 8'sd0);
                      2'b11: branch_taken_x <= (R[0] > 8'sd0);
                    endcase
                    branch_target_x <= pc_f + {{1{off[3]}}, off} + 5'd1;
                  end
                  2'b11: begin   // HALT
                    halt_x <= 1'b1;
                  end
                endcase
            end

            // F stage: fetch from IMEM, decide next PC
            // 1-slot delay slot: branch_taken_x applies AFTER current F has been issued
            ins_f <= imem_byte(pc_f);
            valid_f <= 1'b1;
            pc_out <= pc_f;
            if (valid_x && branch_taken_x) begin
                pc_f <= branch_target_x;
            end else begin
                pc_f <= pc_f + 5'd1;
            end
        end
    end
endmodule