| program | R0 | halt | cycles | verdict |
|---|---|---|---|---|
| load_add_halt | r0=8 (want 8) | halted@pc=4 | 4 cyc | PASS |
| addi_negative_halt | r0=5 (want 5) | halted@pc=4 | 4 cyc | PASS |
| mov_chain | r0=3 (want 3) | halted@pc=4 | 4 cyc | PASS |
| branch_loop | r0=-3 (want -2) | halted@pc=5 | 5 cyc | FAIL |
(no automated diagnostics fired)
// Minimalist 3-stage MCU reference. Pipeline:
// F : fetch ins, compute next PC (no branch prediction; mispred costs 0 since
// we use a 1-slot delay-slot convention)
// X : decode/execute (read regs, ALU, decide branch taken)
// W : writeback to reg file or HALT
// Forwarding: X reads from the W stage's writeback if it targets the same reg
// (i.e. write-through reg file) so back-to-back ALU works.
// Branch convention: 1 delay slot — instruction immediately after a taken
// branch executes; PC for the branch is computed in X and applied in F next.
module mcu3 #(
parameter [0:255] IMEM_INIT = 256'h0
) (
input clk,
input rst,
output reg [4:0] pc_out,
output reg halt_out,
output reg signed [7:0] r0_out
);
// Instruction memory (32 bytes; byte 0 at MSB of IMEM_INIT)
function [7:0] imem_byte;
input [4:0] addr;
begin
// IMEM_INIT is declared [0:255] with byte 0 at MSB.
// Byte k occupies bits [k*8 +: 8] under that convention.
imem_byte = IMEM_INIT[addr*8 +: 8];
end
endfunction
// Register file
reg signed [7:0] R [0:3];
// F stage
reg [4:0] pc_f;
reg [7:0] ins_f;
reg valid_f;
// X stage
reg [4:0] pc_x;
reg [7:0] ins_x;
reg valid_x;
reg signed [7:0] alu_result_x;
reg [1:0] wb_rd_x;
reg wb_en_x;
reg halt_x;
reg branch_taken_x;
reg [4:0] branch_target_x;
// Decode helpers
function signed [7:0] sxt4;
input [3:0] v;
sxt4 = {{4{v[3]}}, v};
endfunction
integer i;
// Sequential
always @(posedge clk) begin
if (rst) begin
pc_f <= 5'd0; ins_f <= 8'd0; valid_f <= 1'b0;
pc_x <= 5'd0; ins_x <= 8'd0; valid_x <= 1'b0;
alu_result_x <= 8'sd0; wb_rd_x <= 2'd0; wb_en_x <= 1'b0;
halt_x <= 1'b0; branch_taken_x <= 1'b0; branch_target_x <= 5'd0;
for (i = 0; i < 4; i = i + 1) R[i] <= 8'sd0;
halt_out <= 1'b0; r0_out <= 8'sd0; pc_out <= 5'd0;
end else if (halt_out) begin
// freeze
end else begin
// W stage (apply writeback from X)
if (valid_x && wb_en_x && !halt_x) begin
R[wb_rd_x] <= alu_result_x;
if (wb_rd_x == 2'd0) r0_out <= alu_result_x;
end
if (valid_x && halt_x) halt_out <= 1'b1;
// F stage moves to X
pc_x <= pc_f;
ins_x <= ins_f;
valid_x <= valid_f;
// Decode/execute combinationally on ins_f -> X regs
begin : decode_exec
reg [1:0] op;
reg [1:0] rd, rs, cond;
reg [3:0] imm, off;
reg signed [7:0] rs_val, rd_val;
op = ins_f[7:6];
rd = ins_f[5:4];
rs = ins_f[3:2];
cond = ins_f[5:4];
imm = ins_f[3:0];
off = ins_f[3:0];
// Forwarding: if X stage is about to write rd or rs, use its value
rd_val = (valid_x && wb_en_x && wb_rd_x == rd && !halt_x) ? alu_result_x : R[rd];
rs_val = (valid_x && wb_en_x && wb_rd_x == rs && !halt_x) ? alu_result_x : R[rs];
wb_en_x <= 1'b0;
halt_x <= 1'b0;
branch_taken_x <= 1'b0;
wb_rd_x <= rd;
alu_result_x <= 8'sd0;
case (op)
2'b00: begin // ADDI
alu_result_x <= rd_val + sxt4(imm);
wb_en_x <= 1'b1;
end
2'b01: begin // MOV rd, rs
alu_result_x <= rs_val;
wb_en_x <= 1'b1;
end
2'b10: begin // BRcc
case (cond)
2'b00: branch_taken_x <= 1'b1;
2'b01: branch_taken_x <= (R[0] == 8'sd0);
2'b10: branch_taken_x <= (R[0] < 8'sd0);
2'b11: branch_taken_x <= (R[0] > 8'sd0);
endcase
branch_target_x <= pc_f + {{1{off[3]}}, off} + 5'd1;
end
2'b11: begin // HALT
halt_x <= 1'b1;
end
endcase
end
// F stage: fetch from IMEM, decide next PC
// 1-slot delay slot: branch_taken_x applies AFTER current F has been issued
ins_f <= imem_byte(pc_f);
valid_f <= 1'b1;
pc_out <= pc_f;
if (valid_x && branch_taken_x) begin
pc_f <= branch_target_x;
end else begin
pc_f <= pc_f + 5'd1;
end
end
end
endmodule
// Minimalist 3-stage MCU reference. Pipeline:
// F : fetch ins, compute next PC (no branch prediction; mispred costs 0 since
// we use a 1-slot delay-slot convention)
// X : decode/execute (read regs, ALU, decide branch taken)
// W : writeback to reg file or HALT
// Forwarding: X reads from the W stage's writeback if it targets the same reg
// (i.e. write-through reg file) so back-to-back ALU works.
// Branch convention: 1 delay slot — instruction immediately after a taken
// branch executes; PC for the branch is computed in X and applied in F next.
module mcu3 #(
parameter [0:255] IMEM_INIT = 256'h0
) (
input clk,
input rst,
output reg [4:0] pc_out,
output reg halt_out,
output reg signed [7:0] r0_out
);
// Instruction memory (32 bytes; byte 0 at MSB of IMEM_INIT)
function [7:0] imem_byte;
input [4:0] addr;
begin
// IMEM_INIT is declared [0:255] with byte 0 at MSB.
// Byte k occupies bits [k*8 +: 8] under that convention.
imem_byte = IMEM_INIT[addr*8 +: 8];
end
endfunction
// Register file
reg signed [7:0] R [0:3];
// F stage
reg [4:0] pc_f;
reg [7:0] ins_f;
reg valid_f;
// X stage
reg [4:0] pc_x;
reg [7:0] ins_x;
reg valid_x;
reg signed [7:0] alu_result_x;
reg [1:0] wb_rd_x;
reg wb_en_x;
reg halt_x;
reg branch_taken_x;
reg [4:0] branch_target_x;
// Decode helpers
function signed [7:0] sxt4;
input [3:0] v;
sxt4 = {{4{v[3]}}, v};
endfunction
integer i;
// Sequential
always @(posedge clk) begin
if (rst) begin
pc_f <= 5'd0; ins_f <= 8'd0; valid_f <= 1'b0;
pc_x <= 5'd0; ins_x <= 8'd0; valid_x <= 1'b0;
alu_result_x <= 8'sd0; wb_rd_x <= 2'd0; wb_en_x <= 1'b0;
halt_x <= 1'b0; branch_taken_x <= 1'b0; branch_target_x <= 5'd0;
for (i = 0; i < 4; i = i + 1) R[i] <= 8'sd0;
halt_out <= 1'b0; r0_out <= 8'sd0; pc_out <= 5'd0;
end else if (halt_out) begin
// freeze
end else begin
// W stage (apply writeback from X)
if (valid_x && wb_en_x && !halt_x) begin
R[wb_rd_x] <= alu_result_x;
if (wb_rd_x == 2'd0) r0_out <= alu_result_x;
end
if (valid_x && halt_x) halt_out <= 1'b1;
// F stage moves to X
pc_x <= pc_f;
ins_x <= ins_f;
valid_x <= valid_f;
// Decode/execute combinationally on ins_f -> X regs
begin : decode_exec
reg [1:0] op;
reg [1:0] rd, rs, cond;
reg [3:0] imm, off;
reg signed [7:0] rs_val, rd_val;
op = ins_f[7:6];
rd = ins_f[5:4];
rs = ins_f[3:2];
cond = ins_f[5:4];
imm = ins_f[3:0];
off = ins_f[3:0];
// Forwarding: if X stage is about to write rd or rs, use its value
rd_val = (valid_x && wb_en_x && wb_rd_x == rd && !halt_x) ? alu_result_x : R[rd];
rs_val = (valid_x && wb_en_x && wb_rd_x == rs && !halt_x) ? alu_result_x : R[rs];
wb_en_x <= 1'b0;
halt_x <= 1'b0;
branch_taken_x <= 1'b0;
wb_rd_x <= rd;
alu_result_x <= 8'sd0;
case (op)
2'b00: begin // ADDI
alu_result_x <= rd_val + sxt4(imm);
wb_en_x <= 1'b1;
end
2'b01: begin // MOV rd, rs
alu_result_x <= rs_val;
wb_en_x <= 1'b1;
end
2'b10: begin // BRcc
case (cond)
2'b00: branch_taken_x <= 1'b1;
2'b01: branch_taken_x <= (R[0] == 8'sd0);
2'b10: branch_taken_x <= (R[0] < 8'sd0);
2'b11: branch_taken_x <= (R[0] > 8'sd0);
endcase
branch_target_x <= pc_f + {{1{off[3]}}, off} + 5'd1;
end
2'b11: begin // HALT
halt_x <= 1'b1;
end
endcase
end
// F stage: fetch from IMEM, decide next PC
// 1-slot delay slot: branch_taken_x applies AFTER current F has been issued
ins_f <= imem_byte(pc_f);
valid_f <= 1'b1;
pc_out <= pc_f;
if (valid_x && branch_taken_x) begin
pc_f <= branch_target_x;
end else begin
pc_f <= pc_f + 5'd1;
end
end
end
endmodule