no functional run (did not compile)
module mcu3 #(
parameter [0:255] IMEM_INIT = 256'h0 // 32 instruction bytes, byte 0 at MSB
) (
input clk,
input rst, // synchronous, active-high
output reg [4:0] pc_out, // current fetched PC (visible for testing)
output reg halt_out, // asserted forever after HALT executes
output reg signed [7:0] r0_out // current value of register R0
);
// Register file: 4 registers, 8-bit signed
reg signed [7:0] R [0:3];
// Pipeline registers
// Stage F -> X
reg [7:0] instr_x;
reg [4:0] pc_x;
reg valid_x;
// Stage X -> W
reg [7:0] instr_w;
reg [4:0] pc_w;
reg valid_w;
reg is_halt_w;
reg is_reg_write_w;
reg [1:0] rd_w;
reg signed [7:0] result_w;
// Forwarding registers (holding X stage results for W->X forwarding)
reg is_reg_write_x;
reg [1:0] rd_x;
reg signed [7:0] result_x;
// Next PC logic for F stage
reg [4:0] npc_f;
reg take_branch_f;
// Decode/Execute logic
reg is_halt_x;
reg is_branch_x;
reg is_reg_write_x_int;
reg [1:0] rd_x_int;
reg [1:0] rs_x_int;
reg signed [7:0] result_x_int;
reg [3:0] imm_x;
reg [1:0] cond_x;
reg [3:0] off_x;
reg branch_taken_x;
// Forwarding mux selects for X stage
reg signed [7:0] rs_fwd;
// Helper: sign extend 4-bit to 8-bit
function signed [7:0] sext4;
input [3:0] val;
begin
sext4 = {{4{val[3]}}, val};
end
endfunction
// Helper: sign extend 4-bit offset to 5-bit for PC addition
function signed [4:0] sext4_off;
input [3:0] val;
begin
sext4_off = {{4{val[3]}}, val};
end
endfunction
// IMEM access: byte-addressed, 32 bytes. IMEM_INIT is 256-bit.
// Byte 0 is at bits [255:248], Byte 1 at [247:240], etc.
// PC is 5-bit, 0..31.
// Instruction at PC i is IMEM_INIT[(31-i)*8 +: 8] ?
// Let's check: "byte 0 at MSB". So byte 0 is MSB of the 256-bit value.
// In Verilog, if we index IMEM_INIT, bit 255 is the MSB.
// Byte 0: bits [255:248]
// Byte 1: bits [247:240]
// Byte k: bits [255-k*8 : 248-k*8]
// So for PC=p, instruction = IMEM_INIT[255-p*8 -: 8]
wire [7:0] imem_data = IMEM_INIT[255 - pc_out*8 -: 8];
// F Stage: Fetch instruction and compute next PC
// The next PC depends on the branch decision from the W stage (for the instruction at pc_w)
// But wait, the branch decision is made in X stage, and the result is in W stage.
// Actually, for the F stage, we need to know if the instruction in W stage is a taken branch.
// If so, the F stage should fetch from the branch target instead of PC+1.
// However, the problem says "1 branch delay slot is acceptable".
// Standard 3-stage pipeline with delay slot:
// F: Fetch instr at PC
// X: Decode instr at PC-1
// W: Writeback instr at PC-2
// Branch resolution happens in X stage. The instruction in the delay slot (fetched by F when branch was in X) is executed.
// If branch taken, the next instruction after the delay slot is the target.
// So, when the branch instruction is in X stage, the F stage fetches the delay slot instruction (PC+1).
// When the branch instruction moves to W stage, if taken, the F stage should fetch the target.
// But the F stage is fetching for the next cycle. The PC for the next cycle's F stage is determined by the W stage's branch result.
// Let's define the pipeline state more carefully.
// Cycle N:
// F: Fetch instr at PC_F
// X: Execute instr at PC_X (which was fetched in cycle N-1)
// W: Writeback instr at PC_W (which was executed in cycle N-2)
// Branch instruction BRcc is at PC_B.
// Cycle T: BRcc in X stage. F stage fetches PC_B+1 (delay slot).
// Cycle T+1: BRcc in W stage. Delay slot instr in X stage. F stage fetches next instr.
// If BRcc taken, next instr should be target. If not taken, next instr is PC_B+2.
// So, the F stage in cycle T+2 should fetch from target or PC_B+2 based on W stage result of BRcc.
// Implementation:
// In W stage, if is_halt_w, halt.
// If is_branch_w and branch_taken_w, then next PC for F stage is branch_target.
// Else, next PC is PC_W + 2 (because PC_W+1 was delay slot, PC_W+2 is next).
// Wait, PC_W is the PC of the instruction in W stage.
// The instruction in X stage has PC_X = PC_W + 1.
// The instruction in F stage has PC_F = PC_X + 1 = PC_W + 2.
// If the instruction in W stage is a taken branch, the instruction that should be fetched next (after the delay slot) is the target.
// But the delay slot instruction is already in X stage. It will execute.
// The instruction fetched in F stage (PC_W+2) will go to X stage next cycle.
// If branch taken, we want the instruction after the delay slot to be the target.
// So, if branch taken, the F stage should fetch from target.
// But the F stage is currently fetching PC_W+2. We need to override it.
// Let's use a signal from W stage to override the PC increment.
reg branch_taken_w;
reg [4:0] branch_target_w;
// F Stage Logic
always @(*) begin
if (halt_out) begin
npc_f = pc_out; // Freeze PC
take_branch_f = 0;
end else if (valid_w && is_branch_w && branch_taken_w) begin
npc_f = branch_target_w;
take_branch_f = 1;
end else begin
npc_f = pc_out + 1;
take_branch_f = 0;
end
end
// X Stage Logic
always @(*) begin
// Default
is_halt_x = 0;
is_branch_x = 0;
is_reg_write_x_int = 0;
rd_x_int = 2'b00;
rs_x_int = 2'b00;
result_x_int = 8'sb0;
imm_x = 4'b0000;
cond_x = 2'b00;
off_x = 4'b0000;
branch_taken_x = 0;
rs_fwd = 8'sb0;
if (!valid_x) begin
// No instruction
end else begin
case (instr_x[7:6])
2'b00: begin // ADDI
is_reg_write_x_int = 1;
rd_x_int = instr_x[5:4];
imm_x = instr_x[3:0];
// R[rd] = R[rd] + sign_ext(imm)
// Forwarding for R[rd] (which is the same as rs for ADDI? No, ADDI only writes rd, reads nothing from R)
// ADDI reads nothing from R, so no forwarding needed for rs.
result_x_int = R[rd_x_int] + sext4(imm_x);
end
2'b01: begin // MOV
is_reg_write_x_int = 1;
rd_x_int = instr_x[5:4];
rs_x_int = instr_x[3:2];
// R[rd] = R[rs]
// Forwarding for R[rs]
rs_fwd = R[rs_x_int]; // Default
// Check forwarding from W stage
if (valid_w && is_reg_write_w && (rd_w == rs_x_int) && !is_halt_w) begin
rs_fwd = result_w;
end
// Check forwarding from X stage (self-forwarding? No, rs and rd are different usually, but if same, use current R)
// If rd_x == rs_x, we should use the current value from R, not the result of the same instruction.
// But in a pipeline, the X stage result is not yet committed. So we use R[rs].
// However, if the previous instruction (in W) wrote to rs, we forward from W.
// If the previous instruction (in X) wrote to rs, we forward from X.
if (valid_w && is_reg_write_w && (rd_w == rs_x_int) && !is_halt_w) begin
rs_fwd = result_w;
end else if (is_reg_write_x_int && (rd_x_int == rs_x_int)) begin
// Self-forward: use current R value
rs_fwd = R[rs_x_int];
end else if (is_reg_write_x_int && (rd_x_int == rs_x_int)) begin
// This case is covered above.
end else begin
// Check X stage forwarding for rs
if (is_reg_write_x_int && (rd_x_int == rs_x_int)) begin
rs_fwd = R[rs_x_int]; // Use current R if rd==rs
end else begin
rs_fwd = R[rs_x_int];
end
end
// Re-evaluate forwarding logic more cleanly:
// Priority: W stage > X stage > Register File
if (valid_w && is_reg_write_w && (rd_w == rs_x_int) && !is_halt_w) begin
rs_fwd = result_w;
end else if (is_reg_write_x_int && (rd_x_int == rs_x_int)) begin
// If the current instruction writes to rd and reads from rs, and rd==rs,
// we should use the current value from R, not the result of the current instruction.
rs_fwd = R[rs_x_int];
end else begin
rs_fwd = R[rs_x_int];
end
result_x_int = rs_fwd;
end
2'b10: begin // BRcc
is_branch_x = 1;
cond_x = instr_x[5:4];
off_x = instr_x[3:0];
// Branch condition
case (cond_x)
2'b00: branch_taken_x = 1; // Always
2'b01: branch_taken_x = (R[0] == 8'sb0); // Zero
2'b10: branch_taken_x = (R[0] < 8'sb0); // Negative
2'b11: branch_taken_x = (R[0] > 8'sb0); // Positive
default: branch_taken_x = 0;
endcase
// Compute branch target: PC + sign_ext(off) + 1
// PC for this instruction is pc_x
result_x_int = pc_x + sext4_off(off_x) + 1;
end
2'b11: begin // HALT
is_halt_x = 1;
end
default: begin
// NOP or invalid
end
endcase
end
end
// W Stage Logic
always @(*) begin
is_halt_w = 0;
is_branch_w = 0;
is_reg_write_w = 0;
rd_w = 2'b00;
result_w = 8'sb0;
branch_taken_w = 0;
branch_target_w = 5'b00000;
if (!valid_w) begin
// No instruction
end else begin
if (is_halt_x) begin
is_halt_w = 1;
end else if (is_branch_x) begin
is_branch_w = 1;
branch_taken_w = branch_taken_x;
branch_target_w = result_x_int; // This is the target PC
end else if (is_reg_write_x_int) begin
is_reg_write_w = 1;
rd_w = rd_x_int;
result_w = result_x_int;
end
end
end
// Pipeline Register Updates
always @(posedge clk) begin
if (rst) begin
pc_out <= 5'b00000;
halt_out <= 1'b0;
r0_out <= 8'sb0;
for (int i = 0; i < 4; i++) R[i] <= 8'sb0;
valid_x <= 1'b0;
valid_w <= 1'b0;
instr_x <= 8'b0;
pc_x <= 5'b0;
instr_w <= 8'b0;
pc_w <= 5'b0;
is_halt_w <= 1'b0;
is_reg_write_w <= 1'b0;
rd_w <= 2'b0;
result_w <= 8'sb0;
is_reg_write_x <= 1'b0;
rd_x <= 2'b0;
result_x <= 8'sb0;
end else if (halt_out) begin
// Freeze everything
pc_out <= pc_out;
halt_out <= 1'b1;
r0_out <= r0_out;
// R remains unchanged
valid_x <= 1'b0;
valid_w <= 1'b0;
// Keep other regs stable
end else begin
// Update W stage
valid_w <= valid_x;
instr_w <= instr_x;
pc_w <= pc_x;
is_halt_w <= is_halt_x;
is_reg_write_w <= is_reg_write_x_int;
rd_w <= rd_x_int;
result_w <= result_x_int;
branch_taken_w <= branch_taken_x;
branch_target_w <= result_x_int; // For branch, result_x_int is target
// Update X stage
valid_x <= valid_f; // valid_f is always 1 unless halt
instr_x <= imem_data;
pc_x <= npc_f;
// Update F stage PC
pc_out <= npc_f;
// Update Register File
if (valid_w && is_reg_write_w) begin
R[rd_w] <= result_w;
end
// Update r0_out
if (valid_w && is_reg_write_w && rd_w == 2'b00) begin
r0_out <= result_w;
end
// Update halt_out
if (valid_w && is_halt_w) begin
halt_out <= 1'b1;
end
// Update forwarding regs for next cycle's X stage
is_reg_write_x <= is_reg_write_w;
rd_x <= rd_w;
result_x <= result_w;
end
end
// Valid F signal: always valid unless halt
wire valid_f = ~halt_out;
endmodule```verilog
// Branch convention: 1 branch delay slot. The instruction immediately following
// a BRcc is always fetched and executed. If the branch is taken, the target
// instruction replaces the instruction at PC+2 in the next cycle.
// Forwarding is implemented for W->X and X->X to handle ALU hazards without stalls.
module mcu3 #(
parameter [0:255] IMEM_INIT = 256'h0 // 32 instruction bytes, byte 0 at MSB
) (
input clk,
input rst, // synchronous, active-high
output reg [4:0] pc_out, // current fetched PC (visible for testing)
output reg halt_out, // asserted forever after HALT executes
output reg signed [7:0] r0_out // current value of register R0
);
// Register file: 4 registers, 8-bit signed
reg signed [7:0] R [0:3];
// Pipeline registers
// Stage F -> X
reg [7:0] instr_x;
reg [4:0] pc_x;
reg valid_x;
// Stage X -> W
reg [7:0] instr_w;
reg [4:0] pc_w;
reg valid_w;
reg is_halt_w;
reg is_reg_write_w;
reg [1:0] rd_w;
reg signed [7:0] result_w;
// Forwarding registers (holding X stage results for W->X forwarding)
reg is_reg_write_x;
reg [1:0] rd_x;
reg signed [7:0] result_x;
// Next PC logic for F stage
reg [4:0] npc_f;
reg take_branch_f;
// Decode/Execute logic
reg is_halt_x;
reg is_branch_x;
reg is_reg_write_x_int;
reg [1:0] rd_x_int;
reg [1:0] rs_x_int;
reg signed [7:0] result_x_int;
reg [3:0] imm_x;
reg [1:0] cond_x;
reg [3:0] off_x;
reg branch_taken_x;
// Forwarding mux selects for X stage
reg signed [7:0] rs_fwd;
// Helper: sign extend 4-bit to 8-bit
function signed [7:0] sext4;
input [3:0] val;
begin
sext4 = {{4{val[3]}}, val};
end
endfunction
// Helper: sign extend 4-bit offset to 5-bit for PC addition
function signed [4:0] sext4_off;
input [3:0] val;
begin
sext4_off = {{4{val[3]}}, val};
end
endfunction
// IMEM access: byte-addressed, 32 bytes. IMEM_INIT is 256-bit.
// Byte 0 is at bits [255:248], Byte 1 at [247:240], etc.
// PC is 5-bit, 0..31.
// Instruction at PC i is IMEM_INIT[(31-i)*8 +: 8] ?
// Let's check: "byte 0 at MSB". So byte 0 is MSB of the 256-bit value.
// In Verilog, if we index IMEM_INIT, bit 255 is the MSB.
// Byte 0: bits [255:248]
// Byte 1: bits [247:240]
// Byte k: bits [255-k*8 : 248-k*8]
// So for PC=p, instruction = IMEM_INIT[255-p*8 -: 8]
wire [7:0] imem_data = IMEM_INIT[255 - pc_out*8 -: 8];
// F Stage: Fetch instruction and compute next PC
// The next PC depends on the branch decision from the W stage (for the instruction at pc_w)
// But wait, the branch decision is made in X stage, and the result is in W stage.
// Actually, for the F stage, we need to know if the instruction in W stage is a taken branch.
// If so, the F stage should fetch from the branch target instead of PC+1.
// However, the problem says "1 branch delay slot is acceptable".
// Standard 3-stage pipeline with delay slot:
// F: Fetch instr at PC
// X: Decode instr at PC-1
// W: Writeback instr at PC-2
// Branch resolution happens in X stage. The instruction in the delay slot (fetched by F when branch was in X) is executed.
// If branch taken, the next instruction after the delay slot is the target.
// So, when the branch instruction is in X stage, the F stage fetches the delay slot instruction (PC+1).
// When the branch instruction moves to W stage, if taken, the F stage should fetch the target.
// But the F stage is fetching for the next cycle. The PC for the next cycle's F stage is determined by the W stage's branch result.
// Let's define the pipeline state more carefully.
// Cycle N:
// F: Fetch instr at PC_F
// X: Execute instr at PC_X (which was fetched in cycle N-1)
// W: Writeback instr at PC_W (which was executed in cycle N-2)
// Branch instruction BRcc is at PC_B.
// Cycle T: BRcc in X stage. F stage fetches PC_B+1 (delay slot).
// Cycle T+1: BRcc in W stage. Delay slot instr in X stage. F stage fetches next instr.
// If BRcc taken, next instr should be target. If not taken, next instr is PC_B+2.
// So, the F stage in cycle T+2 should fetch from target or PC_B+2 based on W stage result of BRcc.
// Implementation:
// In W stage, if is_halt_w, halt.
// If is_branch_w and branch_taken_w, then next PC for F stage is branch_target.
// Else, next PC is PC_W + 2 (because PC_W+1 was delay slot, PC_W+2 is next).
// Wait, PC_W is the PC of the instruction in W stage.
// The instruction in X stage has PC_X = PC_W + 1.
// The instruction in F stage has PC_F = PC_X + 1 = PC_W + 2.
// If the instruction in W stage is a taken branch, the instruction that should be fetched next (after the delay slot) is the target.
// But the delay slot instruction is already in X stage. It will execute.
// The instruction fetched in F stage (PC_W+2) will go to X stage next cycle.
// If branch taken, we want the instruction after the delay slot to be the target.
// So, if branch taken, the F stage should fetch from target.
// But the F stage is currently fetching PC_W+2. We need to override it.
// Let's use a signal from W stage to override the PC increment.
reg branch_taken_w;
reg [4:0] branch_target_w;
// F Stage Logic
always @(*) begin
if (halt_out) begin
npc_f = pc_out; // Freeze PC
take_branch_f = 0;
end else if (valid_w && is_branch_w && branch_taken_w) begin
npc_f = branch_target_w;
take_branch_f = 1;
end else begin
npc_f = pc_out + 1;
take_branch_f = 0;
end
end
// X Stage Logic
always @(*) begin
// Default
is_halt_x = 0;
is_branch_x = 0;
is_reg_write_x_int = 0;
rd_x_int = 2'b00;
rs_x_int = 2'b00;
result_x_int = 8'sb0;
imm_x = 4'b0000;
cond_x = 2'b00;
off_x = 4'b0000;
branch_taken_x = 0;
rs_fwd = 8'sb0;
if (!valid_x) begin
// No instruction
end else begin
case (instr_x[7:6])
2'b00: begin // ADDI
is_reg_write_x_int = 1;
rd_x_int = instr_x[5:4];
imm_x = instr_x[3:0];
// R[rd] = R[rd] + sign_ext(imm)
// Forwarding for R[rd] (which is the same as rs for ADDI? No, ADDI only writes rd, reads nothing from R)
// ADDI reads nothing from R, so no forwarding needed for rs.
result_x_int = R[rd_x_int] + sext4(imm_x);
end
2'b01: begin // MOV
is_reg_write_x_int = 1;
rd_x_int = instr_x[5:4];
rs_x_int = instr_x[3:2];
// R[rd] = R[rs]
// Forwarding for R[rs]
rs_fwd = R[rs_x_int]; // Default
// Check forwarding from W stage
if (valid_w && is_reg_write_w && (rd_w == rs_x_int) && !is_halt_w) begin
rs_fwd = result_w;
end
// Check forwarding from X stage (self-forwarding? No, rs and rd are different usually, but if same, use current R)
// If rd_x == rs_x, we should use the current value from R, not the result of the same instruction.
// But in a pipeline, the X stage result is not yet committed. So we use R[rs].
// However, if the previous instruction (in W) wrote to rs, we forward from W.
// If the previous instruction (in X) wrote to rs, we forward from X.
if (valid_w && is_reg_write_w && (rd_w == rs_x_int) && !is_halt_w) begin
rs_fwd = result_w;
end else if (is_reg_write_x_int && (rd_x_int == rs_x_int)) begin
// Self-forward: use current R value
rs_fwd = R[rs_x_int];
end else if (is_reg_write_x_int && (rd_x_int == rs_x_int)) begin
// This case is covered above.
end else begin
// Check X stage forwarding for rs
if (is_reg_write_x_int && (rd_x_int == rs_x_int)) begin
rs_fwd = R[rs_x_int]; // Use current R if rd==rs
end else begin
rs_fwd = R[rs_x_int];
end
end
// Re-evaluate forwarding logic more cleanly:
// Priority: W stage > X stage > Register File
if (valid_w && is_reg_write_w && (rd_w == rs_x_int) && !is_halt_w) begin
rs_fwd = result_w;
end else if (is_reg_write_x_int && (rd_x_int == rs_x_int)) begin
// If the current instruction writes to rd and reads from rs, and rd==rs,
// we should use the current value from R, not the result of the current instruction.
rs_fwd = R[rs_x_int];
end else begin
rs_fwd = R[rs_x_int];
end
result_x_int = rs_fwd;
end
2'b10: begin // BRcc
is_branch_x = 1;
cond_x = instr_x[5:4];
off_x = instr_x[3:0];
// Branch condition
case (cond_x)
2'b00: branch_taken_x = 1; // Always
2'b01: branch_taken_x = (R[0] == 8'sb0); // Zero
2'b10: branch_taken_x = (R[0] < 8'sb0); // Negative
2'b11: branch_taken_x = (R[0] > 8'sb0); // Positive
default: branch_taken_x = 0;
endcase
// Compute branch target: PC + sign_ext(off) + 1
// PC for this instruction is pc_x
result_x_int = pc_x + sext4_off(off_x) + 1;
end
2'b11: begin // HALT
is_halt_x = 1;
end
default: begin
// NOP or invalid
end
endcase
end
end
// W Stage Logic
always @(*) begin
is_halt_w = 0;
is_branch_w = 0;
is_reg_write_w = 0;
rd_w = 2'b00;
result_w = 8'sb0;
branch_taken_w = 0;
branch_target_w = 5'b00000;
if (!valid_w) begin
// No instruction
end else begin
if (is_halt_x) begin
is_halt_w = 1;
end else if (is_branch_x) begin
is_branch_w = 1;
branch_taken_w = branch_taken_x;
branch_target_w = result_x_int; // This is the target PC
end else if (is_reg_write_x_int) begin
is_reg_write_w = 1;
rd_w = rd_x_int;
result_w = result_x_int;
end
end
end
// Pipeline Register Updates
always @(posedge clk) begin
if (rst) begin
pc_out <= 5'b00000;
halt_out <= 1'b0;
r0_out <= 8'sb0;
for (int i = 0; i < 4; i++) R[i] <= 8'sb0;
valid_x <= 1'b0;
valid_w <= 1'b0;
instr_x <= 8'b0;
pc_x <= 5'b0;
instr_w <= 8'b0;
pc_w <= 5'b0;
is_halt_w <= 1'b0;
is_reg_write_w <= 1'b0;
rd_w <= 2'b0;
result_w <= 8'sb0;
is_reg_write_x <= 1'b0;
rd_x <= 2'b0;
result_x <= 8'sb0;
end else if (halt_out) begin
// Freeze everything
pc_out <= pc_out;
halt_out <= 1'b1;
r0_out <= r0_out;
// R remains unchanged
valid_x <= 1'b0;
valid_w <= 1'b0;
// Keep other regs stable
end else begin
// Update W stage
valid_w <= valid_x;
instr_w <= instr_x;
pc_w <= pc_x;
is_halt_w <= is_halt_x;
is_reg_write_w <= is_reg_write_x_int;
rd_w <= rd_x_int;
result_w <= result_x_int;
branch_taken_w <= branch_taken_x;
branch_target_w <= result_x_int; // For branch, result_x_int is target
// Update X stage
valid_x <= valid_f; // valid_f is always 1 unless halt
instr_x <= imem_data;
pc_x <= npc_f;
// Update F stage PC
pc_out <= npc_f;
// Update Register File
if (valid_w && is_reg_write_w) begin
R[rd_w] <= result_w;
end
// Update r0_out
if (valid_w && is_reg_write_w && rd_w == 2'b00) begin
r0_out <= result_w;
end
// Update halt_out
if (valid_w && is_halt_w) begin
halt_out <= 1'b1;
end
// Update forwarding regs for next cycle's X stage
is_reg_write_x <= is_reg_write_w;
rd_x <= rd_w;
result_x <= result_w;
end
end
// Valid F signal: always valid unless halt
wire valid_f = ~halt_out;
endmodule
```