no functional run (did not compile)
module mcu3 #(
parameter [0:255] IMEM_INIT = 256'h0 // 32 instruction bytes, byte 0 at MSB
) (
input clk,
input rst, // synchronous, active-high
output reg [4:0] pc_out, // current fetched PC (visible for testing)
output reg halt_out, // asserted forever after HALT executes
output reg signed [7:0] r0_out // current value of register R0
);
// Register File: 4 registers, 8-bit signed
reg signed [7:0] rf [0:3];
// Pipeline Registers
// Stage F -> X
reg [7:0] f_x_instr;
reg [4:0] f_x_pc;
reg f_x_valid;
// Stage X -> W
reg [1:0] x_w_op; // 00: NOP, 01: ADDI, 10: MOV, 11: HALT
reg [1:0] x_w_rd;
reg signed [7:0] x_w_result;
reg [1:0] x_w_cond;
reg signed [7:0] x_w_r0_val; // R0 value after execution for branch condition
reg x_w_valid;
reg x_w_is_branch;
reg x_w_branch_taken;
reg [4:0] x_w_branch_target;
// Stage W -> Commit (Implicit in register file update and output update)
// We use a simple register file update at the end of W stage.
// Forwarding Logic
// We need to forward results from X stage to X stage inputs if the source
// register matches the destination register of the instruction in X stage.
// Since we have a 3-stage pipeline, the X stage instruction is the one
// that was in F stage 1 cycle ago. The W stage instruction is the one
// that was in X stage 1 cycle ago.
// Actually, let's look at the timing:
// Cycle N: Instr A in F, Instr B in X, Instr C in W
// Cycle N+1: Instr B in X, Instr C in W, Instr D in F
// Instr B in X might need a register written by Instr A.
// Instr A is in W stage in Cycle N. Its result is available in x_w_result.
// So we can forward from W stage to X stage.
// Also, Instr C in W might write a register that Instr B in X needs? No, B is older.
// What about Instr D in F? It doesn't execute until Cycle N+2.
// So the only forwarding needed is from W stage result to X stage operands.
reg signed [7:0] fwd_val;
reg fwd_valid;
// Decode instruction fields
// F Stage
wire [7:0] f_instr = IMEM_INIT[f_x_pc*8 +: 8]; // This is not correct for current cycle fetch
// Let's define the current PC for fetch
reg [4:0] pc_curr;
reg [4:0] pc_next;
// The instruction fetched in the current cycle is based on pc_curr.
// But we need to store it in f_x_instr for the next cycle.
wire [7:0] instr_fetch = IMEM_INIT[pc_curr*8 +: 8];
// X Stage Decode
wire [1:0] x_op = f_x_instr[7:6];
wire [1:0] x_rd = f_x_instr[5:4];
wire [3:0] x_imm = f_x_instr[3:0];
wire [1:0] x_rs = f_x_instr[3:2]; // For MOV, bits 3:2 are rs. For ADDI, bits 3:0 are imm.
// Wait, the encoding:
// ADDI: 00 rd[2] imm[4] -> bits 7:6=00, 5:4=rd, 3:0=imm
// MOV: 01 rd[2] rs[2] _[2] -> bits 7:6=01, 5:4=rd, 3:2=rs, 1:0=xx
// BRcc: 10 cond[2] off[4] -> bits 7:6=10, 5:4=cond, 3:0=off
// HALT: 11 _ _ _ _ _ _ _ _ -> bits 7:6=11
wire signed [7:0] x_imm_sext = {{4{x_imm[3]}}, x_imm};
wire signed [7:0] x_off_sext = {{4{f_x_instr[3]}}, f_x_instr[3:0]};
// Read Register File for X stage
// We need to read R[x_rs] for MOV.
// We need to read R[x_rd] for ADDI (as source).
// Forwarding: If x_rd (of W stage) matches the source register of X stage, use W result.
reg signed [7:0] x_rs_val;
reg signed [7:0] x_rd_val; // For ADDI, rd is also source
// Forwarding from W stage to X stage
// W stage has x_w_rd and x_w_result.
// X stage needs R[x_rs] for MOV and R[x_rd] for ADDI.
// Let's define the forwarding logic in the always block.
// Branch Condition Evaluation in X stage
wire x_cond_always = (f_x_instr[5:4] == 2'b00);
wire x_cond_zero = (f_x_instr[5:4] == 2'b01) && (x_w_r0_val == 0); // Wait, branch condition depends on R0.
// The branch instruction itself doesn't modify R0. It reads R0.
// So we need the current value of R0.
// But R0 might be updated by the instruction in W stage?
// No, the instruction in W stage is older than the branch in X stage?
// Let's trace:
// Cycle N: Instr A (W), Instr B (X), Instr C (F)
// Instr B is a branch. It needs R0.
// Instr A is in W stage. It might write R0.
// If Instr A writes R0, and Instr B reads R0, we should forward A's result to B.
// So x_w_r0_val should be the value of R0 after Instr A executes.
// But Instr A is in W stage, so its result is x_w_result.
// If Instr A writes R0, then x_w_rd == 0, so R0 becomes x_w_result.
// Otherwise, R0 is rf[0].
// So the value of R0 for the branch condition is:
// if (x_w_valid && x_w_rd == 0) ? x_w_result : rf[0];
// But wait, the branch instruction is in X stage. The W stage instruction is the one
// that was in X stage last cycle.
// So yes, we can forward from W to X for the R0 read.
reg signed [7:0] r0_for_branch;
wire x_cond_zero_met = (f_x_instr[5:4] == 2'b01) && (r0_for_branch == 0);
wire x_cond_neg_met = (f_x_instr[5:4] == 2'b10) && (r0_for_branch < 0);
wire x_cond_pos_met = (f_x_instr[5:4] == 2'b11) && (r0_for_branch > 0);
wire x_branch_taken = x_cond_always | x_cond_zero_met | x_cond_neg_met | x_cond_pos_met;
// ALU Result for X stage
reg signed [7:0] x_alu_result;
reg [1:0] x_next_op;
reg [1:0] x_next_rd;
reg x_next_is_branch;
reg x_next_branch_taken;
reg [4:0] x_next_branch_target;
reg x_next_valid;
// W Stage Writeback
// If x_w_valid, write x_w_result to rf[x_w_rd] if op is ADDI or MOV.
// If op is HALT, set halt.
// PC Update
// If branch taken, PC = branch_target.
// Else, PC = f_x_pc + 1.
// But wait, the branch target is computed in X stage.
// The PC for the next fetch is determined by the branch decision.
// The instruction after the branch (delay slot) is already fetched.
// So if branch is taken, the next PC is the target.
// If branch is not taken, the next PC is f_x_pc + 1 + 1? No.
// Let's trace carefully.
// Cycle N:
// F: Fetch Instr C at PC=2
// X: Decode Instr B at PC=1 (Branch)
// W: Commit Instr A at PC=0
// If Instr B is taken, the next instruction to execute after the delay slot (C) is at Target.
// So in Cycle N+1:
// F: Fetch Instr D at Target
// X: Decode Instr C at PC=2
// W: Commit Instr B (no write)
// So the PC for fetch in Cycle N+1 should be Target if taken, else PC=3.
// The branch target is PC_branch + sign_ext(off) + 1.
// PC_branch is f_x_pc in Cycle N.
// So target = f_x_pc + x_off_sext + 1.
// Reset
always @(posedge clk) begin
if (rst) begin
pc_curr <= 0;
f_x_instr <= 0;
f_x_pc <= 0;
f_x_valid <= 0;
x_w_op <= 0;
x_w_rd <= 0;
x_w_result <= 0;
x_w_cond <= 0;
x_w_r0_val <= 0;
x_w_valid <= 0;
x_w_is_branch <= 0;
x_w_branch_taken <= 0;
x_w_branch_target <= 0;
halt_out <= 0;
pc_out <= 0;
r0_out <= 0;
for (int i = 0; i < 4; i++) rf[i] <= 0;
end else begin
// Update Register File
if (x_w_valid) begin
if (x_w_op == 2'b01 || x_w_op == 2'b10) begin // ADDI or MOV
rf[x_w_rd] <= x_w_result;
end else if (x_w_op == 2'b11) begin // HALT
halt_out <= 1;
end
end
// Update Outputs
pc_out <= pc_curr;
r0_out <= rf[0];
// Advance Pipeline
// F -> X
f_x_instr <= instr_fetch;
f_x_pc <= pc_curr;
f_x_valid <= ~halt_out; // If halt, freeze pipeline
// X -> W
x_w_op <= x_next_op;
x_w_rd <= x_next_rd;
x_w_result <= x_alu_result;
x_w_cond <= f_x_instr[5:4];
x_w_r0_val <= r0_for_branch; // This is the R0 value used for branch condition
x_w_valid <= x_next_valid;
x_w_is_branch <= x_next_is_branch;
x_w_branch_taken <= x_next_branch_taken;
x_w_branch_target <= x_next_branch_target;
// Update PC
if (halt_out) begin
pc_curr <= pc_curr; // Freeze
end else if (x_w_is_branch && x_w_branch_taken) begin
pc_curr <= x_w_branch_target;
end else begin
pc_curr <= pc_curr + 1;
end
end
end
// Combinational Logic for X Stage
always @(*) begin
// Default
x_next_op = 2'b00;
x_next_rd = 2'b00;
x_alu_result = 0;
x_next_valid = 0;
x_next_is_branch = 0;
x_next_branch_taken = 0;
x_next_branch_target = 0;
r0_for_branch = rf[0];
// Forwarding for R0 in Branch Condition
if (f_x_valid) begin
// Check if W stage writes R0
if (x_w_valid && x_w_rd == 0 && (x_w_op == 2'b01 || x_w_op == 2'b10)) begin
r0_for_branch = x_w_result;
end else begin
r0_for_branch = rf[0];
end
end else begin
r0_for_branch = rf[0];
end
if (!f_x_valid) begin
x_next_valid = 0;
end else begin
x_next_valid = 1;
case (f_x_instr[7:6])
2'b00: begin // ADDI
x_next_op = 2'b01;
x_next_rd = f_x_instr[5:4];
// Read R[rd]
signed [7:0] src_val;
// Forwarding for source register
if (x_w_valid && x_w_rd == f_x_instr[5:4] && (x_w_op == 2'b01 || x_w_op == 2'b10)) begin
src_val = x_w_result;
end else begin
src_val = rf[f_x_instr[5:4]];
end
x_alu_result = src_val + x_imm_sext;
x_next_is_branch = 0;
end
2'b01: begin // MOV
x_next_op = 2'b10;
x_next_rd = f_x_instr[5:4];
// Read R[rs]
signed [7:0] src_val;
if (x_w_valid && x_w_rd == f_x_instr[3:2] && (x_w_op == 2'b01 || x_w_op == 2'b10)) begin
src_val = x_w_result;
end else begin
src_val = rf[f_x_instr[3:2]];
end
x_alu_result = src_val;
x_next_is_branch = 0;
end
2'b10: begin // BRcc
x_next_op = 2'b00; // No write
x_next_rd = 0;
x_next_is_branch = 1;
x_next_branch_taken = x_branch_taken;
x_next_branch_target = f_x_pc + x_off_sext + 1;
x_alu_result = 0;
end
2'b11: begin // HALT
x_next_op = 2'b11;
x_next_rd = 0;
x_next_is_branch = 0;
x_alu_result = 0;
end
default: begin
x_next_op = 2'b00;
x_next_is_branch = 0;
end
endcase
end
end
endmodule```verilog
// Branch convention: The instruction immediately following the branch instruction
// (at PC_branch + 1) is always executed (1 branch delay slot). The branch target
// instruction is fetched at PC_target. If the branch is not taken, the PC continues
// to PC_branch + 2. This matches the semantics: "if (cond_met) PC <= PC + sign_ext(off,4)+1".
// In a pipelined implementation, this means the fetch stage computes the next PC
// based on the branch decision resolved in the Execute stage. The instruction fetched
// during the cycle the branch is decoded is the delay slot instruction.
module mcu3 #(
parameter [0:255] IMEM_INIT = 256'h0 // 32 instruction bytes, byte 0 at MSB
) (
input clk,
input rst, // synchronous, active-high
output reg [4:0] pc_out, // current fetched PC (visible for testing)
output reg halt_out, // asserted forever after HALT executes
output reg signed [7:0] r0_out // current value of register R0
);
// Register File: 4 registers, 8-bit signed
reg signed [7:0] rf [0:3];
// Pipeline Registers
// Stage F -> X
reg [7:0] f_x_instr;
reg [4:0] f_x_pc;
reg f_x_valid;
// Stage X -> W
reg [1:0] x_w_op; // 00: NOP, 01: ADDI, 10: MOV, 11: HALT
reg [1:0] x_w_rd;
reg signed [7:0] x_w_result;
reg [1:0] x_w_cond;
reg signed [7:0] x_w_r0_val; // R0 value after execution for branch condition
reg x_w_valid;
reg x_w_is_branch;
reg x_w_branch_taken;
reg [4:0] x_w_branch_target;
// Stage W -> Commit (Implicit in register file update and output update)
// We use a simple register file update at the end of W stage.
// Forwarding Logic
// We need to forward results from X stage to X stage inputs if the source
// register matches the destination register of the instruction in X stage.
// Since we have a 3-stage pipeline, the X stage instruction is the one
// that was in F stage 1 cycle ago. The W stage instruction is the one
// that was in X stage 1 cycle ago.
// Actually, let's look at the timing:
// Cycle N: Instr A in F, Instr B in X, Instr C in W
// Cycle N+1: Instr B in X, Instr C in W, Instr D in F
// Instr B in X might need a register written by Instr A.
// Instr A is in W stage in Cycle N. Its result is available in x_w_result.
// So we can forward from W stage to X stage.
// Also, Instr C in W might write a register that Instr B in X needs? No, B is older.
// What about Instr D in F? It doesn't execute until Cycle N+2.
// So the only forwarding needed is from W stage result to X stage operands.
reg signed [7:0] fwd_val;
reg fwd_valid;
// Decode instruction fields
// F Stage
wire [7:0] f_instr = IMEM_INIT[f_x_pc*8 +: 8]; // This is not correct for current cycle fetch
// Let's define the current PC for fetch
reg [4:0] pc_curr;
reg [4:0] pc_next;
// The instruction fetched in the current cycle is based on pc_curr.
// But we need to store it in f_x_instr for the next cycle.
wire [7:0] instr_fetch = IMEM_INIT[pc_curr*8 +: 8];
// X Stage Decode
wire [1:0] x_op = f_x_instr[7:6];
wire [1:0] x_rd = f_x_instr[5:4];
wire [3:0] x_imm = f_x_instr[3:0];
wire [1:0] x_rs = f_x_instr[3:2]; // For MOV, bits 3:2 are rs. For ADDI, bits 3:0 are imm.
// Wait, the encoding:
// ADDI: 00 rd[2] imm[4] -> bits 7:6=00, 5:4=rd, 3:0=imm
// MOV: 01 rd[2] rs[2] _[2] -> bits 7:6=01, 5:4=rd, 3:2=rs, 1:0=xx
// BRcc: 10 cond[2] off[4] -> bits 7:6=10, 5:4=cond, 3:0=off
// HALT: 11 _ _ _ _ _ _ _ _ -> bits 7:6=11
wire signed [7:0] x_imm_sext = {{4{x_imm[3]}}, x_imm};
wire signed [7:0] x_off_sext = {{4{f_x_instr[3]}}, f_x_instr[3:0]};
// Read Register File for X stage
// We need to read R[x_rs] for MOV.
// We need to read R[x_rd] for ADDI (as source).
// Forwarding: If x_rd (of W stage) matches the source register of X stage, use W result.
reg signed [7:0] x_rs_val;
reg signed [7:0] x_rd_val; // For ADDI, rd is also source
// Forwarding from W stage to X stage
// W stage has x_w_rd and x_w_result.
// X stage needs R[x_rs] for MOV and R[x_rd] for ADDI.
// Let's define the forwarding logic in the always block.
// Branch Condition Evaluation in X stage
wire x_cond_always = (f_x_instr[5:4] == 2'b00);
wire x_cond_zero = (f_x_instr[5:4] == 2'b01) && (x_w_r0_val == 0); // Wait, branch condition depends on R0.
// The branch instruction itself doesn't modify R0. It reads R0.
// So we need the current value of R0.
// But R0 might be updated by the instruction in W stage?
// No, the instruction in W stage is older than the branch in X stage?
// Let's trace:
// Cycle N: Instr A (W), Instr B (X), Instr C (F)
// Instr B is a branch. It needs R0.
// Instr A is in W stage. It might write R0.
// If Instr A writes R0, and Instr B reads R0, we should forward A's result to B.
// So x_w_r0_val should be the value of R0 after Instr A executes.
// But Instr A is in W stage, so its result is x_w_result.
// If Instr A writes R0, then x_w_rd == 0, so R0 becomes x_w_result.
// Otherwise, R0 is rf[0].
// So the value of R0 for the branch condition is:
// if (x_w_valid && x_w_rd == 0) ? x_w_result : rf[0];
// But wait, the branch instruction is in X stage. The W stage instruction is the one
// that was in X stage last cycle.
// So yes, we can forward from W to X for the R0 read.
reg signed [7:0] r0_for_branch;
wire x_cond_zero_met = (f_x_instr[5:4] == 2'b01) && (r0_for_branch == 0);
wire x_cond_neg_met = (f_x_instr[5:4] == 2'b10) && (r0_for_branch < 0);
wire x_cond_pos_met = (f_x_instr[5:4] == 2'b11) && (r0_for_branch > 0);
wire x_branch_taken = x_cond_always | x_cond_zero_met | x_cond_neg_met | x_cond_pos_met;
// ALU Result for X stage
reg signed [7:0] x_alu_result;
reg [1:0] x_next_op;
reg [1:0] x_next_rd;
reg x_next_is_branch;
reg x_next_branch_taken;
reg [4:0] x_next_branch_target;
reg x_next_valid;
// W Stage Writeback
// If x_w_valid, write x_w_result to rf[x_w_rd] if op is ADDI or MOV.
// If op is HALT, set halt.
// PC Update
// If branch taken, PC = branch_target.
// Else, PC = f_x_pc + 1.
// But wait, the branch target is computed in X stage.
// The PC for the next fetch is determined by the branch decision.
// The instruction after the branch (delay slot) is already fetched.
// So if branch is taken, the next PC is the target.
// If branch is not taken, the next PC is f_x_pc + 1 + 1? No.
// Let's trace carefully.
// Cycle N:
// F: Fetch Instr C at PC=2
// X: Decode Instr B at PC=1 (Branch)
// W: Commit Instr A at PC=0
// If Instr B is taken, the next instruction to execute after the delay slot (C) is at Target.
// So in Cycle N+1:
// F: Fetch Instr D at Target
// X: Decode Instr C at PC=2
// W: Commit Instr B (no write)
// So the PC for fetch in Cycle N+1 should be Target if taken, else PC=3.
// The branch target is PC_branch + sign_ext(off) + 1.
// PC_branch is f_x_pc in Cycle N.
// So target = f_x_pc + x_off_sext + 1.
// Reset
always @(posedge clk) begin
if (rst) begin
pc_curr <= 0;
f_x_instr <= 0;
f_x_pc <= 0;
f_x_valid <= 0;
x_w_op <= 0;
x_w_rd <= 0;
x_w_result <= 0;
x_w_cond <= 0;
x_w_r0_val <= 0;
x_w_valid <= 0;
x_w_is_branch <= 0;
x_w_branch_taken <= 0;
x_w_branch_target <= 0;
halt_out <= 0;
pc_out <= 0;
r0_out <= 0;
for (int i = 0; i < 4; i++) rf[i] <= 0;
end else begin
// Update Register File
if (x_w_valid) begin
if (x_w_op == 2'b01 || x_w_op == 2'b10) begin // ADDI or MOV
rf[x_w_rd] <= x_w_result;
end else if (x_w_op == 2'b11) begin // HALT
halt_out <= 1;
end
end
// Update Outputs
pc_out <= pc_curr;
r0_out <= rf[0];
// Advance Pipeline
// F -> X
f_x_instr <= instr_fetch;
f_x_pc <= pc_curr;
f_x_valid <= ~halt_out; // If halt, freeze pipeline
// X -> W
x_w_op <= x_next_op;
x_w_rd <= x_next_rd;
x_w_result <= x_alu_result;
x_w_cond <= f_x_instr[5:4];
x_w_r0_val <= r0_for_branch; // This is the R0 value used for branch condition
x_w_valid <= x_next_valid;
x_w_is_branch <= x_next_is_branch;
x_w_branch_taken <= x_next_branch_taken;
x_w_branch_target <= x_next_branch_target;
// Update PC
if (halt_out) begin
pc_curr <= pc_curr; // Freeze
end else if (x_w_is_branch && x_w_branch_taken) begin
pc_curr <= x_w_branch_target;
end else begin
pc_curr <= pc_curr + 1;
end
end
end
// Combinational Logic for X Stage
always @(*) begin
// Default
x_next_op = 2'b00;
x_next_rd = 2'b00;
x_alu_result = 0;
x_next_valid = 0;
x_next_is_branch = 0;
x_next_branch_taken = 0;
x_next_branch_target = 0;
r0_for_branch = rf[0];
// Forwarding for R0 in Branch Condition
if (f_x_valid) begin
// Check if W stage writes R0
if (x_w_valid && x_w_rd == 0 && (x_w_op == 2'b01 || x_w_op == 2'b10)) begin
r0_for_branch = x_w_result;
end else begin
r0_for_branch = rf[0];
end
end else begin
r0_for_branch = rf[0];
end
if (!f_x_valid) begin
x_next_valid = 0;
end else begin
x_next_valid = 1;
case (f_x_instr[7:6])
2'b00: begin // ADDI
x_next_op = 2'b01;
x_next_rd = f_x_instr[5:4];
// Read R[rd]
signed [7:0] src_val;
// Forwarding for source register
if (x_w_valid && x_w_rd == f_x_instr[5:4] && (x_w_op == 2'b01 || x_w_op == 2'b10)) begin
src_val = x_w_result;
end else begin
src_val = rf[f_x_instr[5:4]];
end
x_alu_result = src_val + x_imm_sext;
x_next_is_branch = 0;
end
2'b01: begin // MOV
x_next_op = 2'b10;
x_next_rd = f_x_instr[5:4];
// Read R[rs]
signed [7:0] src_val;
if (x_w_valid && x_w_rd == f_x_instr[3:2] && (x_w_op == 2'b01 || x_w_op == 2'b10)) begin
src_val = x_w_result;
end else begin
src_val = rf[f_x_instr[3:2]];
end
x_alu_result = src_val;
x_next_is_branch = 0;
end
2'b10: begin // BRcc
x_next_op = 2'b00; // No write
x_next_rd = 0;
x_next_is_branch = 1;
x_next_branch_taken = x_branch_taken;
x_next_branch_target = f_x_pc + x_off_sext + 1;
x_alu_result = 0;
end
2'b11: begin // HALT
x_next_op = 2'b11;
x_next_rd = 0;
x_next_is_branch = 0;
x_alu_result = 0;
end
default: begin
x_next_op = 2'b00;
x_next_is_branch = 0;
end
endcase
end
end
endmodule
```