no functional run (did not compile)
(no automated diagnostics fired)
module mcu3 #(
parameter [0:255] IMEM_INIT = 256'h0
) (
input clk,
input rst,
output reg [4:0] pc_out,
output reg halt_out,
output reg signed [7:0] r0_out
);
// Pipeline Registers
// Stage F: Fetch
reg [4:0] pc_f;
reg [7:0] instr_f;
reg [4:0] npc_f; // Next PC candidate (PC+1)
// Stage X: Decode/Execute
reg [4:0] pc_x;
reg [7:0] instr_x;
reg [4:0] npc_x;
reg [1:0] opcode_x;
reg [1:0] rd_x;
reg [1:0] rs_x;
reg [3:0] imm_x;
reg [1:0] cond_x;
reg [3:0] off_x;
reg [7:0] rs_val_x;
reg [7:0] alu_res_x;
reg is_branch_x;
reg branch_taken_x;
reg is_halt_x;
reg [1:0] rd_wb_x; // Tag for writeback
reg wb_en_x; // Enable writeback
// Stage W: Writeback
reg [4:0] pc_w;
reg [7:0] instr_w;
reg [1:0] opcode_w;
reg [1:0] rd_w;
reg [7:0] alu_res_w;
reg is_halt_w;
reg wb_en_w;
// Register File
reg signed [7:0] rf [0:3];
// Forwarding Logic Signals
reg [7:0] fwd_val_x;
reg fwd_valid_x;
// IMEM Access
wire [7:0] imem_data;
assign imem_data = IMEM_INIT[pc_f*8 +: 8];
// Reset Logic
always @(posedge clk) begin
if (rst) begin
pc_f <= 5'd0;
instr_f <= 8'b0;
npc_f <= 5'd1;
pc_x <= 5'd0;
instr_x <= 8'b0;
npc_x <= 5'd1;
opcode_x <= 2'b00;
rd_x <= 2'b00;
rs_x <= 2'b00;
imm_x <= 4'b0000;
cond_x <= 2'b00;
off_x <= 4'b0000;
rs_val_x <= 8'b0;
alu_res_x <= 8'b0;
is_branch_x <= 1'b0;
branch_taken_x <= 1'b0;
is_halt_x <= 1'b0;
rd_wb_x <= 2'b00;
wb_en_x <= 1'b0;
pc_w <= 5'd0;
instr_w <= 8'b0;
opcode_w <= 2'b00;
rd_w <= 2'b00;
alu_res_w <= 8'b0;
is_halt_w <= 1'b0;
wb_en_w <= 1'b0;
halt_out <= 1'b0;
pc_out <= 5'd0;
r0_out <= 8'b0;
rf[0] <= 8'b0;
rf[1] <= 8'b0;
rf[2] <= 8'b0;
rf[3] <= 8'b0;
end else begin
// Update outputs
pc_out <= pc_f;
// Check for HALT in W stage
if (is_halt_w) begin
halt_out <= 1'b1;
end else if (wb_en_w && rd_w == 2'b00) begin
r0_out <= alu_res_w;
end
// Update Register File
if (!is_halt_w) begin
if (wb_en_w) begin
case (rd_w)
2'b00: rf[0] <= alu_res_w;
2'b01: rf[1] <= alu_res_w;
2'b10: rf[2] <= alu_res_w;
2'b11: rf[3] <= alu_res_w;
endcase
end
end
// Pipeline Stage W -> X
pc_x <= pc_w;
instr_x <= instr_w;
npc_x <= npc_x; // NPC is computed in X based on branch, but we need to propagate the *fetch* NPC if no branch?
// Actually, let's restructure NPC handling.
// F computes PC+1. X computes Branch Target. W does nothing with PC.
// Let's move NPC computation to X and W properly.
// Correct Pipeline Flow for PC:
// F: Fetch instr at PC_f. Compute PC_f + 1.
// X: Decode instr. If branch, compute target. Determine if taken.
// W: Writeback.
// PC Update:
// If Halt, freeze.
// If Branch Taken in X, next PC is Target.
// Else, next PC is PC_f + 1 (from F).
// Let's refine the registers to handle PC correctly.
// We need to know the "Next PC" from the previous stage to decide the current PC.
// Re-defining Pipeline Registers for clarity and correctness
// F Stage
pc_f <= (is_halt_x) ? pc_x : (branch_taken_x ? npc_x : npc_f);
// Wait, if branch is taken in X, the instruction in W is the delay slot.
// The instruction after delay slot is fetched at Target.
// The instruction in X is the branch.
// The instruction in W is the one fetched at PC_branch.
// So if Branch Taken:
// Next PC (for F) = Branch Target.
// If Branch Not Taken:
// Next PC (for F) = PC_branch + 1.
// Let's restart the pipeline logic update block carefully.
// --- Pipeline Updates ---
// W Stage Update
pc_w <= pc_x;
instr_w <= instr_x;
opcode_w <= opcode_x;
rd_w <= rd_x;
alu_res_w <= alu_res_x;
is_halt_w <= is_halt_x;
wb_en_w <= wb_en_x;
// X Stage Update
pc_x <= pc_f;
instr_x <= instr_f;
// Decode
opcode_x <= instr_f[7:6];
case (instr_f[7:6])
2'b00: begin // ADDI
rd_x <= instr_f[5:4];
imm_x <= instr_f[3:0];
rs_x <= 2'b00; // Unused
cond_x <= 2'b00; // Unused
off_x <= 4'b0000; // Unused
is_branch_x <= 1'b0;
is_halt_x <= 1'b0;
wb_en_x <= 1'b1;
end
2'b01: begin // MOV
rd_x <= instr_f[5:4];
rs_x <= instr_f[3:2];
imm_x <= 4'b0000; // Unused
cond_x <= 2'b00; // Unused
off_x <= 4'b0000; // Unused
is_branch_x <= 1'b0;
is_halt_x <= 1'b0;
wb_en_x <= 1'b1;
end
2'b10: begin // BRcc
cond_x <= instr_f[5:4];
off_x <= instr_f[3:0];
rd_x <= 2'b00; // Unused
rs_x <= 2'b00; // Unused
imm_x <= 4'b0000; // Unused
is_branch_x <= 1'b1;
is_halt_x <= 1'b0;
wb_en_x <= 1'b0;
end
2'b11: begin // HALT
rd_x <= 2'b00;
rs_x <= 2'b00;
imm_x <= 4'b0000;
cond_x <= 2'b00;
off_x <= 4'b0000;
is_branch_x <= 1'b0;
is_halt_x <= 1'b1;
wb_en_x <= 1'b0;
end
default: begin
is_branch_x <= 1'b0;
is_halt_x <= 1'b0;
wb_en_x <= 1'b0;
end
endcase
// Execute ALU / Branch Logic
// Forwarding for rs
// We need to read rs_val_x.
// If opcode is ADDI, rs is implicit 0? No, ADDI is R[rd] <= R[rd] + imm. So rs is rd.
// If opcode is MOV, rs is rs.
// Determine source register index
reg [1:0] src_reg_idx;
if (opcode_x == 2'b00) src_reg_idx = rd_x; // ADDI reads rd
else if (opcode_x == 2'b01) src_reg_idx = rs_x; // MOV reads rs
else src_reg_idx = 2'b00; // Dummy
// Forwarding Logic
// Check W stage
if (wb_en_w && rd_w == src_reg_idx && rd_w != 2'b00) begin // Avoid forwarding to/from R0 if R0 is special? No, R0 is just a reg.
fwd_val_x = alu_res_w;
fwd_valid_x = 1'b1;
end else if (wb_en_x && rd_x == src_reg_idx && rd_x != 2'b00) begin
// Forward from X stage (current instruction writing to rd, reading same rd? e.g. ADDI R0, R0, imm? No, ADDI reads R[rd] and writes R[rd].
// If we are in X stage, the result isn't ready yet.
// Standard forwarding: X stage reads from W stage (previous instr) or X stage (current instr)?
// Current instr result is not available until end of X.
// So we can only forward from W (previous) or RF (older).
// Exception: If we need to forward from X to X? No, X produces result at end of cycle.
// So we check W stage.
fwd_val_x = alu_res_x; // This is from the *previous* cycle's X stage result, which is now in W? No.
// Let's trace:
// Cycle T: Instr A in W. Instr B in X.
// Instr B needs result of Instr A.
// Instr A result is in alu_res_w.
// So fwd_val_x = alu_res_w is correct for W->X forwarding.
// What about X->X? Instr B in X. Instr B reads R[rd] and writes R[rd].
// It reads the old value from RF.
fwd_valid_x = 1'b0; // Handled below
end else begin
fwd_valid_x = 1'b0;
end
// Re-evaluating Forwarding Logic
// We need to select the value for rs_val_x.
// Candidates:
// 1. Forward from W stage (Instr in W writes to src_reg_idx)
// 2. Forward from X stage (Instr in X writes to src_reg_idx) -> Only valid if we can use the result immediately? No, ALU result is ready at end of cycle.
// So we can't forward from X to X within the same cycle for the ALU operation.
// However, if the instruction is ADDI R0, imm, it reads R0.
// If the previous instruction (in W) wrote to R0, we use W's result.
// If the previous instruction (in X) wrote to R0, we can't use it yet. We use RF.
if (wb_en_w && rd_w == src_reg_idx) begin
rs_val_x = alu_res_w;
end else if (wb_en_x && rd_x == src_reg_idx) begin
// This case is tricky. If we are in X, and we need a value that is being written by the current X instruction?
// e.g. ADDI R0, R0, 1.
// We read R0 from RF.
rs_val_x = rf[src_reg_idx];
end else begin
rs_val_x = rf[src_reg_idx];
end
// Compute ALU Result
case (opcode_x)
2'b00: begin // ADDI
alu_res_x = rs_val_x + $signed({{4{imm_x[3]}}, imm_x});
end
2'b01: begin // MOV
alu_res_x = rs_val_x;
end
default: begin
alu_res_x = 8'b0;
end
end
// Compute Branch Condition
if (is_branch_x) begin
case (cond_x)
2'b00: branch_taken_x = 1'b1; // Always
2'b01: branch_taken_x = (rs_val_x == 8'b0) ? 1'b1 : 1'b0; // Zero(R0) -> Wait, spec says Zero(R0).
// Spec: cond 01=zero(R0).
// Does this mean we always check R0? Or R[rs]?
// "cond 01=zero(R0)" implies the condition is on R0 specifically.
// "cond 10=neg(R0)"
// "cond 11=pos(R0)"
// This is unusual. Usually it's on the result of the previous ALU or a specific register.
// Given the ISA: BRcc off. No register operand.
// So it must be checking R0 specifically.
branch_taken_x = (rf[0] == 8'b0) ? 1'b1 : 1'b0;
2'b10: branch_taken_x = ($signed(rf[0]) < 0) ? 1'b1 : 1'b0;
2'b11: branch_taken_x = ($signed(rf[0]) > 0) ? 1'b1 : 1'b0;
default: branch_taken_x = 1'b0;
endcase
// Compute Branch Target
// PC <= PC + sign_ext(off,4) + 1
// PC in X stage is pc_x.
npc_x = pc_x + $signed({{4{off_x[3]}}, off_x}) + 1;
end else begin
branch_taken_x = 1'b0;
npc_x = pc_x + 1; // Default next PC
end
// F Stage Update
// Determine next PC for F stage
reg [4:0] next_pc_f;
if (is_halt_x) begin
// Freeze PC
next_pc_f = pc_f;
end else if (branch_taken_x) begin
// Branch Taken: Next PC is Target
// Note: The instruction in W is the delay slot.
// The instruction in X is the branch.
// The next instruction to fetch is the Target.
next_pc_f = npc_x;
end else begin
// Branch Not Taken or Not Branch: Next PC is PC+1
next_pc_f = pc_x + 1;
end
pc_f <= next_pc_f;
instr_f <= imem_data;
npc_f <= next_pc_f + 1; // Precompute for next cycle? Not strictly needed if we compute in X.
// But we need npc_f for the "Not Taken" case in the next cycle?
// In the next cycle, pc_x will be next_pc_f.
// If not branch, npc_x = pc_x + 1.
// So we don't need to store npc_f explicitly if we compute it in X.
end
end
endmodule// Branch convention: The instruction immediately following a BRcc is always executed (1 delay slot).
// If the branch is taken, the target instruction is fetched in the cycle after the delay slot instruction.
// Forwarding is implemented for the X-stage to read from W-stage or X-stage registers if rd matches rs.
`timescale 1ns / 1ps
module mcu3 #(
parameter [0:255] IMEM_INIT = 256'h0
) (
input clk,
input rst,
output reg [4:0] pc_out,
output reg halt_out,
output reg signed [7:0] r0_out
);
// Pipeline Registers
// Stage F: Fetch
reg [4:0] pc_f;
reg [7:0] instr_f;
reg [4:0] npc_f; // Next PC candidate (PC+1)
// Stage X: Decode/Execute
reg [4:0] pc_x;
reg [7:0] instr_x;
reg [4:0] npc_x;
reg [1:0] opcode_x;
reg [1:0] rd_x;
reg [1:0] rs_x;
reg [3:0] imm_x;
reg [1:0] cond_x;
reg [3:0] off_x;
reg [7:0] rs_val_x;
reg [7:0] alu_res_x;
reg is_branch_x;
reg branch_taken_x;
reg is_halt_x;
reg [1:0] rd_wb_x; // Tag for writeback
reg wb_en_x; // Enable writeback
// Stage W: Writeback
reg [4:0] pc_w;
reg [7:0] instr_w;
reg [1:0] opcode_w;
reg [1:0] rd_w;
reg [7:0] alu_res_w;
reg is_halt_w;
reg wb_en_w;
// Register File
reg signed [7:0] rf [0:3];
// Forwarding Logic Signals
reg [7:0] fwd_val_x;
reg fwd_valid_x;
// IMEM Access
wire [7:0] imem_data;
assign imem_data = IMEM_INIT[pc_f*8 +: 8];
// Reset Logic
always @(posedge clk) begin
if (rst) begin
pc_f <= 5'd0;
instr_f <= 8'b0;
npc_f <= 5'd1;
pc_x <= 5'd0;
instr_x <= 8'b0;
npc_x <= 5'd1;
opcode_x <= 2'b00;
rd_x <= 2'b00;
rs_x <= 2'b00;
imm_x <= 4'b0000;
cond_x <= 2'b00;
off_x <= 4'b0000;
rs_val_x <= 8'b0;
alu_res_x <= 8'b0;
is_branch_x <= 1'b0;
branch_taken_x <= 1'b0;
is_halt_x <= 1'b0;
rd_wb_x <= 2'b00;
wb_en_x <= 1'b0;
pc_w <= 5'd0;
instr_w <= 8'b0;
opcode_w <= 2'b00;
rd_w <= 2'b00;
alu_res_w <= 8'b0;
is_halt_w <= 1'b0;
wb_en_w <= 1'b0;
halt_out <= 1'b0;
pc_out <= 5'd0;
r0_out <= 8'b0;
rf[0] <= 8'b0;
rf[1] <= 8'b0;
rf[2] <= 8'b0;
rf[3] <= 8'b0;
end else begin
// Update outputs
pc_out <= pc_f;
// Check for HALT in W stage
if (is_halt_w) begin
halt_out <= 1'b1;
end else if (wb_en_w && rd_w == 2'b00) begin
r0_out <= alu_res_w;
end
// Update Register File
if (!is_halt_w) begin
if (wb_en_w) begin
case (rd_w)
2'b00: rf[0] <= alu_res_w;
2'b01: rf[1] <= alu_res_w;
2'b10: rf[2] <= alu_res_w;
2'b11: rf[3] <= alu_res_w;
endcase
end
end
// Pipeline Stage W -> X
pc_x <= pc_w;
instr_x <= instr_w;
npc_x <= npc_x; // NPC is computed in X based on branch, but we need to propagate the *fetch* NPC if no branch?
// Actually, let's restructure NPC handling.
// F computes PC+1. X computes Branch Target. W does nothing with PC.
// Let's move NPC computation to X and W properly.
// Correct Pipeline Flow for PC:
// F: Fetch instr at PC_f. Compute PC_f + 1.
// X: Decode instr. If branch, compute target. Determine if taken.
// W: Writeback.
// PC Update:
// If Halt, freeze.
// If Branch Taken in X, next PC is Target.
// Else, next PC is PC_f + 1 (from F).
// Let's refine the registers to handle PC correctly.
// We need to know the "Next PC" from the previous stage to decide the current PC.
// Re-defining Pipeline Registers for clarity and correctness
// F Stage
pc_f <= (is_halt_x) ? pc_x : (branch_taken_x ? npc_x : npc_f);
// Wait, if branch is taken in X, the instruction in W is the delay slot.
// The instruction after delay slot is fetched at Target.
// The instruction in X is the branch.
// The instruction in W is the one fetched at PC_branch.
// So if Branch Taken:
// Next PC (for F) = Branch Target.
// If Branch Not Taken:
// Next PC (for F) = PC_branch + 1.
// Let's restart the pipeline logic update block carefully.
// --- Pipeline Updates ---
// W Stage Update
pc_w <= pc_x;
instr_w <= instr_x;
opcode_w <= opcode_x;
rd_w <= rd_x;
alu_res_w <= alu_res_x;
is_halt_w <= is_halt_x;
wb_en_w <= wb_en_x;
// X Stage Update
pc_x <= pc_f;
instr_x <= instr_f;
// Decode
opcode_x <= instr_f[7:6];
case (instr_f[7:6])
2'b00: begin // ADDI
rd_x <= instr_f[5:4];
imm_x <= instr_f[3:0];
rs_x <= 2'b00; // Unused
cond_x <= 2'b00; // Unused
off_x <= 4'b0000; // Unused
is_branch_x <= 1'b0;
is_halt_x <= 1'b0;
wb_en_x <= 1'b1;
end
2'b01: begin // MOV
rd_x <= instr_f[5:4];
rs_x <= instr_f[3:2];
imm_x <= 4'b0000; // Unused
cond_x <= 2'b00; // Unused
off_x <= 4'b0000; // Unused
is_branch_x <= 1'b0;
is_halt_x <= 1'b0;
wb_en_x <= 1'b1;
end
2'b10: begin // BRcc
cond_x <= instr_f[5:4];
off_x <= instr_f[3:0];
rd_x <= 2'b00; // Unused
rs_x <= 2'b00; // Unused
imm_x <= 4'b0000; // Unused
is_branch_x <= 1'b1;
is_halt_x <= 1'b0;
wb_en_x <= 1'b0;
end
2'b11: begin // HALT
rd_x <= 2'b00;
rs_x <= 2'b00;
imm_x <= 4'b0000;
cond_x <= 2'b00;
off_x <= 4'b0000;
is_branch_x <= 1'b0;
is_halt_x <= 1'b1;
wb_en_x <= 1'b0;
end
default: begin
is_branch_x <= 1'b0;
is_halt_x <= 1'b0;
wb_en_x <= 1'b0;
end
endcase
// Execute ALU / Branch Logic
// Forwarding for rs
// We need to read rs_val_x.
// If opcode is ADDI, rs is implicit 0? No, ADDI is R[rd] <= R[rd] + imm. So rs is rd.
// If opcode is MOV, rs is rs.
// Determine source register index
reg [1:0] src_reg_idx;
if (opcode_x == 2'b00) src_reg_idx = rd_x; // ADDI reads rd
else if (opcode_x == 2'b01) src_reg_idx = rs_x; // MOV reads rs
else src_reg_idx = 2'b00; // Dummy
// Forwarding Logic
// Check W stage
if (wb_en_w && rd_w == src_reg_idx && rd_w != 2'b00) begin // Avoid forwarding to/from R0 if R0 is special? No, R0 is just a reg.
fwd_val_x = alu_res_w;
fwd_valid_x = 1'b1;
end else if (wb_en_x && rd_x == src_reg_idx && rd_x != 2'b00) begin
// Forward from X stage (current instruction writing to rd, reading same rd? e.g. ADDI R0, R0, imm? No, ADDI reads R[rd] and writes R[rd].
// If we are in X stage, the result isn't ready yet.
// Standard forwarding: X stage reads from W stage (previous instr) or X stage (current instr)?
// Current instr result is not available until end of X.
// So we can only forward from W (previous) or RF (older).
// Exception: If we need to forward from X to X? No, X produces result at end of cycle.
// So we check W stage.
fwd_val_x = alu_res_x; // This is from the *previous* cycle's X stage result, which is now in W? No.
// Let's trace:
// Cycle T: Instr A in W. Instr B in X.
// Instr B needs result of Instr A.
// Instr A result is in alu_res_w.
// So fwd_val_x = alu_res_w is correct for W->X forwarding.
// What about X->X? Instr B in X. Instr B reads R[rd] and writes R[rd].
// It reads the old value from RF.
fwd_valid_x = 1'b0; // Handled below
end else begin
fwd_valid_x = 1'b0;
end
// Re-evaluating Forwarding Logic
// We need to select the value for rs_val_x.
// Candidates:
// 1. Forward from W stage (Instr in W writes to src_reg_idx)
// 2. Forward from X stage (Instr in X writes to src_reg_idx) -> Only valid if we can use the result immediately? No, ALU result is ready at end of cycle.
// So we can't forward from X to X within the same cycle for the ALU operation.
// However, if the instruction is ADDI R0, imm, it reads R0.
// If the previous instruction (in W) wrote to R0, we use W's result.
// If the previous instruction (in X) wrote to R0, we can't use it yet. We use RF.
if (wb_en_w && rd_w == src_reg_idx) begin
rs_val_x = alu_res_w;
end else if (wb_en_x && rd_x == src_reg_idx) begin
// This case is tricky. If we are in X, and we need a value that is being written by the current X instruction?
// e.g. ADDI R0, R0, 1.
// We read R0 from RF.
rs_val_x = rf[src_reg_idx];
end else begin
rs_val_x = rf[src_reg_idx];
end
// Compute ALU Result
case (opcode_x)
2'b00: begin // ADDI
alu_res_x = rs_val_x + $signed({{4{imm_x[3]}}, imm_x});
end
2'b01: begin // MOV
alu_res_x = rs_val_x;
end
default: begin
alu_res_x = 8'b0;
end
end
// Compute Branch Condition
if (is_branch_x) begin
case (cond_x)
2'b00: branch_taken_x = 1'b1; // Always
2'b01: branch_taken_x = (rs_val_x == 8'b0) ? 1'b1 : 1'b0; // Zero(R0) -> Wait, spec says Zero(R0).
// Spec: cond 01=zero(R0).
// Does this mean we always check R0? Or R[rs]?
// "cond 01=zero(R0)" implies the condition is on R0 specifically.
// "cond 10=neg(R0)"
// "cond 11=pos(R0)"
// This is unusual. Usually it's on the result of the previous ALU or a specific register.
// Given the ISA: BRcc off. No register operand.
// So it must be checking R0 specifically.
branch_taken_x = (rf[0] == 8'b0) ? 1'b1 : 1'b0;
2'b10: branch_taken_x = ($signed(rf[0]) < 0) ? 1'b1 : 1'b0;
2'b11: branch_taken_x = ($signed(rf[0]) > 0) ? 1'b1 : 1'b0;
default: branch_taken_x = 1'b0;
endcase
// Compute Branch Target
// PC <= PC + sign_ext(off,4) + 1
// PC in X stage is pc_x.
npc_x = pc_x + $signed({{4{off_x[3]}}, off_x}) + 1;
end else begin
branch_taken_x = 1'b0;
npc_x = pc_x + 1; // Default next PC
end
// F Stage Update
// Determine next PC for F stage
reg [4:0] next_pc_f;
if (is_halt_x) begin
// Freeze PC
next_pc_f = pc_f;
end else if (branch_taken_x) begin
// Branch Taken: Next PC is Target
// Note: The instruction in W is the delay slot.
// The instruction in X is the branch.
// The next instruction to fetch is the Target.
next_pc_f = npc_x;
end else begin
// Branch Not Taken or Not Branch: Next PC is PC+1
next_pc_f = pc_x + 1;
end
pc_f <= next_pc_f;
instr_f <= imem_data;
npc_f <= next_pc_f + 1; // Precompute for next cycle? Not strictly needed if we compute in X.
// But we need npc_f for the "Not Taken" case in the next cycle?
// In the next cycle, pc_x will be next_pc_f.
// If not branch, npc_x = pc_x + 1.
// So we don't need to store npc_f explicitly if we compute it in X.
end
end
endmodule