no functional run (did not compile)
module mcu3 #(
parameter [0:255] IMEM_INIT = 256'h0 // 32 instruction bytes, byte 0 at MSB
) (
input clk,
input rst, // synchronous, active-high
output reg [4:0] pc_out, // current fetched PC (visible for testing)
output reg halt_out, // asserted forever after HALT executes
output reg signed [7:0] r0_out // current value of register R0
);
// Register file: 4 registers, 8-bit signed
reg signed [7:0] rf [0:3];
// Pipeline registers
// Stage F (Fetch)
reg [4:0] pc_f;
reg [7:0] instr_f;
// Stage X (Execute)
reg [4:0] pc_x;
reg [7:0] instr_x;
reg signed [7:0] rs_val_x; // Read register value (after forwarding)
reg [1:0] cond_x; // Condition for branch
reg [3:0] off_x; // Offset for branch
reg [1:0] rd_x; // Destination register
reg [1:0] rs_x; // Source register (for MOV)
reg [3:0] imm_x; // Immediate (for ADDI)
reg [1:0] opcode_x; // Opcode
reg branch_taken_x; // Branch condition met?
reg is_halt_x; // Is HALT?
reg signed [7:0] alu_res_x; // ALU result
// Stage W (Writeback)
reg [4:0] pc_w;
reg [7:0] instr_w;
reg [1:0] rd_w; // Destination register
reg signed [7:0] result_w; // Result to write
reg is_halt_w; // Is HALT?
reg valid_w; // Valid instruction in W stage
// Forwarding logic
// We need to forward from X stage and W stage to the X stage of the current instruction.
// The X stage instruction reads registers. If the source register matches a destination
// register being written by X or W stage, we forward.
// Forwarding from X stage to X stage (current instruction's X stage reads from previous instruction's X stage result)
// Forwarding from W stage to X stage (current instruction's X stage reads from instruction before previous's W stage result)
reg signed [7:0] fwd_val_x;
reg fwd_en_x;
// Determine forwarding for the instruction in X stage
always @(*) begin
// Default: no forwarding, use register file value
fwd_en_x = 0;
fwd_val_x = rf[rs_x];
// Check if we should forward from W stage (instruction in W stage writes to rd_w)
// The instruction in W stage is the one that was in X stage 2 cycles ago.
// It writes to rd_w. If rs_x == rd_w and rd_w != 0 and valid_w, forward from W.
if (valid_w && (rd_w != 2'b00) && (rs_x == rd_w)) begin
fwd_en_x = 1;
fwd_val_x = result_w;
end
// Check if we should forward from X stage (instruction in X stage writes to rd_x)
// The instruction in X stage is the one that was in F stage 1 cycle ago.
// It computes alu_res_x. If rs_x == rd_x and rd_x != 0, forward from X.
// Note: X stage forwarding takes priority over W stage forwarding if both match,
// because X stage result is more recent.
else if ((rd_x != 2'b00) && (rs_x == rd_x)) begin
fwd_en_x = 1;
fwd_val_x = alu_res_x;
end
end
// Helper function to sign-extend 4-bit immediate to 8-bit
function signed [7:0] sign_ext4;
input [3:0] val;
begin
sign_ext4 = {{4{val[3]}}, val};
end
endfunction
// Main sequential logic
always @(posedge clk) begin
if (rst) begin
// Reset everything
pc_f <= 5'b0;
instr_f <= 8'b0;
pc_x <= 5'b0;
instr_x <= 8'b0;
rs_val_x <= 8'b0;
cond_x <= 2'b0;
off_x <= 4'b0;
rd_x <= 2'b0;
rs_x <= 2'b0;
imm_x <= 4'b0;
opcode_x <= 2'b0;
branch_taken_x <= 1'b0;
is_halt_x <= 1'b0;
alu_res_x <= 8'b0;
pc_w <= 5'b0;
instr_w <= 8'b0;
rd_w <= 2'b0;
result_w <= 8'b0;
is_halt_w <= 1'b0;
valid_w <= 1'b0;
pc_out <= 5'b0;
halt_out <= 1'b0;
r0_out <= 8'b0;
rf[0] <= 8'b0;
rf[1] <= 8'b0;
rf[2] <= 8'b0;
rf[3] <= 8'b0;
end else if (halt_out) begin
// Pipeline frozen: keep current state
// pc_out remains at the HALT instruction's PC
// r0_out remains unchanged
// All pipeline registers hold their last values
end else begin
// --- W Stage (Writeback) ---
// Write result to register file if not HALT and rd != 0
if (valid_w && !is_halt_w && (rd_w != 2'b00)) begin
rf[rd_w] <= result_w;
end
// Update r0_out if R0 was written
if (valid_w && !is_halt_w && (rd_w == 2'b00)) begin
r0_out <= result_w;
end else if (valid_w && is_halt_w) begin
// HALT: r0_out unchanged
end
// Move X to W
pc_w <= pc_x;
instr_w <= instr_x;
rd_w <= rd_x;
result_w <= alu_res_x;
is_halt_w <= is_halt_x;
valid_w <= 1'b1;
// Update halt_out if HALT reached W stage
if (valid_w && is_halt_w) begin
halt_out <= 1'b1;
end
// --- X Stage (Decode/Execute) ---
// Decode instruction in X stage
opcode_x <= instr_x[7:6];
case (opcode_x)
2'b00: begin // ADDI
rd_x <= instr_x[5:4];
imm_x <= instr_x[3:0];
rs_x <= 2'b00; // Not used
cond_x <= 2'b00; // Not used
off_x <= 4'b0; // Not used
is_halt_x <= 1'b0;
branch_taken_x <= 1'b0;
end
2'b01: begin // MOV
rd_x <= instr_x[5:4];
rs_x <= instr_x[3:2];
imm_x <= 4'b0; // Not used
cond_x <= 2'b00; // Not used
off_x <= 4'b0; // Not used
is_halt_x <= 1'b0;
branch_taken_x <= 1'b0;
end
2'b10: begin // BRcc
cond_x <= instr_x[5:4];
off_x <= instr_x[3:0];
rd_x <= 2'b00; // Not used
rs_x <= 2'b00; // Not used
imm_x <= 4'b0; // Not used
is_halt_x <= 1'b0;
end
2'b11: begin // HALT
rd_x <= 2'b00;
rs_x <= 2'b00;
imm_x <= 4'b0;
cond_x <= 2'b00;
off_x <= 4'b0;
is_halt_x <= 1'b1;
branch_taken_x <= 1'b0;
end
default: begin
rd_x <= 2'b00;
rs_x <= 2'b00;
imm_x <= 4'b0;
cond_x <= 2'b00;
off_x <= 4'b0;
is_halt_x <= 1'b0;
branch_taken_x <= 1'b0;
end
endcase
// Execute ALU/Branch
case (opcode_x)
2'b00: begin // ADDI
alu_res_x <= rf[rd_x] + sign_ext4(imm_x);
branch_taken_x <= 1'b0;
end
2'b01: begin // MOV
alu_res_x <= rf[rs_x];
branch_taken_x <= 1'b0;
end
2'b10: begin // BRcc
// Evaluate branch condition based on R0
case (cond_x)
2'b00: branch_taken_x = 1'b1; // Always
2'b01: branch_taken_x = (rf[0] == 8'b0); // Zero
2'b10: branch_taken_x = (rf[0] < 8'b0); // Negative
2'b11: branch_taken_x = (rf[0] > 8'b0); // Positive
default: branch_taken_x = 1'b0;
endcase
alu_res_x <= 8'b0; // Not used
end
2'b11: begin // HALT
alu_res_x <= 8'b0;
branch_taken_x <= 1'b0;
end
default: begin
alu_res_x <= 8'b0;
branch_taken_x <= 1'b0;
end
endcase
// --- F Stage (Fetch) ---
// Compute next PC
// If the instruction in X stage was a taken branch, the next PC after the delay slot
// should be the branch target. But the delay slot instruction is the one currently in W stage.
// So: if branch was taken in X stage, the instruction in W stage is the delay slot.
// After W stage completes, the next fetch should be the branch target.
// However, we need to handle the PC update carefully.
// Current F stage PC is pc_f. The next instruction to fetch depends on whether
// the branch in the X stage (which is the instruction at pc_x) was taken.
// But wait: the X stage instruction is the one fetched in the previous cycle.
// The W stage instruction is the one fetched 2 cycles ago.
// Let's think about the pipeline timing:
// Cycle N: F fetches PC=N, X executes PC=N-1, W writes PC=N-2
// If X stage (PC=N-1) is a taken branch, the delay slot is PC=N (currently in F).
// After cycle N, the delay slot (PC=N) moves to X, and the next fetch should be
// the branch target if the branch was taken.
// So: if the instruction in X stage is a taken branch, the next PC (for the cycle after the delay slot)
// is the branch target. The delay slot instruction (currently in F) will execute normally.
// Compute branch target: pc_x + sign_ext(off_x, 4) + 1
signed [7:0] branch_target = pc_x + sign_ext4(off_x) + 1;
// Determine next PC for F stage
// If the instruction in W stage was a taken branch, then the current F stage instruction
// is the delay slot, and the next F stage should fetch the branch target.
// Wait, let's reconsider. The branch instruction is in X stage. The delay slot is in F stage.
// After this cycle, the delay slot moves to X, and we need to fetch the branch target (if taken)
// or pc_f + 1 (if not taken) for the next F stage.
// Actually, the standard approach:
// - If the branch in X stage is taken, the next PC after the delay slot is the branch target.
// - The delay slot instruction is the one currently in F stage.
// - So for the next cycle, F stage should fetch from branch_target if branch_taken_x is true,
// otherwise fetch from pc_f + 1.
// But we also need to consider: what if the branch is in W stage? No, branches are handled in X stage.
// The W stage only writes results. The branch decision is made in X stage.
// Let's use the branch_taken from the instruction in X stage to determine the next PC.
// The instruction in X stage is at pc_x. If it's a taken branch, the delay slot is at pc_f.
// After this cycle, the next instruction to fetch is the branch target.
if (opcode_x == 2'b10 && branch_taken_x) begin
pc_f <= branch_target;
end else begin
pc_f <= pc_f + 5'b1;
end
// Fetch instruction from IMEM
// IMEM_INIT is 256 bits, byte 0 at MSB.
// Byte at address pc_f is at bits [pc_f*8 +: 8] from the MSB side.
// Since byte 0 is at MSB, byte N is at bits [255 - N*8 -: 8].
// Alternatively, we can index as IMEM_INIT[255 - pc_f*8 -: 8].
instr_f <= IMEM_INIT[255 - pc_f*8 -: 8];
// Move F to X
pc_x <= pc_f;
instr_x <= instr_f;
// Update pc_out to reflect the PC currently in F stage
pc_out <= pc_f;
end
end
endmodule```verilog
// Pipeline convention:
// - Branch delay slot: The instruction immediately following a BRcc is always fetched and executed.
// The branch target (if taken) is fetched in the cycle after the delay slot instruction completes W stage.
// - Forwarding: Results from X stage are forwarded to X stage inputs of the next instruction.
// Results from W stage are forwarded to X stage inputs of the instruction after that.
// - HALT: When a HALT instruction reaches W stage, halt_out is asserted and the pipeline freezes.
// pc_out remains at the PC of the HALT instruction. r0_out remains unchanged.
module mcu3 #(
parameter [0:255] IMEM_INIT = 256'h0 // 32 instruction bytes, byte 0 at MSB
) (
input clk,
input rst, // synchronous, active-high
output reg [4:0] pc_out, // current fetched PC (visible for testing)
output reg halt_out, // asserted forever after HALT executes
output reg signed [7:0] r0_out // current value of register R0
);
// Register file: 4 registers, 8-bit signed
reg signed [7:0] rf [0:3];
// Pipeline registers
// Stage F (Fetch)
reg [4:0] pc_f;
reg [7:0] instr_f;
// Stage X (Execute)
reg [4:0] pc_x;
reg [7:0] instr_x;
reg signed [7:0] rs_val_x; // Read register value (after forwarding)
reg [1:0] cond_x; // Condition for branch
reg [3:0] off_x; // Offset for branch
reg [1:0] rd_x; // Destination register
reg [1:0] rs_x; // Source register (for MOV)
reg [3:0] imm_x; // Immediate (for ADDI)
reg [1:0] opcode_x; // Opcode
reg branch_taken_x; // Branch condition met?
reg is_halt_x; // Is HALT?
reg signed [7:0] alu_res_x; // ALU result
// Stage W (Writeback)
reg [4:0] pc_w;
reg [7:0] instr_w;
reg [1:0] rd_w; // Destination register
reg signed [7:0] result_w; // Result to write
reg is_halt_w; // Is HALT?
reg valid_w; // Valid instruction in W stage
// Forwarding logic
// We need to forward from X stage and W stage to the X stage of the current instruction.
// The X stage instruction reads registers. If the source register matches a destination
// register being written by X or W stage, we forward.
// Forwarding from X stage to X stage (current instruction's X stage reads from previous instruction's X stage result)
// Forwarding from W stage to X stage (current instruction's X stage reads from instruction before previous's W stage result)
reg signed [7:0] fwd_val_x;
reg fwd_en_x;
// Determine forwarding for the instruction in X stage
always @(*) begin
// Default: no forwarding, use register file value
fwd_en_x = 0;
fwd_val_x = rf[rs_x];
// Check if we should forward from W stage (instruction in W stage writes to rd_w)
// The instruction in W stage is the one that was in X stage 2 cycles ago.
// It writes to rd_w. If rs_x == rd_w and rd_w != 0 and valid_w, forward from W.
if (valid_w && (rd_w != 2'b00) && (rs_x == rd_w)) begin
fwd_en_x = 1;
fwd_val_x = result_w;
end
// Check if we should forward from X stage (instruction in X stage writes to rd_x)
// The instruction in X stage is the one that was in F stage 1 cycle ago.
// It computes alu_res_x. If rs_x == rd_x and rd_x != 0, forward from X.
// Note: X stage forwarding takes priority over W stage forwarding if both match,
// because X stage result is more recent.
else if ((rd_x != 2'b00) && (rs_x == rd_x)) begin
fwd_en_x = 1;
fwd_val_x = alu_res_x;
end
end
// Helper function to sign-extend 4-bit immediate to 8-bit
function signed [7:0] sign_ext4;
input [3:0] val;
begin
sign_ext4 = {{4{val[3]}}, val};
end
endfunction
// Main sequential logic
always @(posedge clk) begin
if (rst) begin
// Reset everything
pc_f <= 5'b0;
instr_f <= 8'b0;
pc_x <= 5'b0;
instr_x <= 8'b0;
rs_val_x <= 8'b0;
cond_x <= 2'b0;
off_x <= 4'b0;
rd_x <= 2'b0;
rs_x <= 2'b0;
imm_x <= 4'b0;
opcode_x <= 2'b0;
branch_taken_x <= 1'b0;
is_halt_x <= 1'b0;
alu_res_x <= 8'b0;
pc_w <= 5'b0;
instr_w <= 8'b0;
rd_w <= 2'b0;
result_w <= 8'b0;
is_halt_w <= 1'b0;
valid_w <= 1'b0;
pc_out <= 5'b0;
halt_out <= 1'b0;
r0_out <= 8'b0;
rf[0] <= 8'b0;
rf[1] <= 8'b0;
rf[2] <= 8'b0;
rf[3] <= 8'b0;
end else if (halt_out) begin
// Pipeline frozen: keep current state
// pc_out remains at the HALT instruction's PC
// r0_out remains unchanged
// All pipeline registers hold their last values
end else begin
// --- W Stage (Writeback) ---
// Write result to register file if not HALT and rd != 0
if (valid_w && !is_halt_w && (rd_w != 2'b00)) begin
rf[rd_w] <= result_w;
end
// Update r0_out if R0 was written
if (valid_w && !is_halt_w && (rd_w == 2'b00)) begin
r0_out <= result_w;
end else if (valid_w && is_halt_w) begin
// HALT: r0_out unchanged
end
// Move X to W
pc_w <= pc_x;
instr_w <= instr_x;
rd_w <= rd_x;
result_w <= alu_res_x;
is_halt_w <= is_halt_x;
valid_w <= 1'b1;
// Update halt_out if HALT reached W stage
if (valid_w && is_halt_w) begin
halt_out <= 1'b1;
end
// --- X Stage (Decode/Execute) ---
// Decode instruction in X stage
opcode_x <= instr_x[7:6];
case (opcode_x)
2'b00: begin // ADDI
rd_x <= instr_x[5:4];
imm_x <= instr_x[3:0];
rs_x <= 2'b00; // Not used
cond_x <= 2'b00; // Not used
off_x <= 4'b0; // Not used
is_halt_x <= 1'b0;
branch_taken_x <= 1'b0;
end
2'b01: begin // MOV
rd_x <= instr_x[5:4];
rs_x <= instr_x[3:2];
imm_x <= 4'b0; // Not used
cond_x <= 2'b00; // Not used
off_x <= 4'b0; // Not used
is_halt_x <= 1'b0;
branch_taken_x <= 1'b0;
end
2'b10: begin // BRcc
cond_x <= instr_x[5:4];
off_x <= instr_x[3:0];
rd_x <= 2'b00; // Not used
rs_x <= 2'b00; // Not used
imm_x <= 4'b0; // Not used
is_halt_x <= 1'b0;
end
2'b11: begin // HALT
rd_x <= 2'b00;
rs_x <= 2'b00;
imm_x <= 4'b0;
cond_x <= 2'b00;
off_x <= 4'b0;
is_halt_x <= 1'b1;
branch_taken_x <= 1'b0;
end
default: begin
rd_x <= 2'b00;
rs_x <= 2'b00;
imm_x <= 4'b0;
cond_x <= 2'b00;
off_x <= 4'b0;
is_halt_x <= 1'b0;
branch_taken_x <= 1'b0;
end
endcase
// Execute ALU/Branch
case (opcode_x)
2'b00: begin // ADDI
alu_res_x <= rf[rd_x] + sign_ext4(imm_x);
branch_taken_x <= 1'b0;
end
2'b01: begin // MOV
alu_res_x <= rf[rs_x];
branch_taken_x <= 1'b0;
end
2'b10: begin // BRcc
// Evaluate branch condition based on R0
case (cond_x)
2'b00: branch_taken_x = 1'b1; // Always
2'b01: branch_taken_x = (rf[0] == 8'b0); // Zero
2'b10: branch_taken_x = (rf[0] < 8'b0); // Negative
2'b11: branch_taken_x = (rf[0] > 8'b0); // Positive
default: branch_taken_x = 1'b0;
endcase
alu_res_x <= 8'b0; // Not used
end
2'b11: begin // HALT
alu_res_x <= 8'b0;
branch_taken_x <= 1'b0;
end
default: begin
alu_res_x <= 8'b0;
branch_taken_x <= 1'b0;
end
endcase
// --- F Stage (Fetch) ---
// Compute next PC
// If the instruction in X stage was a taken branch, the next PC after the delay slot
// should be the branch target. But the delay slot instruction is the one currently in W stage.
// So: if branch was taken in X stage, the instruction in W stage is the delay slot.
// After W stage completes, the next fetch should be the branch target.
// However, we need to handle the PC update carefully.
// Current F stage PC is pc_f. The next instruction to fetch depends on whether
// the branch in the X stage (which is the instruction at pc_x) was taken.
// But wait: the X stage instruction is the one fetched in the previous cycle.
// The W stage instruction is the one fetched 2 cycles ago.
// Let's think about the pipeline timing:
// Cycle N: F fetches PC=N, X executes PC=N-1, W writes PC=N-2
// If X stage (PC=N-1) is a taken branch, the delay slot is PC=N (currently in F).
// After cycle N, the delay slot (PC=N) moves to X, and the next fetch should be
// the branch target if the branch was taken.
// So: if the instruction in X stage is a taken branch, the next PC (for the cycle after the delay slot)
// is the branch target. The delay slot instruction (currently in F) will execute normally.
// Compute branch target: pc_x + sign_ext(off_x, 4) + 1
signed [7:0] branch_target = pc_x + sign_ext4(off_x) + 1;
// Determine next PC for F stage
// If the instruction in W stage was a taken branch, then the current F stage instruction
// is the delay slot, and the next F stage should fetch the branch target.
// Wait, let's reconsider. The branch instruction is in X stage. The delay slot is in F stage.
// After this cycle, the delay slot moves to X, and we need to fetch the branch target (if taken)
// or pc_f + 1 (if not taken) for the next F stage.
// Actually, the standard approach:
// - If the branch in X stage is taken, the next PC after the delay slot is the branch target.
// - The delay slot instruction is the one currently in F stage.
// - So for the next cycle, F stage should fetch from branch_target if branch_taken_x is true,
// otherwise fetch from pc_f + 1.
// But we also need to consider: what if the branch is in W stage? No, branches are handled in X stage.
// The W stage only writes results. The branch decision is made in X stage.
// Let's use the branch_taken from the instruction in X stage to determine the next PC.
// The instruction in X stage is at pc_x. If it's a taken branch, the delay slot is at pc_f.
// After this cycle, the next instruction to fetch is the branch target.
if (opcode_x == 2'b10 && branch_taken_x) begin
pc_f <= branch_target;
end else begin
pc_f <= pc_f + 5'b1;
end
// Fetch instruction from IMEM
// IMEM_INIT is 256 bits, byte 0 at MSB.
// Byte at address pc_f is at bits [pc_f*8 +: 8] from the MSB side.
// Since byte 0 is at MSB, byte N is at bits [255 - N*8 -: 8].
// Alternatively, we can index as IMEM_INIT[255 - pc_f*8 -: 8].
instr_f <= IMEM_INIT[255 - pc_f*8 -: 8];
// Move F to X
pc_x <= pc_f;
instr_x <= instr_f;
// Update pc_out to reflect the PC currently in F stage
pc_out <= pc_f;
end
end
endmodule
```