Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
42e5753
split execution into epochs
nicole-graus Jun 5, 2026
af01024
Add an initial-memory image
nicole-graus Jun 5, 2026
83c4a45
build a single epoch's traces
nicole-graus Jun 5, 2026
e6f93b4
Add an is_final flag for halt
nicole-graus Jun 5, 2026
5b3fb3b
Make the HALT table optional
nicole-graus Jun 5, 2026
9125571
Add a register_init parameter to VmAirs::new for the REGISTER preproc…
nicole-graus Jun 5, 2026
e298a7b
reject a non-final epoch that contains the program-terminating instru…
nicole-graus Jun 8, 2026
9884ac4
Add local-to-global boundary and process epochs to emit the boundary set
nicole-graus Jun 8, 2026
cc9499c
Add local-to-global air table
nicole-graus Jun 8, 2026
8048eb4
Add cross-epoch local-to-global memory linkage
nicole-graus Jun 8, 2026
49b1112
Add memory_bus_interactions to emit epoch init/fini tokens
nicole-graus Jun 9, 2026
ba58258
Wire the local-to-global table as the epoch-local Memory-bus bookend
nicole-graus Jun 9, 2026
aa5a023
Add prove_and_verify_continuation
nicole-graus Jun 9, 2026
14b2450
stream epochs one at a time and drop traces after proving
nicole-graus Jun 10, 2026
f007bdc
add bench_continuation
nicole-graus Jun 10, 2026
d0fa022
Add multi-pass array asm program (1 MiB footprint, ~20M steps) as a w…
nicole-graus Jun 10, 2026
45b8205
Add a count mode to bench_continuation that reports a program's cycle…
nicole-graus Jun 10, 2026
bd63d56
Merge remote-tracking branch 'origin/main' into continuations-local-t…
nicole-graus Jun 11, 2026
d5e7aa5
l2g val to a single byte column
nicole-graus Jun 11, 2026
677c537
Merge remote-tracking branch 'origin/main' into continuations-local-t…
nicole-graus Jun 11, 2026
e498057
Thread private inputs so we can bench ethrex program
nicole-graus Jun 11, 2026
7ee7ff2
Use the static preprocessed bitwise commitment
nicole-graus Jun 11, 2026
0423466
Avoid redundant per-epoch work (skip page and carry the memory)
nicole-graus Jun 11, 2026
360c88f
add global_memory for init-elf binding
nicole-graus Jun 12, 2026
ddfd5b8
add clasification into stack vs data/heap in bench_continuation
nicole-graus Jun 12, 2026
6a318b9
store memory in dense per-page arrays instead of per-cell HashMaps
nicole-graus Jun 16, 2026
301cc99
update doc
nicole-graus Jun 17, 2026
3ca8d22
Range-check the local-to-global continuation table columns
nicole-graus Jun 17, 2026
9740f48
slim range-check since memw already does it
nicole-graus Jun 18, 2026
ddd6c01
make fini_epoch constant, add MU selector for padding rows, add epoch…
nicole-graus Jun 18, 2026
7a86d5e
Gate the local-to-global MU selector on the GlobalMemory bus only and…
nicole-graus Jun 18, 2026
6e7aaa7
Revert the local-to-global MU wiring to Design X (MU gates every L2G …
nicole-graus Jun 19, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
353 changes: 353 additions & 0 deletions docs/continuations_l2g_design.md

Large diffs are not rendered by default.

36 changes: 36 additions & 0 deletions executor/programs/asm/array_multipass_20M.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
.attribute 5, "rv64i2p1"
.globl main
main:
# Multi-pass array: P passes over an N-word array, each element
# load+add+store. Touches a LARGE distinct RAM footprint (N words)
# and REUSES it every pass (so each cell is touched in multiple
# epochs) -> worst-case stress for the local-to-global table.
#
# Footprint = N words = 4*N bytes (here 262144 words = 1 MiB).
# Steps ~= P * N * 6 (here 13 * 262144 * 6 ~= 20.4M).
#
# Tuning knobs:
# t5 init (N) -> distinct footprint (bytes = 4*N)
# t6 init (P) -> number of passes (cross-epoch reuse)
# keep P*N*6 ~= target step count.

li t3, 1 # increment k
li t6, 13 # P = passes
li t0, 0x40000000 # BASE = array address (free RAM)

.outer:
mv t1, t0 # ptr = BASE
li t5, 262144 # N = words per pass
.inner:
lw t4, 0(t1) # t4 = a[i]
add t4, t4, t3 # a[i] += k
sw t4, 0(t1) # a[i] = t4
addi t1, t1, 4 # ptr += 4
addi t5, t5, -1 # i--
bnez t5, .inner
addi t6, t6, -1 # pass--
bnez t6, .outer

li a0, 0
li a7, 93
ecall
57 changes: 56 additions & 1 deletion executor/src/vm/execution.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,20 @@ pub struct ExecutionResult {
/// Size of each log chunk - balances memory usage vs callback overhead
const CHUNK_SIZE: usize = 100_000;

/// Default number of cycles (instructions) per continuation epoch.
pub const DEFAULT_EPOCH_SIZE: usize = 100_000;

/// Result of executing one continuation epoch: the logs produced during the
/// epoch and the VM state at the epoch boundary. The boundary state is the
/// starting state of the next epoch.
#[derive(Debug)]
pub struct EpochExecution {
pub logs: Vec<Log>,
pub end_pc: u64,
pub end_registers: Registers,
pub end_memory: Memory,
}

/// Executor state for chunked execution
pub struct Executor {
memory: Memory,
Expand Down Expand Up @@ -57,13 +71,34 @@ impl Executor {

/// Resume execution and return next logs. Returns None when program is finished.
pub fn resume(&mut self) -> Result<Option<&[Log]>, ExecutorError> {
self.resume_with_limit(CHUNK_SIZE)
}

/// Current program counter (0 once the program has halted).
pub fn pc(&self) -> u64 {
self.pc
}

/// Current register state.
pub fn registers(&self) -> &Registers {
&self.registers
}

/// Current memory state.
pub fn memory(&self) -> &Memory {
&self.memory
}

/// Resume execution, running at most `limit` cycles, and return the logs
/// produced. Returns None when the program is finished.
pub fn resume_with_limit(&mut self, limit: usize) -> Result<Option<&[Log]>, ExecutorError> {
if self.pc == 0 {
return Ok(None);
}

self.logs.clear();

while self.pc != 0 && self.logs.len() < CHUNK_SIZE {
while self.pc != 0 && self.logs.len() < limit {
if !self.pc.is_multiple_of(4) {
return Err(ExecutorError::InstructionAddressMisaligned(self.pc));
}
Expand Down Expand Up @@ -117,6 +152,26 @@ impl Executor {
instructions: self.instructions.into_instruction_map(),
})
}

/// Run to completion, splitting execution into epochs of at most `epoch_size`
/// cycles. Each epoch captures its logs and the VM state at the epoch
/// boundary, which is the starting state of the next epoch. Consumes the
/// executor.
pub fn run_epochs(mut self, epoch_size: usize) -> Result<Vec<EpochExecution>, ExecutorError> {
assert!(epoch_size > 0, "epoch_size must be greater than zero");

let mut epochs = Vec::new();
while let Some(logs) = self.resume_with_limit(epoch_size)? {
let logs = logs.to_vec();
epochs.push(EpochExecution {
logs,
end_pc: self.pc,
end_registers: self.registers.clone(),
end_memory: self.memory.clone(),
});
}
Ok(epochs)
}
}

fn load_program(segments: &[crate::elf::Segment], memory: &mut Memory) -> Result<(), MemoryError> {
Expand Down
14 changes: 13 additions & 1 deletion executor/src/vm/memory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ pub const MAX_PRIVATE_INPUT_SIZE: u64 = 6700000;
/// Must match `PRIVATE_INPUT_START` in `syscalls/src/syscalls.rs`.
pub const PRIVATE_INPUT_START_INDEX: u64 = 0xFF000000;

#[derive(Default, Debug)]
#[derive(Default, Debug, Clone)]
pub struct Memory {
cells: U64HashMap<[u8; 4]>,
/// Bytes committed to public output via `commit_public_output`. The
Expand Down Expand Up @@ -80,6 +80,18 @@ impl Memory {
entry[(address % 4) as usize] = value;
}

/// Iterate over all stored bytes as `(address, value)` pairs. Cells are
/// stored as 4-byte words; each word expands into its four byte addresses.
/// Used to snapshot memory at an epoch boundary.
pub fn iter_bytes(&self) -> impl Iterator<Item = (u64, u8)> + '_ {
self.cells.iter().flat_map(|(&addr, bytes)| {
bytes
.iter()
.enumerate()
.map(move |(i, &b)| (addr + i as u64, b))
})
}

pub fn load_word(&self, address: u64) -> Result<u32, MemoryError> {
if address.is_multiple_of(4) {
let bytes = self.cells.get(&address).cloned().unwrap_or_default();
Expand Down
2 changes: 1 addition & 1 deletion executor/src/vm/registers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::fmt::Display;

pub const STACK_TOP: u64 = 0xFFFFFFFFFFFFFFF0; // 64-bit max (Multiple of 16 for RV64 ABI)

#[derive(Debug)]
#[derive(Debug, Clone)]
/// Holds the current value of all 32 registers
/// Register zero is implicit as it cannot hold any value other than zero
pub struct Registers([u64; 31]);
Expand Down
41 changes: 41 additions & 0 deletions executor/tests/asm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -923,3 +923,44 @@ fn test_keccak() {
assert_eq!(result.return_values.memory_values, expected_bytes);
assert_eq!(result.return_values.register_values.0, 0);
}

#[test]
fn test_run_epochs_splits_execution_into_n_cycle_epochs() {
let elf_data = std::fs::read("./program_artifacts/asm/basic_program.elf").unwrap();
let program = Elf::load(&elf_data).unwrap();

// Reference: full single-pass run.
let full = Executor::new(&program, vec![]).unwrap().run().unwrap();

// Pick an epoch size that splits this program into a few epochs, whatever
// its exact length.
let total_cycles = full.logs.len();
assert!(total_cycles >= 2);
let epoch_size = (total_cycles / 3).max(1);

let epochs = Executor::new(&program, vec![])
.unwrap()
.run_epochs(epoch_size)
.unwrap();

// The program is long enough to span several epochs.
assert!(epochs.len() >= 2);

// Concatenated epoch logs reproduce the full run's instruction stream.
let concat: Vec<u64> = epochs
.iter()
.flat_map(|e| e.logs.iter().map(|l| l.current_pc))
.collect();
let expected: Vec<u64> = full.logs.iter().map(|l| l.current_pc).collect();
assert_eq!(concat, expected);

// Every epoch except the last runs exactly `epoch_size` cycles.
for epoch in &epochs[..epochs.len() - 1] {
assert_eq!(epoch.logs.len(), epoch_size);
}
let last = epochs.last().unwrap();
assert!(!last.logs.is_empty() && last.logs.len() <= epoch_size);

// The program finished, so the final epoch's boundary pc is 0.
assert_eq!(last.end_pc, 0);
}
4 changes: 4 additions & 0 deletions prover/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,7 @@ harness = false
[[bench]]
name = "profile_vm_prover"
harness = false

[[bench]]
name = "bench_continuation"
harness = false
134 changes: 134 additions & 0 deletions prover/benches/bench_continuation.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
//! Peak-memory benchmark: monolithic proving vs continuation (streaming-epoch)
//! proving, for large programs.
//!
//! This is a plain one-shot binary (`harness = false`), not a Criterion bench:
//! Criterion measures time over many iterations, whereas the point here is the
//! peak resident set of a SINGLE prove. Wrap it in the OS timer to capture RSS,
//! on Linux:
//! /usr/bin/time -v <binary> main <elf_path>
//! /usr/bin/time -v <binary> cont <elf_path> 65536
//!
//! Build + locate the binary:
//! cargo build --release --bench bench_continuation
//! ls target/release/deps/bench_continuation-* # the executable (no .d)
//!
//! Args:
//! <mode> "count", "main" (monolithic prove) or "cont" (continuation)
//! <elf_path> path to a compiled ELF artifact
//! [epoch_size] epoch length in cycles for "cont" (default 65536)
//!
//! Env:
//! BENCH_PRIVATE_INPUT optional path to a private-input file (e.g. an
//! ethrex ProgramInput .bin). Empty if unset.

use std::time::Instant;

fn main() {
let args: Vec<String> = std::env::args().collect();
if args.len() < 3 {
eprintln!("usage: bench_continuation <count|main|cont> <elf_path> [epoch_size]");
std::process::exit(2);
}
let mode = args[1].as_str();
let elf_path = &args[2];
let elf = std::fs::read(elf_path).expect("failed to read ELF");
let private_inputs: Vec<u8> = match std::env::var("BENCH_PRIVATE_INPUT") {
Ok(path) if !path.is_empty() => {
std::fs::read(&path).expect("failed to read BENCH_PRIVATE_INPUT file")
}
_ => Vec::new(),
};

let start = Instant::now();
match mode {
"count" => {
// Count cycles by running the executor to completion (no proving).
// Cycle count is a linear proxy for monolithic proving memory.
use executor::elf::Elf;
use executor::vm::execution::Executor;
let program = Elf::load(&elf).expect("bad ELF");
let result = Executor::new(&program, private_inputs)
.expect("executor")
.run()
.expect("execution failed");
println!("cycles = {}", result.logs.len());
}
"footprint" => {
// Run to completion, then classify the touched memory by region so we
// can see how much of the footprint is stack (contiguous, near
// STACK_TOP) vs the rest (ELF data / heap / private input, low
// addresses). Tells us whether a stack-specific Vec store would help.
use executor::elf::Elf;
use executor::vm::execution::Executor;
use executor::vm::registers::STACK_TOP;
let program = Elf::load(&elf).expect("bad ELF");
let mut ex = Executor::new(&program, private_inputs).expect("executor");
while ex.pc() != 0 {
match ex.resume_with_limit(usize::MAX).expect("execution failed") {
Some(_) => {}
None => break,
}
}
// Stack lives in the top half of the address space (grows down from
// STACK_TOP); ELF data / heap / input are in the low addresses.
const STACK_THRESHOLD: u64 = 1 << 63;
let (mut stack, mut other) = (0u64, 0u64);
let (mut min_stack, mut min_other, mut max_other) = (u64::MAX, u64::MAX, 0u64);
for (addr, _) in ex.memory().iter_bytes() {
if addr >= STACK_THRESHOLD {
stack += 1;
min_stack = min_stack.min(addr);
} else {
other += 1;
min_other = min_other.min(addr);
max_other = max_other.max(addr);
}
}
let total = stack + other;
let pct = |n: u64| 100.0 * n as f64 / total.max(1) as f64;
println!("footprint: {total} touched bytes");
if stack > 0 {
let span = STACK_TOP - min_stack + 1;
println!(
" stack: {stack} bytes ({:.1}%), range [{:#x}..={:#x}], span {span} bytes, density {:.1}%",
pct(stack),
min_stack,
STACK_TOP,
100.0 * stack as f64 / span as f64,
);
}
if other > 0 {
println!(
" other (data/heap/input): {other} bytes ({:.1}%), range [{:#x}..={:#x}]",
pct(other),
min_other,
max_other,
);
}
}
"main" => {
lambda_vm_prover::prove_with_inputs(&elf, &private_inputs)
.expect("monolithic prove failed");
println!("main prove ok ({} bytes ELF)", elf.len());
}
"cont" => {
let epoch_size: usize = args
.get(3)
.map(|s| s.parse().expect("bad epoch_size"))
.unwrap_or(65536);
let ok = lambda_vm_prover::continuation::prove_and_verify_continuation(
&elf,
&private_inputs,
epoch_size,
)
.expect("continuation failed");
assert!(ok, "continuation did not verify");
println!("cont prove+verify ok (epoch_size={epoch_size})");
}
other => {
eprintln!("unknown mode {other:?}; use count|footprint|main|cont");
std::process::exit(2);
}
}
println!("elapsed {:.2}s", start.elapsed().as_secs_f64());
}
Loading