yetanotherco · nicole-graus · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026
diff --git a/docs/continuations_l2g_design.md b/docs/continuations_l2g_design.md
diff --git a/executor/programs/asm/array_multipass_20M.s b/executor/programs/asm/array_multipass_20M.s
@@ -0,0 +1,36 @@
+	.attribute	5, "rv64i2p1"
+	.globl	main
+main:
+	# Multi-pass array: P passes over an N-word array, each element
+	# load+add+store. Touches a LARGE distinct RAM footprint (N words)
+	# and REUSES it every pass (so each cell is touched in multiple
+	# epochs) -> worst-case stress for the local-to-global table.
+	#
+	# Footprint = N words = 4*N bytes (here 262144 words = 1 MiB).
+	# Steps ~= P * N * 6  (here 13 * 262144 * 6 ~= 20.4M).
+	#
+	# Tuning knobs:
+	#   t5 init (N)  -> distinct footprint (bytes = 4*N)
+	#   t6 init (P)  -> number of passes (cross-epoch reuse)
+	#   keep P*N*6 ~= target step count.
+
+	li	t3, 1			# increment k
+	li	t6, 13			# P = passes
+	li	t0, 0x40000000		# BASE = array address (free RAM)
+
+.outer:
+	mv	t1, t0			# ptr = BASE
+	li	t5, 262144		# N = words per pass
+.inner:
+	lw	t4, 0(t1)		# t4 = a[i]
+	add	t4, t4, t3		# a[i] += k
+	sw	t4, 0(t1)		# a[i] = t4
+	addi	t1, t1, 4		# ptr += 4
+	addi	t5, t5, -1		# i--
+	bnez	t5, .inner
+	addi	t6, t6, -1		# pass--
+	bnez	t6, .outer
+
+	li	a0, 0
+	li	a7, 93
+	ecall
diff --git a/executor/src/vm/execution.rs b/executor/src/vm/execution.rs
@@ -30,6 +30,20 @@ pub struct ExecutionResult {
 /// Size of each log chunk - balances memory usage vs callback overhead
 const CHUNK_SIZE: usize = 100_000;
 
+/// Default number of cycles (instructions) per continuation epoch.
+pub const DEFAULT_EPOCH_SIZE: usize = 100_000;
+
+/// Result of executing one continuation epoch: the logs produced during the
+/// epoch and the VM state at the epoch boundary. The boundary state is the
+/// starting state of the next epoch.
+#[derive(Debug)]
+pub struct EpochExecution {
+    pub logs: Vec<Log>,
+    pub end_pc: u64,
+    pub end_registers: Registers,
+    pub end_memory: Memory,
+}
+
 /// Executor state for chunked execution
 pub struct Executor {
     memory: Memory,
@@ -57,13 +71,34 @@ impl Executor {
 
     /// Resume execution and return next logs. Returns None when program is finished.
     pub fn resume(&mut self) -> Result<Option<&[Log]>, ExecutorError> {
+        self.resume_with_limit(CHUNK_SIZE)
+    }
+
+    /// Current program counter (0 once the program has halted).
+    pub fn pc(&self) -> u64 {
+        self.pc
+    }
+
+    /// Current register state.
+    pub fn registers(&self) -> &Registers {
+        &self.registers
+    }
+
+    /// Current memory state.
+    pub fn memory(&self) -> &Memory {
+        &self.memory
+    }
+
+    /// Resume execution, running at most `limit` cycles, and return the logs
+    /// produced. Returns None when the program is finished.
+    pub fn resume_with_limit(&mut self, limit: usize) -> Result<Option<&[Log]>, ExecutorError> {
         if self.pc == 0 {
             return Ok(None);
         }
 
         self.logs.clear();
 
-        while self.pc != 0 && self.logs.len() < CHUNK_SIZE {
+        while self.pc != 0 && self.logs.len() < limit {
             if !self.pc.is_multiple_of(4) {
                 return Err(ExecutorError::InstructionAddressMisaligned(self.pc));
             }
@@ -117,6 +152,26 @@ impl Executor {
             instructions: self.instructions.into_instruction_map(),
         })
     }
+
+    /// Run to completion, splitting execution into epochs of at most `epoch_size`
+    /// cycles. Each epoch captures its logs and the VM state at the epoch
+    /// boundary, which is the starting state of the next epoch. Consumes the
+    /// executor.
+    pub fn run_epochs(mut self, epoch_size: usize) -> Result<Vec<EpochExecution>, ExecutorError> {
+        assert!(epoch_size > 0, "epoch_size must be greater than zero");
+
+        let mut epochs = Vec::new();
+        while let Some(logs) = self.resume_with_limit(epoch_size)? {
+            let logs = logs.to_vec();
+            epochs.push(EpochExecution {
+                logs,
+                end_pc: self.pc,
+                end_registers: self.registers.clone(),
+                end_memory: self.memory.clone(),
+            });
+        }
+        Ok(epochs)
+    }
 }
 
 fn load_program(segments: &[crate::elf::Segment], memory: &mut Memory) -> Result<(), MemoryError> {

diff --git a/executor/src/vm/memory.rs b/executor/src/vm/memory.rs
@@ -50,7 +50,7 @@ pub const MAX_PRIVATE_INPUT_SIZE: u64 = 6700000;
 /// Must match `PRIVATE_INPUT_START` in `syscalls/src/syscalls.rs`.
 pub const PRIVATE_INPUT_START_INDEX: u64 = 0xFF000000;
 
-#[derive(Default, Debug)]
+#[derive(Default, Debug, Clone)]
 pub struct Memory {
     cells: U64HashMap<[u8; 4]>,
     /// Bytes committed to public output via `commit_public_output`. The
@@ -80,6 +80,18 @@ impl Memory {
         entry[(address % 4) as usize] = value;
     }
 
+    /// Iterate over all stored bytes as `(address, value)` pairs. Cells are
+    /// stored as 4-byte words; each word expands into its four byte addresses.
+    /// Used to snapshot memory at an epoch boundary.
+    pub fn iter_bytes(&self) -> impl Iterator<Item = (u64, u8)> + '_ {
+        self.cells.iter().flat_map(|(&addr, bytes)| {
+            bytes
+                .iter()
+                .enumerate()
+                .map(move |(i, &b)| (addr + i as u64, b))
+        })
+    }
+
     pub fn load_word(&self, address: u64) -> Result<u32, MemoryError> {
         if address.is_multiple_of(4) {
             let bytes = self.cells.get(&address).cloned().unwrap_or_default();

diff --git a/executor/src/vm/registers.rs b/executor/src/vm/registers.rs
@@ -2,7 +2,7 @@ use std::fmt::Display;
 
 pub const STACK_TOP: u64 = 0xFFFFFFFFFFFFFFF0; // 64-bit max (Multiple of 16 for RV64 ABI)
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 /// Holds the current value of all 32 registers
 /// Register zero is implicit as it cannot hold any value other than zero
 pub struct Registers([u64; 31]);

diff --git a/executor/tests/asm.rs b/executor/tests/asm.rs
@@ -923,3 +923,44 @@ fn test_keccak() {
     assert_eq!(result.return_values.memory_values, expected_bytes);
     assert_eq!(result.return_values.register_values.0, 0);
 }
+
+#[test]
+fn test_run_epochs_splits_execution_into_n_cycle_epochs() {
+    let elf_data = std::fs::read("./program_artifacts/asm/basic_program.elf").unwrap();
+    let program = Elf::load(&elf_data).unwrap();
+
+    // Reference: full single-pass run.
+    let full = Executor::new(&program, vec![]).unwrap().run().unwrap();
+
+    // Pick an epoch size that splits this program into a few epochs, whatever
+    // its exact length.
+    let total_cycles = full.logs.len();
+    assert!(total_cycles >= 2);
+    let epoch_size = (total_cycles / 3).max(1);
+
+    let epochs = Executor::new(&program, vec![])
+        .unwrap()
+        .run_epochs(epoch_size)
+        .unwrap();
+
+    // The program is long enough to span several epochs.
+    assert!(epochs.len() >= 2);
+
+    // Concatenated epoch logs reproduce the full run's instruction stream.
+    let concat: Vec<u64> = epochs
+        .iter()
+        .flat_map(|e| e.logs.iter().map(|l| l.current_pc))
+        .collect();
+    let expected: Vec<u64> = full.logs.iter().map(|l| l.current_pc).collect();
+    assert_eq!(concat, expected);
+
+    // Every epoch except the last runs exactly `epoch_size` cycles.
+    for epoch in &epochs[..epochs.len() - 1] {
+        assert_eq!(epoch.logs.len(), epoch_size);
+    }
+    let last = epochs.last().unwrap();
+    assert!(!last.logs.is_empty() && last.logs.len() <= epoch_size);
+
+    // The program finished, so the final epoch's boundary pc is 0.
+    assert_eq!(last.end_pc, 0);
+}
diff --git a/prover/Cargo.toml b/prover/Cargo.toml
@@ -42,3 +42,7 @@ harness = false
 [[bench]]
 name = "profile_vm_prover"
 harness = false
+
+[[bench]]
+name = "bench_continuation"
+harness = false
diff --git a/prover/benches/bench_continuation.rs b/prover/benches/bench_continuation.rs
@@ -0,0 +1,134 @@
+//! Peak-memory benchmark: monolithic proving vs continuation (streaming-epoch)
+//! proving, for large programs.
+//!
+//! This is a plain one-shot binary (`harness = false`), not a Criterion bench:
+//! Criterion measures time over many iterations, whereas the point here is the
+//! peak resident set of a SINGLE prove. Wrap it in the OS timer to capture RSS,
+//! on Linux:
+//!     /usr/bin/time -v <binary> main <elf_path>
+//!     /usr/bin/time -v <binary> cont <elf_path> 65536
+//!
+//! Build + locate the binary:
+//!     cargo build --release --bench bench_continuation
+//!     ls target/release/deps/bench_continuation-*   # the executable (no .d)
+//!
+//! Args:
+//!     <mode>        "count", "main" (monolithic prove) or "cont" (continuation)
+//!     <elf_path>    path to a compiled ELF artifact
+//!     [epoch_size]  epoch length in cycles for "cont" (default 65536)
+//!
+//! Env:
+//!     BENCH_PRIVATE_INPUT  optional path to a private-input file (e.g. an
+//!                          ethrex ProgramInput .bin). Empty if unset.
+
+use std::time::Instant;
+
+fn main() {
+    let args: Vec<String> = std::env::args().collect();
+    if args.len() < 3 {
+        eprintln!("usage: bench_continuation <count|main|cont> <elf_path> [epoch_size]");
+        std::process::exit(2);
+    }
+    let mode = args[1].as_str();
+    let elf_path = &args[2];
+    let elf = std::fs::read(elf_path).expect("failed to read ELF");
+    let private_inputs: Vec<u8> = match std::env::var("BENCH_PRIVATE_INPUT") {
+        Ok(path) if !path.is_empty() => {
+            std::fs::read(&path).expect("failed to read BENCH_PRIVATE_INPUT file")
+        }
+        _ => Vec::new(),
+    };
+
+    let start = Instant::now();
+    match mode {
+        "count" => {
+            // Count cycles by running the executor to completion (no proving).
+            // Cycle count is a linear proxy for monolithic proving memory.
+            use executor::elf::Elf;
+            use executor::vm::execution::Executor;
+            let program = Elf::load(&elf).expect("bad ELF");
+            let result = Executor::new(&program, private_inputs)
+                .expect("executor")
+                .run()
+                .expect("execution failed");
+            println!("cycles = {}", result.logs.len());
+        }
+        "footprint" => {
+            // Run to completion, then classify the touched memory by region so we
+            // can see how much of the footprint is stack (contiguous, near
+            // STACK_TOP) vs the rest (ELF data / heap / private input, low
+            // addresses). Tells us whether a stack-specific Vec store would help.
+            use executor::elf::Elf;
+            use executor::vm::execution::Executor;
+            use executor::vm::registers::STACK_TOP;
+            let program = Elf::load(&elf).expect("bad ELF");
+            let mut ex = Executor::new(&program, private_inputs).expect("executor");
+            while ex.pc() != 0 {
+                match ex.resume_with_limit(usize::MAX).expect("execution failed") {
+                    Some(_) => {}
+                    None => break,
+                }
+            }
+            // Stack lives in the top half of the address space (grows down from
+            // STACK_TOP); ELF data / heap / input are in the low addresses.
+            const STACK_THRESHOLD: u64 = 1 << 63;
+            let (mut stack, mut other) = (0u64, 0u64);
+            let (mut min_stack, mut min_other, mut max_other) = (u64::MAX, u64::MAX, 0u64);
+            for (addr, _) in ex.memory().iter_bytes() {
+                if addr >= STACK_THRESHOLD {
+                    stack += 1;
+                    min_stack = min_stack.min(addr);
+                } else {
+                    other += 1;
+                    min_other = min_other.min(addr);
+                    max_other = max_other.max(addr);
+                }
+            }
+            let total = stack + other;
+            let pct = |n: u64| 100.0 * n as f64 / total.max(1) as f64;
+            println!("footprint: {total} touched bytes");
+            if stack > 0 {
+                let span = STACK_TOP - min_stack + 1;
+                println!(
+                    "  stack: {stack} bytes ({:.1}%), range [{:#x}..={:#x}], span {span} bytes, density {:.1}%",
+                    pct(stack),
+                    min_stack,
+                    STACK_TOP,
+                    100.0 * stack as f64 / span as f64,
+                );
+            }
+            if other > 0 {
+                println!(
+                    "  other (data/heap/input): {other} bytes ({:.1}%), range [{:#x}..={:#x}]",
+                    pct(other),
+                    min_other,
+                    max_other,
+                );
+            }
+        }
+        "main" => {
+            lambda_vm_prover::prove_with_inputs(&elf, &private_inputs)
+                .expect("monolithic prove failed");
+            println!("main prove ok ({} bytes ELF)", elf.len());
+        }
+        "cont" => {
+            let epoch_size: usize = args
+                .get(3)
+                .map(|s| s.parse().expect("bad epoch_size"))
+                .unwrap_or(65536);
+            let ok = lambda_vm_prover::continuation::prove_and_verify_continuation(
+                &elf,
+                &private_inputs,
+                epoch_size,
+            )
+            .expect("continuation failed");
+            assert!(ok, "continuation did not verify");
+            println!("cont prove+verify ok (epoch_size={epoch_size})");
+        }
+        other => {
+            eprintln!("unknown mode {other:?}; use count|footprint|main|cont");
+            std::process::exit(2);
+        }
+    }
+    println!("elapsed {:.2}s", start.elapsed().as_secs_f64());
+}