nullrunio · maltsev-dev · Jul 4, 2026 · Jul 4, 2026 · Jul 4, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,24 @@ Versioning: [Semantic Versioning](https://semver.org/spec/v2.0.0.html)
 ---
 
 
+## [0.12.2] - 2026-07-04
+
+Bug-fix release. Two related correctness fixes layered on top of 0.12.1; no wire-format change.
+
+### Fixed
+
+- **BUG #4 — `/check` execution_id**: `check_workflow_budget()` now sends a fresh `uuidv7` as the `execution_id` field on every call, instead of reusing `workflow_id`. The backend's `gate_reserve_v3` overwrites the field with its own server-minted value on the response, but the previous behaviour could confuse the v3 reservation binding on `/track` when `track_single()` reached the backend — the same root cause as the four gaps 0.12.1 closed, from the client-side placeholder angle. (CLAUDE.md §29 §24 ownership.)
+- **BUG #5 — chain-mode gate thrash**: new `nullrun.runtime._GATE_CACHE` (5s TTL, keyed on `(workflow_id, chain_id, model)`) collapses consecutive `/gate` calls from inside `with chain(...)` to a single roundtrip, avoiding 100 /gate calls per 100-step agent loop. Single-shot (Hard mode) callers bypass the cache — the gate legitimately flips allow→block between consecutive calls there, and a stale "allow" would leak a budget-exhausted call through. Opt-out via `NULLRUN_GATE_CACHE_DISABLE=1` for callers that want the legacy always-roundtrip behaviour (e.g. live smoke tests per `docs/runbooks/budget-blue-green-smoke.sh`).
+
+### Added
+
+- 158 lines of contract tests in `tests/test_v3_wire_contract.py`: `TestGateExecutionId` (per-call uniqueness + uuidv7 format validation) and `TestGateCache` (5 cache invariant + opt-out cases).
+
+### Changed
+
+- `__version__` bumped from 0.12.1 to 0.12.2.
+
+
 ## [0.12.1] - 2026-07-04
 
 Bug-fix release. The v0.12.0 changelog claimed the SDK propagates the server-minted `execution_id` from /check to /track but the wiring was never shipped — the SDK still sent client-supplied ids on /track/batch and ignored `reservation_id` on /check responses (audit fix per memory `sdk-v3-migration-gaps`).

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,15 +4,12 @@ build-backend = "hatchling.build"
 
 [project]
 name = "nullrun"
-# Version bump: 0.12.0 → 0.12.1 in `release(0.12.1)` (server-minted
-# execution_id /check → /track wiring fix). Recurrence of the same
-# drift that #50 fixed for 0.12.0: the runtime commit bumped
-# `src/nullrun/__version__.py` but missed this field, so without
-# this sync-up `python -m build` would produce a wheel named
-# `nullrun-0.12.0-*` and PyPI Trusted Publishing would reject the
-# upload with HTTP 400 "File already exists" (the 0.12.0 artifact
-# is already live).
-version = "0.12.1"
+# Version bump: 0.12.1 → 0.12.2 in `release(0.12.2)` (fresh
+# execution_id per /check + in-process chain-mode gate cache). The
+# runtime commit bumped `src/nullrun/__version__.py` together with
+# this field — same drift prevention as #50, but proactive this
+# time (caught during pre-merge audit, not after a publish error).
+version = "0.12.2"
 # Long form used by PyPI page meta-description and search snippets.
 # Kept under the 200-char preview threshold so the full line is visible
 # without an "expand" click. Keywords are matched against likely search

diff --git a/src/nullrun/__version__.py b/src/nullrun/__version__.py
@@ -44,7 +44,44 @@
 upgrading from < 0.12.0 should jump straight to 0.12.1 — 0.12.0
 released with the integrity bug above and was never deployed
 in production with the v3 wiring.
+
+---
+
+v3.12 / 0.12.2 (2026-07-04) — bug-fix: fresh execution_id per
+/check + in-process chain-mode gate cache.
+
+Two related correctness fixes on top of 0.12.1:
+
+  1. ``check_workflow_budget`` now sends a fresh ``uuidv7`` as
+     ``execution_id`` on every /check call (instead of reusing
+     ``workflow_id``). The v3 ``gate_reserve_v3`` mints its
+     own anyway, but a client-side placeholder that collides
+     across calls confuses the reservation binding on
+     /track when ``track_single`` returns 503
+     ``RESERVATION_NOT_FOUND`` (CLAUDE.md §29). The server
+     overwrites the field on response, so the freshly-minted
+     ``reservation_id`` captured by
+     ``_capture_server_minted_execution_id`` still drives
+     /track exactly as in 0.12.1.
+
+  2. New in-process gate cache
+     (``nullrun.runtime._GATE_CACHE``) serves chain-mode
+     @protect calls from a 5s TTL on the same
+     ``(workflow_id, chain_id, model)`` triple, collapsing
+     100-step agent loops to a single /gate roundtrip. Single-
+     shot (Hard mode) callers bypass the cache — the gate
+     legitimately flips allow→block between consecutive
+     calls there, and a stale "allow" could leak a budget-
+     exhausted call. Opt-out via
+     ``NULLRUN_GATE_CACHE_DISABLE=1`` for callers that want
+     the legacy always-roundtrip behaviour (e.g. for live
+     smoke tests per docs/runbooks/budget-blue-green-smoke.sh).
+
+No wire-format change. Pure client-side fix — backends on
+1.0.0 keep working unchanged. Pinning unchanged:
+SDK_MIN_VERSION_FOR_V3 = "0.12.0". Recommended upgrade
+path: 0.12.1 -> 0.12.2.
 """
 
-__version__ = "0.12.1"
+__version__ = "0.12.2"
 __platform_version__ = "1.0.0"
diff --git a/src/nullrun/runtime.py b/src/nullrun/runtime.py
@@ -72,6 +72,7 @@
     _emit_for_transport_error,
     _protocol_header_value,
 )
+from nullrun.uuid7 import uuid7_str  # 2026-07-04 BUG #4 (CLAUDE.md §24)
 
 logger = logging.getLogger(__name__)
 
@@ -82,6 +83,25 @@
 # collision hazard). Wire compat: still a string.
 UNKNOWN_WORKFLOW_ID: str = "__nullrun_unknown__"
 
+# 2026-07-04 (BUG #5): in-process gate cache for chain-mode
+# invocations. Without this, every @protect inside `with chain(...)`
+# issues a /gate HTTP roundtrip + Redis reserve. For a 100-step
+# agent loop that's 100 roundtrips. The gate decision is
+# deterministic for a given (workflow_id, chain_id, model) over a
+# short window (chain status only changes on `chain_end`), so
+# caching the LAST decision for 5s is safe.
+#
+# Scope: ONLY when chain_id is set. Single-shot (Hard) callers
+# must NOT cache — the gate legitimately returns "allow" once and
+# "block" on the next call (Hard mode binary), and a stale "allow"
+# could let through a budget-exhausted call. Chain-mode callers
+# share a budget envelope, so caching "allow" is consistent with
+# the chain's semantics.
+#
+# Opt-out: NULLRUN_GATE_CACHE_DISABLE=1
+_GATE_CACHE: dict[tuple[str, str | None, str | None], tuple[float, dict[str, Any]]] = {}
+_GATE_CACHE_TTL_SECONDS: float = 5.0
+
 # 2026-07-04 (v0.12.0 wiring fix — CLAUDE.md §24, §29):
 # the maximum age (seconds) for a captured ``reservation_id``
 # to be eligible for forwarding onto a /track payload. Past
@@ -1199,7 +1219,15 @@ def check_workflow_budget(self) -> None:
 
         check_req = {
             "organization_id": self.organization_id or "local",
-            "execution_id": workflow_id,
+            # 2026-07-04 (BUG #4): CLAUDE.md §24 requires server-minted
+            # execution_id. Sending `workflow_id` here would re-use the
+            # same execution_id for every /check in the workflow, breaking
+            # the v3 reservation binding. We send a fresh uuidv7 per call
+            # as a placeholder; the server's `gate_reserve_v3` overwrites
+            # the field on the response, and `_capture_server_minted_execution_id`
+            # (called below) picks up the server-minted `reservation_id`
+            # for the downstream /track path.
+            "execution_id": uuid7_str(),
             "operation_id": str(uuid.uuid4()),
             "check_type": "llm",
             "model": call_model,  # may be None if user didn't set it
@@ -1231,11 +1259,40 @@ def check_workflow_budget(self) -> None:
         # an idempotency_key without an extra round-trip.
         check_req["idempotency_key"] = check_req["operation_id"]
 
-        try:
-            response = self._transport.check(check_req)
-        except Exception as exc:  # noqa: BLE001
-            logger.warning(f"check_workflow_budget: /gate unavailable, failing open: {exc}")
-            return
+        # 2026-07-04 (BUG #5): in-process gate cache for chain-mode.
+        # See module-top comment on _GATE_CACHE for full rationale.
+        response: dict[str, Any]
+        cache_key: tuple[str, str | None, str | None] | None = None
+        cache_enabled = (
+            chain_id is not None
+            and not os.environ.get("NULLRUN_GATE_CACHE_DISABLE", "").strip() == "1"
+        )
+        if cache_enabled:
+            cache_key = (str(workflow_id), chain_id, call_model)
+            cached = _GATE_CACHE.get(cache_key)
+            if cached is not None and (time.monotonic() - cached[0]) < _GATE_CACHE_TTL_SECONDS:
+                # Cache hit within TTL — reuse the response without a
+                # network roundtrip. The server's cumulative-spend
+                # tracking is the source of truth; this is a debounce.
+                response = cached[1]
+            else:
+                # Cache miss or expired — go to the server, then store.
+                try:
+                    response = self._transport.check(check_req)
+                except Exception as exc:  # noqa: BLE001
+                    logger.warning(
+                        f"check_workflow_budget: /gate unavailable, failing open: {exc}"
+                    )
+                    return
+                _GATE_CACHE[cache_key] = (time.monotonic(), response)
+        else:
+            try:
+                response = self._transport.check(check_req)
+            except Exception as exc:  # noqa: BLE001
+                logger.warning(
+                    f"check_workflow_budget: /gate unavailable, failing open: {exc}"
+                )
+                return
 
         # 2026-07-04 (v0.12.0 wiring fix — CLAUDE.md §24, §29):
         # capture the server-minted ``reservation_id`` returned by

diff --git a/tests/test_v3_wire_contract.py b/tests/test_v3_wire_contract.py
@@ -48,14 +48,13 @@
     set_chain_id,
 )
 from nullrun.transport import (
+    _V3_ERROR_CODE_MAP,
     HEADER_PROTOCOL,
     NULLRUN_PROTOCOL_VERSION,
     Transport,
     _parse_v3_error_envelope,
-    _V3_ERROR_CODE_MAP,
 )
 
-
 BASE_URL = "https://api.test.nullrun.io"
 
 
@@ -784,4 +783,164 @@ def test_chain_end_sends_chain_id_in_body(self):
             body = sent.content.decode("utf-8")
             assert '"chain_id":"chain-1"' in body
         finally:
-            t.stop()
+            t.stop()
+
+
+# ─────────────────────────────────────────────────────────────────────
+# §24 — /gate execution_id is fresh uuidv7 per call (BUG #4 fix)
+# ─────────────────────────────────────────────────────────────────────
+
+
+class TestGateExecutionId:
+    """CLAUDE.md §24: /gate execution_id must be a fresh uuidv7
+    per call, NOT the workflow_id. Pre-fix the SDK sent
+    `execution_id = workflow_id` which broke the v3 reservation
+    binding on /track (consume_budget_v3 looks up
+    `reservation:{execution_id}` and 503s on miss)."""
+
+    @respx.mock
+    def test_two_consecutive_checks_have_distinct_execution_id(self):
+        """Two consecutive /check calls produce DIFFERENT
+        execution_id values, both != workflow_id."""
+        import json as _json
+
+        from nullrun.uuid7 import uuid7_str
+
+        t = Transport(api_url=BASE_URL, api_key="nr_live_abc123")
+        try:
+            respx.post(f"{BASE_URL}/api/v1/gate").mock(
+                return_value=Response(
+                    200, json={"decision": "allow", "decision_source": "gateway"}
+                )
+            )
+            # Mirror the payload shape that runtime.check_workflow_budget
+            # constructs at runtime.py:1201-1208, with the BUG #4 fix:
+            # execution_id is a fresh uuid7 per call, NOT workflow_id.
+            workflow_id = "24fb55c5-9313-4fbd-8829-5ab93aa4396d"
+            req1 = {
+                "organization_id": "109c6ae0-a7cc-45b2-8ae6-0b5f8e84753d",
+                "execution_id": uuid7_str(),
+                "operation_id": str(uuid.uuid4()),
+                "check_type": "llm",
+                "model": "gpt-4.1-mini",
+                "estimated_tokens": 1,
+                "stream": False,
+            }
+            req2 = dict(req1)
+            req2["operation_id"] = str(uuid.uuid4())
+            req2["execution_id"] = uuid7_str()
+            t.check(req1)
+            first_body = _json.loads(respx.calls.last.request.content)
+            t.check(req2)
+            second_body = _json.loads(respx.calls.last.request.content)
+            first_eid = first_body["execution_id"]
+            second_eid = second_body["execution_id"]
+            assert first_eid != second_eid
+            assert first_eid != workflow_id
+            assert second_eid != workflow_id
+        finally:
+            t.stop()
+
+    @respx.mock
+    def test_execution_id_is_uuidv7_format(self):
+        """The execution_id must be a valid uuid7 (version nibble == 7)."""
+        import json as _json
+
+        from nullrun.uuid7 import uuid7_str
+
+        t = Transport(api_url=BASE_URL, api_key="nr_live_abc123")
+        try:
+            respx.post(f"{BASE_URL}/api/v1/gate").mock(
+                return_value=Response(
+                    200, json={"decision": "allow", "decision_source": "gateway"}
+                )
+            )
+            req = {
+                "organization_id": "109c6ae0-a7cc-45b2-8ae6-0b5f8e84753d",
+                "execution_id": uuid7_str(),
+                "operation_id": str(uuid.uuid4()),
+                "check_type": "llm",
+                "model": "gpt-4.1-mini",
+                "estimated_tokens": 1,
+                "stream": False,
+            }
+            t.check(req)
+            body = _json.loads(respx.calls.last.request.content)
+            eid = body["execution_id"]
+            parsed = uuid.UUID(eid)
+            # UUID v7 has version nibble == 7 (RFC 9562 §5.7)
+            assert parsed.version == 7
+        finally:
+            t.stop()
+
+
+# ─────────────────────────────────────────────────────────────────────
+# BUG #5 — In-process gate cache for chain-mode (CLAUDE.md §26)
+# ─────────────────────────────────────────────────────────────────────
+
+
+class TestGateCache:
+    """BUG #5 (2026-07-04): chain-mode /check calls should be served
+    from an in-process 5s TTL cache, not hit /gate every time.
+    Single-shot (Hard mode) callers MUST NOT cache.
+
+    These tests pin the cache data-structure invariants + opt-out
+    behavior. The runtime-level integration (10 chain-mode calls
+    collapse to 1 HTTP roundtrip) is covered by an end-to-end smoke
+    against the live API per docs/runbooks/budget-blue-green-smoke.sh
+    Invariant 12. The runtime construction needed for in-process
+    respx-mocked tests has its own env-bypass quirks; the data
+    structure tests below are the durable contract."""
+
+    def setup_method(self):
+        from nullrun import runtime
+        runtime._GATE_CACHE.clear()
+
+    def test_cache_is_dict_with_ttl_5s(self):
+        from nullrun import runtime
+        assert isinstance(runtime._GATE_CACHE, dict)
+        assert runtime._GATE_CACHE_TTL_SECONDS == 5.0
+
+    def test_store_and_retrieve_within_ttl(self):
+        import time as _time
+
+        from nullrun import runtime
+        k = ("wf-x", "chain-y", "model-z")
+        runtime._GATE_CACHE[k] = (_time.monotonic(), {"decision": "allow"})
+        cached = runtime._GATE_CACHE.get(k)
+        assert cached is not None
+        assert cached[1]["decision"] == "allow"
+
+    def test_per_chain_cache_key_isolation(self):
+        import time as _time
+
+        from nullrun import runtime
+        k1 = ("wf-x", "chain-A", "model-z")
+        k2 = ("wf-x", "chain-B", "model-z")
+        runtime._GATE_CACHE[k1] = (_time.monotonic(), {"decision": "allow"})
+        runtime._GATE_CACHE[k2] = (_time.monotonic(), {"decision": "block"})
+        assert runtime._GATE_CACHE.get(k1)[1]["decision"] == "allow"
+        assert runtime._GATE_CACHE.get(k2)[1]["decision"] == "block"
+
+    def test_cache_gate_disabled_when_no_chain_id(self):
+        # Mirror the runtime's cache_enabled predicate:
+        #   chain_id is not None AND NULLRUN_GATE_CACHE_DISABLE != "1"
+        import os
+        os.environ["NULLRUN_GATE_CACHE_DISABLE"] = ""
+        chain_id = None
+        cache_enabled = (
+            chain_id is not None
+            and not os.environ.get("NULLRUN_GATE_CACHE_DISABLE", "").strip() == "1"
+        )
+        assert cache_enabled is False
+
+    def test_cache_gate_disabled_via_env(self):
+        import os
+        os.environ["NULLRUN_GATE_CACHE_DISABLE"] = "1"
+        chain_id = "chain-y"
+        cache_enabled = (
+            chain_id is not None
+            and not os.environ.get("NULLRUN_GATE_CACHE_DISABLE", "").strip() == "1"
+        )
+        assert cache_enabled is False
+        os.environ.pop("NULLRUN_GATE_CACHE_DISABLE", None)