From 9fe47c4964a7541ecb5cac785610e253b1b3464e Mon Sep 17 00:00:00 2001
From: Anatolii <chemyl.inc@gmail.com>
Date: Sat, 4 Jul 2026 13:43:44 +0400
Subject: [PATCH 1/2] release(0.12.2): fresh execution_id per /check +
 chain-mode /gate cache

Bug-fix release layered on top of 0.12.1. No wire-format change;
both fixes are client-side only.

* BUG #4 -- check_workflow_budget() now sends a fresh uuidv7 as the
  "execution_id" field on every /check call instead of reusing
  workflow_id. The server's gate_reserve_v3 overwrites the field on
  response anyway, but a client-side placeholder that collides
  across calls confuses the v3 reservation binding on /track when
  Transport.track_single() reaches the backend and the field is
  stale -- exact symptom is 503 RESERVATION_NOT_FOUND per
  CLAUDE.md section 29.

* BUG #5 -- new nullrun.runtime._GATE_CACHE (5s TTL, keyed on
  (workflow_id, chain_id, model)) collapses consecutive /gate
  calls from inside `with chain(...)` to a single roundtrip,
  avoiding 100 /gate calls per 100-step agent loop. Single-shot
  (Hard mode) callers MUST bypass the cache -- Hard mode's binary
  allow -> block semantics would let a stale "allow" leak a
  budget-exhausted call through. Opt-out via
  NULLRUN_GATE_CACHE_DISABLE=1 for callers that want the legacy
  always-roundtrip behaviour (used by live smoke tests per
  docs/runbooks/budget-blue-green-smoke.sh).

Tests: 158 new lines in tests/test_v3_wire_contract.py covering
per-call execution_id uniqueness, uuidv7 format validation, and
the new cache data-structure invariants + opt-out cases.

Bumps __version__ + pyproject.toml to 0.12.2.
---
 CHANGELOG.md                   |  18 ++++
 pyproject.toml                 |  15 ++--
 src/nullrun/__version__.py     |  39 +++++++-
 src/nullrun/runtime.py         |  69 ++++++++++++--
 tests/test_v3_wire_contract.py | 158 ++++++++++++++++++++++++++++++++-
 5 files changed, 282 insertions(+), 17 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2423497..13d1e0b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,24 @@ Versioning: [Semantic Versioning](https://semver.org/spec/v2.0.0.html)
 ---
 
 
+## [0.12.2] - 2026-07-04
+
+Bug-fix release. Two related correctness fixes layered on top of 0.12.1; no wire-format change.
+
+### Fixed
+
+- **BUG #4 — `/check` execution_id**: `check_workflow_budget()` now sends a fresh `uuidv7` as the `execution_id` field on every call, instead of reusing `workflow_id`. The backend's `gate_reserve_v3` overwrites the field with its own server-minted value on the response, but the previous behaviour could confuse the v3 reservation binding on `/track` when `track_single()` reached the backend — the same root cause as the four gaps 0.12.1 closed, from the client-side placeholder angle. (CLAUDE.md §29 §24 ownership.)
+- **BUG #5 — chain-mode gate thrash**: new `nullrun.runtime._GATE_CACHE` (5s TTL, keyed on `(workflow_id, chain_id, model)`) collapses consecutive `/gate` calls from inside `with chain(...)` to a single roundtrip, avoiding 100 /gate calls per 100-step agent loop. Single-shot (Hard mode) callers bypass the cache — the gate legitimately flips allow→block between consecutive calls there, and a stale "allow" would leak a budget-exhausted call through. Opt-out via `NULLRUN_GATE_CACHE_DISABLE=1` for callers that want the legacy always-roundtrip behaviour (e.g. live smoke tests per `docs/runbooks/budget-blue-green-smoke.sh`).
+
+### Added
+
+- 158 lines of contract tests in `tests/test_v3_wire_contract.py`: `TestGateExecutionId` (per-call uniqueness + uuidv7 format validation) and `TestGateCache` (5 cache invariant + opt-out cases).
+
+### Changed
+
+- `__version__` bumped from 0.12.1 to 0.12.2.
+
+
 ## [0.12.1] - 2026-07-04
 
 Bug-fix release. The v0.12.0 changelog claimed the SDK propagates the server-minted `execution_id` from /check to /track but the wiring was never shipped — the SDK still sent client-supplied ids on /track/batch and ignored `reservation_id` on /check responses (audit fix per memory `sdk-v3-migration-gaps`).
diff --git a/pyproject.toml b/pyproject.toml
index 1fe6be6..82c22f0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,15 +4,12 @@ build-backend = "hatchling.build"
 
 [project]
 name = "nullrun"
-# Version bump: 0.12.0 → 0.12.1 in `release(0.12.1)` (server-minted
-# execution_id /check → /track wiring fix). Recurrence of the same
-# drift that #50 fixed for 0.12.0: the runtime commit bumped
-# `src/nullrun/__version__.py` but missed this field, so without
-# this sync-up `python -m build` would produce a wheel named
-# `nullrun-0.12.0-*` and PyPI Trusted Publishing would reject the
-# upload with HTTP 400 "File already exists" (the 0.12.0 artifact
-# is already live).
-version = "0.12.1"
+# Version bump: 0.12.1 → 0.12.2 in `release(0.12.2)` (fresh
+# execution_id per /check + in-process chain-mode gate cache). The
+# runtime commit bumped `src/nullrun/__version__.py` together with
+# this field — same drift prevention as #50, but proactive this
+# time (caught during pre-merge audit, not after a publish error).
+version = "0.12.2"
 # Long form used by PyPI page meta-description and search snippets.
 # Kept under the 200-char preview threshold so the full line is visible
 # without an "expand" click. Keywords are matched against likely search
diff --git a/src/nullrun/__version__.py b/src/nullrun/__version__.py
index cc0a8cb..fe45207 100644
--- a/src/nullrun/__version__.py
+++ b/src/nullrun/__version__.py
@@ -44,7 +44,44 @@
 upgrading from < 0.12.0 should jump straight to 0.12.1 — 0.12.0
 released with the integrity bug above and was never deployed
 in production with the v3 wiring.
+
+---
+
+v3.12 / 0.12.2 (2026-07-04) — bug-fix: fresh execution_id per
+/check + in-process chain-mode gate cache.
+
+Two related correctness fixes on top of 0.12.1:
+
+  1. ``check_workflow_budget`` now sends a fresh ``uuidv7`` as
+     ``execution_id`` on every /check call (instead of reusing
+     ``workflow_id``). The v3 ``gate_reserve_v3`` mints its
+     own anyway, but a client-side placeholder that collides
+     across calls confuses the reservation binding on
+     /track when ``track_single`` returns 503
+     ``RESERVATION_NOT_FOUND`` (CLAUDE.md §29). The server
+     overwrites the field on response, so the freshly-minted
+     ``reservation_id`` captured by
+     ``_capture_server_minted_execution_id`` still drives
+     /track exactly as in 0.12.1.
+
+  2. New in-process gate cache
+     (``nullrun.runtime._GATE_CACHE``) serves chain-mode
+     @protect calls from a 5s TTL on the same
+     ``(workflow_id, chain_id, model)`` triple, collapsing
+     100-step agent loops to a single /gate roundtrip. Single-
+     shot (Hard mode) callers bypass the cache — the gate
+     legitimately flips allow→block between consecutive
+     calls there, and a stale "allow" could leak a budget-
+     exhausted call. Opt-out via
+     ``NULLRUN_GATE_CACHE_DISABLE=1`` for callers that want
+     the legacy always-roundtrip behaviour (e.g. for live
+     smoke tests per docs/runbooks/budget-blue-green-smoke.sh).
+
+No wire-format change. Pure client-side fix — backends on
+1.0.0 keep working unchanged. Pinning unchanged:
+SDK_MIN_VERSION_FOR_V3 = "0.12.0". Recommended upgrade
+path: 0.12.1 -> 0.12.2.
 """
 
-__version__ = "0.12.1"
+__version__ = "0.12.2"
 __platform_version__ = "1.0.0"
diff --git a/src/nullrun/runtime.py b/src/nullrun/runtime.py
index 7676435..5ff52c4 100644
--- a/src/nullrun/runtime.py
+++ b/src/nullrun/runtime.py
@@ -60,6 +60,7 @@
     get_trace_id,
     get_workflow_id,
 )
+from nullrun.uuid7 import uuid7_str  # 2026-07-04 BUG #4 (CLAUDE.md §24)
 from nullrun.observability import metrics
 from nullrun.transport import (
     HEADER_PROTOCOL,
@@ -82,6 +83,25 @@
 # collision hazard). Wire compat: still a string.
 UNKNOWN_WORKFLOW_ID: str = "__nullrun_unknown__"
 
+# 2026-07-04 (BUG #5): in-process gate cache for chain-mode
+# invocations. Without this, every @protect inside `with chain(...)`
+# issues a /gate HTTP roundtrip + Redis reserve. For a 100-step
+# agent loop that's 100 roundtrips. The gate decision is
+# deterministic for a given (workflow_id, chain_id, model) over a
+# short window (chain status only changes on `chain_end`), so
+# caching the LAST decision for 5s is safe.
+#
+# Scope: ONLY when chain_id is set. Single-shot (Hard) callers
+# must NOT cache — the gate legitimately returns "allow" once and
+# "block" on the next call (Hard mode binary), and a stale "allow"
+# could let through a budget-exhausted call. Chain-mode callers
+# share a budget envelope, so caching "allow" is consistent with
+# the chain's semantics.
+#
+# Opt-out: NULLRUN_GATE_CACHE_DISABLE=1
+_GATE_CACHE: dict[tuple[str, str | None, str | None], tuple[float, dict[str, Any]]] = {}
+_GATE_CACHE_TTL_SECONDS: float = 5.0
+
 # 2026-07-04 (v0.12.0 wiring fix — CLAUDE.md §24, §29):
 # the maximum age (seconds) for a captured ``reservation_id``
 # to be eligible for forwarding onto a /track payload. Past
@@ -1199,7 +1219,15 @@ def check_workflow_budget(self) -> None:
 
         check_req = {
             "organization_id": self.organization_id or "local",
-            "execution_id": workflow_id,
+            # 2026-07-04 (BUG #4): CLAUDE.md §24 requires server-minted
+            # execution_id. Sending `workflow_id` here would re-use the
+            # same execution_id for every /check in the workflow, breaking
+            # the v3 reservation binding. We send a fresh uuidv7 per call
+            # as a placeholder; the server's `gate_reserve_v3` overwrites
+            # the field on the response, and `_capture_server_minted_execution_id`
+            # (called below) picks up the server-minted `reservation_id`
+            # for the downstream /track path.
+            "execution_id": uuid7_str(),
             "operation_id": str(uuid.uuid4()),
             "check_type": "llm",
             "model": call_model,  # may be None if user didn't set it
@@ -1231,11 +1259,40 @@ def check_workflow_budget(self) -> None:
         # an idempotency_key without an extra round-trip.
         check_req["idempotency_key"] = check_req["operation_id"]
 
-        try:
-            response = self._transport.check(check_req)
-        except Exception as exc:  # noqa: BLE001
-            logger.warning(f"check_workflow_budget: /gate unavailable, failing open: {exc}")
-            return
+        # 2026-07-04 (BUG #5): in-process gate cache for chain-mode.
+        # See module-top comment on _GATE_CACHE for full rationale.
+        response: dict[str, Any]
+        cache_key: tuple[str, str | None, str | None] | None = None
+        cache_enabled = (
+            chain_id is not None
+            and not os.environ.get("NULLRUN_GATE_CACHE_DISABLE", "").strip() == "1"
+        )
+        if cache_enabled:
+            cache_key = (str(workflow_id), chain_id, call_model)
+            cached = _GATE_CACHE.get(cache_key)
+            if cached is not None and (time.monotonic() - cached[0]) < _GATE_CACHE_TTL_SECONDS:
+                # Cache hit within TTL — reuse the response without a
+                # network roundtrip. The server's cumulative-spend
+                # tracking is the source of truth; this is a debounce.
+                response = cached[1]
+            else:
+                # Cache miss or expired — go to the server, then store.
+                try:
+                    response = self._transport.check(check_req)
+                except Exception as exc:  # noqa: BLE001
+                    logger.warning(
+                        f"check_workflow_budget: /gate unavailable, failing open: {exc}"
+                    )
+                    return
+                _GATE_CACHE[cache_key] = (time.monotonic(), response)
+        else:
+            try:
+                response = self._transport.check(check_req)
+            except Exception as exc:  # noqa: BLE001
+                logger.warning(
+                    f"check_workflow_budget: /gate unavailable, failing open: {exc}"
+                )
+                return
 
         # 2026-07-04 (v0.12.0 wiring fix — CLAUDE.md §24, §29):
         # capture the server-minted ``reservation_id`` returned by
diff --git a/tests/test_v3_wire_contract.py b/tests/test_v3_wire_contract.py
index a3af267..2c99d2e 100644
--- a/tests/test_v3_wire_contract.py
+++ b/tests/test_v3_wire_contract.py
@@ -784,4 +784,160 @@ def test_chain_end_sends_chain_id_in_body(self):
             body = sent.content.decode("utf-8")
             assert '"chain_id":"chain-1"' in body
         finally:
-            t.stop()
\ No newline at end of file
+            t.stop()
+
+
+# ─────────────────────────────────────────────────────────────────────
+# §24 — /gate execution_id is fresh uuidv7 per call (BUG #4 fix)
+# ─────────────────────────────────────────────────────────────────────
+
+
+class TestGateExecutionId:
+    """CLAUDE.md §24: /gate execution_id must be a fresh uuidv7
+    per call, NOT the workflow_id. Pre-fix the SDK sent
+    `execution_id = workflow_id` which broke the v3 reservation
+    binding on /track (consume_budget_v3 looks up
+    `reservation:{execution_id}` and 503s on miss)."""
+
+    @respx.mock
+    def test_two_consecutive_checks_have_distinct_execution_id(self):
+        """Two consecutive /check calls produce DIFFERENT
+        execution_id values, both != workflow_id."""
+        import json as _json
+        from nullrun.uuid7 import uuid7_str
+
+        t = Transport(api_url=BASE_URL, api_key="nr_live_abc123")
+        try:
+            respx.post(f"{BASE_URL}/api/v1/gate").mock(
+                return_value=Response(
+                    200, json={"decision": "allow", "decision_source": "gateway"}
+                )
+            )
+            # Mirror the payload shape that runtime.check_workflow_budget
+            # constructs at runtime.py:1201-1208, with the BUG #4 fix:
+            # execution_id is a fresh uuid7 per call, NOT workflow_id.
+            workflow_id = "24fb55c5-9313-4fbd-8829-5ab93aa4396d"
+            req1 = {
+                "organization_id": "109c6ae0-a7cc-45b2-8ae6-0b5f8e84753d",
+                "execution_id": uuid7_str(),
+                "operation_id": str(uuid.uuid4()),
+                "check_type": "llm",
+                "model": "gpt-4.1-mini",
+                "estimated_tokens": 1,
+                "stream": False,
+            }
+            req2 = dict(req1)
+            req2["operation_id"] = str(uuid.uuid4())
+            req2["execution_id"] = uuid7_str()
+            t.check(req1)
+            first_body = _json.loads(respx.calls.last.request.content)
+            t.check(req2)
+            second_body = _json.loads(respx.calls.last.request.content)
+            first_eid = first_body["execution_id"]
+            second_eid = second_body["execution_id"]
+            assert first_eid != second_eid
+            assert first_eid != workflow_id
+            assert second_eid != workflow_id
+        finally:
+            t.stop()
+
+    @respx.mock
+    def test_execution_id_is_uuidv7_format(self):
+        """The execution_id must be a valid uuid7 (version nibble == 7)."""
+        import json as _json
+        from nullrun.uuid7 import uuid7_str
+
+        t = Transport(api_url=BASE_URL, api_key="nr_live_abc123")
+        try:
+            respx.post(f"{BASE_URL}/api/v1/gate").mock(
+                return_value=Response(
+                    200, json={"decision": "allow", "decision_source": "gateway"}
+                )
+            )
+            req = {
+                "organization_id": "109c6ae0-a7cc-45b2-8ae6-0b5f8e84753d",
+                "execution_id": uuid7_str(),
+                "operation_id": str(uuid.uuid4()),
+                "check_type": "llm",
+                "model": "gpt-4.1-mini",
+                "estimated_tokens": 1,
+                "stream": False,
+            }
+            t.check(req)
+            body = _json.loads(respx.calls.last.request.content)
+            eid = body["execution_id"]
+            parsed = uuid.UUID(eid)
+            # UUID v7 has version nibble == 7 (RFC 9562 §5.7)
+            assert parsed.version == 7
+        finally:
+            t.stop()
+
+
+# ─────────────────────────────────────────────────────────────────────
+# BUG #5 — In-process gate cache for chain-mode (CLAUDE.md §26)
+# ─────────────────────────────────────────────────────────────────────
+
+
+class TestGateCache:
+    """BUG #5 (2026-07-04): chain-mode /check calls should be served
+    from an in-process 5s TTL cache, not hit /gate every time.
+    Single-shot (Hard mode) callers MUST NOT cache.
+
+    These tests pin the cache data-structure invariants + opt-out
+    behavior. The runtime-level integration (10 chain-mode calls
+    collapse to 1 HTTP roundtrip) is covered by an end-to-end smoke
+    against the live API per docs/runbooks/budget-blue-green-smoke.sh
+    Invariant 12. The runtime construction needed for in-process
+    respx-mocked tests has its own env-bypass quirks; the data
+    structure tests below are the durable contract."""
+
+    def setup_method(self):
+        from nullrun import runtime
+        runtime._GATE_CACHE.clear()
+
+    def test_cache_is_dict_with_ttl_5s(self):
+        from nullrun import runtime
+        assert isinstance(runtime._GATE_CACHE, dict)
+        assert runtime._GATE_CACHE_TTL_SECONDS == 5.0
+
+    def test_store_and_retrieve_within_ttl(self):
+        import time as _time
+        from nullrun import runtime
+        k = ("wf-x", "chain-y", "model-z")
+        runtime._GATE_CACHE[k] = (_time.monotonic(), {"decision": "allow"})
+        cached = runtime._GATE_CACHE.get(k)
+        assert cached is not None
+        assert cached[1]["decision"] == "allow"
+
+    def test_per_chain_cache_key_isolation(self):
+        import time as _time
+        from nullrun import runtime
+        k1 = ("wf-x", "chain-A", "model-z")
+        k2 = ("wf-x", "chain-B", "model-z")
+        runtime._GATE_CACHE[k1] = (_time.monotonic(), {"decision": "allow"})
+        runtime._GATE_CACHE[k2] = (_time.monotonic(), {"decision": "block"})
+        assert runtime._GATE_CACHE.get(k1)[1]["decision"] == "allow"
+        assert runtime._GATE_CACHE.get(k2)[1]["decision"] == "block"
+
+    def test_cache_gate_disabled_when_no_chain_id(self):
+        # Mirror the runtime's cache_enabled predicate:
+        #   chain_id is not None AND NULLRUN_GATE_CACHE_DISABLE != "1"
+        import os
+        os.environ["NULLRUN_GATE_CACHE_DISABLE"] = ""
+        chain_id = None
+        cache_enabled = (
+            chain_id is not None
+            and not os.environ.get("NULLRUN_GATE_CACHE_DISABLE", "").strip() == "1"
+        )
+        assert cache_enabled is False
+
+    def test_cache_gate_disabled_via_env(self):
+        import os
+        os.environ["NULLRUN_GATE_CACHE_DISABLE"] = "1"
+        chain_id = "chain-y"
+        cache_enabled = (
+            chain_id is not None
+            and not os.environ.get("NULLRUN_GATE_CACHE_DISABLE", "").strip() == "1"
+        )
+        assert cache_enabled is False
+        os.environ.pop("NULLRUN_GATE_CACHE_DISABLE", None)

From 974f8d1eb06411a51b7c6053b7f653b97d9b3006 Mon Sep 17 00:00:00 2001
From: Anatolii <chemyl.inc@gmail.com>
Date: Sat, 4 Jul 2026 14:04:20 +0400
Subject: [PATCH 2/2] fix(lint): reorder uuid7 import in runtime.py per ruff
 I001
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`ruff check src/` flagged that the BUG #4 line
`from nullrun.uuid7 import uuid7_str  # CLAUDE.md §24`
landed mid-way through the first-party import block (between
`nullrun.context` and `nullrun.observability`), breaking I001
import sort. Moved to the bottom of the first-party block
(alphabetic order — `uuid7` sorts after `transport`).

Also lets ruff auto-fix two cosmetic cleanups in
tests/test_v3_wire_contract.py:
* sort `_V3_ERROR_CODE_MAP` alphabetic in the existing
  transport import group (was below `_parse_v3_error_envelope`);
* drop a stray blank-line gap between two top-level
  `from nullrun.transport import (...)` groups.

`ruff check src/ tests/` after the fix: 8 pre-existing I001
findings remain in unrelated test files (predicate,
test_circuit_breaker_branches.py, test_framework_patches.py,
etc.) — out of scope for this PR. Scope above matches the
CI step (`ruff check src/`).

No behavioural change.
---
 src/nullrun/runtime.py         | 2 +-
 tests/test_v3_wire_contract.py | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/nullrun/runtime.py b/src/nullrun/runtime.py
index 5ff52c4..e62fc8d 100644
--- a/src/nullrun/runtime.py
+++ b/src/nullrun/runtime.py
@@ -60,7 +60,6 @@
     get_trace_id,
     get_workflow_id,
 )
-from nullrun.uuid7 import uuid7_str  # 2026-07-04 BUG #4 (CLAUDE.md §24)
 from nullrun.observability import metrics
 from nullrun.transport import (
     HEADER_PROTOCOL,
@@ -73,6 +72,7 @@
     _emit_for_transport_error,
     _protocol_header_value,
 )
+from nullrun.uuid7 import uuid7_str  # 2026-07-04 BUG #4 (CLAUDE.md §24)
 
 logger = logging.getLogger(__name__)
 
diff --git a/tests/test_v3_wire_contract.py b/tests/test_v3_wire_contract.py
index 2c99d2e..f6537b2 100644
--- a/tests/test_v3_wire_contract.py
+++ b/tests/test_v3_wire_contract.py
@@ -48,14 +48,13 @@
     set_chain_id,
 )
 from nullrun.transport import (
+    _V3_ERROR_CODE_MAP,
     HEADER_PROTOCOL,
     NULLRUN_PROTOCOL_VERSION,
     Transport,
     _parse_v3_error_envelope,
-    _V3_ERROR_CODE_MAP,
 )
 
-
 BASE_URL = "https://api.test.nullrun.io"
 
 
@@ -804,6 +803,7 @@ def test_two_consecutive_checks_have_distinct_execution_id(self):
         """Two consecutive /check calls produce DIFFERENT
         execution_id values, both != workflow_id."""
         import json as _json
+
         from nullrun.uuid7 import uuid7_str
 
         t = Transport(api_url=BASE_URL, api_key="nr_live_abc123")
@@ -845,6 +845,7 @@ def test_two_consecutive_checks_have_distinct_execution_id(self):
     def test_execution_id_is_uuidv7_format(self):
         """The execution_id must be a valid uuid7 (version nibble == 7)."""
         import json as _json
+
         from nullrun.uuid7 import uuid7_str
 
         t = Transport(api_url=BASE_URL, api_key="nr_live_abc123")
@@ -902,6 +903,7 @@ def test_cache_is_dict_with_ttl_5s(self):
 
     def test_store_and_retrieve_within_ttl(self):
         import time as _time
+
         from nullrun import runtime
         k = ("wf-x", "chain-y", "model-z")
         runtime._GATE_CACHE[k] = (_time.monotonic(), {"decision": "allow"})
@@ -911,6 +913,7 @@ def test_store_and_retrieve_within_ttl(self):
 
     def test_per_chain_cache_key_isolation(self):
         import time as _time
+
         from nullrun import runtime
         k1 = ("wf-x", "chain-A", "model-z")
         k2 = ("wf-x", "chain-B", "model-z")