From 3f9b447d11ee2da8b6f46c45e8e02c27ce8a861e Mon Sep 17 00:00:00 2001 From: Andy Staples Date: Tue, 19 May 2026 15:27:04 -0600 Subject: [PATCH 1/9] History export extension - V1 --- CHANGELOG.md | 25 + docs/features.md | 156 +++++ .../plan-orchestrationHistoryExport.prompt.md | 438 ++++++++++++++ .../extensions/history_export/__init__.py | 80 +++ .../extensions/history_export/_constants.py | 34 ++ .../extensions/history_export/_logging.py | 19 + .../extensions/history_export/activities.py | 204 +++++++ .../extensions/history_export/azure_blob.py | 152 +++++ .../extensions/history_export/client.py | 363 ++++++++++++ .../extensions/history_export/entity.py | 287 +++++++++ .../extensions/history_export/exceptions.py | 57 ++ .../extensions/history_export/models.py | 543 ++++++++++++++++++ .../extensions/history_export/orchestrator.py | 306 ++++++++++ .../history_export/serialization.py | 181 ++++++ .../extensions/history_export/transitions.py | 84 +++ .../extensions/history_export/writer.py | 95 +++ examples/history_export/README.md | 38 ++ examples/history_export/app.py | 126 ++++ pyproject.toml | 3 + tests/durabletask/extensions/__init__.py | 0 .../extensions/history_export/__init__.py | 0 .../history_export/_test_helpers.py | 52 ++ .../history_export/test_activities.py | 232 ++++++++ .../test_azure_blob_writer_e2e.py | 144 +++++ .../extensions/history_export/test_client.py | 298 ++++++++++ .../extensions/history_export/test_entity.py | 289 ++++++++++ .../extensions/history_export/test_models.py | 209 +++++++ .../history_export/test_orchestrator.py | 226 ++++++++ .../history_export/test_serialization.py | 185 ++++++ .../test_transitions_and_exceptions.py | 133 +++++ 30 files changed, 4959 insertions(+) create mode 100644 docs/plan-orchestrationHistoryExport.prompt.md create mode 100644 durabletask/extensions/history_export/__init__.py create mode 100644 durabletask/extensions/history_export/_constants.py create mode 100644 durabletask/extensions/history_export/_logging.py create mode 100644 durabletask/extensions/history_export/activities.py create mode 100644 durabletask/extensions/history_export/azure_blob.py create mode 100644 durabletask/extensions/history_export/client.py create mode 100644 durabletask/extensions/history_export/entity.py create mode 100644 durabletask/extensions/history_export/exceptions.py create mode 100644 durabletask/extensions/history_export/models.py create mode 100644 durabletask/extensions/history_export/orchestrator.py create mode 100644 durabletask/extensions/history_export/serialization.py create mode 100644 durabletask/extensions/history_export/transitions.py create mode 100644 durabletask/extensions/history_export/writer.py create mode 100644 examples/history_export/README.md create mode 100644 examples/history_export/app.py create mode 100644 tests/durabletask/extensions/__init__.py create mode 100644 tests/durabletask/extensions/history_export/__init__.py create mode 100644 tests/durabletask/extensions/history_export/_test_helpers.py create mode 100644 tests/durabletask/extensions/history_export/test_activities.py create mode 100644 tests/durabletask/extensions/history_export/test_azure_blob_writer_e2e.py create mode 100644 tests/durabletask/extensions/history_export/test_client.py create mode 100644 tests/durabletask/extensions/history_export/test_entity.py create mode 100644 tests/durabletask/extensions/history_export/test_models.py create mode 100644 tests/durabletask/extensions/history_export/test_orchestrator.py create mode 100644 tests/durabletask/extensions/history_export/test_serialization.py create mode 100644 tests/durabletask/extensions/history_export/test_transitions_and_exceptions.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d1c2f5e..f3e3f2ce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,31 @@ adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ADDED +- Added `durabletask.extensions.history_export` for exporting the event history of + terminal orchestrations to an external destination. Includes + `ExportHistoryClient`, a per-job `ExportHistoryJobClient` returned by + `get_job_client(...)`, and `list_jobs(...)` for enumerating jobs by status + or last-modified window. Ships with a bundled `AzureBlobHistoryExportWriter` + (installed with `pip install durabletask[history-export-azure]`) and a + `HistoryWriter` protocol for plugging in custom destinations. Supports both + `ExportMode.BATCH` (export a window and complete) and `ExportMode.CONTINUOUS` + (tail terminal instances indefinitely until stopped via `delete_job`). + Exported blobs are self-describing: each blob carries an explicit + `schema_version`, the orchestration's `OrchestrationState` metadata, and + the full ordered event list. The export workflow retries each instance up + to 3 times with exponential backoff (15s/30s/60s), retries failed batches + up to 3 times, caps in-flight exports via `max_parallel_exports` + (default 32), continues-as-new every 5 page cycles to bound orchestrator + history, and re-fetches entity state at the top of every page loop so + external delete or mark-failed signals stop the orchestrator cleanly. + Job state lives in a durable entity with an explicit state-transition + matrix (PENDING / ACTIVE / COMPLETED / FAILED); invalid transitions raise + `ExportJobInvalidTransitionError`. Persisted entity state uses a + versioned, schema-stable JSON shape (`STATE_SCHEMA_VERSION`) with no + embedded Python type metadata. Each export job's driving orchestrator + uses a deterministic instance ID (`export-job-{job_id}`, exposed via + `orchestrator_instance_id_for(...)`) so callers can correlate a job ID + with its orchestrator for logging, monitoring, and restart. - Added `ReplaySafeLogger` and `OrchestrationContext.create_replay_safe_logger()` for suppressing duplicate log messages during orchestrator replay - Added `GrpcChannelOptions` and `GrpcRetryPolicyOptions` for configuring diff --git a/docs/features.md b/docs/features.md index d23b7c75..6c8f837b 100644 --- a/docs/features.md +++ b/docs/features.md @@ -412,6 +412,162 @@ class MyPayloadStore(PayloadStore): See the [large payload example](../examples/large_payload/) for a complete working sample. +### Orchestration history export + +The optional `durabletask.extensions.history_export` package provides a workflow for exporting the +full event history of terminal orchestrations to an external destination (for example Azure Blob +Storage). It is modeled after the .NET SDK's `ExportHistory` package. + +An export job scans a time window of terminal instances, fetches each instance's history through +the standard client API, serializes it, and writes it through a pluggable `HistoryWriter`. Job +state is owned by a durable entity so progress survives worker restarts. + +#### Installation + +The core extension has no extra dependencies beyond the SDK. The bundled Azure Blob writer requires +an optional dependency: + +```bash +pip install durabletask[history-export-azure] +``` + +#### Configuring an export job + +```python +from datetime import datetime, timedelta, timezone + +from durabletask import client, worker +from durabletask.extensions.history_export import ( + ExportDestination, + ExportFormat, + ExportFormatKind, + ExportHistoryClient, + ExportJobCreationOptions, + ExportMode, +) +from durabletask.extensions.history_export.azure_blob import ( + AzureBlobHistoryExportWriter, + AzureBlobHistoryExportWriterOptions, +) + +writer = AzureBlobHistoryExportWriter( + AzureBlobHistoryExportWriterOptions( + container_name="orchestration-history", + connection_string="DefaultEndpointsProtocol=https;...", + ) +) +dt_client = client.TaskHubGrpcClient(host_address="localhost:4001") +export_client = ExportHistoryClient(dt_client, writer) + +with worker.TaskHubGrpcWorker(host_address="localhost:4001") as w: + export_client.register_worker(w) + w.start() + + now = datetime.now(timezone.utc) + desc = export_client.create_job(ExportJobCreationOptions( + mode=ExportMode.BATCH, + completed_time_from=now - timedelta(days=1), + completed_time_to=now, + destination=ExportDestination(container="orchestration-history", prefix="2026-05"), + format=ExportFormat(kind=ExportFormatKind.JSONL_GZIP), + max_instances_per_batch=100, + )) + final = export_client.wait_for_job(desc.job_id, timeout=600) + print(final.status, final.exported_instances, final.failed_instances) +``` + +#### Output formats + +| `ExportFormatKind` | Per-instance blob extension | Content-Type | Content-Encoding | +|---|---|---|---| +| `JSON` | `.json` | `application/json` | (none) | +| `JSONL_GZIP` | `.jsonl.gz` | `application/x-ndjson` | `gzip` | + +The JSONL format prepends a metadata line and writes one event per line, which streams well for +large histories. + +#### Modes + +Two `ExportMode` values are supported: + +- `BATCH` exports a fixed time window (`completed_time_from` .. `completed_time_to`) and then + marks the job `Completed`. This is the default and is appropriate for one-off backfills. +- `CONTINUOUS` tails terminal instances indefinitely, sleeping between empty pages. The job + has no natural completion; stop it by calling `export_client.delete_job(job_id)` (or signalling + `mark_failed`). The orchestrator re-reads entity state at the top of each page loop, so the + next iteration after the delete observes the missing entity and exits cleanly. + +#### Listing and managing jobs + +Use `list_jobs(ExportJobQuery(...))` to enumerate existing jobs, optionally filtered by status +or last-modified window: + +```python +from durabletask.extensions.history_export import ExportJobQuery, ExportJobStatus + +for desc in export_client.list_jobs( + ExportJobQuery(status=[ExportJobStatus.FAILED]) +): + print(desc.job_id, desc.last_error) +``` + +Use `get_job_client(job_id)` for a per-job convenience wrapper that exposes `describe()`, +`wait(timeout=...)`, and `delete()` directly: + +```python +job_client = export_client.get_job_client(desc.job_id) +final = job_client.wait(timeout=600) +print(final.status.value, final.exported_instances) +job_client.delete() +``` + +#### Custom destinations + +The Azure Blob writer is one implementation of the +`HistoryWriter` extension point. Implement the protocol (no +inheritance required — it's a `typing.Protocol`) to send exports to +any destination (S3, GCS, SFTP, local filesystem, a database, etc.): + +```python +from typing import Optional + +from durabletask.extensions.history_export import HistoryWriter + + +class LocalFileSystemHistoryWriter: + def __init__(self, root_dir: str) -> None: + self._root = root_dir + + def write( + self, + *, + instance_id: str, + blob_name: str, + payload: bytes, + content_type: str, + content_encoding: Optional[str], + ) -> None: + import os + path = os.path.join(self._root, blob_name) + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "wb") as fp: + fp.write(payload) + + +export_client = ExportHistoryClient( + dt_client, LocalFileSystemHistoryWriter("/var/exports") +) +``` + +> [!TIP] +> The bundled `AzureBlobHistoryExportWriter` lives in the optional +> `durabletask.extensions.history_export.azure_blob` submodule and +> requires `pip install durabletask[history-export-azure]`. The +> core history-export package has no third-party runtime +> dependencies — only the bundled destination does. Future +> first-party destinations (S3, GCS, etc.) will be packaged as +> additional optional extras using the same pattern. + ### Logging configuration Both the TaskHubGrpcWorker and TaskHubGrpcClient (as well as DurableTaskSchedulerWorker and diff --git a/docs/plan-orchestrationHistoryExport.prompt.md b/docs/plan-orchestrationHistoryExport.prompt.md new file mode 100644 index 00000000..971b0d6e --- /dev/null +++ b/docs/plan-orchestrationHistoryExport.prompt.md @@ -0,0 +1,438 @@ +# Plan: Python History Export Parity + +Add orchestration history export to durabletask-python in two layers: first expose the existing sidecar capabilities that are already present in protobuf but missing from the Python client and test backend; then build a higher-level export job workflow modeled on durabletask-dotnet ExportHistory, with Azure Blob as the first destination via an optional extension. This keeps the core SDK transport-focused while still achieving full feature parity. + +## Target Scope + +**Full parity path**: core retrieval/list/rewind APIs plus a higher-level job-based export workflow modeled on the .NET implementation. + +**Recommended packaging split**: Core SDKowns retrieval/list/rewind and generic history serialization helpers; Azure Blob export workflow lives behind optional dependencies in an extension-style module. + +**Initial destination scope**: Azure Blob only, matching durabletask-dotnet's current export package. Do not generalize destination providers until a second real provider exists. + +**Initial format scope**: JSON and JSONL gzip with explicit schema versioning. Defer CSV, Parquet, and import/replay-from-export features. + +## Phase 1: Core History Foundations + +### 1.1 Add Core Client APIs +**Files**: [durabletask/client.py](durabletask/client.py#L213) and [durabletask/client.py](durabletask/client.py#L428) + +Add sync and async methods to both `TaskHubGrpcClient` and `AsyncTaskHubGrpcClient`: +- `get_orchestration_history(instance_id) -> Iterable[HistoryEvent]` — stream all HistoryEvent messages for an instance. +- `list_instance_ids(runtime_status, completed_time_from, completed_time_to, page_size) -> Page[List[str]]` — paginate terminal instance IDs by completion-time window and status filter. +- `rewind_orchestration(instance_id, reason) -> None` — rewind a failed orchestration (if backend supports). + +Implementation should: +- Reuse the existing gRPC stubs already declared in [durabletask/internal/orchestrator_service_pb2_grpc.py](durabletask/internal/orchestrator_service_pb2_grpc.py). +- Handle gRPC error codes (NOT_FOUND, UNIMPLEMENTED, CANCELLED, INTERNAL) and map to Python exceptions. +- For streamed history, aggregate HistoryChunk messages and yield or return individual HistoryEvent objects. +- De-externalize nested payload tokens if a payload_store is configured (reuse logic from [durabletask/payload/helpers.py](durabletask/payload/helpers.py)). +- Log operations consistently with existing client methods. + +### 1.2 Implement In-Memory Backend History Support +**Files**: [durabletask/testing/in_memory_backend.py](durabletask/testing/in_memory_backend.py#L1086) + +Implement two currently-stubbed gRPC servicer methods: +- `StreamInstanceHistory(request: StreamInstanceHistoryRequest, context)` — yield HistoryChunk messages containing events from the instance's history list, paginating or chunking as needed. +- `ListInstanceIds(request: ListInstanceIdsRequest, context)` — iterate stored instances, filter by terminal status and completion-time window, and return a paginated response with continuation token. + +Optionally decide whether to implement `RewindInstance`: +- **Recommendation**: Mark as explicitly unsupported initially (abort with UNIMPLEMENTED); add later if demand is high or if rewound instances are needed for tests. +- If implemented, reset the instance's history to exclude the failed events and restart with a new execution ID. + +### 1.3 Add History Helper Utilities +**Files**: New module `durabletask/internal/history_helpers.py` or extend [durabletask/payload/helpers.py](durabletask/payload/helpers.py) + +Provide internal helpers for: +- **Payload de-externalization in history**: walk nested HistoryEvent fields and replace payload tokens with original data if a store is configured. +- **Event-to-dict conversion**: convert a HistoryEvent protobuf to a serializable dict for JSON export (used later). +- **Event filtering**: filter a list of HistoryEvent by type, timestamp range, or other criteria (optional; can be deferred to export layer). + +### 1.4 Settle on Public History Return Type +**Decision Point** + +Options: +1. **Raw protobuf** (recommended for Phase 1): Return `Iterable[pb.HistoryEvent]` to callers. Low risk of churn, users who need export can call helper utilities. Matches .NET baseline. +2. **Python dataclass wrapper** (higher initial investment): Define an `HistoryEventData` class and convert all HistoryEvent messages to it. Better UX but requires more upfront design. +3. **Both** (post-Phase 1): Start with raw protobuf; add a Python wrapper class in Phase 2 if export code needs the conversion anyway. + +**Recommendation**: Start with raw protobuf. Keep the public API minimal and transport-focused. Add serialization helpers (internal) that the export layer can use. + +### 1.5 Update Tests +**Files**: [tests/durabletask/test_client.py](tests/durabletask/test_client.py), [tests/durabletask/test_orchestration_executor.py](tests/durabletask/test_orchestration_executor.py), new test file for history retrieval + +Add tests for: +- **Client API tests**: Verify `get_orchestration_history()`, `list_instance_ids()` make correct gRPC requests, handle streaming/pagination, de-externalize payloads (reuse FakePayloadStore from large_payload tests), and map errors. +- **Backend tests**: Verify in-memory history streaming returns events in order, ListInstanceIds paginates correctly by status/time, and continuation tokens work. +- **Error handling**: Verify NOT_FOUND, UNIMPLEMENTED, CANCELLED, INTERNAL errors are mapped appropriately. + +### 1.6 Update Core Changelog +**Files**: [CHANGELOG.md](CHANGELOG.md#L7) + +Under `## Unreleased`, add: +``` +ADDED + +- Added `get_orchestration_history(instance_id)` and async variant to both gRPC client classes for streaming instance history events. +- Added `list_instance_ids(runtime_status, completed_time_from, completed_time_to, ...)` to support filtering terminal instances by completion time and status. Supports pagination via continuation tokens. +- Added `rewind_orchestration(instance_id, reason)` and async variant for rewinding failed orchestrations (backend support may vary). +- In-memory backend now implements `StreamInstanceHistory` and `ListInstanceIds` gRPC methods for testing. +- Added internal history utility functions for payload de-externalization and event serialization. +``` + +--- + +## Phase 2: Export Job Workflow + +### 2.1 Package Structure +**New module**: `durabletask/extensions/history_export/` or a new separate package (if isolated from core) + +**Recommendation**: Follow the existing extension pattern. Place under `durabletask/extensions/` as a submodule with optional (but recommended) Azure Blob dependencies: + +``` +durabletask/ +└── extensions/ + └── history_export/ + ├── __init__.py + ├── client.py # ExportHistoryClient, ExportHistoryJobClient + ├── models/ + │ ├── __init__.py + │ ├── export_job_state.py + │ ├── export_checkpoint.py + │ ├── export_destination.py + │ ├── export_filter.py + │ ├── export_format.py + │ ├── export_failure.py + │ ├── export_job_description.py + │ ├── export_job_configuration.py + │ ├── export_job_status.py # Enum: Active, Completed, Failed + │ ├── export_mode.py # Enum: Batch, Continuous + │ └── export_job_creation_options.py + ├── entity.py # ExportJob durable entity + ├── serialization.py # JSON/JSONL serialization logic + └── orchestrations/ + ├── __init__.py + ├── export_job_orchestrator.py + └── activities/ + ├── __init__.py + ├── list_terminal_instances.py + ├── export_instance_history.py + └── helpers.py + +tests/ +└── durabletask/ + └── extensions/ + └── history_export/ + ├── test_export_client.py + ├── test_export_models.py + ├── test_export_entity.py + ├── test_export_orchestrator.py + ├── test_export_activities.py + └── test_serialization.py +``` + +### 2.2 Models and Data Types +**Files**: `durabletask/extensions/history_export/models/` + +Define the following (inspired by durabletask-dotnet ExportHistory models): + +```python +# export_mode.py +class ExportMode(Enum): + BATCH = 1 # Export a fixed time window, then complete + CONTINUOUS = 2 # Tail terminal instances continuously + +# export_format.py +class ExportFormatKind(Enum): + JSON = 1 # Array of events, uncompressed + JSONL = 2 # One event per line, gzip compressed + +@dataclass +class ExportFormat: + kind: ExportFormatKind = ExportFormatKind.JSONL + schema_version: str = "1.0" + +# export_destination.py +@dataclass +class ExportDestination: + container: str # Azure Blob container name + prefix: Optional[str] = None # Optional blob prefix + +# export_filter.py +@dataclass +class ExportFilter: + completed_time_from: datetime # Inclusive lower bound + completed_time_to: Optional[datetime] = None # Inclusive upper bound + runtime_status: Optional[List[OrchestrationStatus]] = None # Filter by status + +# export_checkpoint.py +@dataclass +class ExportCheckpoint: + last_instance_key: Optional[str] = None # Continuation token for ListInstanceIds + +# export_failure.py +@dataclass +class ExportFailure: + instance_id: str + reason: str + attempt_count: int + last_attempt: datetime + +# export_job_status.py +class ExportJobStatus(Enum): + ACTIVE = "Active" + COMPLETED = "Completed" + FAILED = "Failed" + +# export_job_state.py +@dataclass +class ExportJobState: + status: ExportJobStatus + config: Optional['ExportJobConfiguration'] = None + checkpoint: Optional[ExportCheckpoint] = None + created_at: Optional[datetime] = None + last_modified_at: Optional[datetime] = None + last_checkpoint_time: Optional[datetime] = None + last_error: Optional[str] = None + scanned_instances: int = 0 + exported_instances: int = 0 + orchestrator_instance_id: Optional[str] = None + +# export_job_configuration.py +@dataclass +class ExportJobConfiguration: + mode: ExportMode + filter: ExportFilter + destination: ExportDestination + format: ExportFormat + max_parallel_exports: int = 32 + max_instances_per_batch: int = 100 + +# export_job_creation_options.py +@dataclass +class ExportJobCreationOptions: + mode: ExportMode + completed_time_from: datetime + completed_time_to: Optional[datetime] # Required for Batch, None for Continuous + destination: Optional[ExportDestination] + job_id: Optional[str] = None + format: ExportFormat = field(default_factory=lambda: ExportFormat()) + runtime_status: Optional[List[OrchestrationStatus]] = None # Defaults to terminal statuses + max_instances_per_batch: int = 100 + +# export_job_description.py +@dataclass +class ExportJobDescription: + job_id: str + status: ExportJobStatus + created_at: Optional[datetime] + last_modified_at: Optional[datetime] + config: Optional[ExportJobConfiguration] + orchestrator_instance_id: Optional[str] + scanned_instances: int + exported_instances: int + last_error: Optional[str] + checkpoint: Optional[ExportCheckpoint] + last_checkpoint_time: Optional[datetime] +``` + +### 2.3 Durable Entity for Job State +**Files**: `durabletask/extensions/history_export/entity.py` + +Implement `ExportJob` as a durable entity with operations: +- `create(context, creation_options: ExportJobCreationOptions)` — initialize job state and validate transitions. +- `get(context, _=None) -> ExportJobState` — fetch current state. +- `run(context, _=None)` — signal to start the export orchestrator. +- `commit_checkpoint(context, request: CommitCheckpointRequest)` — update progress, checkpoint, status. +- `mark_as_completed(context, _=None)` — transition to Completed. +- `mark_as_failed(context, error_message)` — transition to Failed. +- `delete(context, _=None)` — delete the entity. + +Include transition validation: define which operations are valid from which states (similar to ExportJobTransitions in .NET). + +### 2.4 Export Client +**Files**: `durabletask/extensions/history_export/client.py` + +Provide two public classes: + +```python +class ExportHistoryClient: + def __init__(self, durable_task_client: TaskHubGrpcClient, storage_options: ExportHistoryStorageOptions): + ... + + async def create_job_async(self, options: ExportJobCreationOptions) -> ExportHistoryJobClient: + """Create a new export job.""" + ... + + async def get_job_async(self, job_id: str) -> ExportJobDescription: + """Fetch a job by ID.""" + ... + + async def list_jobs_async(self, filter: Optional[ExportJobQuery] = None) -> AsyncIterable[ExportJobDescription]: + """List all export jobs, optionally filtered.""" + ... + + def get_job_client(self, job_id: str) -> ExportHistoryJobClient: + """Get a client for a specific job.""" + ... + +class ExportHistoryJobClient: + def __init__(self, job_id: str, ...): + ... + + async def create_async(self, options: ExportJobCreationOptions) -> None: + """Create the export job.""" + ... + + async def describe_async(self) -> ExportJobDescription: + """Get job status.""" + ... + + async def delete_async(self) -> None: + """Delete the job and terminate its orchestrator.""" + ... +``` + +Implementation: +- Use durable entities via `durable_task_client.signal_entity()` and `get_entity()` to manage job state. +- Wrap the entity ID as `ExportJob@{job_id}`. +- Schedule an orchestrator named `ExportJobOrchestrator` with a fixed instance ID pattern (e.g., `ExportJob-{job_id}`) to ensure one orchestrator per job. + +### 2.5 Activities +**Files**: `durabletask/extensions/history_export/orchestrations/activities/` + +Implement two activities: + +#### 2.5.1 ListTerminalInstancesActivity +Input: `ListTerminalInstancesRequest(completed_time_from, completed_time_to, runtime_status, last_instance_key, max_instances_per_batch)` +Output: `InstancePage(instance_ids: List[str], next_checkpoint: ExportCheckpoint)` + +Logic: +- Call the core client's `list_instance_ids()` with the filter parameters. +- Return a page of instance IDs and a checkpoint for the next call. + +#### 2.5.2 ExportInstanceHistoryActivity +Input: `ExportRequest(instance_id, destination, format)` +Output: `ExportResult(instance_id, success, error)` + +Logic: +- Fetch the instance's history using the core client's `get_orchestration_history()`. +- Fetch metadata using `get_orchestration_state()`. +- Serialize the history and metadata to the specified format (JSON or JSONL gzip). +- Upload to Azure Blob Storage. +- Return success/failure result. + +### 2.6 Export Orchestrator +**Files**: `durabletask/extensions/history_export/orchestrations/export_job_orchestrator.py` + +Orchestrate the export workflow: + +Input: `ExportJobRunRequest(job_entity_id, processed_cycles)` + +Logic: +1. Fetch job state from entity. If not Active, exit. +2. Call `ListTerminalInstancesActivity` to get a page of instance IDs. +3. If no instances and mode is Continuous, sleep and retry. If Batch, exit. +4. Call `ExportInstanceHistoryActivity` for each instance in parallel (bounded by `max_parallel_exports`). +5. With exponential backoff, retry failed exports up to 3 times. +6. Commit checkpoint if successful; record failures and stay at current checkpoint if batch fails. +7. If processed_cycles > 5, continue-as-new to reset history and prevent bloat. +8. Mark job as Completed or Failed based on final result. + +### 2.7 Serialization +**Files**: `durabletask/extensions/history_export/serialization.py` + +Provide functions: + +```python +def serialize_history( + instance_id: str, + metadata: OrchestrationState, + history: Iterable[pb.HistoryEvent], + format: ExportFormat, +) -> bytes: + """Serialize history and metadata to JSON or JSONL gzip.""" + ... + +def event_to_dict(event: pb.HistoryEvent) -> dict: + """Convert protobuf HistoryEvent to serializable dict.""" + ... +``` + +Implementation: +- For JSON: return an array of event dicts with metadata. +- For JSONL: return one event per line, gzip compressed. +- Preserve all event fields and handle polymorphic event types (use protobuf reflection or explicit converters). +- Skip internal fields (e.g., timestamps for WorkItem processing). + +### 2.8 Azure Blob Storage Upload +**Files**: Same activity file or new `durabletask/extensions/history_export/orchestrations/azure_storage.py` + +Use `azure.storage.blob.BlobClient` to upload serialized data: +- Generate a deterministic blob name (e.g., hash of completed time + instance ID). +- Include instance ID as blob metadata. +- Handle connection strings from `ExportHistoryStorageOptions`. + +### 2.9 Tests +**Files**: `tests/durabletask/extensions/history_export/` + +Add tests for: +- Export client creation, job listing, job description. +- Export job entity lifecycle (create, run, checkpoint, complete, fail, delete). +- Activity logic (ListTerminalInstancesActivity, ExportInstanceHistoryActivity). +- Orchestrator flow (paging, retries, continues-as-new, checkpoint commits). +- Serialization (JSON and JSONL formats, large nested payloads, polymorphic events). +- Azure Blob integration (mocked or using Azure Test Containers if the repo has that pattern). +- Batch vs. Continuous modes. +- Error transitions and recovery. + +### 2.10 Update Extension Changelog +**Files**: New `durabletask/extensions/history_export/CHANGELOG.md` or extend core [CHANGELOG.md](CHANGELOG.md) + +Document the new export extension, models, and client APIs. + +--- + +## Verification Checklist + +### Phase 1 +- [ ] Unit tests for `get_orchestration_history()`, `list_instance_ids()`, `rewind_orchestration()` (sync and async). +- [ ] Backend tests for in-memory StreamInstanceHistory and ListInstanceIds. +- [ ] Error mapping tests (NOT_FOUND, UNIMPLEMENTED, CANCELLED, INTERNAL). +- [ ] Payload de-externalization tests (use FakePayloadStore). +- [ ] Pylance diagnostics on modified client and backend files. +- [ ] `flake8` on modified Python files. +- [ ] Targeted pytest for client, backend, and large_payload tests. + +### Phase 2 +- [ ] Unit tests for all model classes (dataclass validation, enum values). +- [ ] Entity tests (create, get, signal operations, state transitions). +- [ ] Activity tests (ListTerminalInstancesActivity, ExportInstanceHistoryActivity). +- [ ] Orchestrator tests (paging, retries, continues-as-new, checkpointing). +- [ ] Serialization tests (JSON, JSONL gzip, nested payloads, polymorphic events). +- [ ] Integration tests (end-to-end export with mock or test Azure Blob). +- [ ] Batch and Continuous mode tests. +- [ ] Pylance diagnostics on export extension files. +- [ ] `flake8` on all new files. +- [ ] Targeted pytest for export extension tests. +- [ ] Optional: Azure Test Containers integration (if repo has pattern). + +--- + +## Further Considerations + +1. **Public history return type**: Keeping raw protobuf HistoryEvent in Phase 1 allows Phase 2 serialization logic to reuse the message introspection without forcing a new public model. Consider wrapping in Phase 2 if users ask. + +2. **Export package naming**: Avoid adding many methods directly to the core `TaskHubGrpcClient`. Instead, keep export under a clear namespace (`extensions.history_export` or a separate distribution) to signal that it is optional and not part of the core SDK. + +3. **Continuous export semantics**: Follow the .NET pattern closely: + - Tail terminal instances from a completion-time watermark. + - Persist checkpoint to entity state to survive orchestrator restarts. + - Use periodic continue-as-new or equivalent restart behavior if Python orchestrator history gets too large. + - Sleep between empty pages to avoid busy-waiting. + +4. **Dependency management**: The export extension should declare optional Azure Storage dependencies (e.g., `pip install durabletask[history-export]` pulls in `azure-storage-blob`). + +5. **Rewind support**: Rewind is a lower-priority feature. Consider leaving it unsupported in the in-memory backend for now and adding it only if users need it or if it's required for export testing. + +6. **Future extensions**: Design the destination abstraction (ExportDestination, upload logic) so it can evolve to support other backends (S3, GCS, SFTP, local filesystem) without core changes. For now, ship only Azure Blob. diff --git a/durabletask/extensions/history_export/__init__.py b/durabletask/extensions/history_export/__init__.py new file mode 100644 index 00000000..2214e9d7 --- /dev/null +++ b/durabletask/extensions/history_export/__init__.py @@ -0,0 +1,80 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Orchestration history export for Durable Task. + +This optional extension package provides a workflow for exporting +orchestration history from terminal instances to a configured +destination, modeled after the durabletask-dotnet ``ExportHistory`` +package. + +The core building blocks (models, durable entity, activities, +orchestrator, and the public client) live in this package and have +no required runtime dependencies beyond the core SDK. Specific +destinations (for example Azure Blob Storage) may require optional +dependencies; see the destination module documentation for details. +""" + +from durabletask.extensions.history_export._constants import ( + ENTITY_NAME, + ORCHESTRATOR_NAME, + orchestrator_instance_id_for, +) +from durabletask.extensions.history_export.activities import ( + HistoryExportContext, +) +from durabletask.extensions.history_export.client import ( + ExportHistoryClient, + ExportHistoryJobClient, +) +from durabletask.extensions.history_export.exceptions import ( + ExportJobError, + ExportJobInvalidTransitionError, + ExportJobNotFoundError, +) +from durabletask.extensions.history_export.models import ( + STATE_SCHEMA_VERSION, + ExportCheckpoint, + ExportDestination, + ExportFailure, + ExportFilter, + ExportFormat, + ExportFormatKind, + ExportJobConfiguration, + ExportJobCreationOptions, + ExportJobDescription, + ExportJobQuery, + ExportJobState, + ExportJobStatus, + ExportMode, +) +from durabletask.extensions.history_export.writer import HistoryWriter + +__all__ = [ + "ENTITY_NAME", + "ORCHESTRATOR_NAME", + "STATE_SCHEMA_VERSION", + "ExportCheckpoint", + "ExportDestination", + "ExportFailure", + "ExportFilter", + "ExportFormat", + "ExportFormatKind", + "ExportHistoryClient", + "ExportHistoryJobClient", + "ExportJobConfiguration", + "ExportJobCreationOptions", + "ExportJobDescription", + "ExportJobError", + "ExportJobInvalidTransitionError", + "ExportJobNotFoundError", + "ExportJobQuery", + "ExportJobState", + "ExportJobStatus", + "ExportMode", + "HistoryExportContext", + "HistoryWriter", + "orchestrator_instance_id_for", +] + +PACKAGE_NAME = "durabletask.extensions.history_export" diff --git a/durabletask/extensions/history_export/_constants.py b/durabletask/extensions/history_export/_constants.py new file mode 100644 index 00000000..c9f98694 --- /dev/null +++ b/durabletask/extensions/history_export/_constants.py @@ -0,0 +1,34 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Stable, cross-cutting constants for the history-export extension. + +These constants are imported by both the entity and the orchestrator +modules. Keeping them in their own module avoids the circular import +that would arise if either of those modules imported the other. +""" + +from __future__ import annotations + +ENTITY_NAME = "ExportJobEntity" +"""Logical name of the export-job durable entity.""" + +ORCHESTRATOR_NAME = "export_job_orchestrator" +"""Function-derived name of the export-job orchestrator.""" + +ORCHESTRATOR_INSTANCE_ID_PREFIX = "export-job-" +"""Prefix applied to deterministic orchestrator instance IDs.""" + + +def orchestrator_instance_id_for(job_id: str) -> str: + """Return the deterministic orchestrator instance ID for *job_id*. + + All export-job orchestrators share a stable instance-ID pattern so + that public clients can reliably correlate a job ID with the + orchestrator driving it (for logs, monitoring, restart, etc.). + Matches the .NET ``ExportHistoryConstants.GetOrchestratorInstanceId`` + pattern. + """ + if not job_id: + raise ValueError("job_id must be a non-empty string") + return f"{ORCHESTRATOR_INSTANCE_ID_PREFIX}{job_id}" diff --git a/durabletask/extensions/history_export/_logging.py b/durabletask/extensions/history_export/_logging.py new file mode 100644 index 00000000..b001c4ca --- /dev/null +++ b/durabletask/extensions/history_export/_logging.py @@ -0,0 +1,19 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Shared logger for the history-export extension. + +Submodules that emit log records should import :data:`logger` from +this module rather than calling :func:`logging.getLogger` themselves. +This keeps every emit attributed to the same logger name so that +callers can configure / filter the extension's output in one place. +""" + +from __future__ import annotations + +import logging + +logger = logging.getLogger("durabletask.extensions.history_export") +"""Module-wide logger for the history-export extension.""" + +__all__ = ["logger"] diff --git a/durabletask/extensions/history_export/activities.py b/durabletask/extensions/history_export/activities.py new file mode 100644 index 00000000..f5a390b9 --- /dev/null +++ b/durabletask/extensions/history_export/activities.py @@ -0,0 +1,204 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Activities for the history export workflow. + +Two activities cooperate to drive an export job: + +* ``list_terminal_instances`` — wraps + :meth:`TaskHubGrpcClient.list_instance_ids` to fetch one page of + terminal instance IDs that match the job's filter. + +* ``export_instance_history`` — fetches the full history for a single + instance via :meth:`TaskHubGrpcClient.get_orchestration_history`, + serializes it with the configured format, and writes the resulting + blob through a :class:`HistoryWriter`. + +The client and writer are not serializable, so they cannot be passed +through orchestrator inputs. Instead, the public client registers a +module-level :class:`HistoryExportContext` once at worker startup. +The activities resolve their dependencies from that context at +execution time. This is acceptable because activities run in-process +within the worker that registered them. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from datetime import datetime +from typing import Any, List, Mapping, Optional + +from durabletask import client as client_module +from durabletask import task + +from durabletask.extensions.history_export.models import ( + ExportFormat, + ExportFormatKind, + _dt_from_iso, +) +from durabletask.extensions.history_export.serialization import ( + content_encoding_for, + content_type_for, + file_extension_for, + orchestration_state_to_dict, + serialize_history, +) +from durabletask.extensions.history_export.writer import HistoryWriter + + +# The activity name registered with the worker is simply ``fn.__name__`` +# (see :func:`durabletask.task.get_name`). These constants exist so +# downstream code (the orchestrator, tests) can refer to the names +# symbolically without re-deriving them from the function objects. +LIST_TERMINAL_INSTANCES_ACTIVITY = "list_terminal_instances" +EXPORT_INSTANCE_HISTORY_ACTIVITY = "export_instance_history" + + +@dataclass +class HistoryExportContext: + """Runtime dependencies shared by all history-export activities.""" + + client: client_module.TaskHubGrpcClient + writer: HistoryWriter + + +_context: Optional[HistoryExportContext] = None + + +def bind_context(context: HistoryExportContext) -> None: + """Install the runtime dependencies for the history-export activities.""" + global _context + _context = context + + +def clear_context() -> None: + """Remove the bound context. Useful for tests.""" + global _context + _context = None + + +def _require_context() -> HistoryExportContext: + if _context is None: + raise RuntimeError( + "history-export activities invoked without a bound context; " + "call bind_context(HistoryExportContext(...)) before starting the worker" + ) + return _context + + +# ---------------------------------------------------------------------- +# Activity bodies +# ---------------------------------------------------------------------- + +def list_terminal_instances(_: task.ActivityContext, input: Mapping[str, Any]) -> dict: + """Activity: fetch one page of terminal instance IDs.""" + ctx = _require_context() + + runtime_status_names: Optional[List[str]] = input.get("runtime_status") + completed_time_from = _dt_from_iso(input.get("completed_time_from")) + completed_time_to = _dt_from_iso(input.get("completed_time_to")) + page_size = input.get("page_size") + continuation_token = input.get("continuation_token") + + if completed_time_from is None: + raise ValueError("list_terminal_instances requires 'completed_time_from'") + + runtime_status: Optional[List[client_module.OrchestrationStatus]] = None + if runtime_status_names is not None: + runtime_status = [ + client_module.OrchestrationStatus[name] for name in runtime_status_names + ] + + page = ctx.client.list_instance_ids( + runtime_status=runtime_status, + completed_time_from=completed_time_from, + completed_time_to=completed_time_to, + page_size=page_size, + continuation_token=continuation_token, + ) + + return { + "instance_ids": list(page.items), + "continuation_token": page.continuation_token, + } + + +def export_instance_history(_: task.ActivityContext, input: Mapping[str, Any]) -> dict: + """Activity: serialize and write one instance's history.""" + ctx = _require_context() + + instance_id = input["instance_id"] + fmt = ExportFormat._from_dict(input.get("format") or { + "kind": ExportFormatKind.JSONL_GZIP.value, + "schema_version": "1.0", + }) + destination = input.get("destination") or {} + prefix = destination.get("prefix") + + try: + events = ctx.client.get_orchestration_history(instance_id) + # Fetch the orchestration's terminal metadata too so the + # exported blob is self-describing (matches the .NET behavior). + state = ctx.client.get_orchestration_state( + instance_id, fetch_payloads=True, + ) + metadata = orchestration_state_to_dict(state) if state is not None else None + payload = serialize_history( + events, + instance_id=instance_id, + fmt=fmt, + metadata=metadata, + ) + blob_name = _blob_name_for(instance_id=instance_id, prefix=prefix, fmt=fmt) + ctx.writer.write( + instance_id=instance_id, + blob_name=blob_name, + payload=payload, + content_type=content_type_for(fmt), + content_encoding=content_encoding_for(fmt), + ) + except Exception as ex: # noqa: BLE001 - reported back via return value + return { + "instance_id": instance_id, + "success": False, + "error": f"{type(ex).__name__}: {ex}", + } + + return {"instance_id": instance_id, "success": True, "error": None} + + +# ---------------------------------------------------------------------- +# Helpers +# ---------------------------------------------------------------------- + +def _blob_name_for(*, instance_id: str, prefix: Optional[str], fmt: ExportFormat) -> str: + ext = file_extension_for(fmt) + safe_id = instance_id.replace("/", "_") + if prefix: + return f"{prefix.rstrip('/')}/{safe_id}{ext}" + return f"{safe_id}{ext}" + + +def register(worker_instance) -> None: + """Convenience helper to register both activities on *worker*.""" + worker_instance.add_activity(list_terminal_instances) + worker_instance.add_activity(export_instance_history) + + +# Used by the orchestrator to build a fresh activity input from the +# resolved job configuration without leaking model objects. +def build_list_activity_input( + *, + runtime_status_names: Optional[List[str]], + completed_time_from: datetime, + completed_time_to: Optional[datetime], + page_size: int, + continuation_token: Optional[str], +) -> dict: + return { + "runtime_status": runtime_status_names, + "completed_time_from": completed_time_from.isoformat(), + "completed_time_to": completed_time_to.isoformat() if completed_time_to else None, + "page_size": page_size, + "continuation_token": continuation_token, + } diff --git a/durabletask/extensions/history_export/azure_blob.py b/durabletask/extensions/history_export/azure_blob.py new file mode 100644 index 00000000..3a8a7f2c --- /dev/null +++ b/durabletask/extensions/history_export/azure_blob.py @@ -0,0 +1,152 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Azure Blob Storage destination for history exports. + +This optional module implements the +:class:`~durabletask.extensions.history_export.writer.HistoryWriter` +protocol on top of ``azure-storage-blob``. + +Install the dependency with:: + + pip install durabletask[history-export-azure] + +The writer is synchronous, matching the synchronous activity execution +model used by the rest of the extension. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any, Optional + +try: + from azure.core.exceptions import ResourceExistsError + from azure.storage.blob import BlobServiceClient, ContentSettings +except ImportError as exc: # pragma: no cover - import-time guard + raise ImportError( + "The 'azure-storage-blob' package is required for the Azure Blob " + "history-export writer. Install it with: " + "pip install durabletask[history-export-azure]" + ) from exc + + +@dataclass +class AzureBlobHistoryExportWriterOptions: + """Configuration for :class:`AzureBlobHistoryExportWriter`. + + Provide either *connection_string*, or both *account_url* and + *credential*. + + Attributes: + container_name: Azure Blob container that exports are written + to. The container is created on first use if it does not + already exist. + connection_string: Azure Storage connection string. Mutually + exclusive with *account_url*. + account_url: Azure Storage account URL + (e.g. ``https://.blob.core.windows.net``). Use + together with *credential* for token-based auth. + credential: A ``TokenCredential`` instance (e.g. + ``DefaultAzureCredential``). + api_version: Optional Azure Storage API version override + (useful for Azurite compatibility). + create_container_if_not_exists: When ``True`` (the default), + ensure the container exists on the first write. + """ + + container_name: str + connection_string: Optional[str] = None + account_url: Optional[str] = None + credential: Any = field(default=None, repr=False) + api_version: Optional[str] = None + create_container_if_not_exists: bool = True + + def __post_init__(self) -> None: + if not self.container_name: + raise ValueError("container_name is required") + if not self.connection_string and not self.account_url: + raise ValueError( + "Either 'connection_string' or 'account_url' (with 'credential') " + "must be provided" + ) + + +class AzureBlobHistoryExportWriter: + """Writes exported history blobs to Azure Blob Storage.""" + + def __init__(self, options: AzureBlobHistoryExportWriterOptions) -> None: + self._options = options + extra: dict = {} + if options.api_version: + extra["api_version"] = options.api_version + + if options.connection_string: + self._service = BlobServiceClient.from_connection_string( + options.connection_string, **extra + ) + else: + assert options.account_url is not None + self._service = BlobServiceClient( + account_url=options.account_url, + credential=options.credential, + **extra, + ) + + self._container_ready = False + + # ------------------------------------------------------------------ + # Context-manager / cleanup helpers + # ------------------------------------------------------------------ + + def close(self) -> None: + self._service.close() + + def __enter__(self) -> "AzureBlobHistoryExportWriter": + return self + + def __exit__(self, *args: object) -> None: + self.close() + + # ------------------------------------------------------------------ + # HistoryWriter protocol + # ------------------------------------------------------------------ + + def write( + self, + *, + instance_id: str, + blob_name: str, + payload: bytes, + content_type: str, + content_encoding: Optional[str], + ) -> None: + del instance_id # included by the protocol but not needed here + self._ensure_container() + container_client = self._service.get_container_client( + self._options.container_name + ) + content_settings = ContentSettings( + content_type=content_type, + content_encoding=content_encoding or "", + ) + container_client.upload_blob( + name=blob_name, + data=payload, + overwrite=True, + content_settings=content_settings, + ) + + # ------------------------------------------------------------------ + # Internals + # ------------------------------------------------------------------ + + def _ensure_container(self) -> None: + if self._container_ready or not self._options.create_container_if_not_exists: + self._container_ready = True + return + try: + self._service.create_container(self._options.container_name) + except ResourceExistsError: + pass + self._container_ready = True diff --git a/durabletask/extensions/history_export/client.py b/durabletask/extensions/history_export/client.py new file mode 100644 index 00000000..c4ee75c0 --- /dev/null +++ b/durabletask/extensions/history_export/client.py @@ -0,0 +1,363 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Public client API for the history export extension. + +The :class:`ExportHistoryClient` wraps a :class:`TaskHubGrpcClient` +and a +:class:`~durabletask.extensions.history_export.writer.HistoryWriter` +to expose a small, typed surface for creating, inspecting, listing, +and deleting export jobs. Most callers will pair it with the +per-job :class:`ExportHistoryJobClient` returned by +:meth:`ExportHistoryClient.get_job_client`. + +Typical usage:: + + from durabletask import client, worker + from durabletask.extensions.history_export import ( + ExportDestination, + ExportFormat, + ExportFormatKind, + ExportHistoryClient, + ExportJobCreationOptions, + ExportMode, + ) + from durabletask.extensions.history_export.azure_blob import ( + AzureBlobHistoryExportWriter, + AzureBlobHistoryExportWriterOptions, + ) + + writer = AzureBlobHistoryExportWriter( + AzureBlobHistoryExportWriterOptions( + container_name="exports", + connection_string="UseDevelopmentStorage=true", + ) + ) + dt_client = client.TaskHubGrpcClient(host_address="localhost:4001") + export_client = ExportHistoryClient(dt_client, writer) + + with worker.TaskHubGrpcWorker(host_address="localhost:4001") as w: + export_client.register_worker(w) + w.start() + + desc = export_client.create_job(ExportJobCreationOptions( + mode=ExportMode.BATCH, + completed_time_from=datetime(2026, 1, 1, tzinfo=timezone.utc), + completed_time_to=datetime(2026, 2, 1, tzinfo=timezone.utc), + destination=ExportDestination(container="exports", prefix="january"), + )) + job_client = export_client.get_job_client(desc.job_id) + final = job_client.wait(timeout=300) +""" + +from __future__ import annotations + +import json +import time +import uuid +from datetime import datetime, timezone +from typing import Iterator, Optional + +from durabletask import client as client_module +from durabletask import entities + +from durabletask.extensions.history_export._constants import ( + ENTITY_NAME, + ORCHESTRATOR_NAME, + orchestrator_instance_id_for, +) +from durabletask.extensions.history_export._logging import logger +from durabletask.extensions.history_export.activities import ( + HistoryExportContext, + bind_context, + register as _register_activities, +) +from durabletask.extensions.history_export.writer import HistoryWriter +from durabletask.extensions.history_export.entity import ExportJobEntity +from durabletask.extensions.history_export.exceptions import ( + ExportJobNotFoundError, +) +from durabletask.extensions.history_export.models import ( + ExportJobCreationOptions, + ExportJobDescription, + ExportJobQuery, + ExportJobStatus, +) +from durabletask.extensions.history_export.orchestrator import ( + export_job_orchestrator, +) + + +_TERMINAL_STATUSES = frozenset({ExportJobStatus.COMPLETED, ExportJobStatus.FAILED}) +_ENTITY_ID_PREFIX = f"@{ENTITY_NAME.lower()}@" + + +__all__ = ["ExportHistoryClient", "ExportHistoryJobClient"] + + +class ExportHistoryClient: + """Public façade for creating and inspecting export jobs.""" + + def __init__( + self, + durable_task_client: client_module.TaskHubGrpcClient, + writer: HistoryWriter, + ) -> None: + self._client = durable_task_client + self._writer = writer + + # ------------------------------------------------------------------ + # Worker wiring + # ------------------------------------------------------------------ + + def register_worker(self, worker_instance) -> None: + """Register the entity, activities, and orchestrator on *worker*. + + Also binds the activity execution context so the activities + can find the underlying client and writer at runtime. Call + this once per worker before :meth:`start`. + """ + worker_instance.add_entity(ExportJobEntity, name=ENTITY_NAME) + _register_activities(worker_instance) + worker_instance.add_orchestrator(export_job_orchestrator) + bind_context(HistoryExportContext(client=self._client, writer=self._writer)) + + # ------------------------------------------------------------------ + # Job lifecycle + # ------------------------------------------------------------------ + + def create_job( + self, + options: ExportJobCreationOptions, + *, + job_id: Optional[str] = None, + ) -> ExportJobDescription: + """Create a new export job and start its driving orchestrator. + + The entity is created in :attr:`ExportJobStatus.PENDING` and + immediately signalled with ``run``, which schedules the + driving orchestrator from inside the entity using a + deterministic instance ID (``export-job-{job_id}``). This + matches the .NET ``ExportJob.Run`` pattern: callers can + correlate a job with its orchestrator by ID alone and may + safely re-create a previously-terminated job. + """ + config = options.to_configuration() + resolved_job_id = job_id or options.job_id or uuid.uuid4().hex + entity_id = entities.EntityInstanceId(ENTITY_NAME, resolved_job_id) + created_at = datetime.now(timezone.utc) + config_dict = config._to_dict() + + # Signal create first; the entity will validate the transition + # and persist PENDING. Then signal run; the entity will + # schedule the orchestrator and transition to ACTIVE. Both + # signals are processed in FIFO order by the entity dispatcher. + self._client.signal_entity( + entity_id, + "create", + input={ + "config": config_dict, + "created_at": created_at.isoformat(), + }, + ) + self._client.signal_entity(entity_id, "run") + + logger.info( + "Submitted export job %r; orchestrator instance ID will be %s", + resolved_job_id, orchestrator_instance_id_for(resolved_job_id), + ) + return ExportJobDescription( + job_id=resolved_job_id, + status=ExportJobStatus.PENDING, + created_at=created_at, + last_modified_at=created_at, + config=config, + orchestrator_instance_id=orchestrator_instance_id_for(resolved_job_id), + scanned_instances=0, + exported_instances=0, + failed_instances=0, + last_error=None, + checkpoint=None, + last_checkpoint_time=None, + ) + + def get_job(self, job_id: str) -> Optional[ExportJobDescription]: + """Look up an export job by ID. Returns ``None`` if not found.""" + entity_id = entities.EntityInstanceId(ENTITY_NAME, job_id) + meta = self._client.get_entity(entity_id, include_state=True) + if meta is None: + return None + raw = meta.get_state(str) + if not raw: + return None + try: + state = json.loads(raw) + except (TypeError, ValueError): + return None + if not isinstance(state, dict): + return None + return ExportJobDescription._from_state_dict(job_id, state) + + def list_jobs( + self, + query: Optional[ExportJobQuery] = None, + ) -> Iterator[ExportJobDescription]: + """Enumerate export jobs. + + Filters from *query* (status, last-modified window) are + applied client-side after fetching pages from the backend. + Yields one :class:`ExportJobDescription` per matching job. + """ + if query is None: + query = ExportJobQuery() + + entity_query = client_module.EntityQuery( + instance_id_starts_with=_ENTITY_ID_PREFIX, + last_modified_from=query.last_modified_from, + last_modified_to=query.last_modified_to, + include_state=query.include_state, + page_size=query.page_size, + ) + status_filter = set(query.status) if query.status else None + + for meta in self._client.get_all_entities(entity_query): + # The query may catch unrelated entities if some other + # extension picks the same prefix; guard with an + # explicit entity-name check. + if meta.id.entity != ENTITY_NAME.lower(): + continue + raw = meta.get_state(str) if meta.includes_state else None + if not raw: + continue + try: + state = json.loads(raw) + except (TypeError, ValueError): + continue + if not isinstance(state, dict): + continue + try: + desc = ExportJobDescription._from_state_dict(meta.id.key, state) + except (KeyError, ValueError): + continue + if status_filter is not None and desc.status not in status_filter: + continue + yield desc + + def wait_for_job( + self, + job_id: str, + *, + timeout: float = 300.0, + poll_interval: float = 1.0, + ) -> ExportJobDescription: + """Poll until the job reaches a terminal status or *timeout* elapses. + + Raises: + TimeoutError: If the job is still pending/active after + *timeout* seconds. + ExportJobNotFoundError: If the job cannot be found at all. + """ + if timeout <= 0: + raise ValueError("timeout must be positive") + if poll_interval <= 0: + raise ValueError("poll_interval must be positive") + + deadline = time.monotonic() + timeout + last: Optional[ExportJobDescription] = None + while True: + desc = self.get_job(job_id) + if desc is not None: + last = desc + if desc.status in _TERMINAL_STATUSES: + return desc + if time.monotonic() >= deadline: + if last is None: + raise ExportJobNotFoundError(job_id) + raise TimeoutError( + f"Export job '{job_id}' did not reach a terminal status " + f"within {timeout}s (last status: {last.status.value})" + ) + time.sleep(poll_interval) + + def delete_job(self, job_id: str) -> None: + """Delete the export-job entity, clearing its state. + + The driving orchestrator will detect the deletion at its next + loop iteration (via :meth:`OrchestrationContext.call_entity`) + and exit cleanly without issuing further signals. + + This does NOT delete blobs already written to the destination. + """ + entity_id = entities.EntityInstanceId(ENTITY_NAME, job_id) + self._client.signal_entity(entity_id, "delete") + + # ------------------------------------------------------------------ + # Convenience + # ------------------------------------------------------------------ + + def get_job_client(self, job_id: str) -> "ExportHistoryJobClient": + """Return a per-job façade for *job_id*.""" + return ExportHistoryJobClient(self, job_id) + + # ------------------------------------------------------------------ + # Diagnostics + # ------------------------------------------------------------------ + + @property + def entity_name(self) -> str: + return ENTITY_NAME + + @property + def orchestrator_name(self) -> str: + return ORCHESTRATOR_NAME + + @property + def writer(self) -> HistoryWriter: + return self._writer + + @property + def underlying_client(self) -> client_module.TaskHubGrpcClient: + return self._client + + +class ExportHistoryJobClient: + """Per-job convenience façade returned by :meth:`ExportHistoryClient.get_job_client`. + + All methods are thin pass-throughs to the parent client; the + class exists so callers can pass around a single object that + encapsulates a job ID rather than re-typing it at every call + site. + """ + + def __init__(self, parent: ExportHistoryClient, job_id: str) -> None: + if not job_id: + raise ValueError("job_id must be a non-empty string") + self._parent = parent + self._job_id = job_id + + @property + def job_id(self) -> str: + return self._job_id + + @property + def orchestrator_instance_id(self) -> str: + return orchestrator_instance_id_for(self._job_id) + + def describe(self) -> Optional[ExportJobDescription]: + """Fetch the latest description, or ``None`` if the job is missing.""" + return self._parent.get_job(self._job_id) + + def wait( + self, + *, + timeout: float = 300.0, + poll_interval: float = 1.0, + ) -> ExportJobDescription: + """Poll until terminal; see :meth:`ExportHistoryClient.wait_for_job`.""" + return self._parent.wait_for_job( + self._job_id, timeout=timeout, poll_interval=poll_interval, + ) + + def delete(self) -> None: + """Delete the export job; see :meth:`ExportHistoryClient.delete_job`.""" + self._parent.delete_job(self._job_id) diff --git a/durabletask/extensions/history_export/entity.py b/durabletask/extensions/history_export/entity.py new file mode 100644 index 00000000..cddc4f38 --- /dev/null +++ b/durabletask/extensions/history_export/entity.py @@ -0,0 +1,287 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Durable entity that owns the state of a single export job. + +The entity persists its state through the SDK's default JSON encoder. +To avoid embedding Python type metadata in the persisted payload (a +known deserialization-attack vector), the on-disk shape is owned by +:class:`~durabletask.extensions.history_export.models.ExportJobState`, +a versioned dataclass whose ``_to_dict`` / ``_from_dict`` methods +produce and consume pure JSON primitives keyed by literal field names +plus an explicit ``schema_version``. + +Operations +---------- +``create`` + Initialise a fresh export job, or reset a terminal job back to + :attr:`ExportJobStatus.PENDING`. Refuses to overwrite an active + job (raises :class:`ExportJobInvalidTransitionError`). +``get`` + Returns the persisted state dict, or ``None`` if the entity has + not been created (or has been deleted). +``run`` + Schedules the driving orchestrator (with a deterministic instance + ID derived from the job ID) and transitions the job to + :attr:`ExportJobStatus.ACTIVE`. Idempotent so the client may + safely signal it more than once. +``commit_checkpoint`` + Applies an incremental update after a single export page. When + ``mark_failed_on_batch`` is true *and* ``failures`` is non-empty, + transitions the job to :class:`ExportJobStatus.FAILED` and + records the failure summary as ``last_error``. +``mark_completed`` + Transitions the job to :class:`ExportJobStatus.COMPLETED`. +``mark_failed`` + Transitions the job to :class:`ExportJobStatus.FAILED` and + records ``payload["reason"]`` as the terminal error. +``delete`` + Clears all entity state. +""" + +from __future__ import annotations + +from datetime import datetime, timezone +from typing import Any, List, Mapping, Optional + +from durabletask import entities + +from durabletask.extensions.history_export._constants import ( + ENTITY_NAME, + ORCHESTRATOR_INSTANCE_ID_PREFIX, + ORCHESTRATOR_NAME, + orchestrator_instance_id_for, +) +from durabletask.extensions.history_export._logging import logger +from durabletask.extensions.history_export.models import ( + ExportFailure, + ExportJobConfiguration, + ExportJobState, + ExportJobStatus, + _dt_from_iso, +) +from durabletask.extensions.history_export.transitions import ( + assert_valid_transition, +) + + +__all__ = [ + "ENTITY_NAME", + "ORCHESTRATOR_INSTANCE_ID_PREFIX", + "ExportJobEntity", + "orchestrator_instance_id_for", + "register", +] + + +def _utcnow() -> datetime: + return datetime.now(timezone.utc) + + +def _summarize_failures(failures: List[ExportFailure], *, limit: int = 10) -> str: + if not failures: + return "" + head = "; ".join(f"{f.instance_id}: {f.reason}" for f in failures[:limit]) + if len(failures) > limit: + head += f"; ... and {len(failures) - limit} more failures" + return head + + +class ExportJobEntity(entities.DurableEntity): + """Durable entity that owns the lifecycle state of one export job.""" + + # ----- state helpers -------------------------------------------- + + def _load(self) -> Optional[ExportJobState]: + raw = self.get_state() + if raw is None: + return None + if not isinstance(raw, dict): + raise TypeError( + f"Unexpected entity state type {type(raw).__name__!r}; expected dict" + ) + return ExportJobState._from_dict(raw) + + def _save(self, state: ExportJobState) -> dict[str, Any]: + state.last_modified_at = _utcnow() + persisted = state._to_dict() + self.set_state(persisted) + return persisted + + def _current_status(self) -> Optional[ExportJobStatus]: + state = self._load() + return state.status if state is not None else None + + def _job_id(self) -> str: + return self.entity_context.entity_id.key + + # ----- operations ------------------------------------------------ + + def create(self, payload: Mapping[str, Any]) -> dict[str, Any]: + job_id = self._job_id() + current = self._current_status() + assert_valid_transition( + "create", current, ExportJobStatus.PENDING, job_id=job_id, + ) + + config_dict = payload.get("config") + if not config_dict: + raise ValueError("create payload requires 'config'") + config = ExportJobConfiguration._from_dict(config_dict) + + created_at_raw = payload.get("created_at") + created_at = _dt_from_iso(created_at_raw) if created_at_raw else _utcnow() + assert created_at is not None + + state = ExportJobState( + status=ExportJobStatus.PENDING, + config=config, + created_at=created_at, + last_modified_at=created_at, + ) + logger.info( + "Created export job %r in status %s", job_id, state.status.value, + ) + return self._save(state) + + def get(self, _: Any = None) -> Optional[dict[str, Any]]: + state = self._load() + return state._to_dict() if state is not None else None + + def run(self, _: Any = None) -> Optional[dict[str, Any]]: + state = self._load() + if state is None: + raise ValueError("Cannot run uninitialized export job") + job_id = self._job_id() + assert_valid_transition( + "run", state.status, ExportJobStatus.ACTIVE, job_id=job_id, + ) + + # The entity itself schedules the driving orchestrator. The + # client is therefore decoupled from the orchestrator's name + # and input shape. + if state.status is ExportJobStatus.PENDING: + instance_id = orchestrator_instance_id_for(job_id) + try: + self.entity_context.schedule_new_orchestration( + ORCHESTRATOR_NAME, + input={"job_id": job_id, "config": state.config._to_dict()}, + instance_id=instance_id, + ) + state.orchestrator_instance_id = instance_id + logger.info( + "Scheduled orchestrator %s for job %r with instance ID %s", + ORCHESTRATOR_NAME, job_id, instance_id, + ) + except Exception as ex: # noqa: BLE001 + state.status = ExportJobStatus.FAILED + state.last_error = ( + f"Failed to schedule orchestrator: {type(ex).__name__}: {ex}" + ) + logger.exception( + "Failed to schedule orchestrator for export job %r", job_id, + ) + self._save(state) + raise + + state.status = ExportJobStatus.ACTIVE + state.last_error = None + return self._save(state) + + def commit_checkpoint(self, payload: Mapping[str, Any]) -> Optional[dict[str, Any]]: + state = self._load() + if state is None: + raise ValueError("Cannot commit_checkpoint on uninitialized export job") + job_id = self._job_id() + + # commit_checkpoint may transition ACTIVE -> ACTIVE (no-op) or + # ACTIVE -> FAILED (when the orchestrator signals a persistent + # batch failure). The transitions matrix covers both. + scanned_delta = int(payload.get("scanned_delta", 0)) + exported_delta = int(payload.get("exported_delta", 0)) + failed_delta = int(payload.get("failed_delta", 0)) + if scanned_delta < 0 or exported_delta < 0 or failed_delta < 0: + raise ValueError("checkpoint deltas must be non-negative") + + failures_data = payload.get("failures") or [] + new_failures = [ExportFailure._from_dict(f) for f in failures_data] + will_fail = bool(payload.get("mark_failed_on_batch")) and bool(new_failures) + target = ExportJobStatus.FAILED if will_fail else ExportJobStatus.ACTIVE + assert_valid_transition( + "commit_checkpoint", state.status, target, job_id=job_id, + ) + + state.scanned_instances += scanned_delta + state.exported_instances += exported_delta + state.failed_instances += failed_delta + + if "last_instance_key" in payload: + state.checkpoint.last_instance_key = payload.get("last_instance_key") + + checkpoint_time_raw = payload.get("checkpoint_time") + checkpoint_time = ( + _dt_from_iso(checkpoint_time_raw) if checkpoint_time_raw else _utcnow() + ) + state.last_checkpoint_time = checkpoint_time + + if new_failures: + state.failures.extend(new_failures) + + if will_fail: + state.status = ExportJobStatus.FAILED + summary = _summarize_failures(new_failures) + state.last_error = ( + f"Batch export failed after retries. Failures: {summary}" + if summary + else "Batch export failed after retries." + ) + logger.warning( + "Export job %r marked FAILED after batch retries (%d failures)", + job_id, len(new_failures), + ) + + return self._save(state) + + def mark_completed(self, _: Any = None) -> Optional[dict[str, Any]]: + state = self._load() + if state is None: + raise ValueError("Cannot mark_completed on uninitialized export job") + job_id = self._job_id() + assert_valid_transition( + "mark_completed", state.status, ExportJobStatus.COMPLETED, + job_id=job_id, + ) + state.status = ExportJobStatus.COMPLETED + state.last_error = None + logger.info("Export job %r marked COMPLETED", job_id) + return self._save(state) + + def mark_failed( + self, payload: Optional[Mapping[str, Any]] = None + ) -> Optional[dict[str, Any]]: + state = self._load() + if state is None: + raise ValueError("Cannot mark_failed on uninitialized export job") + job_id = self._job_id() + assert_valid_transition( + "mark_failed", state.status, ExportJobStatus.FAILED, job_id=job_id, + ) + reason = "" + if payload is not None: + reason = str(payload.get("reason", "")) + state.status = ExportJobStatus.FAILED + state.last_error = reason or None + logger.info("Export job %r marked FAILED: %s", job_id, reason or "(no reason)") + return self._save(state) + + def delete(self, _: Any = None) -> None: # type: ignore[override] + # The base class's delete() calls set_state(None) which is + # exactly what we want for export-job cleanup. ``delete`` is + # always valid regardless of current status. + logger.info("Export job %r deleted", self._job_id()) + super().delete() + + +def register(worker_instance, *, name: str = ENTITY_NAME) -> None: + """Convenience helper to register :class:`ExportJobEntity` on *worker*.""" + worker_instance.add_entity(ExportJobEntity, name=name) diff --git a/durabletask/extensions/history_export/exceptions.py b/durabletask/extensions/history_export/exceptions.py new file mode 100644 index 00000000..aec08d1b --- /dev/null +++ b/durabletask/extensions/history_export/exceptions.py @@ -0,0 +1,57 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Custom exception types raised by the history-export extension. + +The hierarchy multiply-inherits from the closest matching built-in +exception so that existing ``except ValueError:`` / ``except +LookupError:`` clauses keep working — the export-specific subclasses +are an additive refinement, not a breaking rename. +""" + +from __future__ import annotations + +from typing import Optional + + +class ExportJobError(Exception): + """Base class for all export-job specific errors.""" + + def __init__(self, message: str, *, job_id: Optional[str] = None) -> None: + super().__init__(message) + self.job_id = job_id + + +class ExportJobInvalidTransitionError(ExportJobError, ValueError): + """Raised when an entity operation would produce an invalid state transition.""" + + def __init__( + self, + operation: str, + from_status: Optional[str], + to_status: Optional[str], + *, + job_id: Optional[str] = None, + ) -> None: + message = ( + f"Operation {operation!r} cannot transition export job " + f"{job_id!r} from {from_status!r} to {to_status!r}" + ) + super().__init__(message, job_id=job_id) + self.operation = operation + self.from_status = from_status + self.to_status = to_status + + +class ExportJobNotFoundError(ExportJobError, LookupError): + """Raised when an export job cannot be located by ID.""" + + def __init__(self, job_id: str) -> None: + super().__init__(f"Export job {job_id!r} was not found", job_id=job_id) + + +__all__ = [ + "ExportJobError", + "ExportJobInvalidTransitionError", + "ExportJobNotFoundError", +] diff --git a/durabletask/extensions/history_export/models.py b/durabletask/extensions/history_export/models.py new file mode 100644 index 00000000..36bfde26 --- /dev/null +++ b/durabletask/extensions/history_export/models.py @@ -0,0 +1,543 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Public data models for the history export extension. + +These dataclasses describe export jobs at the public API surface. All +JSON-primitive conversions (for entity state, orchestrator inputs, and +activity inputs) are implemented as ``_to_dict`` / ``_from_dict`` pairs +in this module so the rest of the extension can stay free of ad-hoc +serialization logic. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from datetime import datetime, timezone +from enum import Enum +from typing import Any, List, Mapping, Optional + +from durabletask.client import OrchestrationStatus + + +class ExportMode(Enum): + """How the export job processes instances.""" + + BATCH = "Batch" + """Export a fixed time window of terminal instances, then complete.""" + + CONTINUOUS = "Continuous" + """Tail terminal instances continuously until stopped externally. + + The orchestrator processes one page, signals a checkpoint, sleeps + when the page is empty, and never calls ``mark_completed``. The + job is stopped by deleting the entity (via + :meth:`ExportHistoryClient.delete_job`) or by signalling + ``mark_failed`` externally. + """ + + +class ExportFormatKind(Enum): + """Serialization format for exported history.""" + + JSON = "Json" + """Single JSON document per instance (uncompressed).""" + + JSONL_GZIP = "JsonlGzip" + """One JSON event per line, gzip compressed.""" + + +class ExportJobStatus(Enum): + """Lifecycle status of an export job. + + The enum values double as the persisted ``status`` field in the + export-job entity state. Adding a new value here is a public-API + change because consumers may switch on the result of + :meth:`ExportHistoryClient.get_job`. + + Status meanings + --------------- + ``PENDING`` + The job has been created but the entity has not yet processed + the ``create`` signal, *or* the entity has accepted the + configuration but has not yet kicked off its driving + orchestrator. The value is reserved for the forthcoming + ``run`` operation (see the .NET ``ExportJob.Run`` pattern); + the current implementation transitions directly from creation + to :attr:`ACTIVE`, so jobs are not persisted in ``Pending`` + today. + ``ACTIVE`` + The job is running and the driving orchestrator is making + progress through pages of terminal instances. + ``COMPLETED`` + The orchestrator finished a batch successfully. + ``FAILED`` + The orchestrator threw, or a page of exports exhausted its + retries. + """ + + PENDING = "Pending" + ACTIVE = "Active" + COMPLETED = "Completed" + FAILED = "Failed" + + +# Default set of runtime statuses considered "terminal" for export. +_DEFAULT_TERMINAL_STATUSES: List[OrchestrationStatus] = [ + OrchestrationStatus.COMPLETED, + OrchestrationStatus.FAILED, + OrchestrationStatus.TERMINATED, +] + + +# ---------------------------------------------------------------------- +# Datetime helpers +# ---------------------------------------------------------------------- + +def _dt_to_iso(value: Optional[datetime]) -> Optional[str]: + if value is None: + return None + if value.tzinfo is None: + value = value.replace(tzinfo=timezone.utc) + else: + value = value.astimezone(timezone.utc) + return value.isoformat() + + +def _dt_from_iso(value: Optional[str]) -> Optional[datetime]: + if value is None: + return None + parsed = datetime.fromisoformat(value) + if parsed.tzinfo is None: + parsed = parsed.replace(tzinfo=timezone.utc) + return parsed + + +# ---------------------------------------------------------------------- +# Configuration dataclasses +# ---------------------------------------------------------------------- + +@dataclass +class ExportFormat: + """Output format for serialized history.""" + + kind: ExportFormatKind = ExportFormatKind.JSONL_GZIP + schema_version: str = "1.0" + + def _to_dict(self) -> dict[str, Any]: + return {"kind": self.kind.value, "schema_version": self.schema_version} + + @classmethod + def _from_dict(cls, data: Mapping[str, Any]) -> "ExportFormat": + return cls( + kind=ExportFormatKind(data["kind"]), + schema_version=data.get("schema_version", "1.0"), + ) + + +@dataclass +class ExportDestination: + """Identifies where exported history should be written. + + The destination is destination-agnostic at this layer; the writer + implementation (for example Azure Blob Storage) is selected by the + extension package configuration, not by this dataclass. + + Attributes: + container: Logical container name (for example, an Azure Blob + container). Required. + prefix: Optional prefix prepended to each exported blob/object + name. Useful for grouping exports of the same job. + """ + + container: str + prefix: Optional[str] = None + + def __post_init__(self) -> None: + if not self.container: + raise ValueError("destination.container must be a non-empty string") + + def _to_dict(self) -> dict[str, Any]: + return {"container": self.container, "prefix": self.prefix} + + @classmethod + def _from_dict(cls, data: Mapping[str, Any]) -> "ExportDestination": + return cls(container=data["container"], prefix=data.get("prefix")) + + +@dataclass +class ExportFilter: + """Filter applied when selecting instances to export. + + Attributes: + completed_time_from: Inclusive lower bound on instance + completion time. Required. + completed_time_to: Exclusive upper bound on instance + completion time. Required for batch mode. + runtime_status: Restrict to a specific set of terminal + statuses. Defaults to ``COMPLETED``, ``FAILED``, and + ``TERMINATED``. + """ + + completed_time_from: datetime + completed_time_to: Optional[datetime] = None + runtime_status: Optional[List[OrchestrationStatus]] = None + + def effective_runtime_status(self) -> List[OrchestrationStatus]: + """Return the runtime statuses to use, applying the default.""" + if self.runtime_status is None: + return list(_DEFAULT_TERMINAL_STATUSES) + return list(self.runtime_status) + + def _to_dict(self) -> dict[str, Any]: + return { + "completed_time_from": _dt_to_iso(self.completed_time_from), + "completed_time_to": _dt_to_iso(self.completed_time_to), + "runtime_status": ( + [s.name for s in self.runtime_status] + if self.runtime_status is not None + else None + ), + } + + @classmethod + def _from_dict(cls, data: Mapping[str, Any]) -> "ExportFilter": + statuses = data.get("runtime_status") + completed_from = _dt_from_iso(data.get("completed_time_from")) + if completed_from is None: + raise ValueError("completed_time_from is required") + return cls( + completed_time_from=completed_from, + completed_time_to=_dt_from_iso(data.get("completed_time_to")), + runtime_status=( + [OrchestrationStatus[name] for name in statuses] + if statuses is not None + else None + ), + ) + + +@dataclass +class ExportCheckpoint: + """Continuation state for resumable exports. + + Attributes: + last_instance_key: Opaque continuation token returned by the + backend's ``ListInstanceIds`` API. ``None`` indicates the + export has not started or has completed. + """ + + last_instance_key: Optional[str] = None + + def _to_dict(self) -> dict[str, Any]: + return {"last_instance_key": self.last_instance_key} + + @classmethod + def _from_dict(cls, data: Mapping[str, Any]) -> "ExportCheckpoint": + return cls(last_instance_key=data.get("last_instance_key")) + + +@dataclass +class ExportFailure: + """Records a single instance that failed to export.""" + + instance_id: str + reason: str + attempt_count: int + last_attempt: datetime + + def _to_dict(self) -> dict[str, Any]: + return { + "instance_id": self.instance_id, + "reason": self.reason, + "attempt_count": self.attempt_count, + "last_attempt": _dt_to_iso(self.last_attempt), + } + + @classmethod + def _from_dict(cls, data: Mapping[str, Any]) -> "ExportFailure": + last_attempt = _dt_from_iso(data["last_attempt"]) + assert last_attempt is not None + return cls( + instance_id=data["instance_id"], + reason=data["reason"], + attempt_count=int(data["attempt_count"]), + last_attempt=last_attempt, + ) + + +@dataclass +class ExportJobConfiguration: + """Resolved configuration for a running export job.""" + + mode: ExportMode + filter: ExportFilter + destination: ExportDestination + format: ExportFormat = field(default_factory=ExportFormat) + max_instances_per_batch: int = 100 + max_parallel_exports: int = 32 + + def __post_init__(self) -> None: + if self.max_instances_per_batch <= 0: + raise ValueError("max_instances_per_batch must be positive") + if self.max_parallel_exports <= 0: + raise ValueError("max_parallel_exports must be positive") + if self.mode == ExportMode.BATCH and self.filter.completed_time_to is None: + raise ValueError( + "completed_time_to is required for batch mode exports" + ) + + def _to_dict(self) -> dict[str, Any]: + return { + "mode": self.mode.value, + "filter": self.filter._to_dict(), + "destination": self.destination._to_dict(), + "format": self.format._to_dict(), + "max_instances_per_batch": self.max_instances_per_batch, + "max_parallel_exports": self.max_parallel_exports, + } + + @classmethod + def _from_dict(cls, data: Mapping[str, Any]) -> "ExportJobConfiguration": + return cls( + mode=ExportMode(data["mode"]), + filter=ExportFilter._from_dict(data["filter"]), + destination=ExportDestination._from_dict(data["destination"]), + format=ExportFormat._from_dict(data.get("format") or {"kind": ExportFormatKind.JSONL_GZIP.value}), + max_instances_per_batch=int(data.get("max_instances_per_batch", 100)), + max_parallel_exports=int(data.get("max_parallel_exports", 32)), + ) + + +@dataclass +class ExportJobQuery: + """Filter for :meth:`ExportHistoryClient.list_jobs`. + + Attributes: + status: When set, only jobs whose persisted status is one of + these values are returned. + last_modified_from: When set, only jobs modified at or after + this timestamp are returned. + last_modified_to: When set, only jobs modified at or before + this timestamp are returned. + page_size: Backend page size used to enumerate the underlying + entities. + include_state: Whether to fetch full job state (set ``False`` + to retrieve job IDs and metadata only). + """ + + status: Optional[List["ExportJobStatus"]] = None + last_modified_from: Optional[datetime] = None + last_modified_to: Optional[datetime] = None + page_size: Optional[int] = None + include_state: bool = True + + +@dataclass +class ExportJobCreationOptions: + """User-supplied options for creating a new export job.""" + + mode: ExportMode + completed_time_from: datetime + destination: ExportDestination + completed_time_to: Optional[datetime] = None + runtime_status: Optional[List[OrchestrationStatus]] = None + format: ExportFormat = field(default_factory=ExportFormat) + job_id: Optional[str] = None + max_instances_per_batch: int = 100 + max_parallel_exports: int = 32 + + def to_configuration(self) -> ExportJobConfiguration: + """Resolve into a fully-populated :class:`ExportJobConfiguration`.""" + return ExportJobConfiguration( + mode=self.mode, + filter=ExportFilter( + completed_time_from=self.completed_time_from, + completed_time_to=self.completed_time_to, + runtime_status=self.runtime_status, + ), + destination=self.destination, + format=self.format, + max_instances_per_batch=self.max_instances_per_batch, + max_parallel_exports=self.max_parallel_exports, + ) + + +# ---------------------------------------------------------------------- +# Persisted entity state +# ---------------------------------------------------------------------- +# +# The export-job entity persists its state through the SDK's default JSON +# encoder, which only handles JSON-primitive types. Rather than encoding +# Python class metadata into the persisted payload (which is a known +# deserialization-attack vector), we serialize through an explicit, named, +# schema-versioned shape defined by :class:`ExportJobState`. +# +# Every persisted dict carries a ``schema_version`` string. Loading code +# dispatches on that version, never on a class name or module path. When the +# SDK eventually grows type-aware deserialization, the dispatch can be +# replaced with a registry keyed by ``(entity_name, schema_version)`` without +# changing the on-disk shape. + +STATE_SCHEMA_VERSION = "1.0" +"""The schema version emitted by :meth:`ExportJobState._to_dict`. + +Increment this when the persisted shape changes in a non-backward-compatible +way and add a new branch in :meth:`ExportJobState._from_dict`. +""" + + +@dataclass +class ExportJobState: + """Typed, schema-versioned mirror of the entity's persisted state. + + This dataclass is the single source of truth for the on-disk schema. + All persistence flows through :meth:`_to_dict` (write) and + :meth:`_from_dict` (read); the dict contains only JSON primitives plus + nested dicts produced by the model ``_to_dict`` methods. No Python + class names, module paths, or other type metadata appear in the + serialized form. + """ + + status: ExportJobStatus + config: ExportJobConfiguration + created_at: datetime + last_modified_at: datetime + orchestrator_instance_id: Optional[str] = None + checkpoint: ExportCheckpoint = field(default_factory=ExportCheckpoint) + last_checkpoint_time: Optional[datetime] = None + last_error: Optional[str] = None + scanned_instances: int = 0 + exported_instances: int = 0 + failed_instances: int = 0 + failures: List[ExportFailure] = field(default_factory=list) + + # ------------------------------------------------------------------ + # Serialization + # ------------------------------------------------------------------ + + def _to_dict(self) -> dict[str, Any]: + return { + "schema_version": STATE_SCHEMA_VERSION, + "status": self.status.value, + "config": self.config._to_dict(), + "checkpoint": self.checkpoint._to_dict(), + "created_at": _dt_to_iso(self.created_at), + "last_modified_at": _dt_to_iso(self.last_modified_at), + "last_checkpoint_time": _dt_to_iso(self.last_checkpoint_time), + "last_error": self.last_error, + "scanned_instances": self.scanned_instances, + "exported_instances": self.exported_instances, + "failed_instances": self.failed_instances, + "orchestrator_instance_id": self.orchestrator_instance_id, + "failures": [f._to_dict() for f in self.failures], + } + + @classmethod + def _from_dict(cls, data: Mapping[str, Any]) -> "ExportJobState": + version = data.get("schema_version", "1.0") + if version != STATE_SCHEMA_VERSION: + raise ValueError( + f"Unsupported export job state schema_version={version!r}; " + f"expected {STATE_SCHEMA_VERSION!r}" + ) + + config_data = data.get("config") + if not config_data: + raise ValueError("persisted state is missing 'config'") + created_at = _dt_from_iso(data.get("created_at")) + last_modified_at = _dt_from_iso(data.get("last_modified_at")) + if created_at is None or last_modified_at is None: + raise ValueError( + "persisted state must include 'created_at' and 'last_modified_at'" + ) + checkpoint_data = data.get("checkpoint") + failures_data = data.get("failures") or [] + + return cls( + status=ExportJobStatus(data["status"]), + config=ExportJobConfiguration._from_dict(config_data), + created_at=created_at, + last_modified_at=last_modified_at, + orchestrator_instance_id=data.get("orchestrator_instance_id"), + checkpoint=( + ExportCheckpoint._from_dict(checkpoint_data) + if checkpoint_data is not None + else ExportCheckpoint() + ), + last_checkpoint_time=_dt_from_iso(data.get("last_checkpoint_time")), + last_error=data.get("last_error"), + scanned_instances=int(data.get("scanned_instances", 0)), + exported_instances=int(data.get("exported_instances", 0)), + failed_instances=int(data.get("failed_instances", 0)), + failures=[ExportFailure._from_dict(f) for f in failures_data], + ) + + # ------------------------------------------------------------------ + # Factory + # ------------------------------------------------------------------ + + @classmethod + def new( + cls, + config: ExportJobConfiguration, + *, + created_at: datetime, + orchestrator_instance_id: Optional[str] = None, + ) -> "ExportJobState": + """Construct a fresh state for a newly-created job.""" + return cls( + status=ExportJobStatus.ACTIVE, + config=config, + created_at=created_at, + last_modified_at=created_at, + orchestrator_instance_id=orchestrator_instance_id, + ) + + +# ---------------------------------------------------------------------- +# Public job description (read view) +# ---------------------------------------------------------------------- + +@dataclass +class ExportJobDescription: + """Public view of an export job.""" + + job_id: str + status: ExportJobStatus + created_at: Optional[datetime] + last_modified_at: Optional[datetime] + config: Optional[ExportJobConfiguration] + orchestrator_instance_id: Optional[str] + scanned_instances: int + exported_instances: int + failed_instances: int + last_error: Optional[str] + checkpoint: Optional[ExportCheckpoint] + last_checkpoint_time: Optional[datetime] + failures: List[ExportFailure] = field(default_factory=list) + + @classmethod + def _from_state(cls, job_id: str, state: "ExportJobState") -> "ExportJobDescription": + return cls( + job_id=job_id, + status=state.status, + created_at=state.created_at, + last_modified_at=state.last_modified_at, + config=state.config, + orchestrator_instance_id=state.orchestrator_instance_id, + scanned_instances=state.scanned_instances, + exported_instances=state.exported_instances, + failed_instances=state.failed_instances, + last_error=state.last_error, + checkpoint=state.checkpoint, + last_checkpoint_time=state.last_checkpoint_time, + failures=list(state.failures), + ) + + @classmethod + def _from_state_dict( + cls, job_id: str, state: Mapping[str, Any] + ) -> "ExportJobDescription": + """Build a description from a persisted entity-state dict.""" + return cls._from_state(job_id, ExportJobState._from_dict(state)) diff --git a/durabletask/extensions/history_export/orchestrator.py b/durabletask/extensions/history_export/orchestrator.py new file mode 100644 index 00000000..724ca083 --- /dev/null +++ b/durabletask/extensions/history_export/orchestrator.py @@ -0,0 +1,306 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Orchestrator that drives one export job from start to completion. + +Mirrors the .NET ``ExportJobOrchestrator`` design: + +1. Re-fetch the export-job entity state at the top of every loop + iteration via :meth:`OrchestrationContext.call_entity`. If the + job no longer exists (deleted) or is no longer ACTIVE (externally + marked failed/completed), the orchestrator exits cleanly without + issuing any further signals. +2. Ask ``list_terminal_instances`` for one page. +3. Fan out ``export_instance_history`` across the page, respecting + the configured ``max_parallel_exports`` cap, with a per-activity + retry policy. +4. If any individual export still failed after its retries, retry + the *whole page* up to ``MAX_BATCH_RETRY_ATTEMPTS`` times with + exponential backoff. +5. Signal the entity with ``commit_checkpoint`` carrying the page + totals. On persistent batch failure, the signal also carries the + failure list and ``mark_failed_on_batch=True``. +6. In :attr:`ExportMode.BATCH`, break out of the loop when there is + no next page. In :attr:`ExportMode.CONTINUOUS`, sleep for + ``CONTINUOUS_IDLE_DELAY`` on empty pages and continue tailing + forever (until an external stop is observed via step 1). +7. Continue-as-new every ``CONTINUE_AS_NEW_FREQUENCY`` pages to + keep the orchestrator history bounded. +8. In BATCH mode only: on a clean exit, signal ``mark_completed``. + In CONTINUOUS mode, the orchestrator does not mark the job + completed — the job lifecycle is owned by the caller. +""" + +from __future__ import annotations + +from datetime import timedelta +from typing import Any, List, Mapping, Optional + +from durabletask import task + +from durabletask.extensions.history_export._constants import ( + ENTITY_NAME, + ORCHESTRATOR_NAME, +) +from durabletask.extensions.history_export._logging import logger +from durabletask.extensions.history_export.activities import ( + EXPORT_INSTANCE_HISTORY_ACTIVITY, + LIST_TERMINAL_INSTANCES_ACTIVITY, + build_list_activity_input, +) +from durabletask.extensions.history_export.models import ( + ExportJobConfiguration, + ExportJobStatus, + ExportMode, +) + + +__all__ = [ + "CONTINUE_AS_NEW_FREQUENCY", + "CONTINUOUS_IDLE_DELAY", + "EXPORT_ACTIVITY_RETRY_POLICY", + "MAX_BATCH_RETRY_ATTEMPTS", + "ORCHESTRATOR_NAME", + "export_job_orchestrator", + "register", +] + + +# Per-activity retry policy applied to ``export_instance_history``. +# Mirrors the .NET defaults (3 attempts, 15s/30s/60s backoff). +EXPORT_ACTIVITY_RETRY_POLICY = task.RetryPolicy( + first_retry_interval=timedelta(seconds=15), + max_number_of_attempts=3, + backoff_coefficient=2.0, + max_retry_interval=timedelta(seconds=60), +) + +# Number of *page cycles* the orchestrator processes before issuing +# continue-as-new to bound its own history. +CONTINUE_AS_NEW_FREQUENCY = 5 + +# Number of times to retry a whole batch (a page worth of exports) if +# any individual export ultimately fails. +MAX_BATCH_RETRY_ATTEMPTS = 3 + +# Default sleep between empty pages in CONTINUOUS mode. +CONTINUOUS_IDLE_DELAY = timedelta(minutes=1) + +# Default backoff between batch retries. Tests override this via +# :data:`_BATCH_RETRY_BACKOFF_OVERRIDE` to keep runtimes short. +_DEFAULT_BATCH_RETRY_FIRST = timedelta(seconds=60) +_DEFAULT_BATCH_RETRY_MAX = timedelta(seconds=300) + +# Test seams: monkey-patch to small values to keep test runs fast. +_BATCH_RETRY_BACKOFF_OVERRIDE: Optional[timedelta] = None +_CONTINUOUS_IDLE_DELAY_OVERRIDE: Optional[timedelta] = None + + +def _batch_retry_delay(attempt: int) -> timedelta: + if _BATCH_RETRY_BACKOFF_OVERRIDE is not None: + return _BATCH_RETRY_BACKOFF_OVERRIDE + seconds = min( + int(_DEFAULT_BATCH_RETRY_FIRST.total_seconds() * (2 ** (attempt - 1))), + int(_DEFAULT_BATCH_RETRY_MAX.total_seconds()), + ) + return timedelta(seconds=seconds) + + +def _continuous_idle_delay() -> timedelta: + return _CONTINUOUS_IDLE_DELAY_OVERRIDE or CONTINUOUS_IDLE_DELAY + + +def export_job_orchestrator(ctx: task.OrchestrationContext, input: Mapping[str, Any]): + """Drive a single export job through the page → fan-out → checkpoint loop. + + Input schema:: + + { + "job_id": str, + "config": ExportJobConfiguration._to_dict(), + "checkpoint": ExportCheckpoint._to_dict() (optional), + "processed_cycles": int (optional, used for continue-as-new), + } + """ + job_id = input["job_id"] + config = ExportJobConfiguration._from_dict(input["config"]) + initial_checkpoint = input.get("checkpoint") or {"last_instance_key": None} + processed_cycles = int(input.get("processed_cycles", 0)) + + entity_id = task.EntityInstanceId(ENTITY_NAME, job_id) + runtime_status_names = [s.name for s in config.filter.effective_runtime_status()] + continuation_token = initial_checkpoint.get("last_instance_key") + + totals = {"scanned": 0, "exported": 0, "failed": 0} + + try: + while True: + processed_cycles += 1 + if processed_cycles > CONTINUE_AS_NEW_FREQUENCY: + ctx.continue_as_new({ + "job_id": job_id, + "config": input["config"], + "checkpoint": {"last_instance_key": continuation_token}, + "processed_cycles": 0, + }) + return None + + # Step 1: re-check the entity's view of the world. This + # lets external state changes (delete, mark_failed) cancel + # the orchestrator without us having to drain a backlog. + current_state = yield ctx.call_entity(entity_id, "get") + if current_state is None: + logger.info( + "Export job %r entity has been deleted; exiting orchestrator", + job_id, + ) + return {"job_id": job_id, "status": "Cancelled", "totals": totals} + if current_state.get("status") != ExportJobStatus.ACTIVE.value: + logger.info( + "Export job %r entity status is %s; exiting orchestrator", + job_id, current_state.get("status"), + ) + return { + "job_id": job_id, + "status": current_state.get("status"), + "totals": totals, + } + + list_input = build_list_activity_input( + runtime_status_names=runtime_status_names, + completed_time_from=config.filter.completed_time_from, + completed_time_to=config.filter.completed_time_to, + page_size=config.max_instances_per_batch, + continuation_token=continuation_token, + ) + page = yield ctx.call_activity( + LIST_TERMINAL_INSTANCES_ACTIVITY, input=list_input + ) + + instance_ids = page.get("instance_ids") or [] + scanned_delta = len(instance_ids) + exported_delta = 0 + failed_delta = 0 + batch_failures: List[dict] = [] + + if instance_ids: + batch_succeeded = False + results: List[dict] = [] + for attempt in range(1, MAX_BATCH_RETRY_ATTEMPTS + 1): + results = yield from _run_page( + ctx, + instance_ids=instance_ids, + config=config, + max_parallel=config.max_parallel_exports, + ) + failed_results = [r for r in results if not r.get("success")] + if not failed_results: + batch_succeeded = True + break + if attempt < MAX_BATCH_RETRY_ATTEMPTS: + delay = _batch_retry_delay(attempt) + yield ctx.create_timer(ctx.current_utc_datetime + delay) + + exported_delta = sum(1 for r in results if r.get("success")) + failed_delta = sum(1 for r in results if not r.get("success")) + batch_failures = [ + { + "instance_id": r["instance_id"], + "reason": r.get("error") or "Unknown error", + "attempt_count": MAX_BATCH_RETRY_ATTEMPTS, + "last_attempt": ctx.current_utc_datetime.isoformat(), + } + for r in results + if not r.get("success") + ] + + if not batch_succeeded: + ctx.signal_entity( + entity_id, + "commit_checkpoint", + { + "scanned_delta": 0, + "exported_delta": 0, + "failed_delta": failed_delta, + "failures": batch_failures, + "mark_failed_on_batch": True, + }, + ) + totals["scanned"] += scanned_delta + totals["exported"] += exported_delta + totals["failed"] += failed_delta + raise RuntimeError( + f"Export job '{job_id}' batch failed after " + f"{MAX_BATCH_RETRY_ATTEMPTS} attempts; " + f"{failed_delta} instances could not be exported." + ) + + next_token = page.get("continuation_token") + ctx.signal_entity( + entity_id, + "commit_checkpoint", + { + "scanned_delta": scanned_delta, + "exported_delta": exported_delta, + "failed_delta": failed_delta, + "last_instance_key": next_token, + }, + ) + + totals["scanned"] += scanned_delta + totals["exported"] += exported_delta + totals["failed"] += failed_delta + + if not next_token: + if config.mode is ExportMode.CONTINUOUS: + # Tail mode: sleep, then loop back and re-check. + yield ctx.create_timer( + ctx.current_utc_datetime + _continuous_idle_delay() + ) + continuation_token = None + continue + break + + continuation_token = next_token + + # Reaching here means BATCH mode finished its window cleanly. + ctx.signal_entity(entity_id, "mark_completed") + return {"job_id": job_id, "status": "Completed", "totals": totals} + + except Exception as ex: # noqa: BLE001 - reported back via mark_failed + ctx.signal_entity( + entity_id, + "mark_failed", + {"reason": f"{type(ex).__name__}: {ex}"}, + ) + raise + + +def _run_page(ctx, *, instance_ids, config, max_parallel): + """Fan out export activities for a single page, bounded by *max_parallel*.""" + destination = config.destination._to_dict() + fmt = config.format._to_dict() + + results: List[dict] = [] + for start in range(0, len(instance_ids), max_parallel): + chunk = instance_ids[start:start + max_parallel] + chunk_tasks = [ + ctx.call_activity( + EXPORT_INSTANCE_HISTORY_ACTIVITY, + input={ + "instance_id": instance_id, + "format": fmt, + "destination": destination, + }, + retry_policy=EXPORT_ACTIVITY_RETRY_POLICY, + ) + for instance_id in chunk + ] + chunk_results = yield task.when_all(chunk_tasks) + results.extend(chunk_results) + return results + + +def register(worker_instance) -> None: + """Convenience helper to register the orchestrator on *worker*.""" + worker_instance.add_orchestrator(export_job_orchestrator) diff --git a/durabletask/extensions/history_export/serialization.py b/durabletask/extensions/history_export/serialization.py new file mode 100644 index 00000000..60898e74 --- /dev/null +++ b/durabletask/extensions/history_export/serialization.py @@ -0,0 +1,181 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Serialization helpers for exported orchestration history. + +Two formats are supported: + +* ``JSON`` — a single self-describing JSON document per instance, + containing the schema version, instance ID, and the full ordered + list of events. + +* ``JSONL_GZIP`` — gzip-compressed newline-delimited JSON. The first + line is an envelope object with metadata (``schema_version`` and + ``instance_id``) and the remaining lines are individual events. + This format is appropriate for very long histories where streaming + ingestion or seeking by event is desirable. + +Output is deterministic: given the same list of input events, the +serialized bytes are byte-for-byte identical across calls. This is +achieved by sorting JSON object keys and using a fixed (non-localized) +representation of timestamps and other primitives. +""" + +from __future__ import annotations + +import gzip +import json +from typing import Any, Iterable, Mapping, Optional, Sequence + +from durabletask import client as client_module +from durabletask import history +from durabletask import task + +from durabletask.extensions.history_export.models import ( + ExportFormat, + ExportFormatKind, + _dt_to_iso, +) + + +def event_to_dict(event: history.HistoryEvent) -> dict: + """Convert a :class:`history.HistoryEvent` into a JSON-safe dict. + + A discriminator field ``event_type`` is added so downstream + consumers can distinguish event subclasses without inspecting + their fields. + """ + payload = event.to_dict() + # Insert the discriminator first so the resulting dict orders it + # near the front of the JSON object even before any sorting. + return {"event_type": type(event).__name__, **payload} + + +def orchestration_state_to_dict( + state: client_module.OrchestrationState, +) -> dict[str, Any]: + """Convert an :class:`OrchestrationState` into a JSON-safe dict. + + All fields are mapped to literal-named primitives. No Python + class names or module paths appear in the resulting dict. + """ + failure = state.failure_details + failure_dict: Optional[dict[str, Any]] = None + if failure is not None: + failure_dict = { + "message": failure.message, + "error_type": failure.error_type, + "stack_trace": failure.stack_trace, + } + inner = getattr(failure, "inner_failure", None) + if isinstance(inner, task.FailureDetails): + failure_dict["inner_failure"] = { + "message": inner.message, + "error_type": inner.error_type, + "stack_trace": inner.stack_trace, + } + return { + "instance_id": state.instance_id, + "name": state.name, + "runtime_status": state.runtime_status.name, + "created_at": _dt_to_iso(state.created_at), + "last_updated_at": _dt_to_iso(state.last_updated_at), + "serialized_input": state.serialized_input, + "serialized_output": state.serialized_output, + "serialized_custom_status": state.serialized_custom_status, + "failure_details": failure_dict, + } + + +def _dump_json(value) -> str: + return json.dumps( + value, + sort_keys=True, + separators=(",", ":"), + ensure_ascii=False, + ) + + +def serialize_history( + events: Sequence[history.HistoryEvent], + *, + instance_id: str, + fmt: ExportFormat, + metadata: Optional[Mapping[str, Any]] = None, +) -> bytes: + """Serialize a list of history events for a single instance. + + Args: + events: The ordered list of events to serialize. + instance_id: The orchestration instance the events belong to. + fmt: The output format (kind + schema version). + metadata: Optional dict produced by + :func:`orchestration_state_to_dict` that will be embedded + in the serialized output (top-level ``metadata`` field for + JSON; embedded in the first JSONL line for JSONL). + + Returns: + The serialized bytes, ready to be written to the destination. + """ + if fmt.kind is ExportFormatKind.JSON: + document: dict[str, Any] = { + "schema_version": fmt.schema_version, + "instance_id": instance_id, + "events": [event_to_dict(e) for e in events], + } + if metadata is not None: + document["metadata"] = dict(metadata) + return _dump_json(document).encode("utf-8") + + if fmt.kind is ExportFormatKind.JSONL_GZIP: + return _gzip_jsonl(events, instance_id=instance_id, fmt=fmt, metadata=metadata) + + raise ValueError(f"Unsupported export format kind: {fmt.kind!r}") + + +def _gzip_jsonl( + events: Iterable[history.HistoryEvent], + *, + instance_id: str, + fmt: ExportFormat, + metadata: Optional[Mapping[str, Any]] = None, +) -> bytes: + # Build the uncompressed JSONL document first so the test surface + # can decode the bytes deterministically. + header: dict[str, Any] = { + "schema_version": fmt.schema_version, + "instance_id": instance_id, + "kind": "metadata", + } + if metadata is not None: + header["metadata"] = dict(metadata) + lines = [_dump_json(header)] + lines.extend(_dump_json(event_to_dict(e)) for e in events) + raw = ("\n".join(lines) + "\n").encode("utf-8") + # mtime=0 keeps the gzip header deterministic across runs. + return gzip.compress(raw, mtime=0) + + +def content_type_for(fmt: ExportFormat) -> str: + """Return the appropriate HTTP-style content type for *fmt*.""" + if fmt.kind is ExportFormatKind.JSON: + return "application/json" + if fmt.kind is ExportFormatKind.JSONL_GZIP: + return "application/x-ndjson" + raise ValueError(f"Unsupported export format kind: {fmt.kind!r}") + + +def content_encoding_for(fmt: ExportFormat) -> str | None: + """Return the appropriate ``Content-Encoding`` for *fmt*, if any.""" + if fmt.kind is ExportFormatKind.JSONL_GZIP: + return "gzip" + return None + + +def file_extension_for(fmt: ExportFormat) -> str: + """Return the file extension to append to exported blobs.""" + if fmt.kind is ExportFormatKind.JSON: + return ".json" + if fmt.kind is ExportFormatKind.JSONL_GZIP: + return ".jsonl.gz" + raise ValueError(f"Unsupported export format kind: {fmt.kind!r}") diff --git a/durabletask/extensions/history_export/transitions.py b/durabletask/extensions/history_export/transitions.py new file mode 100644 index 00000000..b44f53eb --- /dev/null +++ b/durabletask/extensions/history_export/transitions.py @@ -0,0 +1,84 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""State-transition matrix for the export-job entity. + +The matrix is a single, declarative source of truth for which entity +operation may transition the job from which current status to which +new status. Mirrors the .NET ``ExportJobTransitions`` design. + +Conventions +----------- +``None`` is used as the "from" key to represent the absence of any +persisted state (a brand-new entity). The transitions table is +consulted by :class:`~durabletask.extensions.history_export.entity.ExportJobEntity` +before any status-changing operation. +""" + +from __future__ import annotations + +from typing import Mapping, Optional + +from durabletask.extensions.history_export.exceptions import ( + ExportJobInvalidTransitionError, +) +from durabletask.extensions.history_export.models import ExportJobStatus + + +# Maps (operation_name, current_status_or_None) -> {valid target statuses}. +TRANSITIONS: Mapping[tuple[str, Optional[ExportJobStatus]], frozenset[ExportJobStatus]] = { + # ``create`` initialises a fresh job and revives terminal jobs. + ("create", None): frozenset({ExportJobStatus.PENDING}), + ("create", ExportJobStatus.FAILED): frozenset({ExportJobStatus.PENDING}), + ("create", ExportJobStatus.COMPLETED): frozenset({ExportJobStatus.PENDING}), + + # ``run`` flips the job from PENDING to ACTIVE. Idempotent so the + # client may signal it more than once without crashing the entity. + ("run", ExportJobStatus.PENDING): frozenset({ExportJobStatus.ACTIVE}), + ("run", ExportJobStatus.ACTIVE): frozenset({ExportJobStatus.ACTIVE}), + + # ``commit_checkpoint`` is a no-op transition during normal runs. + # When the orchestrator signals ``mark_failed_on_batch`` the entity + # transitions ACTIVE -> FAILED; that is also allowed here. + ("commit_checkpoint", ExportJobStatus.ACTIVE): frozenset({ + ExportJobStatus.ACTIVE, + ExportJobStatus.FAILED, + }), + + ("mark_completed", ExportJobStatus.ACTIVE): frozenset({ExportJobStatus.COMPLETED}), + + # ``mark_failed`` from PENDING covers the rare case of a failure + # happening between create and run. + ("mark_failed", ExportJobStatus.PENDING): frozenset({ExportJobStatus.FAILED}), + ("mark_failed", ExportJobStatus.ACTIVE): frozenset({ExportJobStatus.FAILED}), +} + + +def is_valid_transition( + operation: str, + from_status: Optional[ExportJobStatus], + to_status: ExportJobStatus, +) -> bool: + """Return whether *to_status* is reachable from *from_status* via *operation*.""" + targets = TRANSITIONS.get((operation, from_status)) + return targets is not None and to_status in targets + + +def assert_valid_transition( + operation: str, + from_status: Optional[ExportJobStatus], + to_status: ExportJobStatus, + *, + job_id: Optional[str] = None, +) -> None: + """Raise :class:`ExportJobInvalidTransitionError` for invalid transitions.""" + if not is_valid_transition(operation, from_status, to_status): + raise ExportJobInvalidTransitionError( + operation=operation, + from_status=from_status.value if from_status is not None else None, + to_status=to_status.value, + job_id=job_id, + ) + + +__all__ = ["TRANSITIONS", "is_valid_transition", "assert_valid_transition"] diff --git a/durabletask/extensions/history_export/writer.py b/durabletask/extensions/history_export/writer.py new file mode 100644 index 00000000..e8a0a39d --- /dev/null +++ b/durabletask/extensions/history_export/writer.py @@ -0,0 +1,95 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Destination-agnostic writer protocol for the history export extension. + +A :class:`HistoryWriter` is the extension point that lets the history +export workflow target any blob-style backend (Azure Blob Storage, +S3, GCS, local filesystem, SFTP, etc.). The export activities call +:meth:`HistoryWriter.write` once per exported instance with the +serialized payload and the content-type / content-encoding metadata +appropriate for the configured :class:`ExportFormat`. + +The protocol is intentionally structural — implementations do **not** +need to inherit from a base class. Any object with a compatible +``write(...)`` method is a valid writer. Use ``@runtime_checkable`` +support to write ``isinstance(obj, HistoryWriter)`` assertions if +desired. + +Example custom writer:: + + from typing import Optional + + + class LocalFileSystemHistoryWriter: + def __init__(self, root_dir: str) -> None: + self._root = root_dir + + def write( + self, + *, + instance_id: str, + blob_name: str, + payload: bytes, + content_type: str, + content_encoding: Optional[str], + ) -> None: + import os + path = os.path.join(self._root, blob_name) + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "wb") as fp: + fp.write(payload) + + writer = LocalFileSystemHistoryWriter("/var/exports") + export_client = ExportHistoryClient(dt_client, writer) + +The reason the protocol exposes both ``blob_name`` and ``instance_id`` +is so that destinations that key by something other than the blob +name (database row IDs, message metadata, etc.) still have the +orchestration identity available. +""" + +from __future__ import annotations + +from typing import Optional, Protocol, runtime_checkable + + +@runtime_checkable +class HistoryWriter(Protocol): + """Destination-agnostic interface for writing one exported blob. + + Implementations are expected to be **synchronous** and thread-safe + if a single instance is shared across activity workers. + """ + + def write( + self, + *, + instance_id: str, + blob_name: str, + payload: bytes, + content_type: str, + content_encoding: Optional[str], + ) -> None: + """Persist one exported blob. + + Args: + instance_id: The orchestration instance whose history this + payload represents. Provided so destinations may use + it as a key, metadata, or sharding hint. + blob_name: Destination-relative path / key, including any + configured destination prefix and file extension. + payload: The serialized history bytes. Already compressed + if the configured format calls for it. + content_type: The HTTP-style content type appropriate for + the configured format (e.g. ``application/json``). + content_encoding: ``"gzip"`` for the JSONL_GZIP format, + ``None`` for uncompressed formats. Destinations that + model HTTP-style headers (such as Azure Blob Storage) + should persist this on the blob; destinations that + cannot represent it may ignore it. + """ + ... + + +__all__ = ["HistoryWriter"] diff --git a/examples/history_export/README.md b/examples/history_export/README.md new file mode 100644 index 00000000..e250bc8d --- /dev/null +++ b/examples/history_export/README.md @@ -0,0 +1,38 @@ +# Orchestration history export + +This sample shows how to use `durabletask.extensions.history_export` to +export the event history of terminal orchestrations to Azure Blob +Storage. It uses the in-memory backend, so it only needs Azurite (the +local Azure Storage emulator) — no other services or accounts. + +## Prerequisites + +```bash +pip install durabletask[history-export-azure] +``` + +Start Azurite locally: + +```bash +azurite --silent --blobPort 10000 +``` + +## Run the sample + +```bash +python app.py +``` + +The script: + +1. Spins up an in-memory durabletask backend. +2. Schedules five small orchestrations to populate terminal history. +3. Creates an export job that scans the recent time window and writes + each instance's history to the `history-export-sample` container as + gzipped JSONL. +4. Polls the job through `ExportHistoryClient.wait_for_job` and prints + the final status. + +> [!TIP] +> Set `STORAGE_CONNECTION_STRING` to point at a real Azure Storage +> account instead of Azurite. diff --git a/examples/history_export/app.py b/examples/history_export/app.py new file mode 100644 index 00000000..37d6668d --- /dev/null +++ b/examples/history_export/app.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""End-to-end sample for exporting orchestration history to Azure Blob Storage. + +This sample uses the in-memory backend, so it can run with no external +services other than Azurite (the local Azure Storage emulator). + +Prerequisites: + pip install durabletask[history-export-azure] + azurite --silent --blobPort 10000 + +Usage: + python app.py +""" + +from __future__ import annotations + +import os +import time +from datetime import datetime, timedelta, timezone + +from durabletask import client, task, worker +from durabletask.extensions.history_export import ( + ExportDestination, + ExportFormat, + ExportFormatKind, + ExportHistoryClient, + ExportJobCreationOptions, + ExportMode, +) +from durabletask.extensions.history_export.azure_blob import ( + AzureBlobHistoryExportWriter, + AzureBlobHistoryExportWriterOptions, +) +from durabletask.testing import create_test_backend + + +HOST = "localhost:50300" +CONTAINER_NAME = os.getenv("EXPORT_CONTAINER", "history-export-sample") +AZURITE_CONN_STR = os.getenv( + "STORAGE_CONNECTION_STRING", "UseDevelopmentStorage=true" +) + + +# --------------- Activities & orchestrator (synthetic workload) --------------- + +def square(_: task.ActivityContext, n: int) -> int: + return n * n + + +def sample_orchestrator(ctx: task.OrchestrationContext, n: int): + result = yield ctx.call_activity(square, input=n) + return result + + +# --------------- Main --------------- + +def main() -> None: + print(f"Using container: {CONTAINER_NAME}") + print(f"Using storage connection: {AZURITE_CONN_STR}") + + backend = create_test_backend(port=50300) + try: + writer = AzureBlobHistoryExportWriter( + AzureBlobHistoryExportWriterOptions( + container_name=CONTAINER_NAME, + connection_string=AZURITE_CONN_STR, + api_version="2024-08-04", + ) + ) + + dt_client = client.TaskHubGrpcClient(host_address=HOST) + export_client = ExportHistoryClient(dt_client, writer) + + with worker.TaskHubGrpcWorker(host_address=HOST) as w: + # Register the workload orchestrator and activity. + w.add_orchestrator(sample_orchestrator) + w.add_activity(square) + + # Register the export-job entity, activities, and orchestrator. + export_client.register_worker(w) + w.start() + + # Seed some terminal instances to export. + print("\nSeeding sample orchestrations...") + for n in range(1, 6): + sid = dt_client.schedule_new_orchestration(sample_orchestrator, input=n) + state = dt_client.wait_for_orchestration_completion(sid, timeout=30) + assert state and state.runtime_status == client.OrchestrationStatus.COMPLETED + time.sleep(0.5) + + # Create an export job for the seeded window. + now = datetime.now(timezone.utc) + print("\nCreating export job...") + desc = export_client.create_job( + ExportJobCreationOptions( + mode=ExportMode.BATCH, + completed_time_from=now - timedelta(hours=1), + completed_time_to=now + timedelta(hours=1), + destination=ExportDestination(container=CONTAINER_NAME, prefix="sample-run"), + format=ExportFormat(kind=ExportFormatKind.JSONL_GZIP), + max_instances_per_batch=10, + ) + ) + print(f" job_id: {desc.job_id}") + print(f" orchestrator_instance_id: {desc.orchestrator_instance_id}") + + final = export_client.wait_for_job(desc.job_id, timeout=120, poll_interval=0.5) + print("\nFinal job status:") + print(f" status: {final.status.value}") + print(f" scanned_instances: {final.scanned_instances}") + print(f" exported_instances: {final.exported_instances}") + print(f" failed_instances: {final.failed_instances}") + if final.last_error: + print(f" last_error: {final.last_error}") + + writer.close() + finally: + backend.stop() + backend.reset() + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 145b9304..4facf5e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,9 @@ opentelemetry = [ azure-blob-payloads = [ "azure-storage-blob[aio]>=12.0.0" ] +history-export-azure = [ + "azure-storage-blob>=12.0.0" +] [project.urls] repository = "https://github.com/microsoft/durabletask-python" diff --git a/tests/durabletask/extensions/__init__.py b/tests/durabletask/extensions/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/durabletask/extensions/history_export/__init__.py b/tests/durabletask/extensions/history_export/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/durabletask/extensions/history_export/_test_helpers.py b/tests/durabletask/extensions/history_export/_test_helpers.py new file mode 100644 index 00000000..3f01a646 --- /dev/null +++ b/tests/durabletask/extensions/history_export/_test_helpers.py @@ -0,0 +1,52 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Shared fixtures and helpers for the history-export test package. + +The most useful helper here is :func:`wait_until`, which lets tests +poll for an asynchronous state change (e.g. an entity processing a +signal) instead of sleeping for a fixed duration. Polling is both +faster on the happy path and much less flaky than a single fixed +sleep, and it lets every test fail fast when the expected condition +never materializes. +""" + +from __future__ import annotations + +import time +from typing import Any, Callable, Optional, TypeVar + +T = TypeVar("T") + + +def wait_until( + predicate: Callable[[], Optional[T]], + *, + timeout: float = 10.0, + interval: float = 0.05, + description: str = "condition", +) -> T: + """Poll *predicate* until it returns a truthy value or *timeout* elapses. + + The predicate is called immediately and then every *interval* + seconds. Returns the first truthy value the predicate produced. + Raises :class:`AssertionError` if *timeout* elapses with no truthy + return — the message includes *description* and the last value. + """ + deadline = time.monotonic() + timeout + last: Any = None + while True: + try: + value = predicate() + except Exception as ex: # noqa: BLE001 + value = None + last = f"" + if value: + return value # type: ignore[return-value] + last = value if last is None else last + if time.monotonic() >= deadline: + raise AssertionError( + f"Timed out after {timeout}s waiting for {description}; " + f"last value was {last!r}" + ) + time.sleep(interval) diff --git a/tests/durabletask/extensions/history_export/test_activities.py b/tests/durabletask/extensions/history_export/test_activities.py new file mode 100644 index 00000000..d329b659 --- /dev/null +++ b/tests/durabletask/extensions/history_export/test_activities.py @@ -0,0 +1,232 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""E2E tests for history-export activities against the in-memory backend. + +Tests share a single backend + worker per module to avoid paying the +worker start/shutdown cost on every test. The bound activity context +is module-global, so individual tests swap their own +:class:`HistoryExportContext` (writer + client) in via ``bind_context`` +at the top of the test, and restore a clean slate via ``clear_context`` +on teardown. +""" + +from __future__ import annotations + +import gzip +import json +import threading +from datetime import datetime, timedelta, timezone + +import pytest + +from durabletask import client, task, worker +from durabletask.extensions.history_export import ( + ExportDestination, + ExportFormat, + ExportFormatKind, +) +from durabletask.extensions.history_export.activities import ( + EXPORT_INSTANCE_HISTORY_ACTIVITY, + LIST_TERMINAL_INSTANCES_ACTIVITY, + HistoryExportContext, + bind_context, + clear_context, + register as register_activities, +) +from durabletask.testing import create_test_backend + + +PORT = 50261 +HOST = f"localhost:{PORT}" + + +class _InMemoryWriter: + def __init__(self) -> None: + self._lock = threading.Lock() + self.blobs: dict[str, dict] = {} + + def write(self, *, instance_id, blob_name, payload, content_type, content_encoding): + with self._lock: + self.blobs[blob_name] = { + "instance_id": instance_id, + "payload": payload, + "content_type": content_type, + "content_encoding": content_encoding, + } + + +def _echo_orchestrator(ctx: task.OrchestrationContext, input): + return input + + +def _list_then_export(ctx: task.OrchestrationContext, input): + """Test orchestrator: list one page, then export each returned id.""" + page = yield ctx.call_activity(LIST_TERMINAL_INSTANCES_ACTIVITY, input=input["list"]) + results = [] + for instance_id in page["instance_ids"]: + export_input = { + "instance_id": instance_id, + "format": input["format"], + "destination": input["destination"], + } + result = yield ctx.call_activity(EXPORT_INSTANCE_HISTORY_ACTIVITY, input=export_input) + results.append(result) + return {"page": page, "results": results} + + +@pytest.fixture(scope="module") +def backend(): + b = create_test_backend(port=PORT) + yield b + b.stop() + b.reset() + + +@pytest.fixture(scope="module") +def w(backend): + w_ = worker.TaskHubGrpcWorker(host_address=HOST) + w_.add_orchestrator(_echo_orchestrator) + w_.add_orchestrator(_list_then_export) + register_activities(w_) + w_.start() + yield w_ + w_.stop() + + +@pytest.fixture(scope="module") +def c(w): + return client.TaskHubGrpcClient(host_address=HOST) + + +@pytest.fixture(autouse=True) +def _isolate_context(): + """Each test must explicitly bind its own context.""" + clear_context() + yield + clear_context() + + +@pytest.fixture(scope="module") +def seeded_ids(c, w): + """Three completed orchestrations shared across tests.""" + ids: list[str] = [] + for value in ["a", "b", "c"]: + sid = c.schedule_new_orchestration(_echo_orchestrator, input=value) + state = c.wait_for_orchestration_completion(sid, timeout=30) + assert state is not None + assert state.runtime_status == client.OrchestrationStatus.COMPLETED + ids.append(sid) + return ids + + +# --------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------- + + +def test_activities_list_and_export_to_in_memory_writer(c, seeded_ids): + writer = _InMemoryWriter() + bind_context(HistoryExportContext(client=c, writer=writer)) + + now = datetime.now(timezone.utc) + fmt = ExportFormat(kind=ExportFormatKind.JSONL_GZIP) + dest = ExportDestination(container="exports", prefix="run-1") + orch_input = { + "list": { + "runtime_status": ["COMPLETED"], + "completed_time_from": (now - timedelta(hours=1)).isoformat(), + "completed_time_to": (now + timedelta(hours=1)).isoformat(), + "page_size": 50, + "continuation_token": None, + }, + "format": fmt._to_dict(), + "destination": dest._to_dict(), + } + run_id = c.schedule_new_orchestration(_list_then_export, input=orch_input) + state = c.wait_for_orchestration_completion( + run_id, timeout=30, fetch_payloads=True + ) + + assert state is not None + assert state.runtime_status == client.OrchestrationStatus.COMPLETED, state.failure_details + output = json.loads(state.serialized_output or "null") + listed_ids = output["page"]["instance_ids"] + assert set(seeded_ids).issubset(set(listed_ids)) + assert all(r["success"] for r in output["results"]), output["results"] + + blob_names = {b["instance_id"]: name for name, b in writer.blobs.items()} + for sid in seeded_ids: + assert sid in blob_names + name = blob_names[sid] + assert name.startswith("run-1/") + assert name.endswith(".jsonl.gz") + entry = writer.blobs[name] + assert entry["content_type"] == "application/x-ndjson" + assert entry["content_encoding"] == "gzip" + raw = gzip.decompress(entry["payload"]).decode("utf-8") + lines = raw.strip().split("\n") + assert len(lines) >= 2 # metadata + at least one event + meta = json.loads(lines[0]) + assert meta["instance_id"] == sid + + +def test_export_activity_reports_failure_when_writer_raises(c, seeded_ids): + class FailingWriter: + def write(self, **_): + raise RuntimeError("disk full") + + bind_context(HistoryExportContext(client=c, writer=FailingWriter())) + + now = datetime.now(timezone.utc) + fmt = ExportFormat(kind=ExportFormatKind.JSON) + dest = ExportDestination(container="exports") + orch_input = { + "list": { + "runtime_status": ["COMPLETED"], + "completed_time_from": (now - timedelta(hours=1)).isoformat(), + "completed_time_to": (now + timedelta(hours=1)).isoformat(), + "page_size": 50, + "continuation_token": None, + }, + "format": fmt._to_dict(), + "destination": dest._to_dict(), + } + run_id = c.schedule_new_orchestration(_list_then_export, input=orch_input) + state = c.wait_for_orchestration_completion( + run_id, timeout=30, fetch_payloads=True + ) + + assert state is not None + assert state.runtime_status == client.OrchestrationStatus.COMPLETED, state.failure_details + output = json.loads(state.serialized_output or "null") + failures = [r for r in output["results"] if not r["success"]] + assert failures, "expected at least one failure" + assert all("disk full" in r["error"] for r in failures) + + +def test_activities_require_bound_context(c): + # Do NOT bind a context. The activities should raise. + now = datetime.now(timezone.utc) + fmt = ExportFormat(kind=ExportFormatKind.JSON) + dest = ExportDestination(container="exports") + orch_input = { + "list": { + "runtime_status": ["COMPLETED"], + "completed_time_from": (now - timedelta(hours=1)).isoformat(), + "completed_time_to": (now + timedelta(hours=1)).isoformat(), + "page_size": 50, + "continuation_token": None, + }, + "format": fmt._to_dict(), + "destination": dest._to_dict(), + } + run_id = c.schedule_new_orchestration(_list_then_export, input=orch_input) + state = c.wait_for_orchestration_completion( + run_id, timeout=30, fetch_payloads=True + ) + + assert state is not None + assert state.runtime_status == client.OrchestrationStatus.FAILED + assert state.failure_details is not None + assert "without a bound context" in (state.failure_details.message or "") diff --git a/tests/durabletask/extensions/history_export/test_azure_blob_writer_e2e.py b/tests/durabletask/extensions/history_export/test_azure_blob_writer_e2e.py new file mode 100644 index 00000000..0fb815f0 --- /dev/null +++ b/tests/durabletask/extensions/history_export/test_azure_blob_writer_e2e.py @@ -0,0 +1,144 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""End-to-end tests for :class:`AzureBlobHistoryExportWriter` using Azurite. + +Prerequisites: + - Azurite must be running locally on the default blob port (10000). + Start it with: ``azurite --silent --blobPort 10000`` + - ``azure-storage-blob`` must be installed. +""" + +from __future__ import annotations + +import gzip +import json +import uuid + +import pytest + +# Skip the entire module if azure-storage-blob is not installed. +azure_blob = pytest.importorskip("azure.storage.blob") + +from durabletask.extensions.history_export import ( # noqa: E402 + ExportFormat, + ExportFormatKind, +) +from durabletask.extensions.history_export.azure_blob import ( # noqa: E402 + AzureBlobHistoryExportWriter, + AzureBlobHistoryExportWriterOptions, +) +from durabletask.extensions.history_export.serialization import ( # noqa: E402 + content_encoding_for, + content_type_for, + serialize_history, +) + + +AZURITE_CONN_STR = "UseDevelopmentStorage=true" +AZURITE_API_VERSION = "2024-08-04" +TEST_CONTAINER = f"e2e-history-export-{uuid.uuid4().hex[:8]}" + + +def _azurite_is_running() -> bool: + try: + svc = azure_blob.BlobServiceClient.from_connection_string( + AZURITE_CONN_STR, api_version=AZURITE_API_VERSION, + ) + next(iter(svc.list_containers(results_per_page=1)), None) + return True + except Exception: + return False + + +pytestmark = [ + pytest.mark.azurite, + pytest.mark.skipif( + not _azurite_is_running(), + reason="Azurite blob service is not running on 127.0.0.1:10000", + ), +] + + +@pytest.fixture(scope="module") +def writer(): + w = AzureBlobHistoryExportWriter( + AzureBlobHistoryExportWriterOptions( + container_name=TEST_CONTAINER, + connection_string=AZURITE_CONN_STR, + api_version=AZURITE_API_VERSION, + ) + ) + yield w + w.close() + try: + svc = azure_blob.BlobServiceClient.from_connection_string( + AZURITE_CONN_STR, api_version=AZURITE_API_VERSION, + ) + svc.delete_container(TEST_CONTAINER) + except Exception: + pass + + +def test_options_validate_required_fields(): + with pytest.raises(ValueError): + AzureBlobHistoryExportWriterOptions(container_name="") + with pytest.raises(ValueError): + AzureBlobHistoryExportWriterOptions(container_name="c") + + +def test_write_json_blob(writer): + fmt = ExportFormat(kind=ExportFormatKind.JSON) + payload = serialize_history([], instance_id="inst-json", fmt=fmt) + blob_name = f"json/{uuid.uuid4().hex}.json" + writer.write( + instance_id="inst-json", + blob_name=blob_name, + payload=payload, + content_type=content_type_for(fmt), + content_encoding=content_encoding_for(fmt), + ) + + svc = azure_blob.BlobServiceClient.from_connection_string( + AZURITE_CONN_STR, api_version=AZURITE_API_VERSION, + ) + container = svc.get_container_client(TEST_CONTAINER) + blob = container.get_blob_client(blob_name) + props = blob.get_blob_properties() + assert props.content_settings.content_type == "application/json" + downloaded = blob.download_blob().readall() + doc = json.loads(downloaded) + assert doc["instance_id"] == "inst-json" + assert doc["events"] == [] + + +def test_write_jsonl_gzip_blob(writer): + fmt = ExportFormat(kind=ExportFormatKind.JSONL_GZIP) + payload = serialize_history([], instance_id="inst-gz", fmt=fmt) + blob_name = f"gz/{uuid.uuid4().hex}.jsonl.gz" + writer.write( + instance_id="inst-gz", + blob_name=blob_name, + payload=payload, + content_type=content_type_for(fmt), + content_encoding=content_encoding_for(fmt), + ) + + svc = azure_blob.BlobServiceClient.from_connection_string( + AZURITE_CONN_STR, api_version=AZURITE_API_VERSION, + ) + container = svc.get_container_client(TEST_CONTAINER) + blob = container.get_blob_client(blob_name) + props = blob.get_blob_properties() + assert props.content_settings.content_type == "application/x-ndjson" + assert props.content_settings.content_encoding == "gzip" + downloaded = blob.download_blob().readall() + # The Azure SDK may auto-decompress when Content-Encoding: gzip is set; + # accept either the gzipped bytes or the decoded text. + if downloaded[:2] == b"\x1f\x8b": + text = gzip.decompress(downloaded).decode("utf-8") + else: + text = downloaded.decode("utf-8") + first = json.loads(text.strip().split("\n")[0]) + assert first["instance_id"] == "inst-gz" + assert first["kind"] == "metadata" diff --git a/tests/durabletask/extensions/history_export/test_client.py b/tests/durabletask/extensions/history_export/test_client.py new file mode 100644 index 00000000..7076b5eb --- /dev/null +++ b/tests/durabletask/extensions/history_export/test_client.py @@ -0,0 +1,298 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""E2E tests for :class:`ExportHistoryClient`. + +Tests share a single backend + worker per module so the worker is +started and shut down only once per test file. Each test uses a +unique job ID to avoid cross-test interference. +""" + +from __future__ import annotations + +import gzip +import json +import threading +from datetime import datetime, timedelta, timezone + +import pytest + +from durabletask import client, task, worker +from durabletask.extensions.history_export import ( + ExportDestination, + ExportFormat, + ExportFormatKind, + ExportHistoryClient, + ExportHistoryJobClient, + ExportJobCreationOptions, + ExportJobNotFoundError, + ExportJobQuery, + ExportJobStatus, + ExportMode, + orchestrator_instance_id_for, +) +from durabletask.extensions.history_export.activities import clear_context +from durabletask.testing import create_test_backend + +from tests.durabletask.extensions.history_export._test_helpers import wait_until + + +PORT = 50263 +HOST = f"localhost:{PORT}" + + +class _InMemoryWriter: + def __init__(self) -> None: + self._lock = threading.Lock() + self.blobs: dict[str, dict] = {} + + def write(self, *, instance_id, blob_name, payload, content_type, content_encoding): + with self._lock: + self.blobs[blob_name] = { + "instance_id": instance_id, + "payload": payload, + "content_type": content_type, + "content_encoding": content_encoding, + } + + +def _echo(ctx: task.OrchestrationContext, input): + return input + + +@pytest.fixture(scope="module") +def backend(): + b = create_test_backend(port=PORT) + yield b + b.stop() + b.reset() + clear_context() + + +@pytest.fixture(scope="module") +def writer() -> _InMemoryWriter: + return _InMemoryWriter() + + +@pytest.fixture(scope="module") +def dt_client(backend): + return client.TaskHubGrpcClient(host_address=HOST) + + +@pytest.fixture(scope="module") +def export_client(dt_client, writer): + return ExportHistoryClient(dt_client, writer) + + +@pytest.fixture(scope="module") +def w(backend, export_client): + w_ = worker.TaskHubGrpcWorker(host_address=HOST) + w_.add_orchestrator(_echo) + export_client.register_worker(w_) + w_.start() + yield w_ + w_.stop() + + +@pytest.fixture(scope="module") +def seeded_terminal_instances(dt_client, w): + """Three seeded terminal orchestrations shared across tests.""" + ids: list[str] = [] + for v in ["x", "y", "z"]: + sid = dt_client.schedule_new_orchestration(_echo, input=v) + state = dt_client.wait_for_orchestration_completion(sid, timeout=30) + assert state and state.runtime_status == client.OrchestrationStatus.COMPLETED + ids.append(sid) + return ids + + +# --------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------- + + +def test_create_get_and_wait_for_job_end_to_end( + dt_client, export_client, writer, seeded_terminal_instances, +): + now = datetime.now(timezone.utc) + desc = export_client.create_job( + ExportJobCreationOptions( + mode=ExportMode.BATCH, + completed_time_from=now - timedelta(hours=1), + completed_time_to=now + timedelta(hours=1), + destination=ExportDestination(container="exports", prefix="run-G"), + format=ExportFormat(kind=ExportFormatKind.JSONL_GZIP), + max_instances_per_batch=2, + ) + ) + + assert desc.job_id + assert desc.status == ExportJobStatus.PENDING + assert desc.config is not None + assert desc.orchestrator_instance_id == f"export-job-{desc.job_id}" + + final = export_client.wait_for_job(desc.job_id, timeout=30, poll_interval=0.1) + + assert final.status == ExportJobStatus.COMPLETED + assert final.exported_instances >= len(seeded_terminal_instances) + assert final.failed_instances == 0 + assert final.last_error is None + + matching_blobs = [ + (name, entry) for name, entry in writer.blobs.items() + if name.startswith("run-G/") + ] + assert len(matching_blobs) >= len(seeded_terminal_instances) + for name, entry in matching_blobs: + assert name.endswith(".jsonl.gz") + raw = gzip.decompress(entry["payload"]).decode("utf-8") + first = json.loads(raw.strip().split("\n")[0]) + assert first["kind"] == "metadata" + + +def test_get_job_returns_none_for_unknown_id(export_client): + assert export_client.get_job("does-not-exist") is None + + +def test_wait_for_job_raises_lookup_when_job_never_exists(export_client): + with pytest.raises(ExportJobNotFoundError): + export_client.wait_for_job("never-created", timeout=0.5, poll_interval=0.1) + with pytest.raises(LookupError): + export_client.wait_for_job("never-created", timeout=0.5, poll_interval=0.1) + + +def test_wait_for_job_times_out_when_status_stays_active( + dt_client, writer, monkeypatch, +): + local_client = ExportHistoryClient(dt_client, writer) + fake_desc = ExportJobCreationOptions( + mode=ExportMode.BATCH, + completed_time_from=datetime.now(timezone.utc) - timedelta(hours=1), + completed_time_to=datetime.now(timezone.utc) + timedelta(hours=1), + destination=ExportDestination(container="c"), + format=ExportFormat(kind=ExportFormatKind.JSON), + ).to_configuration() + + from durabletask.extensions.history_export.models import ExportJobDescription + + def fake_get_job(self, job_id): + return ExportJobDescription( + job_id=job_id, + status=ExportJobStatus.ACTIVE, + created_at=datetime.now(timezone.utc), + last_modified_at=datetime.now(timezone.utc), + config=fake_desc, + orchestrator_instance_id="orch-1", + scanned_instances=0, + exported_instances=0, + failed_instances=0, + last_error=None, + checkpoint=None, + last_checkpoint_time=None, + ) + + monkeypatch.setattr(ExportHistoryClient, "get_job", fake_get_job) + + with pytest.raises(TimeoutError): + local_client.wait_for_job("stuck-job", timeout=0.3, poll_interval=0.05) + + +def test_delete_job_clears_entity_state(dt_client, export_client): + now = datetime.now(timezone.utc) + desc = export_client.create_job( + ExportJobCreationOptions( + mode=ExportMode.BATCH, + completed_time_from=now - timedelta(hours=1), + completed_time_to=now + timedelta(hours=1), + destination=ExportDestination(container="exports"), + format=ExportFormat(kind=ExportFormatKind.JSON), + max_instances_per_batch=10, + ) + ) + + final = export_client.wait_for_job(desc.job_id, timeout=15, poll_interval=0.1) + assert final.status == ExportJobStatus.COMPLETED + + export_client.delete_job(desc.job_id) + wait_until( + lambda: export_client.get_job(desc.job_id) is None, + description="job to disappear after delete", + timeout=5.0, + ) + + +def test_list_jobs_returns_created_jobs_and_supports_status_filter(export_client): + now = datetime.now(timezone.utc) + completed_ids: list[str] = [] + for _ in range(3): + d = export_client.create_job( + ExportJobCreationOptions( + mode=ExportMode.BATCH, + completed_time_from=now - timedelta(hours=1), + completed_time_to=now + timedelta(hours=1), + destination=ExportDestination(container="exports"), + format=ExportFormat(kind=ExportFormatKind.JSON), + max_instances_per_batch=10, + ) + ) + export_client.wait_for_job(d.job_id, timeout=15, poll_interval=0.1) + completed_ids.append(d.job_id) + + all_jobs = list(export_client.list_jobs()) + seen = {j.job_id for j in all_jobs} + for jid in completed_ids: + assert jid in seen + + completed_only = list( + export_client.list_jobs( + ExportJobQuery(status=[ExportJobStatus.COMPLETED]) + ) + ) + assert {j.job_id for j in completed_only} >= set(completed_ids) + assert all(j.status == ExportJobStatus.COMPLETED for j in completed_only) + + failed_only = list( + export_client.list_jobs( + ExportJobQuery(status=[ExportJobStatus.FAILED]) + ) + ) + assert all(j.status == ExportJobStatus.FAILED for j in failed_only) + + +def test_export_history_job_client_round_trip(export_client): + now = datetime.now(timezone.utc) + desc = export_client.create_job( + ExportJobCreationOptions( + mode=ExportMode.BATCH, + completed_time_from=now - timedelta(hours=1), + completed_time_to=now + timedelta(hours=1), + destination=ExportDestination(container="exports"), + format=ExportFormat(kind=ExportFormatKind.JSON), + max_instances_per_batch=10, + ) + ) + job_client = export_client.get_job_client(desc.job_id) + assert isinstance(job_client, ExportHistoryJobClient) + assert job_client.job_id == desc.job_id + assert job_client.orchestrator_instance_id == orchestrator_instance_id_for( + desc.job_id + ) + + final = job_client.wait(timeout=15, poll_interval=0.1) + assert final.status == ExportJobStatus.COMPLETED + + snap = job_client.describe() + assert snap is not None + assert snap.status == ExportJobStatus.COMPLETED + + job_client.delete() + wait_until( + lambda: job_client.describe() is None, + description="job to disappear after delete", + timeout=5.0, + ) + + +def test_export_history_job_client_rejects_empty_job_id(export_client): + with pytest.raises(ValueError): + export_client.get_job_client("") diff --git a/tests/durabletask/extensions/history_export/test_entity.py b/tests/durabletask/extensions/history_export/test_entity.py new file mode 100644 index 00000000..4d8399ed --- /dev/null +++ b/tests/durabletask/extensions/history_export/test_entity.py @@ -0,0 +1,289 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""E2E tests for :class:`ExportJobEntity` against the in-memory backend. + +Tests share a single backend + worker per module to avoid paying the +worker start/shutdown cost on every test. Each test uses a unique +job ID so there is no cross-test interference. +""" + +from __future__ import annotations + +import json +from datetime import datetime, timezone +from typing import Callable, Optional + +import pytest + +from durabletask import client, entities, task, worker +from durabletask.extensions.history_export import ( + ExportDestination, + ExportJobCreationOptions, + ExportJobStatus, + ExportMode, + orchestrator_instance_id_for, +) +from durabletask.extensions.history_export.entity import ( + ENTITY_NAME, + ExportJobEntity, +) +from durabletask.testing import create_test_backend + +from tests.durabletask.extensions.history_export._test_helpers import wait_until + + +PORT = 50260 +HOST = f"localhost:{PORT}" + +_WINDOW_START = datetime(2025, 1, 1, tzinfo=timezone.utc) +_WINDOW_END = datetime(2025, 1, 2, tzinfo=timezone.utc) + + +def _no_op_orchestrator(ctx: task.OrchestrationContext, _input): + # The entity's run() op schedules an orchestrator named + # ``export_job_orchestrator``. These tests focus on entity + # behaviour, so register a no-op stub under that canonical name. + return None + + +@pytest.fixture(scope="module") +def backend(): + b = create_test_backend(port=PORT) + yield b + b.stop() + b.reset() + + +@pytest.fixture(scope="module") +def w(backend): + def export_job_orchestrator(ctx: task.OrchestrationContext, _input): + return None + w_ = worker.TaskHubGrpcWorker(host_address=HOST) + w_.add_entity(ExportJobEntity, name=ENTITY_NAME) + w_.add_orchestrator(export_job_orchestrator) + w_.start() + yield w_ + w_.stop() + + +@pytest.fixture(scope="module") +def c(w): + return client.TaskHubGrpcClient(host_address=HOST) + + +# --------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------- + + +def _create_payload() -> dict: + cfg = ExportJobCreationOptions( + mode=ExportMode.BATCH, + completed_time_from=_WINDOW_START, + completed_time_to=_WINDOW_END, + destination=ExportDestination(container="exports", prefix="run-1"), + ).to_configuration() + return {"config": cfg._to_dict()} + + +def _state_dict(metadata) -> dict: + raw = metadata.get_state(str) + assert raw is not None + return json.loads(raw) + + +def _wait_for_state( + c: client.TaskHubGrpcClient, + entity_id: entities.EntityInstanceId, + predicate: Callable[[dict], bool], + *, + description: str, + timeout: float = 5.0, +) -> dict: + """Poll the entity until its state satisfies *predicate*.""" + def _check() -> Optional[dict]: + meta = c.get_entity(entity_id, include_state=True) + if meta is None: + return None + raw = meta.get_state(str) + if not raw: + return None + try: + state = json.loads(raw) + except (TypeError, ValueError): + return None + return state if predicate(state) else None + + return wait_until(_check, timeout=timeout, description=description) + + +def _wait_for_status( + c: client.TaskHubGrpcClient, + entity_id: entities.EntityInstanceId, + expected: ExportJobStatus, + *, + timeout: float = 5.0, +) -> dict: + return _wait_for_state( + c, + entity_id, + lambda s: s.get("status") == expected.value, + description=f"entity {entity_id} to reach status {expected.value}", + timeout=timeout, + ) + + +# --------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------- + + +def test_create_persists_pending_status(c) -> None: + entity_id = entities.EntityInstanceId(ENTITY_NAME, "job-1") + c.signal_entity(entity_id, "create", input=_create_payload()) + + state = _wait_for_status(c, entity_id, ExportJobStatus.PENDING) + assert state["schema_version"] == "1.0" + assert state["status"] == ExportJobStatus.PENDING.value + assert state["orchestrator_instance_id"] is None + assert state["config"]["destination"]["container"] == "exports" + assert state["failures"] == [] + + +def test_run_transitions_to_active_and_records_orchestrator_instance_id(c) -> None: + entity_id = entities.EntityInstanceId(ENTITY_NAME, "job-1b") + c.signal_entity(entity_id, "create", input=_create_payload()) + c.signal_entity(entity_id, "run") + + state = _wait_for_status(c, entity_id, ExportJobStatus.ACTIVE) + assert state["orchestrator_instance_id"] == orchestrator_instance_id_for("job-1b") + + +def test_create_on_active_job_is_rejected_and_state_unchanged(c) -> None: + entity_id = entities.EntityInstanceId(ENTITY_NAME, "job-1c") + c.signal_entity(entity_id, "create", input=_create_payload()) + c.signal_entity(entity_id, "run") + _wait_for_status(c, entity_id, ExportJobStatus.ACTIVE) + + c.signal_entity(entity_id, "commit_checkpoint", input={"scanned_delta": 7}) + _wait_for_state( + c, entity_id, + lambda s: s.get("scanned_instances") == 7, + description="scanned_instances to reach 7", + ) + + # A second create on an ACTIVE job is rejected by the transitions + # matrix. The signal is one-way so we don't see the exception + # client-side, but the state remains ACTIVE with progress intact. + c.signal_entity(entity_id, "create", input=_create_payload()) + state = _wait_for_state( + c, entity_id, + lambda s: ( + s.get("status") == ExportJobStatus.ACTIVE.value + and s.get("scanned_instances") == 7 + ), + description="state to remain ACTIVE with scanned_instances=7", + ) + assert state["scanned_instances"] == 7 + + +def test_create_after_failure_resets_to_pending(c) -> None: + entity_id = entities.EntityInstanceId(ENTITY_NAME, "job-1d") + c.signal_entity(entity_id, "create", input=_create_payload()) + c.signal_entity(entity_id, "run") + _wait_for_status(c, entity_id, ExportJobStatus.ACTIVE) + + c.signal_entity(entity_id, "mark_failed", input={"reason": "boom"}) + _wait_for_status(c, entity_id, ExportJobStatus.FAILED) + + c.signal_entity(entity_id, "create", input=_create_payload()) + _wait_for_status(c, entity_id, ExportJobStatus.PENDING) + + +def test_commit_checkpoint_requires_active_status(c) -> None: + entity_id = entities.EntityInstanceId(ENTITY_NAME, "job-2") + c.signal_entity(entity_id, "create", input=_create_payload()) + c.signal_entity(entity_id, "run") + _wait_for_status(c, entity_id, ExportJobStatus.ACTIVE) + + c.signal_entity( + entity_id, + "commit_checkpoint", + input={ + "scanned_delta": 10, + "exported_delta": 8, + "failed_delta": 2, + "last_instance_key": "ts|inst-9", + }, + ) + c.signal_entity( + entity_id, + "commit_checkpoint", + input={"scanned_delta": 5, "exported_delta": 5}, + ) + + state = _wait_for_state( + c, entity_id, + lambda s: s.get("scanned_instances") == 15, + description="scanned_instances to reach 15", + ) + assert state["status"] == ExportJobStatus.ACTIVE.value + assert state["scanned_instances"] == 15 + assert state["exported_instances"] == 13 + assert state["failed_instances"] == 2 + assert state["checkpoint"]["last_instance_key"] == "ts|inst-9" + + +def test_commit_checkpoint_records_failures_and_marks_failed(c) -> None: + entity_id = entities.EntityInstanceId(ENTITY_NAME, "job-2b") + c.signal_entity(entity_id, "create", input=_create_payload()) + c.signal_entity(entity_id, "run") + _wait_for_status(c, entity_id, ExportJobStatus.ACTIVE) + + c.signal_entity( + entity_id, + "commit_checkpoint", + input={ + "scanned_delta": 0, + "exported_delta": 0, + "failed_delta": 2, + "failures": [ + { + "instance_id": "inst-a", + "reason": "timeout", + "attempt_count": 3, + "last_attempt": "2026-01-01T00:00:00+00:00", + }, + { + "instance_id": "inst-b", + "reason": "boom", + "attempt_count": 3, + "last_attempt": "2026-01-01T00:00:00+00:00", + }, + ], + "mark_failed_on_batch": True, + }, + ) + + state = _wait_for_status(c, entity_id, ExportJobStatus.FAILED) + assert state["failed_instances"] == 2 + assert len(state["failures"]) == 2 + assert "inst-a: timeout" in state["last_error"] + + +def test_mark_completed_sets_status(c) -> None: + entity_id = entities.EntityInstanceId(ENTITY_NAME, "job-3") + c.signal_entity(entity_id, "create", input=_create_payload()) + c.signal_entity(entity_id, "run") + c.signal_entity(entity_id, "mark_completed") + state = _wait_for_status(c, entity_id, ExportJobStatus.COMPLETED) + assert state["last_error"] is None + + +def test_mark_failed_records_reason_from_pending(c) -> None: + entity_id = entities.EntityInstanceId(ENTITY_NAME, "job-4") + c.signal_entity(entity_id, "create", input=_create_payload()) + c.signal_entity(entity_id, "mark_failed", input={"reason": "boom"}) + state = _wait_for_status(c, entity_id, ExportJobStatus.FAILED) + assert state["last_error"] == "boom" diff --git a/tests/durabletask/extensions/history_export/test_models.py b/tests/durabletask/extensions/history_export/test_models.py new file mode 100644 index 00000000..078933ca --- /dev/null +++ b/tests/durabletask/extensions/history_export/test_models.py @@ -0,0 +1,209 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Tests for the history-export public data models.""" + +from datetime import datetime, timezone, timedelta + +import pytest + +from durabletask.client import OrchestrationStatus +from durabletask.extensions.history_export import ( + ExportCheckpoint, + ExportDestination, + ExportFailure, + ExportFilter, + ExportFormat, + ExportFormatKind, + ExportJobConfiguration, + ExportJobCreationOptions, + ExportJobDescription, + ExportJobStatus, + ExportMode, +) +from durabletask.extensions.history_export.models import ( + STATE_SCHEMA_VERSION, + ExportJobState, +) + + +_WINDOW_START = datetime(2025, 1, 1, tzinfo=timezone.utc) +_WINDOW_END = datetime(2025, 1, 2, tzinfo=timezone.utc) + + +def _basic_destination() -> ExportDestination: + return ExportDestination(container="exports", prefix="run-1") + + +def _basic_options() -> ExportJobCreationOptions: + return ExportJobCreationOptions( + mode=ExportMode.BATCH, + completed_time_from=_WINDOW_START, + completed_time_to=_WINDOW_END, + destination=_basic_destination(), + ) + + +class TestDestination: + def test_requires_container(self) -> None: + with pytest.raises(ValueError): + ExportDestination(container="") + + +class TestConfigurationValidation: + def test_batch_requires_completed_time_to(self) -> None: + with pytest.raises(ValueError, match="completed_time_to"): + ExportJobConfiguration( + mode=ExportMode.BATCH, + filter=ExportFilter(completed_time_from=_WINDOW_START), + destination=_basic_destination(), + ) + + def test_max_instances_per_batch_must_be_positive(self) -> None: + with pytest.raises(ValueError, match="max_instances_per_batch"): + ExportJobConfiguration( + mode=ExportMode.BATCH, + filter=ExportFilter( + completed_time_from=_WINDOW_START, + completed_time_to=_WINDOW_END, + ), + destination=_basic_destination(), + max_instances_per_batch=0, + ) + + def test_valid_config(self) -> None: + cfg = _basic_options().to_configuration() + assert cfg.mode is ExportMode.BATCH + assert cfg.filter.completed_time_to == _WINDOW_END + assert cfg.format.kind is ExportFormatKind.JSONL_GZIP + assert cfg.format.schema_version == "1.0" + assert cfg.max_instances_per_batch == 100 + assert cfg.max_parallel_exports == 32 + + def test_max_parallel_exports_must_be_positive(self) -> None: + with pytest.raises(ValueError, match="max_parallel_exports"): + ExportJobConfiguration( + mode=ExportMode.BATCH, + filter=ExportFilter( + completed_time_from=_WINDOW_START, + completed_time_to=_WINDOW_END, + ), + destination=_basic_destination(), + max_parallel_exports=0, + ) + + +class TestFilterDefaults: + def test_default_runtime_statuses(self) -> None: + f = ExportFilter(completed_time_from=_WINDOW_START, completed_time_to=_WINDOW_END) + statuses = f.effective_runtime_status() + assert OrchestrationStatus.COMPLETED in statuses + assert OrchestrationStatus.FAILED in statuses + assert OrchestrationStatus.TERMINATED in statuses + assert OrchestrationStatus.RUNNING not in statuses + + def test_explicit_runtime_statuses(self) -> None: + f = ExportFilter( + completed_time_from=_WINDOW_START, + completed_time_to=_WINDOW_END, + runtime_status=[OrchestrationStatus.COMPLETED], + ) + assert f.effective_runtime_status() == [OrchestrationStatus.COMPLETED] + + +class TestRoundTrip: + def test_configuration_round_trip(self) -> None: + cfg = ExportJobConfiguration( + mode=ExportMode.BATCH, + filter=ExportFilter( + completed_time_from=_WINDOW_START, + completed_time_to=_WINDOW_END, + runtime_status=[OrchestrationStatus.COMPLETED, OrchestrationStatus.FAILED], + ), + destination=ExportDestination(container="c", prefix="p"), + format=ExportFormat(kind=ExportFormatKind.JSON, schema_version="1.0"), + max_instances_per_batch=25, + ) + restored = ExportJobConfiguration._from_dict(cfg._to_dict()) + assert restored == cfg + + def test_checkpoint_round_trip(self) -> None: + cp = ExportCheckpoint(last_instance_key="abc|xyz") + assert ExportCheckpoint._from_dict(cp._to_dict()) == cp + + def test_failure_round_trip(self) -> None: + f = ExportFailure( + instance_id="i1", + reason="boom", + attempt_count=3, + last_attempt=_WINDOW_END, + ) + assert ExportFailure._from_dict(f._to_dict()) == f + + def test_naive_datetimes_are_treated_as_utc(self) -> None: + naive = datetime(2025, 1, 1, 12, 0, 0) + f = ExportFilter(completed_time_from=naive, completed_time_to=_WINDOW_END) + d = f._to_dict() + restored = ExportFilter._from_dict(d) + assert restored.completed_time_from == naive.replace(tzinfo=timezone.utc) + + +class TestDescriptionFromState: + def test_from_new_state(self) -> None: + cfg = _basic_options().to_configuration() + created = _WINDOW_END + timedelta(minutes=5) + state = ExportJobState.new( + cfg, + created_at=created, + orchestrator_instance_id="orch-1", + ) + desc = ExportJobDescription._from_state_dict("job-1", state._to_dict()) + + assert desc.job_id == "job-1" + assert desc.status is ExportJobStatus.ACTIVE + assert desc.created_at == created + assert desc.last_modified_at == created + assert desc.orchestrator_instance_id == "orch-1" + assert desc.scanned_instances == 0 + assert desc.exported_instances == 0 + assert desc.failed_instances == 0 + assert desc.last_error is None + assert desc.config is not None + assert desc.config.mode is ExportMode.BATCH + assert desc.checkpoint is not None + assert desc.checkpoint.last_instance_key is None + assert desc.failures == [] + + +class TestExportJobStateRoundTrip: + def test_state_round_trip_preserves_schema_version(self) -> None: + cfg = _basic_options().to_configuration() + created = _WINDOW_END + state = ExportJobState.new(cfg, created_at=created) + d = state._to_dict() + assert d["schema_version"] == STATE_SCHEMA_VERSION + assert "__class__" not in d # no Python type metadata + assert "__type__" not in d + restored = ExportJobState._from_dict(d) + assert restored == state + + def test_unknown_schema_version_is_rejected(self) -> None: + cfg = _basic_options().to_configuration() + state = ExportJobState.new(cfg, created_at=_WINDOW_END) + bad = state._to_dict() + bad["schema_version"] = "99.0" + with pytest.raises(ValueError, match="schema_version"): + ExportJobState._from_dict(bad) + + def test_state_carries_failures(self) -> None: + cfg = _basic_options().to_configuration() + f = ExportFailure( + instance_id="i", + reason="r", + attempt_count=1, + last_attempt=_WINDOW_END, + ) + state = ExportJobState.new(cfg, created_at=_WINDOW_END) + state.failures.append(f) + restored = ExportJobState._from_dict(state._to_dict()) + assert restored.failures == [f] diff --git a/tests/durabletask/extensions/history_export/test_orchestrator.py b/tests/durabletask/extensions/history_export/test_orchestrator.py new file mode 100644 index 00000000..aacebe4f --- /dev/null +++ b/tests/durabletask/extensions/history_export/test_orchestrator.py @@ -0,0 +1,226 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""E2E tests for :func:`export_job_orchestrator`. + +Tests share a single backend per module to keep total runtime low. +The "context bound" tests use their own worker because they need a +fresh registration; the happy-path and cancellation tests share a +module-scoped worker. +""" + +from __future__ import annotations + +import gzip +import json +import threading +from datetime import datetime, timedelta, timezone + +import pytest + +from durabletask import client, task, worker +from durabletask.extensions.history_export import ( + ExportDestination, + ExportFormat, + ExportFormatKind, + ExportHistoryClient, + ExportJobCreationOptions, + ExportJobStatus, + ExportMode, +) +from durabletask.extensions.history_export.activities import clear_context +from durabletask.extensions.history_export import orchestrator as orch_mod +from durabletask.testing import create_test_backend + +from tests.durabletask.extensions.history_export._test_helpers import wait_until + + +PORT = 50262 +HOST = f"localhost:{PORT}" + + +class _InMemoryWriter: + def __init__(self) -> None: + self._lock = threading.Lock() + self.blobs: dict[str, dict] = {} + + def write(self, *, instance_id, blob_name, payload, content_type, content_encoding): + with self._lock: + self.blobs[blob_name] = { + "instance_id": instance_id, + "payload": payload, + "content_type": content_type, + "content_encoding": content_encoding, + } + + +def _echo(ctx: task.OrchestrationContext, input): + return input + + +@pytest.fixture(scope="module", autouse=True) +def _retry_overrides(): + # Tighten retry/idle timings so tests don't sleep for minutes. + orch_mod._BATCH_RETRY_BACKOFF_OVERRIDE = timedelta(milliseconds=100) + orch_mod._CONTINUOUS_IDLE_DELAY_OVERRIDE = timedelta(milliseconds=200) + try: + yield + finally: + orch_mod._BATCH_RETRY_BACKOFF_OVERRIDE = None + orch_mod._CONTINUOUS_IDLE_DELAY_OVERRIDE = None + + +@pytest.fixture(scope="module") +def backend(): + b = create_test_backend(port=PORT) + yield b + b.stop() + b.reset() + + +@pytest.fixture(scope="module") +def writer() -> _InMemoryWriter: + return _InMemoryWriter() + + +@pytest.fixture(scope="module") +def dt_client(backend): + return client.TaskHubGrpcClient(host_address=HOST) + + +@pytest.fixture(scope="module") +def export_client(dt_client, writer): + return ExportHistoryClient(dt_client, writer) + + +@pytest.fixture(scope="module") +def w(backend, export_client): + w_ = worker.TaskHubGrpcWorker(host_address=HOST) + w_.add_orchestrator(_echo) + export_client.register_worker(w_) + w_.start() + yield w_ + w_.stop() + + +@pytest.fixture(scope="module") +def seeded_ids(dt_client, w): + ids: list[str] = [] + for v in ["a", "b", "c", "d", "e"]: + sid = dt_client.schedule_new_orchestration(_echo, input=v) + state = dt_client.wait_for_orchestration_completion(sid, timeout=30) + assert state and state.runtime_status == client.OrchestrationStatus.COMPLETED + ids.append(sid) + return ids + + +# --------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------- + + +def test_orchestrator_exports_all_terminal_instances_and_marks_completed( + dt_client, export_client, writer, seeded_ids, +): + now = datetime.now(timezone.utc) + desc = export_client.create_job( + ExportJobCreationOptions( + mode=ExportMode.BATCH, + completed_time_from=now - timedelta(hours=1), + completed_time_to=now + timedelta(hours=1), + destination=ExportDestination(container="exports", prefix="run-1"), + format=ExportFormat(kind=ExportFormatKind.JSONL_GZIP), + max_instances_per_batch=2, + ) + ) + final = export_client.wait_for_job(desc.job_id, timeout=30, poll_interval=0.1) + + assert final.status == ExportJobStatus.COMPLETED + assert final.exported_instances >= len(seeded_ids) + assert final.failed_instances == 0 + assert final.failures == [] + + written = {b["instance_id"] for b in writer.blobs.values()} + for sid in seeded_ids: + assert sid in written + for name, entry in writer.blobs.items(): + if not name.startswith("run-1/"): + continue + assert name.endswith(".jsonl.gz") + raw = gzip.decompress(entry["payload"]).decode("utf-8") + header = json.loads(raw.strip().split("\n")[0]) + assert header["kind"] == "metadata" + assert header["metadata"]["instance_id"] == entry["instance_id"] + assert header["metadata"]["runtime_status"] == "COMPLETED" + + +def test_orchestrator_exits_when_entity_is_deleted_mid_run( + dt_client, export_client, +): + """Continuous-mode jobs stop when the entity is deleted externally.""" + now = datetime.now(timezone.utc) + desc = export_client.create_job( + ExportJobCreationOptions( + mode=ExportMode.CONTINUOUS, + completed_time_from=now - timedelta(hours=1), + destination=ExportDestination(container="exports"), + format=ExportFormat(kind=ExportFormatKind.JSON), + max_instances_per_batch=10, + ) + ) + + # Wait for the entity to be ACTIVE (orchestrator running its loop). + wait_until( + lambda: (export_client.get_job(desc.job_id) or None) + and export_client.get_job(desc.job_id).status == ExportJobStatus.ACTIVE, + description="job to reach ACTIVE", + timeout=5.0, + ) + + # External delete: the orchestrator's next mid-loop entity get + # observes None and exits gracefully. + export_client.delete_job(desc.job_id) + + run_state = dt_client.wait_for_orchestration_completion( + desc.orchestrator_instance_id, timeout=10, fetch_payloads=True, + ) + assert run_state is not None + assert run_state.runtime_status == client.OrchestrationStatus.COMPLETED + output = json.loads(run_state.serialized_output or "null") + assert output["status"] == "Cancelled" + + +def test_orchestrator_records_failure_when_no_context_bound( + dt_client, export_client, +): + """An orchestrator that cannot reach its activity context fails the job.""" + # The shared module worker has a bound context. Clear it so the + # next orchestrator's activities raise, then restore it afterwards + # so subsequent tests are unaffected. + from durabletask.extensions.history_export.activities import ( + HistoryExportContext, + bind_context, + ) + clear_context() + try: + now = datetime.now(timezone.utc) + desc = export_client.create_job( + ExportJobCreationOptions( + mode=ExportMode.BATCH, + completed_time_from=now - timedelta(hours=1), + completed_time_to=now + timedelta(hours=1), + destination=ExportDestination(container="exports"), + format=ExportFormat(kind=ExportFormatKind.JSON), + max_instances_per_batch=10, + ) + ) + final = export_client.wait_for_job(desc.job_id, timeout=15, poll_interval=0.1) + assert final.status == ExportJobStatus.FAILED + assert final.last_error is not None + finally: + # Re-arm the context for any subsequent tests. + bind_context( + HistoryExportContext( + client=dt_client, writer=export_client.writer, + ) + ) diff --git a/tests/durabletask/extensions/history_export/test_serialization.py b/tests/durabletask/extensions/history_export/test_serialization.py new file mode 100644 index 00000000..6e08be58 --- /dev/null +++ b/tests/durabletask/extensions/history_export/test_serialization.py @@ -0,0 +1,185 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Tests for history-export serialization helpers.""" + +from __future__ import annotations + +import gzip +import json +from datetime import datetime, timezone + +import pytest + +from durabletask import client as client_module +from durabletask import history, task +from durabletask.extensions.history_export import ExportFormat, ExportFormatKind +from durabletask.extensions.history_export.serialization import ( + content_encoding_for, + content_type_for, + event_to_dict, + file_extension_for, + orchestration_state_to_dict, + serialize_history, +) + + +def _sample_events() -> list[history.HistoryEvent]: + ts = datetime(2025, 1, 2, 3, 4, 5, tzinfo=timezone.utc) + return [ + history.OrchestratorStartedEvent(event_id=-1, timestamp=ts), + history.ExecutionStartedEvent( + event_id=-1, + timestamp=ts, + name="MyOrch", + input='"hello"', + ), + history.TaskScheduledEvent( + event_id=1, + timestamp=ts, + name="MyActivity", + input='42', + ), + history.TaskCompletedEvent( + event_id=-1, + timestamp=ts, + task_scheduled_id=1, + result='43', + ), + history.ExecutionCompletedEvent( + event_id=-1, + timestamp=ts, + orchestration_status=1, + result='"done"', + ), + ] + + +class TestEventToDict: + def test_includes_type_discriminator(self) -> None: + e = history.OrchestratorStartedEvent( + event_id=0, + timestamp=datetime(2025, 1, 1, tzinfo=timezone.utc), + ) + d = event_to_dict(e) + assert d["event_type"] == "OrchestratorStartedEvent" + assert d["event_id"] == 0 + assert d["timestamp"].startswith("2025-01-01") + + +class TestSerializeJson: + def test_envelope_fields(self) -> None: + fmt = ExportFormat(kind=ExportFormatKind.JSON) + out = serialize_history(_sample_events(), instance_id="inst-1", fmt=fmt) + doc = json.loads(out) + assert doc["schema_version"] == "1.0" + assert doc["instance_id"] == "inst-1" + assert isinstance(doc["events"], list) + assert len(doc["events"]) == 5 + assert doc["events"][0]["event_type"] == "OrchestratorStartedEvent" + assert doc["events"][-1]["event_type"] == "ExecutionCompletedEvent" + + def test_deterministic(self) -> None: + fmt = ExportFormat(kind=ExportFormatKind.JSON) + events = _sample_events() + a = serialize_history(events, instance_id="x", fmt=fmt) + b = serialize_history(events, instance_id="x", fmt=fmt) + assert a == b + + +class TestSerializeJsonlGzip: + def test_decodes_to_metadata_plus_events(self) -> None: + fmt = ExportFormat(kind=ExportFormatKind.JSONL_GZIP) + out = serialize_history(_sample_events(), instance_id="inst-2", fmt=fmt) + raw = gzip.decompress(out).decode("utf-8") + lines = raw.strip().split("\n") + assert len(lines) == 1 + 5 # metadata + events + metadata = json.loads(lines[0]) + assert metadata == { + "instance_id": "inst-2", + "kind": "metadata", + "schema_version": "1.0", + } + first_event = json.loads(lines[1]) + assert first_event["event_type"] == "OrchestratorStartedEvent" + + def test_deterministic(self) -> None: + fmt = ExportFormat(kind=ExportFormatKind.JSONL_GZIP) + events = _sample_events() + a = serialize_history(events, instance_id="x", fmt=fmt) + b = serialize_history(events, instance_id="x", fmt=fmt) + assert a == b + + +class TestHelpers: + def test_content_type(self) -> None: + assert content_type_for(ExportFormat(kind=ExportFormatKind.JSON)) == "application/json" + assert content_type_for(ExportFormat(kind=ExportFormatKind.JSONL_GZIP)) == "application/x-ndjson" + + def test_content_encoding(self) -> None: + assert content_encoding_for(ExportFormat(kind=ExportFormatKind.JSON)) is None + assert content_encoding_for(ExportFormat(kind=ExportFormatKind.JSONL_GZIP)) == "gzip" + + def test_file_extension(self) -> None: + assert file_extension_for(ExportFormat(kind=ExportFormatKind.JSON)) == ".json" + assert file_extension_for(ExportFormat(kind=ExportFormatKind.JSONL_GZIP)) == ".jsonl.gz" + + def test_unknown_kind_rejected(self) -> None: + fmt = ExportFormat(kind=ExportFormatKind.JSON) + fmt.kind = "bogus" # type: ignore[assignment] + with pytest.raises(ValueError): + serialize_history([], instance_id="x", fmt=fmt) + + +class TestMetadataEmbedding: + def _state(self) -> client_module.OrchestrationState: + ts = datetime(2025, 1, 2, 3, 4, 5, tzinfo=timezone.utc) + return client_module.OrchestrationState( + instance_id="inst-meta", + name="MyOrch", + runtime_status=client_module.OrchestrationStatus.COMPLETED, + created_at=ts, + last_updated_at=ts, + serialized_input='"hello"', + serialized_output='"done"', + serialized_custom_status=None, + failure_details=None, + ) + + def test_state_to_dict_has_no_python_type_metadata(self) -> None: + d = orchestration_state_to_dict(self._state()) + assert "__class__" not in d and "__type__" not in d + assert d["runtime_status"] == "COMPLETED" + assert d["name"] == "MyOrch" + assert d["serialized_input"] == '"hello"' + assert d["serialized_output"] == '"done"' + assert d["failure_details"] is None + + def test_state_with_failure_details(self) -> None: + st = self._state() + st.failure_details = task.FailureDetails( + message="boom", error_type="RuntimeError", stack_trace="trace", + ) + d = orchestration_state_to_dict(st) + assert d["failure_details"] == { + "message": "boom", + "error_type": "RuntimeError", + "stack_trace": "trace", + } + + def test_metadata_embedded_in_json(self) -> None: + fmt = ExportFormat(kind=ExportFormatKind.JSON) + md = orchestration_state_to_dict(self._state()) + out = serialize_history([], instance_id="inst-meta", fmt=fmt, metadata=md) + doc = json.loads(out) + assert doc["metadata"]["instance_id"] == "inst-meta" + assert doc["metadata"]["runtime_status"] == "COMPLETED" + + def test_metadata_embedded_in_jsonl_header(self) -> None: + fmt = ExportFormat(kind=ExportFormatKind.JSONL_GZIP) + md = orchestration_state_to_dict(self._state()) + out = serialize_history([], instance_id="inst-meta", fmt=fmt, metadata=md) + raw = gzip.decompress(out).decode("utf-8") + header = json.loads(raw.strip().split("\n")[0]) + assert header["kind"] == "metadata" + assert header["metadata"]["instance_id"] == "inst-meta" diff --git a/tests/durabletask/extensions/history_export/test_transitions_and_exceptions.py b/tests/durabletask/extensions/history_export/test_transitions_and_exceptions.py new file mode 100644 index 00000000..e51cfe68 --- /dev/null +++ b/tests/durabletask/extensions/history_export/test_transitions_and_exceptions.py @@ -0,0 +1,133 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Tests for the export-job state-transition matrix and custom exceptions.""" + +from __future__ import annotations + +import pytest + +from durabletask.extensions.history_export import ( + ExportJobError, + ExportJobInvalidTransitionError, + ExportJobNotFoundError, + ExportJobStatus, +) +from durabletask.extensions.history_export.transitions import ( + TRANSITIONS, + assert_valid_transition, + is_valid_transition, +) + + +class TestTransitionsMatrix: + def test_create_from_none_is_pending(self) -> None: + assert is_valid_transition("create", None, ExportJobStatus.PENDING) + + def test_create_from_active_is_rejected(self) -> None: + assert not is_valid_transition( + "create", ExportJobStatus.ACTIVE, ExportJobStatus.PENDING + ) + + def test_create_from_terminal_revives_job(self) -> None: + assert is_valid_transition( + "create", ExportJobStatus.COMPLETED, ExportJobStatus.PENDING + ) + assert is_valid_transition( + "create", ExportJobStatus.FAILED, ExportJobStatus.PENDING + ) + + def test_run_is_idempotent_on_active(self) -> None: + assert is_valid_transition( + "run", ExportJobStatus.ACTIVE, ExportJobStatus.ACTIVE + ) + + def test_run_from_pending_activates(self) -> None: + assert is_valid_transition( + "run", ExportJobStatus.PENDING, ExportJobStatus.ACTIVE + ) + + def test_commit_checkpoint_can_fail_active_job(self) -> None: + assert is_valid_transition( + "commit_checkpoint", ExportJobStatus.ACTIVE, ExportJobStatus.FAILED, + ) + + def test_commit_checkpoint_not_allowed_on_terminal(self) -> None: + assert not is_valid_transition( + "commit_checkpoint", ExportJobStatus.COMPLETED, ExportJobStatus.ACTIVE, + ) + + def test_mark_completed_requires_active(self) -> None: + assert is_valid_transition( + "mark_completed", ExportJobStatus.ACTIVE, ExportJobStatus.COMPLETED, + ) + assert not is_valid_transition( + "mark_completed", ExportJobStatus.PENDING, ExportJobStatus.COMPLETED, + ) + + def test_mark_failed_allowed_from_pending_or_active(self) -> None: + assert is_valid_transition( + "mark_failed", ExportJobStatus.PENDING, ExportJobStatus.FAILED, + ) + assert is_valid_transition( + "mark_failed", ExportJobStatus.ACTIVE, ExportJobStatus.FAILED, + ) + + def test_unknown_operation_rejected(self) -> None: + assert not is_valid_transition( + "frobnicate", ExportJobStatus.ACTIVE, ExportJobStatus.COMPLETED, + ) + + def test_assert_valid_raises_on_invalid_transition(self) -> None: + with pytest.raises(ExportJobInvalidTransitionError) as excinfo: + assert_valid_transition( + "mark_completed", + ExportJobStatus.PENDING, + ExportJobStatus.COMPLETED, + job_id="job-x", + ) + ex = excinfo.value + assert ex.operation == "mark_completed" + assert ex.from_status == ExportJobStatus.PENDING.value + assert ex.to_status == ExportJobStatus.COMPLETED.value + assert ex.job_id == "job-x" + + def test_assert_valid_no_op_when_allowed(self) -> None: + # Should not raise. + assert_valid_transition( + "run", ExportJobStatus.PENDING, ExportJobStatus.ACTIVE, + ) + + def test_matrix_is_self_consistent(self) -> None: + for (op, frm), targets in TRANSITIONS.items(): + assert isinstance(op, str) and op + for t in targets: + assert isinstance(t, ExportJobStatus) + assert is_valid_transition(op, frm, t) + + +class TestExceptions: + def test_invalid_transition_is_value_error(self) -> None: + err = ExportJobInvalidTransitionError( + operation="run", + from_status="Active", + to_status="Pending", + job_id="j", + ) + assert isinstance(err, ValueError) + assert isinstance(err, ExportJobError) + assert "Active" in str(err) + assert "Pending" in str(err) + assert "run" in str(err) + assert err.job_id == "j" + + def test_not_found_is_lookup_error(self) -> None: + err = ExportJobNotFoundError("j2") + assert isinstance(err, LookupError) + assert isinstance(err, ExportJobError) + assert err.job_id == "j2" + assert "j2" in str(err) + + def test_base_carries_job_id(self) -> None: + err = ExportJobError("boom", job_id="abc") + assert err.job_id == "abc" From a2dc68c4708137e27cda8434a6e545527aa56317 Mon Sep 17 00:00:00 2001 From: Andy Staples Date: Wed, 3 Jun 2026 11:14:49 -0600 Subject: [PATCH 2/9] Strict typing, type correctness --- .../extensions/history_export/_internal.py | 38 ++++ .../extensions/history_export/activities.py | 61 +++--- .../extensions/history_export/azure_blob.py | 19 +- .../extensions/history_export/client.py | 26 ++- .../extensions/history_export/entity.py | 48 ++--- .../extensions/history_export/exceptions.py | 10 +- .../extensions/history_export/models.py | 181 ++++++++---------- .../extensions/history_export/orchestrator.py | 147 ++++++++------ .../history_export/serialization.py | 21 +- .../extensions/history_export/transitions.py | 10 +- .../extensions/history_export/writer.py | 9 +- .../history_export/test_activities.py | 12 +- .../extensions/history_export/test_entity.py | 2 +- .../extensions/history_export/test_models.py | 22 +-- 14 files changed, 342 insertions(+), 264 deletions(-) create mode 100644 durabletask/extensions/history_export/_internal.py diff --git a/durabletask/extensions/history_export/_internal.py b/durabletask/extensions/history_export/_internal.py new file mode 100644 index 00000000..7b734702 --- /dev/null +++ b/durabletask/extensions/history_export/_internal.py @@ -0,0 +1,38 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Package-internal helpers shared across the history-export modules. + +Names here have no leading underscore so they can be imported by sibling +modules without tripping pyright's ``reportPrivateUsage`` check. They +remain package-private by convention: nothing in this module is exported +from :mod:`durabletask.extensions.history_export.__init__`. +""" + +from __future__ import annotations + +from datetime import datetime, timezone + + +def dt_to_iso(value: datetime | None) -> str | None: + """Normalize *value* to a UTC ISO-8601 string (or ``None``).""" + if value is None: + return None + if value.tzinfo is None: + value = value.replace(tzinfo=timezone.utc) + else: + value = value.astimezone(timezone.utc) + return value.isoformat() + + +def dt_from_iso(value: str | None) -> datetime | None: + """Parse *value* as an ISO-8601 timestamp, defaulting naive values to UTC.""" + if value is None: + return None + parsed = datetime.fromisoformat(value) + if parsed.tzinfo is None: + parsed = parsed.replace(tzinfo=timezone.utc) + return parsed + + +__all__ = ["dt_from_iso", "dt_to_iso"] diff --git a/durabletask/extensions/history_export/activities.py b/durabletask/extensions/history_export/activities.py index f5a390b9..e4c48fab 100644 --- a/durabletask/extensions/history_export/activities.py +++ b/durabletask/extensions/history_export/activities.py @@ -24,17 +24,19 @@ from __future__ import annotations +from collections.abc import Mapping from dataclasses import dataclass from datetime import datetime -from typing import Any, List, Mapping, Optional +from typing import Any, cast from durabletask import client as client_module from durabletask import task +from durabletask import worker as worker_module +from durabletask.extensions.history_export._internal import dt_from_iso from durabletask.extensions.history_export.models import ( ExportFormat, ExportFormatKind, - _dt_from_iso, ) from durabletask.extensions.history_export.serialization import ( content_encoding_for, @@ -62,7 +64,7 @@ class HistoryExportContext: writer: HistoryWriter -_context: Optional[HistoryExportContext] = None +_context: HistoryExportContext | None = None def bind_context(context: HistoryExportContext) -> None: @@ -90,20 +92,29 @@ def _require_context() -> HistoryExportContext: # Activity bodies # ---------------------------------------------------------------------- -def list_terminal_instances(_: task.ActivityContext, input: Mapping[str, Any]) -> dict: +def list_terminal_instances( + _: task.ActivityContext, input: Mapping[str, Any], +) -> dict[str, Any]: """Activity: fetch one page of terminal instance IDs.""" ctx = _require_context() - runtime_status_names: Optional[List[str]] = input.get("runtime_status") - completed_time_from = _dt_from_iso(input.get("completed_time_from")) - completed_time_to = _dt_from_iso(input.get("completed_time_to")) - page_size = input.get("page_size") - continuation_token = input.get("continuation_token") + raw_statuses = input.get("runtime_status") + runtime_status_names: list[str] | None = ( + list(raw_statuses) if raw_statuses is not None else None + ) + completed_time_from = dt_from_iso(input.get("completed_time_from")) + completed_time_to = dt_from_iso(input.get("completed_time_to")) + page_size_raw = input.get("page_size") + page_size: int | None = int(page_size_raw) if page_size_raw is not None else None + continuation_token_raw = input.get("continuation_token") + continuation_token: str | None = ( + str(continuation_token_raw) if continuation_token_raw is not None else None + ) if completed_time_from is None: raise ValueError("list_terminal_instances requires 'completed_time_from'") - runtime_status: Optional[List[client_module.OrchestrationStatus]] = None + runtime_status: list[client_module.OrchestrationStatus] | None = None if runtime_status_names is not None: runtime_status = [ client_module.OrchestrationStatus[name] for name in runtime_status_names @@ -123,17 +134,23 @@ def list_terminal_instances(_: task.ActivityContext, input: Mapping[str, Any]) - } -def export_instance_history(_: task.ActivityContext, input: Mapping[str, Any]) -> dict: +def export_instance_history( + _: task.ActivityContext, input: Mapping[str, Any], +) -> dict[str, Any]: """Activity: serialize and write one instance's history.""" ctx = _require_context() - instance_id = input["instance_id"] - fmt = ExportFormat._from_dict(input.get("format") or { + instance_id = str(input["instance_id"]) + fmt_input = input.get("format") or { "kind": ExportFormatKind.JSONL_GZIP.value, "schema_version": "1.0", - }) - destination = input.get("destination") or {} - prefix = destination.get("prefix") + } + if not isinstance(fmt_input, Mapping): + raise TypeError("format must be a mapping") + fmt = ExportFormat.from_dict(cast("Mapping[str, Any]", fmt_input)) + destination_raw: Mapping[str, Any] = input.get("destination") or {} + prefix_raw: Any = destination_raw.get("prefix") + prefix: str | None = str(prefix_raw) if prefix_raw is not None else None try: events = ctx.client.get_orchestration_history(instance_id) @@ -171,7 +188,7 @@ def export_instance_history(_: task.ActivityContext, input: Mapping[str, Any]) - # Helpers # ---------------------------------------------------------------------- -def _blob_name_for(*, instance_id: str, prefix: Optional[str], fmt: ExportFormat) -> str: +def _blob_name_for(*, instance_id: str, prefix: str | None, fmt: ExportFormat) -> str: ext = file_extension_for(fmt) safe_id = instance_id.replace("/", "_") if prefix: @@ -179,7 +196,7 @@ def _blob_name_for(*, instance_id: str, prefix: Optional[str], fmt: ExportFormat return f"{safe_id}{ext}" -def register(worker_instance) -> None: +def register(worker_instance: worker_module.TaskHubGrpcWorker) -> None: """Convenience helper to register both activities on *worker*.""" worker_instance.add_activity(list_terminal_instances) worker_instance.add_activity(export_instance_history) @@ -189,12 +206,12 @@ def register(worker_instance) -> None: # resolved job configuration without leaking model objects. def build_list_activity_input( *, - runtime_status_names: Optional[List[str]], + runtime_status_names: list[str] | None, completed_time_from: datetime, - completed_time_to: Optional[datetime], + completed_time_to: datetime | None, page_size: int, - continuation_token: Optional[str], -) -> dict: + continuation_token: str | None, +) -> dict[str, Any]: return { "runtime_status": runtime_status_names, "completed_time_from": completed_time_from.isoformat(), diff --git a/durabletask/extensions/history_export/azure_blob.py b/durabletask/extensions/history_export/azure_blob.py index 3a8a7f2c..203fc659 100644 --- a/durabletask/extensions/history_export/azure_blob.py +++ b/durabletask/extensions/history_export/azure_blob.py @@ -18,7 +18,7 @@ from __future__ import annotations from dataclasses import dataclass, field -from typing import Any, Optional +from typing import Any try: from azure.core.exceptions import ResourceExistsError @@ -56,10 +56,10 @@ class AzureBlobHistoryExportWriterOptions: """ container_name: str - connection_string: Optional[str] = None - account_url: Optional[str] = None + connection_string: str | None = None + account_url: str | None = None credential: Any = field(default=None, repr=False) - api_version: Optional[str] = None + api_version: str | None = None create_container_if_not_exists: bool = True def __post_init__(self) -> None: @@ -77,7 +77,7 @@ class AzureBlobHistoryExportWriter: def __init__(self, options: AzureBlobHistoryExportWriterOptions) -> None: self._options = options - extra: dict = {} + extra: dict[str, Any] = {} if options.api_version: extra["api_version"] = options.api_version @@ -119,7 +119,7 @@ def write( blob_name: str, payload: bytes, content_type: str, - content_encoding: Optional[str], + content_encoding: str | None, ) -> None: del instance_id # included by the protocol but not needed here self._ensure_container() @@ -146,7 +146,12 @@ def _ensure_container(self) -> None: self._container_ready = True return try: - self._service.create_container(self._options.container_name) + # The azure-storage-blob stubs leave create_container's return + # type partially unknown; we don't use it, so it's safe to + # suppress the strict-mode warning. + self._service.create_container( # pyright: ignore[reportUnknownMemberType] + self._options.container_name, + ) except ResourceExistsError: pass self._container_ready = True diff --git a/durabletask/extensions/history_export/client.py b/durabletask/extensions/history_export/client.py index c4ee75c0..600bf41a 100644 --- a/durabletask/extensions/history_export/client.py +++ b/durabletask/extensions/history_export/client.py @@ -55,11 +55,13 @@ import json import time import uuid +from collections.abc import Iterator from datetime import datetime, timezone -from typing import Iterator, Optional +from typing import Any, cast from durabletask import client as client_module from durabletask import entities +from durabletask import worker as worker_module from durabletask.extensions.history_export._constants import ( ENTITY_NAME, @@ -110,7 +112,7 @@ def __init__( # Worker wiring # ------------------------------------------------------------------ - def register_worker(self, worker_instance) -> None: + def register_worker(self, worker_instance: worker_module.TaskHubGrpcWorker) -> None: """Register the entity, activities, and orchestrator on *worker*. Also binds the activity execution context so the activities @@ -130,7 +132,7 @@ def create_job( self, options: ExportJobCreationOptions, *, - job_id: Optional[str] = None, + job_id: str | None = None, ) -> ExportJobDescription: """Create a new export job and start its driving orchestrator. @@ -146,7 +148,7 @@ def create_job( resolved_job_id = job_id or options.job_id or uuid.uuid4().hex entity_id = entities.EntityInstanceId(ENTITY_NAME, resolved_job_id) created_at = datetime.now(timezone.utc) - config_dict = config._to_dict() + config_dict = config.to_dict() # Signal create first; the entity will validate the transition # and persist PENDING. Then signal run; the entity will @@ -181,7 +183,7 @@ def create_job( last_checkpoint_time=None, ) - def get_job(self, job_id: str) -> Optional[ExportJobDescription]: + def get_job(self, job_id: str) -> ExportJobDescription | None: """Look up an export job by ID. Returns ``None`` if not found.""" entity_id = entities.EntityInstanceId(ENTITY_NAME, job_id) meta = self._client.get_entity(entity_id, include_state=True) @@ -196,11 +198,13 @@ def get_job(self, job_id: str) -> Optional[ExportJobDescription]: return None if not isinstance(state, dict): return None - return ExportJobDescription._from_state_dict(job_id, state) + return ExportJobDescription.from_state_dict( + job_id, cast("dict[str, Any]", state), + ) def list_jobs( self, - query: Optional[ExportJobQuery] = None, + query: ExportJobQuery | None = None, ) -> Iterator[ExportJobDescription]: """Enumerate export jobs. @@ -236,7 +240,9 @@ def list_jobs( if not isinstance(state, dict): continue try: - desc = ExportJobDescription._from_state_dict(meta.id.key, state) + desc = ExportJobDescription.from_state_dict( + meta.id.key, cast("dict[str, Any]", state), + ) except (KeyError, ValueError): continue if status_filter is not None and desc.status not in status_filter: @@ -263,7 +269,7 @@ def wait_for_job( raise ValueError("poll_interval must be positive") deadline = time.monotonic() + timeout - last: Optional[ExportJobDescription] = None + last: ExportJobDescription | None = None while True: desc = self.get_job(job_id) if desc is not None: @@ -343,7 +349,7 @@ def job_id(self) -> str: def orchestrator_instance_id(self) -> str: return orchestrator_instance_id_for(self._job_id) - def describe(self) -> Optional[ExportJobDescription]: + def describe(self) -> ExportJobDescription | None: """Fetch the latest description, or ``None`` if the job is missing.""" return self._parent.get_job(self._job_id) diff --git a/durabletask/extensions/history_export/entity.py b/durabletask/extensions/history_export/entity.py index cddc4f38..99057bd6 100644 --- a/durabletask/extensions/history_export/entity.py +++ b/durabletask/extensions/history_export/entity.py @@ -7,7 +7,7 @@ To avoid embedding Python type metadata in the persisted payload (a known deserialization-attack vector), the on-disk shape is owned by :class:`~durabletask.extensions.history_export.models.ExportJobState`, -a versioned dataclass whose ``_to_dict`` / ``_from_dict`` methods +a versioned dataclass whose ``to_dict`` / ``from_dict`` methods produce and consume pure JSON primitives keyed by literal field names plus an explicit ``schema_version``. @@ -41,10 +41,12 @@ from __future__ import annotations +from collections.abc import Mapping from datetime import datetime, timezone -from typing import Any, List, Mapping, Optional +from typing import Any, cast from durabletask import entities +from durabletask import worker as worker_module from durabletask.extensions.history_export._constants import ( ENTITY_NAME, @@ -52,13 +54,13 @@ ORCHESTRATOR_NAME, orchestrator_instance_id_for, ) +from durabletask.extensions.history_export._internal import dt_from_iso from durabletask.extensions.history_export._logging import logger from durabletask.extensions.history_export.models import ( ExportFailure, ExportJobConfiguration, ExportJobState, ExportJobStatus, - _dt_from_iso, ) from durabletask.extensions.history_export.transitions import ( assert_valid_transition, @@ -78,7 +80,7 @@ def _utcnow() -> datetime: return datetime.now(timezone.utc) -def _summarize_failures(failures: List[ExportFailure], *, limit: int = 10) -> str: +def _summarize_failures(failures: list[ExportFailure], *, limit: int = 10) -> str: if not failures: return "" head = "; ".join(f"{f.instance_id}: {f.reason}" for f in failures[:limit]) @@ -92,7 +94,7 @@ class ExportJobEntity(entities.DurableEntity): # ----- state helpers -------------------------------------------- - def _load(self) -> Optional[ExportJobState]: + def _load(self) -> ExportJobState | None: raw = self.get_state() if raw is None: return None @@ -100,15 +102,15 @@ def _load(self) -> Optional[ExportJobState]: raise TypeError( f"Unexpected entity state type {type(raw).__name__!r}; expected dict" ) - return ExportJobState._from_dict(raw) + return ExportJobState.from_dict(cast("dict[str, Any]", raw)) def _save(self, state: ExportJobState) -> dict[str, Any]: state.last_modified_at = _utcnow() - persisted = state._to_dict() + persisted = state.to_dict() self.set_state(persisted) return persisted - def _current_status(self) -> Optional[ExportJobStatus]: + def _current_status(self) -> ExportJobStatus | None: state = self._load() return state.status if state is not None else None @@ -127,10 +129,10 @@ def create(self, payload: Mapping[str, Any]) -> dict[str, Any]: config_dict = payload.get("config") if not config_dict: raise ValueError("create payload requires 'config'") - config = ExportJobConfiguration._from_dict(config_dict) + config = ExportJobConfiguration.from_dict(config_dict) created_at_raw = payload.get("created_at") - created_at = _dt_from_iso(created_at_raw) if created_at_raw else _utcnow() + created_at = dt_from_iso(created_at_raw) if created_at_raw else _utcnow() assert created_at is not None state = ExportJobState( @@ -144,11 +146,11 @@ def create(self, payload: Mapping[str, Any]) -> dict[str, Any]: ) return self._save(state) - def get(self, _: Any = None) -> Optional[dict[str, Any]]: + def get(self, _: Any = None) -> dict[str, Any] | None: state = self._load() - return state._to_dict() if state is not None else None + return state.to_dict() if state is not None else None - def run(self, _: Any = None) -> Optional[dict[str, Any]]: + def run(self, _: Any = None) -> dict[str, Any] | None: state = self._load() if state is None: raise ValueError("Cannot run uninitialized export job") @@ -165,7 +167,7 @@ def run(self, _: Any = None) -> Optional[dict[str, Any]]: try: self.entity_context.schedule_new_orchestration( ORCHESTRATOR_NAME, - input={"job_id": job_id, "config": state.config._to_dict()}, + input={"job_id": job_id, "config": state.config.to_dict()}, instance_id=instance_id, ) state.orchestrator_instance_id = instance_id @@ -188,7 +190,7 @@ def run(self, _: Any = None) -> Optional[dict[str, Any]]: state.last_error = None return self._save(state) - def commit_checkpoint(self, payload: Mapping[str, Any]) -> Optional[dict[str, Any]]: + def commit_checkpoint(self, payload: Mapping[str, Any]) -> dict[str, Any] | None: state = self._load() if state is None: raise ValueError("Cannot commit_checkpoint on uninitialized export job") @@ -203,8 +205,8 @@ def commit_checkpoint(self, payload: Mapping[str, Any]) -> Optional[dict[str, An if scanned_delta < 0 or exported_delta < 0 or failed_delta < 0: raise ValueError("checkpoint deltas must be non-negative") - failures_data = payload.get("failures") or [] - new_failures = [ExportFailure._from_dict(f) for f in failures_data] + failures_data: list[Mapping[str, Any]] = list(payload.get("failures") or []) + new_failures = [ExportFailure.from_dict(f) for f in failures_data] will_fail = bool(payload.get("mark_failed_on_batch")) and bool(new_failures) target = ExportJobStatus.FAILED if will_fail else ExportJobStatus.ACTIVE assert_valid_transition( @@ -220,7 +222,7 @@ def commit_checkpoint(self, payload: Mapping[str, Any]) -> Optional[dict[str, An checkpoint_time_raw = payload.get("checkpoint_time") checkpoint_time = ( - _dt_from_iso(checkpoint_time_raw) if checkpoint_time_raw else _utcnow() + dt_from_iso(checkpoint_time_raw) if checkpoint_time_raw else _utcnow() ) state.last_checkpoint_time = checkpoint_time @@ -242,7 +244,7 @@ def commit_checkpoint(self, payload: Mapping[str, Any]) -> Optional[dict[str, An return self._save(state) - def mark_completed(self, _: Any = None) -> Optional[dict[str, Any]]: + def mark_completed(self, _: Any = None) -> dict[str, Any] | None: state = self._load() if state is None: raise ValueError("Cannot mark_completed on uninitialized export job") @@ -257,8 +259,8 @@ def mark_completed(self, _: Any = None) -> Optional[dict[str, Any]]: return self._save(state) def mark_failed( - self, payload: Optional[Mapping[str, Any]] = None - ) -> Optional[dict[str, Any]]: + self, payload: Mapping[str, Any] | None = None + ) -> dict[str, Any] | None: state = self._load() if state is None: raise ValueError("Cannot mark_failed on uninitialized export job") @@ -282,6 +284,8 @@ def delete(self, _: Any = None) -> None: # type: ignore[override] super().delete() -def register(worker_instance, *, name: str = ENTITY_NAME) -> None: +def register( + worker_instance: worker_module.TaskHubGrpcWorker, *, name: str = ENTITY_NAME, +) -> None: """Convenience helper to register :class:`ExportJobEntity` on *worker*.""" worker_instance.add_entity(ExportJobEntity, name=name) diff --git a/durabletask/extensions/history_export/exceptions.py b/durabletask/extensions/history_export/exceptions.py index aec08d1b..5bf16a95 100644 --- a/durabletask/extensions/history_export/exceptions.py +++ b/durabletask/extensions/history_export/exceptions.py @@ -11,13 +11,11 @@ from __future__ import annotations -from typing import Optional - class ExportJobError(Exception): """Base class for all export-job specific errors.""" - def __init__(self, message: str, *, job_id: Optional[str] = None) -> None: + def __init__(self, message: str, *, job_id: str | None = None) -> None: super().__init__(message) self.job_id = job_id @@ -28,10 +26,10 @@ class ExportJobInvalidTransitionError(ExportJobError, ValueError): def __init__( self, operation: str, - from_status: Optional[str], - to_status: Optional[str], + from_status: str | None, + to_status: str | None, *, - job_id: Optional[str] = None, + job_id: str | None = None, ) -> None: message = ( f"Operation {operation!r} cannot transition export job " diff --git a/durabletask/extensions/history_export/models.py b/durabletask/extensions/history_export/models.py index 36bfde26..3d139861 100644 --- a/durabletask/extensions/history_export/models.py +++ b/durabletask/extensions/history_export/models.py @@ -5,20 +5,26 @@ These dataclasses describe export jobs at the public API surface. All JSON-primitive conversions (for entity state, orchestrator inputs, and -activity inputs) are implemented as ``_to_dict`` / ``_from_dict`` pairs +activity inputs) are implemented as ``to_dict`` / ``from_dict`` pairs in this module so the rest of the extension can stay free of ad-hoc serialization logic. """ from __future__ import annotations +from collections.abc import Mapping from dataclasses import dataclass, field -from datetime import datetime, timezone +from datetime import datetime from enum import Enum -from typing import Any, List, Mapping, Optional +from typing import Any from durabletask.client import OrchestrationStatus +from durabletask.extensions.history_export._internal import ( + dt_from_iso, + dt_to_iso, +) + class ExportMode(Enum): """How the export job processes instances.""" @@ -83,36 +89,13 @@ class ExportJobStatus(Enum): # Default set of runtime statuses considered "terminal" for export. -_DEFAULT_TERMINAL_STATUSES: List[OrchestrationStatus] = [ +_DEFAULT_TERMINAL_STATUSES: list[OrchestrationStatus] = [ OrchestrationStatus.COMPLETED, OrchestrationStatus.FAILED, OrchestrationStatus.TERMINATED, ] -# ---------------------------------------------------------------------- -# Datetime helpers -# ---------------------------------------------------------------------- - -def _dt_to_iso(value: Optional[datetime]) -> Optional[str]: - if value is None: - return None - if value.tzinfo is None: - value = value.replace(tzinfo=timezone.utc) - else: - value = value.astimezone(timezone.utc) - return value.isoformat() - - -def _dt_from_iso(value: Optional[str]) -> Optional[datetime]: - if value is None: - return None - parsed = datetime.fromisoformat(value) - if parsed.tzinfo is None: - parsed = parsed.replace(tzinfo=timezone.utc) - return parsed - - # ---------------------------------------------------------------------- # Configuration dataclasses # ---------------------------------------------------------------------- @@ -124,11 +107,11 @@ class ExportFormat: kind: ExportFormatKind = ExportFormatKind.JSONL_GZIP schema_version: str = "1.0" - def _to_dict(self) -> dict[str, Any]: + def to_dict(self) -> dict[str, Any]: return {"kind": self.kind.value, "schema_version": self.schema_version} @classmethod - def _from_dict(cls, data: Mapping[str, Any]) -> "ExportFormat": + def from_dict(cls, data: Mapping[str, Any]) -> "ExportFormat": return cls( kind=ExportFormatKind(data["kind"]), schema_version=data.get("schema_version", "1.0"), @@ -151,17 +134,17 @@ class ExportDestination: """ container: str - prefix: Optional[str] = None + prefix: str | None = None def __post_init__(self) -> None: if not self.container: raise ValueError("destination.container must be a non-empty string") - def _to_dict(self) -> dict[str, Any]: + def to_dict(self) -> dict[str, Any]: return {"container": self.container, "prefix": self.prefix} @classmethod - def _from_dict(cls, data: Mapping[str, Any]) -> "ExportDestination": + def from_dict(cls, data: Mapping[str, Any]) -> "ExportDestination": return cls(container=data["container"], prefix=data.get("prefix")) @@ -180,19 +163,19 @@ class ExportFilter: """ completed_time_from: datetime - completed_time_to: Optional[datetime] = None - runtime_status: Optional[List[OrchestrationStatus]] = None + completed_time_to: datetime | None = None + runtime_status: list[OrchestrationStatus] | None = None - def effective_runtime_status(self) -> List[OrchestrationStatus]: + def effective_runtime_status(self) -> list[OrchestrationStatus]: """Return the runtime statuses to use, applying the default.""" if self.runtime_status is None: return list(_DEFAULT_TERMINAL_STATUSES) return list(self.runtime_status) - def _to_dict(self) -> dict[str, Any]: + def to_dict(self) -> dict[str, Any]: return { - "completed_time_from": _dt_to_iso(self.completed_time_from), - "completed_time_to": _dt_to_iso(self.completed_time_to), + "completed_time_from": dt_to_iso(self.completed_time_from), + "completed_time_to": dt_to_iso(self.completed_time_to), "runtime_status": ( [s.name for s in self.runtime_status] if self.runtime_status is not None @@ -201,14 +184,14 @@ def _to_dict(self) -> dict[str, Any]: } @classmethod - def _from_dict(cls, data: Mapping[str, Any]) -> "ExportFilter": + def from_dict(cls, data: Mapping[str, Any]) -> "ExportFilter": statuses = data.get("runtime_status") - completed_from = _dt_from_iso(data.get("completed_time_from")) + completed_from = dt_from_iso(data.get("completed_time_from")) if completed_from is None: raise ValueError("completed_time_from is required") return cls( completed_time_from=completed_from, - completed_time_to=_dt_from_iso(data.get("completed_time_to")), + completed_time_to=dt_from_iso(data.get("completed_time_to")), runtime_status=( [OrchestrationStatus[name] for name in statuses] if statuses is not None @@ -227,13 +210,13 @@ class ExportCheckpoint: export has not started or has completed. """ - last_instance_key: Optional[str] = None + last_instance_key: str | None = None - def _to_dict(self) -> dict[str, Any]: + def to_dict(self) -> dict[str, Any]: return {"last_instance_key": self.last_instance_key} @classmethod - def _from_dict(cls, data: Mapping[str, Any]) -> "ExportCheckpoint": + def from_dict(cls, data: Mapping[str, Any]) -> "ExportCheckpoint": return cls(last_instance_key=data.get("last_instance_key")) @@ -246,17 +229,17 @@ class ExportFailure: attempt_count: int last_attempt: datetime - def _to_dict(self) -> dict[str, Any]: + def to_dict(self) -> dict[str, Any]: return { "instance_id": self.instance_id, "reason": self.reason, "attempt_count": self.attempt_count, - "last_attempt": _dt_to_iso(self.last_attempt), + "last_attempt": dt_to_iso(self.last_attempt), } @classmethod - def _from_dict(cls, data: Mapping[str, Any]) -> "ExportFailure": - last_attempt = _dt_from_iso(data["last_attempt"]) + def from_dict(cls, data: Mapping[str, Any]) -> "ExportFailure": + last_attempt = dt_from_iso(data["last_attempt"]) assert last_attempt is not None return cls( instance_id=data["instance_id"], @@ -287,23 +270,23 @@ def __post_init__(self) -> None: "completed_time_to is required for batch mode exports" ) - def _to_dict(self) -> dict[str, Any]: + def to_dict(self) -> dict[str, Any]: return { "mode": self.mode.value, - "filter": self.filter._to_dict(), - "destination": self.destination._to_dict(), - "format": self.format._to_dict(), + "filter": self.filter.to_dict(), + "destination": self.destination.to_dict(), + "format": self.format.to_dict(), "max_instances_per_batch": self.max_instances_per_batch, "max_parallel_exports": self.max_parallel_exports, } @classmethod - def _from_dict(cls, data: Mapping[str, Any]) -> "ExportJobConfiguration": + def from_dict(cls, data: Mapping[str, Any]) -> "ExportJobConfiguration": return cls( mode=ExportMode(data["mode"]), - filter=ExportFilter._from_dict(data["filter"]), - destination=ExportDestination._from_dict(data["destination"]), - format=ExportFormat._from_dict(data.get("format") or {"kind": ExportFormatKind.JSONL_GZIP.value}), + filter=ExportFilter.from_dict(data["filter"]), + destination=ExportDestination.from_dict(data["destination"]), + format=ExportFormat.from_dict(data.get("format") or {"kind": ExportFormatKind.JSONL_GZIP.value}), max_instances_per_batch=int(data.get("max_instances_per_batch", 100)), max_parallel_exports=int(data.get("max_parallel_exports", 32)), ) @@ -326,10 +309,10 @@ class ExportJobQuery: to retrieve job IDs and metadata only). """ - status: Optional[List["ExportJobStatus"]] = None - last_modified_from: Optional[datetime] = None - last_modified_to: Optional[datetime] = None - page_size: Optional[int] = None + status: list[ExportJobStatus] | None = None + last_modified_from: datetime | None = None + last_modified_to: datetime | None = None + page_size: int | None = None include_state: bool = True @@ -340,10 +323,10 @@ class ExportJobCreationOptions: mode: ExportMode completed_time_from: datetime destination: ExportDestination - completed_time_to: Optional[datetime] = None - runtime_status: Optional[List[OrchestrationStatus]] = None + completed_time_to: datetime | None = None + runtime_status: list[OrchestrationStatus] | None = None format: ExportFormat = field(default_factory=ExportFormat) - job_id: Optional[str] = None + job_id: str | None = None max_instances_per_batch: int = 100 max_parallel_exports: int = 32 @@ -380,10 +363,10 @@ def to_configuration(self) -> ExportJobConfiguration: # changing the on-disk shape. STATE_SCHEMA_VERSION = "1.0" -"""The schema version emitted by :meth:`ExportJobState._to_dict`. +"""The schema version emitted by :meth:`ExportJobState.to_dict`. Increment this when the persisted shape changes in a non-backward-compatible -way and add a new branch in :meth:`ExportJobState._from_dict`. +way and add a new branch in :meth:`ExportJobState.from_dict`. """ @@ -392,9 +375,9 @@ class ExportJobState: """Typed, schema-versioned mirror of the entity's persisted state. This dataclass is the single source of truth for the on-disk schema. - All persistence flows through :meth:`_to_dict` (write) and - :meth:`_from_dict` (read); the dict contains only JSON primitives plus - nested dicts produced by the model ``_to_dict`` methods. No Python + All persistence flows through :meth:`to_dict` (write) and + :meth:`from_dict` (read); the dict contains only JSON primitives plus + nested dicts produced by the model ``to_dict`` methods. No Python class names, module paths, or other type metadata appear in the serialized form. """ @@ -403,38 +386,38 @@ class names, module paths, or other type metadata appear in the config: ExportJobConfiguration created_at: datetime last_modified_at: datetime - orchestrator_instance_id: Optional[str] = None + orchestrator_instance_id: str | None = None checkpoint: ExportCheckpoint = field(default_factory=ExportCheckpoint) - last_checkpoint_time: Optional[datetime] = None - last_error: Optional[str] = None + last_checkpoint_time: datetime | None = None + last_error: str | None = None scanned_instances: int = 0 exported_instances: int = 0 failed_instances: int = 0 - failures: List[ExportFailure] = field(default_factory=list) + failures: list[ExportFailure] = field(default_factory=list[ExportFailure]) # ------------------------------------------------------------------ # Serialization # ------------------------------------------------------------------ - def _to_dict(self) -> dict[str, Any]: + def to_dict(self) -> dict[str, Any]: return { "schema_version": STATE_SCHEMA_VERSION, "status": self.status.value, - "config": self.config._to_dict(), - "checkpoint": self.checkpoint._to_dict(), - "created_at": _dt_to_iso(self.created_at), - "last_modified_at": _dt_to_iso(self.last_modified_at), - "last_checkpoint_time": _dt_to_iso(self.last_checkpoint_time), + "config": self.config.to_dict(), + "checkpoint": self.checkpoint.to_dict(), + "created_at": dt_to_iso(self.created_at), + "last_modified_at": dt_to_iso(self.last_modified_at), + "last_checkpoint_time": dt_to_iso(self.last_checkpoint_time), "last_error": self.last_error, "scanned_instances": self.scanned_instances, "exported_instances": self.exported_instances, "failed_instances": self.failed_instances, "orchestrator_instance_id": self.orchestrator_instance_id, - "failures": [f._to_dict() for f in self.failures], + "failures": [f.to_dict() for f in self.failures], } @classmethod - def _from_dict(cls, data: Mapping[str, Any]) -> "ExportJobState": + def from_dict(cls, data: Mapping[str, Any]) -> "ExportJobState": version = data.get("schema_version", "1.0") if version != STATE_SCHEMA_VERSION: raise ValueError( @@ -445,32 +428,32 @@ def _from_dict(cls, data: Mapping[str, Any]) -> "ExportJobState": config_data = data.get("config") if not config_data: raise ValueError("persisted state is missing 'config'") - created_at = _dt_from_iso(data.get("created_at")) - last_modified_at = _dt_from_iso(data.get("last_modified_at")) + created_at = dt_from_iso(data.get("created_at")) + last_modified_at = dt_from_iso(data.get("last_modified_at")) if created_at is None or last_modified_at is None: raise ValueError( "persisted state must include 'created_at' and 'last_modified_at'" ) checkpoint_data = data.get("checkpoint") - failures_data = data.get("failures") or [] + failures_data: list[Mapping[str, Any]] = list(data.get("failures") or []) return cls( status=ExportJobStatus(data["status"]), - config=ExportJobConfiguration._from_dict(config_data), + config=ExportJobConfiguration.from_dict(config_data), created_at=created_at, last_modified_at=last_modified_at, orchestrator_instance_id=data.get("orchestrator_instance_id"), checkpoint=( - ExportCheckpoint._from_dict(checkpoint_data) + ExportCheckpoint.from_dict(checkpoint_data) if checkpoint_data is not None else ExportCheckpoint() ), - last_checkpoint_time=_dt_from_iso(data.get("last_checkpoint_time")), + last_checkpoint_time=dt_from_iso(data.get("last_checkpoint_time")), last_error=data.get("last_error"), scanned_instances=int(data.get("scanned_instances", 0)), exported_instances=int(data.get("exported_instances", 0)), failed_instances=int(data.get("failed_instances", 0)), - failures=[ExportFailure._from_dict(f) for f in failures_data], + failures=[ExportFailure.from_dict(f) for f in failures_data], ) # ------------------------------------------------------------------ @@ -483,7 +466,7 @@ def new( config: ExportJobConfiguration, *, created_at: datetime, - orchestrator_instance_id: Optional[str] = None, + orchestrator_instance_id: str | None = None, ) -> "ExportJobState": """Construct a fresh state for a newly-created job.""" return cls( @@ -505,20 +488,20 @@ class ExportJobDescription: job_id: str status: ExportJobStatus - created_at: Optional[datetime] - last_modified_at: Optional[datetime] - config: Optional[ExportJobConfiguration] - orchestrator_instance_id: Optional[str] + created_at: datetime | None + last_modified_at: datetime | None + config: ExportJobConfiguration | None + orchestrator_instance_id: str | None scanned_instances: int exported_instances: int failed_instances: int - last_error: Optional[str] - checkpoint: Optional[ExportCheckpoint] - last_checkpoint_time: Optional[datetime] - failures: List[ExportFailure] = field(default_factory=list) + last_error: str | None + checkpoint: ExportCheckpoint | None + last_checkpoint_time: datetime | None + failures: list[ExportFailure] = field(default_factory=list[ExportFailure]) @classmethod - def _from_state(cls, job_id: str, state: "ExportJobState") -> "ExportJobDescription": + def from_state(cls, job_id: str, state: "ExportJobState") -> "ExportJobDescription": return cls( job_id=job_id, status=state.status, @@ -536,8 +519,8 @@ def _from_state(cls, job_id: str, state: "ExportJobState") -> "ExportJobDescript ) @classmethod - def _from_state_dict( + def from_state_dict( cls, job_id: str, state: Mapping[str, Any] ) -> "ExportJobDescription": """Build a description from a persisted entity-state dict.""" - return cls._from_state(job_id, ExportJobState._from_dict(state)) + return cls.from_state(job_id, ExportJobState.from_dict(state)) diff --git a/durabletask/extensions/history_export/orchestrator.py b/durabletask/extensions/history_export/orchestrator.py index 724ca083..eb2846cd 100644 --- a/durabletask/extensions/history_export/orchestrator.py +++ b/durabletask/extensions/history_export/orchestrator.py @@ -5,38 +5,37 @@ Mirrors the .NET ``ExportJobOrchestrator`` design: -1. Re-fetch the export-job entity state at the top of every loop - iteration via :meth:`OrchestrationContext.call_entity`. If the - job no longer exists (deleted) or is no longer ACTIVE (externally - marked failed/completed), the orchestrator exits cleanly without - issuing any further signals. -2. Ask ``list_terminal_instances`` for one page. -3. Fan out ``export_instance_history`` across the page, respecting - the configured ``max_parallel_exports`` cap, with a per-activity - retry policy. -4. If any individual export still failed after its retries, retry - the *whole page* up to ``MAX_BATCH_RETRY_ATTEMPTS`` times with - exponential backoff. -5. Signal the entity with ``commit_checkpoint`` carrying the page - totals. On persistent batch failure, the signal also carries the - failure list and ``mark_failed_on_batch=True``. -6. In :attr:`ExportMode.BATCH`, break out of the loop when there is - no next page. In :attr:`ExportMode.CONTINUOUS`, sleep for - ``CONTINUOUS_IDLE_DELAY`` on empty pages and continue tailing - forever (until an external stop is observed via step 1). -7. Continue-as-new every ``CONTINUE_AS_NEW_FREQUENCY`` pages to - keep the orchestrator history bounded. -8. In BATCH mode only: on a clean exit, signal ``mark_completed``. - In CONTINUOUS mode, the orchestrator does not mark the job - completed — the job lifecycle is owned by the caller. +1. Resolve the job configuration from the orchestrator input. +2. Loop over pages of terminal instance IDs: + a. Ask ``list_terminal_instances`` for one page. + b. Fan out ``export_instance_history`` across the page, + respecting the configured ``max_parallel_exports`` cap, with + a per-activity retry policy. + c. If any individual export still failed after its retries, + retry the *whole page* once after a backoff timer. + d. Signal the entity with ``commit_checkpoint`` carrying the + page totals. On persistent batch failure, the signal also + carries the failure list and ``mark_failed_on_batch=True``. +3. Continue-as-new every ``CONTINUE_AS_NEW_FREQUENCY`` pages to keep + the orchestrator history bounded. +4. Signal ``mark_completed`` (or ``mark_failed`` on any uncaught + exception) and return a summary. + +The orchestrator never reads the entity's state back during normal +operation — except for the lightweight ``call_entity("get")`` at the +top of every loop iteration which lets external delete / mark_failed +signals cancel the orchestrator cleanly. This keeps the orchestrator +history small and avoids round-trip latency. """ from __future__ import annotations +from collections.abc import Generator, Mapping from datetime import timedelta -from typing import Any, List, Mapping, Optional +from typing import Any, TypedDict, cast from durabletask import task +from durabletask import worker as worker_module from durabletask.extensions.history_export._constants import ( ENTITY_NAME, @@ -92,8 +91,16 @@ _DEFAULT_BATCH_RETRY_MAX = timedelta(seconds=300) # Test seams: monkey-patch to small values to keep test runs fast. -_BATCH_RETRY_BACKOFF_OVERRIDE: Optional[timedelta] = None -_CONTINUOUS_IDLE_DELAY_OVERRIDE: Optional[timedelta] = None +_BATCH_RETRY_BACKOFF_OVERRIDE: timedelta | None = None +_CONTINUOUS_IDLE_DELAY_OVERRIDE: timedelta | None = None + + +class _ExportActivityResult(TypedDict): + """Shape of the dict returned by ``export_instance_history``.""" + + instance_id: str + success: bool + error: str | None def _batch_retry_delay(attempt: int) -> timedelta: @@ -110,28 +117,34 @@ def _continuous_idle_delay() -> timedelta: return _CONTINUOUS_IDLE_DELAY_OVERRIDE or CONTINUOUS_IDLE_DELAY -def export_job_orchestrator(ctx: task.OrchestrationContext, input: Mapping[str, Any]): +def export_job_orchestrator( + ctx: task.OrchestrationContext, input: Mapping[str, Any], +) -> Generator[Any, Any, Any]: """Drive a single export job through the page → fan-out → checkpoint loop. Input schema:: { "job_id": str, - "config": ExportJobConfiguration._to_dict(), - "checkpoint": ExportCheckpoint._to_dict() (optional), + "config": ExportJobConfiguration.to_dict(), + "checkpoint": ExportCheckpoint.to_dict() (optional), "processed_cycles": int (optional, used for continue-as-new), } """ - job_id = input["job_id"] - config = ExportJobConfiguration._from_dict(input["config"]) + job_id = str(input["job_id"]) + config_input = input["config"] + if not isinstance(config_input, Mapping): + raise TypeError("config input must be a mapping") + config_mapping = cast("Mapping[str, Any]", config_input) + config = ExportJobConfiguration.from_dict(config_mapping) initial_checkpoint = input.get("checkpoint") or {"last_instance_key": None} processed_cycles = int(input.get("processed_cycles", 0)) entity_id = task.EntityInstanceId(ENTITY_NAME, job_id) runtime_status_names = [s.name for s in config.filter.effective_runtime_status()] - continuation_token = initial_checkpoint.get("last_instance_key") + continuation_token: str | None = initial_checkpoint.get("last_instance_key") - totals = {"scanned": 0, "exported": 0, "failed": 0} + totals: dict[str, int] = {"scanned": 0, "exported": 0, "failed": 0} try: while True: @@ -139,7 +152,7 @@ def export_job_orchestrator(ctx: task.OrchestrationContext, input: Mapping[str, if processed_cycles > CONTINUE_AS_NEW_FREQUENCY: ctx.continue_as_new({ "job_id": job_id, - "config": input["config"], + "config": dict(config_mapping), "checkpoint": {"last_instance_key": continuation_token}, "processed_cycles": 0, }) @@ -148,21 +161,24 @@ def export_job_orchestrator(ctx: task.OrchestrationContext, input: Mapping[str, # Step 1: re-check the entity's view of the world. This # lets external state changes (delete, mark_failed) cancel # the orchestrator without us having to drain a backlog. - current_state = yield ctx.call_entity(entity_id, "get") + current_state: dict[str, Any] | None = ( + yield ctx.call_entity(entity_id, "get") + ) if current_state is None: logger.info( "Export job %r entity has been deleted; exiting orchestrator", job_id, ) return {"job_id": job_id, "status": "Cancelled", "totals": totals} - if current_state.get("status") != ExportJobStatus.ACTIVE.value: + current_status = current_state.get("status") + if current_status != ExportJobStatus.ACTIVE.value: logger.info( "Export job %r entity status is %s; exiting orchestrator", - job_id, current_state.get("status"), + job_id, current_status, ) return { "job_id": job_id, - "status": current_state.get("status"), + "status": current_status, "totals": totals, } @@ -173,19 +189,20 @@ def export_job_orchestrator(ctx: task.OrchestrationContext, input: Mapping[str, page_size=config.max_instances_per_batch, continuation_token=continuation_token, ) - page = yield ctx.call_activity( + page: dict[str, Any] = yield ctx.call_activity( LIST_TERMINAL_INSTANCES_ACTIVITY, input=list_input ) - instance_ids = page.get("instance_ids") or [] + raw_ids: list[Any] = list(page.get("instance_ids") or []) + instance_ids: list[str] = [str(x) for x in raw_ids] scanned_delta = len(instance_ids) exported_delta = 0 failed_delta = 0 - batch_failures: List[dict] = [] + batch_failures: list[dict[str, Any]] = [] if instance_ids: batch_succeeded = False - results: List[dict] = [] + results: list[_ExportActivityResult] = [] for attempt in range(1, MAX_BATCH_RETRY_ATTEMPTS + 1): results = yield from _run_page( ctx, @@ -235,7 +252,10 @@ def export_job_orchestrator(ctx: task.OrchestrationContext, input: Mapping[str, f"{failed_delta} instances could not be exported." ) - next_token = page.get("continuation_token") + next_token_raw = page.get("continuation_token") + next_token: str | None = ( + str(next_token_raw) if next_token_raw is not None else None + ) ctx.signal_entity( entity_id, "commit_checkpoint", @@ -276,31 +296,40 @@ def export_job_orchestrator(ctx: task.OrchestrationContext, input: Mapping[str, raise -def _run_page(ctx, *, instance_ids, config, max_parallel): +def _run_page( + ctx: task.OrchestrationContext, + *, + instance_ids: list[str], + config: ExportJobConfiguration, + max_parallel: int, +) -> Generator[Any, Any, list[_ExportActivityResult]]: """Fan out export activities for a single page, bounded by *max_parallel*.""" - destination = config.destination._to_dict() - fmt = config.format._to_dict() + destination = config.destination.to_dict() + fmt = config.format.to_dict() - results: List[dict] = [] + results: list[_ExportActivityResult] = [] for start in range(0, len(instance_ids), max_parallel): chunk = instance_ids[start:start + max_parallel] - chunk_tasks = [ - ctx.call_activity( - EXPORT_INSTANCE_HISTORY_ACTIVITY, - input={ - "instance_id": instance_id, - "format": fmt, - "destination": destination, - }, - retry_policy=EXPORT_ACTIVITY_RETRY_POLICY, + chunk_tasks: list[task.Task[_ExportActivityResult]] = [ + cast( + "task.Task[_ExportActivityResult]", + ctx.call_activity( + EXPORT_INSTANCE_HISTORY_ACTIVITY, + input={ + "instance_id": instance_id, + "format": fmt, + "destination": destination, + }, + retry_policy=EXPORT_ACTIVITY_RETRY_POLICY, + ), ) for instance_id in chunk ] - chunk_results = yield task.when_all(chunk_tasks) + chunk_results: list[_ExportActivityResult] = yield task.when_all(chunk_tasks) results.extend(chunk_results) return results -def register(worker_instance) -> None: +def register(worker_instance: worker_module.TaskHubGrpcWorker) -> None: """Convenience helper to register the orchestrator on *worker*.""" worker_instance.add_orchestrator(export_job_orchestrator) diff --git a/durabletask/extensions/history_export/serialization.py b/durabletask/extensions/history_export/serialization.py index 60898e74..2740571b 100644 --- a/durabletask/extensions/history_export/serialization.py +++ b/durabletask/extensions/history_export/serialization.py @@ -25,27 +25,28 @@ import gzip import json -from typing import Any, Iterable, Mapping, Optional, Sequence +from collections.abc import Iterable, Mapping, Sequence +from typing import Any from durabletask import client as client_module from durabletask import history from durabletask import task +from durabletask.extensions.history_export._internal import dt_to_iso from durabletask.extensions.history_export.models import ( ExportFormat, ExportFormatKind, - _dt_to_iso, ) -def event_to_dict(event: history.HistoryEvent) -> dict: +def event_to_dict(event: history.HistoryEvent) -> dict[str, Any]: """Convert a :class:`history.HistoryEvent` into a JSON-safe dict. A discriminator field ``event_type`` is added so downstream consumers can distinguish event subclasses without inspecting their fields. """ - payload = event.to_dict() + payload: dict[str, Any] = event.to_dict() # Insert the discriminator first so the resulting dict orders it # near the front of the JSON object even before any sorting. return {"event_type": type(event).__name__, **payload} @@ -60,7 +61,7 @@ def orchestration_state_to_dict( class names or module paths appear in the resulting dict. """ failure = state.failure_details - failure_dict: Optional[dict[str, Any]] = None + failure_dict: dict[str, Any] | None = None if failure is not None: failure_dict = { "message": failure.message, @@ -78,8 +79,8 @@ class names or module paths appear in the resulting dict. "instance_id": state.instance_id, "name": state.name, "runtime_status": state.runtime_status.name, - "created_at": _dt_to_iso(state.created_at), - "last_updated_at": _dt_to_iso(state.last_updated_at), + "created_at": dt_to_iso(state.created_at), + "last_updated_at": dt_to_iso(state.last_updated_at), "serialized_input": state.serialized_input, "serialized_output": state.serialized_output, "serialized_custom_status": state.serialized_custom_status, @@ -87,7 +88,7 @@ class names or module paths appear in the resulting dict. } -def _dump_json(value) -> str: +def _dump_json(value: Any) -> str: return json.dumps( value, sort_keys=True, @@ -101,7 +102,7 @@ def serialize_history( *, instance_id: str, fmt: ExportFormat, - metadata: Optional[Mapping[str, Any]] = None, + metadata: Mapping[str, Any] | None = None, ) -> bytes: """Serialize a list of history events for a single instance. @@ -138,7 +139,7 @@ def _gzip_jsonl( *, instance_id: str, fmt: ExportFormat, - metadata: Optional[Mapping[str, Any]] = None, + metadata: Mapping[str, Any] | None = None, ) -> bytes: # Build the uncompressed JSONL document first so the test surface # can decode the bytes deterministically. diff --git a/durabletask/extensions/history_export/transitions.py b/durabletask/extensions/history_export/transitions.py index b44f53eb..268e2093 100644 --- a/durabletask/extensions/history_export/transitions.py +++ b/durabletask/extensions/history_export/transitions.py @@ -17,7 +17,7 @@ from __future__ import annotations -from typing import Mapping, Optional +from collections.abc import Mapping from durabletask.extensions.history_export.exceptions import ( ExportJobInvalidTransitionError, @@ -26,7 +26,7 @@ # Maps (operation_name, current_status_or_None) -> {valid target statuses}. -TRANSITIONS: Mapping[tuple[str, Optional[ExportJobStatus]], frozenset[ExportJobStatus]] = { +TRANSITIONS: Mapping[tuple[str, ExportJobStatus | None], frozenset[ExportJobStatus]] = { # ``create`` initialises a fresh job and revives terminal jobs. ("create", None): frozenset({ExportJobStatus.PENDING}), ("create", ExportJobStatus.FAILED): frozenset({ExportJobStatus.PENDING}), @@ -56,7 +56,7 @@ def is_valid_transition( operation: str, - from_status: Optional[ExportJobStatus], + from_status: ExportJobStatus | None, to_status: ExportJobStatus, ) -> bool: """Return whether *to_status* is reachable from *from_status* via *operation*.""" @@ -66,10 +66,10 @@ def is_valid_transition( def assert_valid_transition( operation: str, - from_status: Optional[ExportJobStatus], + from_status: ExportJobStatus | None, to_status: ExportJobStatus, *, - job_id: Optional[str] = None, + job_id: str | None = None, ) -> None: """Raise :class:`ExportJobInvalidTransitionError` for invalid transitions.""" if not is_valid_transition(operation, from_status, to_status): diff --git a/durabletask/extensions/history_export/writer.py b/durabletask/extensions/history_export/writer.py index e8a0a39d..181895d8 100644 --- a/durabletask/extensions/history_export/writer.py +++ b/durabletask/extensions/history_export/writer.py @@ -18,9 +18,6 @@ Example custom writer:: - from typing import Optional - - class LocalFileSystemHistoryWriter: def __init__(self, root_dir: str) -> None: self._root = root_dir @@ -32,7 +29,7 @@ def write( blob_name: str, payload: bytes, content_type: str, - content_encoding: Optional[str], + content_encoding: str | None, ) -> None: import os path = os.path.join(self._root, blob_name) @@ -51,7 +48,7 @@ def write( from __future__ import annotations -from typing import Optional, Protocol, runtime_checkable +from typing import Protocol, runtime_checkable @runtime_checkable @@ -69,7 +66,7 @@ def write( blob_name: str, payload: bytes, content_type: str, - content_encoding: Optional[str], + content_encoding: str | None, ) -> None: """Persist one exported blob. diff --git a/tests/durabletask/extensions/history_export/test_activities.py b/tests/durabletask/extensions/history_export/test_activities.py index d329b659..240e34b7 100644 --- a/tests/durabletask/extensions/history_export/test_activities.py +++ b/tests/durabletask/extensions/history_export/test_activities.py @@ -140,8 +140,8 @@ def test_activities_list_and_export_to_in_memory_writer(c, seeded_ids): "page_size": 50, "continuation_token": None, }, - "format": fmt._to_dict(), - "destination": dest._to_dict(), + "format": fmt.to_dict(), + "destination": dest.to_dict(), } run_id = c.schedule_new_orchestration(_list_then_export, input=orch_input) state = c.wait_for_orchestration_completion( @@ -189,8 +189,8 @@ def write(self, **_): "page_size": 50, "continuation_token": None, }, - "format": fmt._to_dict(), - "destination": dest._to_dict(), + "format": fmt.to_dict(), + "destination": dest.to_dict(), } run_id = c.schedule_new_orchestration(_list_then_export, input=orch_input) state = c.wait_for_orchestration_completion( @@ -218,8 +218,8 @@ def test_activities_require_bound_context(c): "page_size": 50, "continuation_token": None, }, - "format": fmt._to_dict(), - "destination": dest._to_dict(), + "format": fmt.to_dict(), + "destination": dest.to_dict(), } run_id = c.schedule_new_orchestration(_list_then_export, input=orch_input) state = c.wait_for_orchestration_completion( diff --git a/tests/durabletask/extensions/history_export/test_entity.py b/tests/durabletask/extensions/history_export/test_entity.py index 4d8399ed..6f28c44c 100644 --- a/tests/durabletask/extensions/history_export/test_entity.py +++ b/tests/durabletask/extensions/history_export/test_entity.py @@ -84,7 +84,7 @@ def _create_payload() -> dict: completed_time_to=_WINDOW_END, destination=ExportDestination(container="exports", prefix="run-1"), ).to_configuration() - return {"config": cfg._to_dict()} + return {"config": cfg.to_dict()} def _state_dict(metadata) -> dict: diff --git a/tests/durabletask/extensions/history_export/test_models.py b/tests/durabletask/extensions/history_export/test_models.py index 078933ca..a05d2d05 100644 --- a/tests/durabletask/extensions/history_export/test_models.py +++ b/tests/durabletask/extensions/history_export/test_models.py @@ -124,12 +124,12 @@ def test_configuration_round_trip(self) -> None: format=ExportFormat(kind=ExportFormatKind.JSON, schema_version="1.0"), max_instances_per_batch=25, ) - restored = ExportJobConfiguration._from_dict(cfg._to_dict()) + restored = ExportJobConfiguration.from_dict(cfg.to_dict()) assert restored == cfg def test_checkpoint_round_trip(self) -> None: cp = ExportCheckpoint(last_instance_key="abc|xyz") - assert ExportCheckpoint._from_dict(cp._to_dict()) == cp + assert ExportCheckpoint.from_dict(cp.to_dict()) == cp def test_failure_round_trip(self) -> None: f = ExportFailure( @@ -138,13 +138,13 @@ def test_failure_round_trip(self) -> None: attempt_count=3, last_attempt=_WINDOW_END, ) - assert ExportFailure._from_dict(f._to_dict()) == f + assert ExportFailure.from_dict(f.to_dict()) == f def test_naive_datetimes_are_treated_as_utc(self) -> None: naive = datetime(2025, 1, 1, 12, 0, 0) f = ExportFilter(completed_time_from=naive, completed_time_to=_WINDOW_END) - d = f._to_dict() - restored = ExportFilter._from_dict(d) + d = f.to_dict() + restored = ExportFilter.from_dict(d) assert restored.completed_time_from == naive.replace(tzinfo=timezone.utc) @@ -157,7 +157,7 @@ def test_from_new_state(self) -> None: created_at=created, orchestrator_instance_id="orch-1", ) - desc = ExportJobDescription._from_state_dict("job-1", state._to_dict()) + desc = ExportJobDescription.from_state_dict("job-1", state.to_dict()) assert desc.job_id == "job-1" assert desc.status is ExportJobStatus.ACTIVE @@ -180,20 +180,20 @@ def test_state_round_trip_preserves_schema_version(self) -> None: cfg = _basic_options().to_configuration() created = _WINDOW_END state = ExportJobState.new(cfg, created_at=created) - d = state._to_dict() + d = state.to_dict() assert d["schema_version"] == STATE_SCHEMA_VERSION assert "__class__" not in d # no Python type metadata assert "__type__" not in d - restored = ExportJobState._from_dict(d) + restored = ExportJobState.from_dict(d) assert restored == state def test_unknown_schema_version_is_rejected(self) -> None: cfg = _basic_options().to_configuration() state = ExportJobState.new(cfg, created_at=_WINDOW_END) - bad = state._to_dict() + bad = state.to_dict() bad["schema_version"] = "99.0" with pytest.raises(ValueError, match="schema_version"): - ExportJobState._from_dict(bad) + ExportJobState.from_dict(bad) def test_state_carries_failures(self) -> None: cfg = _basic_options().to_configuration() @@ -205,5 +205,5 @@ def test_state_carries_failures(self) -> None: ) state = ExportJobState.new(cfg, created_at=_WINDOW_END) state.failures.append(f) - restored = ExportJobState._from_dict(state._to_dict()) + restored = ExportJobState.from_dict(state.to_dict()) assert restored.failures == [f] From 50bb4a586a3af4d85953d0c66c84e263d8893af5 Mon Sep 17 00:00:00 2001 From: Andy Staples Date: Wed, 3 Jun 2026 11:22:01 -0600 Subject: [PATCH 3/9] PR feedback --- docs/features.md | 11 +++++--- .../extensions/history_export/activities.py | 5 ++++ .../extensions/history_export/azure_blob.py | 28 +++++++++++++++++-- .../extensions/history_export/client.py | 7 +++-- .../extensions/history_export/models.py | 17 ++++------- .../extensions/history_export/writer.py | 17 ++++++++++- examples/history_export/app.py | 7 ++++- .../history_export/test_activities.py | 2 +- .../test_azure_blob_writer_e2e.py | 2 ++ .../extensions/history_export/test_client.py | 2 +- .../history_export/test_orchestrator.py | 2 +- 11 files changed, 75 insertions(+), 25 deletions(-) diff --git a/docs/features.md b/docs/features.md index 6c8f837b..acbba2a9 100644 --- a/docs/features.md +++ b/docs/features.md @@ -529,8 +529,6 @@ inheritance required — it's a `typing.Protocol`) to send exports to any destination (S3, GCS, SFTP, local filesystem, a database, etc.): ```python -from typing import Optional - from durabletask.extensions.history_export import HistoryWriter @@ -542,13 +540,18 @@ class LocalFileSystemHistoryWriter: self, *, instance_id: str, + container: str, blob_name: str, payload: bytes, content_type: str, - content_encoding: Optional[str], + content_encoding: str | None, ) -> None: import os - path = os.path.join(self._root, blob_name) + # ``container`` is the destination's logical container name + # (an ExportDestination.container). Per-job routing writers + # combine it with ``blob_name``; writers that pin to a fixed + # location at construction time may ignore it. + path = os.path.join(self._root, container, blob_name) os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, "wb") as fp: fp.write(payload) diff --git a/durabletask/extensions/history_export/activities.py b/durabletask/extensions/history_export/activities.py index e4c48fab..0e33ab80 100644 --- a/durabletask/extensions/history_export/activities.py +++ b/durabletask/extensions/history_export/activities.py @@ -149,6 +149,10 @@ def export_instance_history( raise TypeError("format must be a mapping") fmt = ExportFormat.from_dict(cast("Mapping[str, Any]", fmt_input)) destination_raw: Mapping[str, Any] = input.get("destination") or {} + container_raw: Any = destination_raw.get("container") + if not container_raw: + raise ValueError("destination.container is required") + container: str = str(container_raw) prefix_raw: Any = destination_raw.get("prefix") prefix: str | None = str(prefix_raw) if prefix_raw is not None else None @@ -169,6 +173,7 @@ def export_instance_history( blob_name = _blob_name_for(instance_id=instance_id, prefix=prefix, fmt=fmt) ctx.writer.write( instance_id=instance_id, + container=container, blob_name=blob_name, payload=payload, content_type=content_type_for(fmt), diff --git a/durabletask/extensions/history_export/azure_blob.py b/durabletask/extensions/history_export/azure_blob.py index 203fc659..992d1b37 100644 --- a/durabletask/extensions/history_export/azure_blob.py +++ b/durabletask/extensions/history_export/azure_blob.py @@ -65,11 +65,19 @@ class AzureBlobHistoryExportWriterOptions: def __post_init__(self) -> None: if not self.container_name: raise ValueError("container_name is required") + if self.connection_string and self.account_url: + raise ValueError( + "'connection_string' and 'account_url' are mutually exclusive" + ) if not self.connection_string and not self.account_url: raise ValueError( "Either 'connection_string' or 'account_url' (with 'credential') " "must be provided" ) + if self.account_url and self.credential is None: + raise ValueError( + "'credential' is required when 'account_url' is provided" + ) class AzureBlobHistoryExportWriter: @@ -116,19 +124,33 @@ def write( self, *, instance_id: str, + container: str, blob_name: str, payload: bytes, content_type: str, content_encoding: str | None, ) -> None: del instance_id # included by the protocol but not needed here + # This writer pins to the container configured at construction + # time and ignores the per-call ``container`` argument; the + # configured value is authoritative for any given writer + # instance. Run a separate writer per destination container + # if you need per-job routing. + del container self._ensure_container() container_client = self._service.get_container_client( self._options.container_name ) - content_settings = ContentSettings( - content_type=content_type, - content_encoding=content_encoding or "", + # Only set Content-Encoding if the format actually compresses + # the payload; an empty header value would be persisted on + # the blob and confuse downstream clients. + content_settings = ( + ContentSettings( + content_type=content_type, + content_encoding=content_encoding, + ) + if content_encoding + else ContentSettings(content_type=content_type) ) container_client.upload_blob( name=blob_name, diff --git a/durabletask/extensions/history_export/client.py b/durabletask/extensions/history_export/client.py index 600bf41a..d41c5590 100644 --- a/durabletask/extensions/history_export/client.py +++ b/durabletask/extensions/history_export/client.py @@ -219,7 +219,10 @@ def list_jobs( instance_id_starts_with=_ENTITY_ID_PREFIX, last_modified_from=query.last_modified_from, last_modified_to=query.last_modified_to, - include_state=query.include_state, + # list_jobs always needs the persisted state to populate + # ExportJobDescription; an entity-only view doesn't carry + # status or progress and would always be filtered out. + include_state=True, page_size=query.page_size, ) status_filter = set(query.status) if query.status else None @@ -230,7 +233,7 @@ def list_jobs( # explicit entity-name check. if meta.id.entity != ENTITY_NAME.lower(): continue - raw = meta.get_state(str) if meta.includes_state else None + raw = meta.get_state(str) if not raw: continue try: diff --git a/durabletask/extensions/history_export/models.py b/durabletask/extensions/history_export/models.py index 3d139861..614a11ff 100644 --- a/durabletask/extensions/history_export/models.py +++ b/durabletask/extensions/history_export/models.py @@ -64,14 +64,12 @@ class ExportJobStatus(Enum): Status meanings --------------- ``PENDING`` - The job has been created but the entity has not yet processed - the ``create`` signal, *or* the entity has accepted the - configuration but has not yet kicked off its driving - orchestrator. The value is reserved for the forthcoming - ``run`` operation (see the .NET ``ExportJob.Run`` pattern); - the current implementation transitions directly from creation - to :attr:`ACTIVE`, so jobs are not persisted in ``Pending`` - today. + The job has been created and persisted but the entity has not + yet kicked off its driving orchestrator. Jobs sit in this + state briefly between the ``create`` and ``run`` signals + (the public client sends both in immediate succession), or + for longer if ``run`` is never invoked or if a caller revives + a previously terminal job via ``create``. ``ACTIVE`` The job is running and the driving orchestrator is making progress through pages of terminal instances. @@ -305,15 +303,12 @@ class ExportJobQuery: this timestamp are returned. page_size: Backend page size used to enumerate the underlying entities. - include_state: Whether to fetch full job state (set ``False`` - to retrieve job IDs and metadata only). """ status: list[ExportJobStatus] | None = None last_modified_from: datetime | None = None last_modified_to: datetime | None = None page_size: int | None = None - include_state: bool = True @dataclass diff --git a/durabletask/extensions/history_export/writer.py b/durabletask/extensions/history_export/writer.py index 181895d8..bea008a2 100644 --- a/durabletask/extensions/history_export/writer.py +++ b/durabletask/extensions/history_export/writer.py @@ -26,13 +26,18 @@ def write( self, *, instance_id: str, + container: str, blob_name: str, payload: bytes, content_type: str, content_encoding: str | None, ) -> None: import os - path = os.path.join(self._root, blob_name) + # The ``container`` value comes from the export job's + # ExportDestination.container and is the logical + # bucket / subdirectory the caller asked the job to + # write into. + path = os.path.join(self._root, container, blob_name) os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, "wb") as fp: fp.write(payload) @@ -63,6 +68,7 @@ def write( self, *, instance_id: str, + container: str, blob_name: str, payload: bytes, content_type: str, @@ -74,8 +80,17 @@ def write( instance_id: The orchestration instance whose history this payload represents. Provided so destinations may use it as a key, metadata, or sharding hint. + container: The destination container / bucket name the + job's :class:`ExportDestination` declared. Writers + that want to honour per-job container routing should + use this value; writers that pin to a fixed container + at construction time (such as the bundled Azure Blob + writer) may ignore it. blob_name: Destination-relative path / key, including any configured destination prefix and file extension. + Does NOT include the ``container`` component — a + writer that routes per-container is expected to + combine the two. payload: The serialized history bytes. Already compressed if the configured format calls for it. content_type: The HTTP-style content type appropriate for diff --git a/examples/history_export/app.py b/examples/history_export/app.py index 37d6668d..c0d72995 100644 --- a/examples/history_export/app.py +++ b/examples/history_export/app.py @@ -59,7 +59,12 @@ def sample_orchestrator(ctx: task.OrchestrationContext, n: int): def main() -> None: print(f"Using container: {CONTAINER_NAME}") - print(f"Using storage connection: {AZURITE_CONN_STR}") + # Avoid printing the raw connection string — a real Azure Storage + # connection string contains the account key. + if AZURITE_CONN_STR == "UseDevelopmentStorage=true": + print("Using storage connection: Azurite (UseDevelopmentStorage=true)") + else: + print("Using storage connection: (redacted)") backend = create_test_backend(port=50300) try: diff --git a/tests/durabletask/extensions/history_export/test_activities.py b/tests/durabletask/extensions/history_export/test_activities.py index 240e34b7..3ce3788b 100644 --- a/tests/durabletask/extensions/history_export/test_activities.py +++ b/tests/durabletask/extensions/history_export/test_activities.py @@ -46,7 +46,7 @@ def __init__(self) -> None: self._lock = threading.Lock() self.blobs: dict[str, dict] = {} - def write(self, *, instance_id, blob_name, payload, content_type, content_encoding): + def write(self, *, instance_id, container, blob_name, payload, content_type, content_encoding): with self._lock: self.blobs[blob_name] = { "instance_id": instance_id, diff --git a/tests/durabletask/extensions/history_export/test_azure_blob_writer_e2e.py b/tests/durabletask/extensions/history_export/test_azure_blob_writer_e2e.py index 0fb815f0..1934d0ec 100644 --- a/tests/durabletask/extensions/history_export/test_azure_blob_writer_e2e.py +++ b/tests/durabletask/extensions/history_export/test_azure_blob_writer_e2e.py @@ -93,6 +93,7 @@ def test_write_json_blob(writer): blob_name = f"json/{uuid.uuid4().hex}.json" writer.write( instance_id="inst-json", + container=TEST_CONTAINER, blob_name=blob_name, payload=payload, content_type=content_type_for(fmt), @@ -118,6 +119,7 @@ def test_write_jsonl_gzip_blob(writer): blob_name = f"gz/{uuid.uuid4().hex}.jsonl.gz" writer.write( instance_id="inst-gz", + container=TEST_CONTAINER, blob_name=blob_name, payload=payload, content_type=content_type_for(fmt), diff --git a/tests/durabletask/extensions/history_export/test_client.py b/tests/durabletask/extensions/history_export/test_client.py index 7076b5eb..ee24e8f8 100644 --- a/tests/durabletask/extensions/history_export/test_client.py +++ b/tests/durabletask/extensions/history_export/test_client.py @@ -46,7 +46,7 @@ def __init__(self) -> None: self._lock = threading.Lock() self.blobs: dict[str, dict] = {} - def write(self, *, instance_id, blob_name, payload, content_type, content_encoding): + def write(self, *, instance_id, container, blob_name, payload, content_type, content_encoding): with self._lock: self.blobs[blob_name] = { "instance_id": instance_id, diff --git a/tests/durabletask/extensions/history_export/test_orchestrator.py b/tests/durabletask/extensions/history_export/test_orchestrator.py index aacebe4f..2c914427 100644 --- a/tests/durabletask/extensions/history_export/test_orchestrator.py +++ b/tests/durabletask/extensions/history_export/test_orchestrator.py @@ -44,7 +44,7 @@ def __init__(self) -> None: self._lock = threading.Lock() self.blobs: dict[str, dict] = {} - def write(self, *, instance_id, blob_name, payload, content_type, content_encoding): + def write(self, *, instance_id, container, blob_name, payload, content_type, content_encoding): with self._lock: self.blobs[blob_name] = { "instance_id": instance_id, From 5b7d9704a761caeb9a2fddfce86797ebf73188ca Mon Sep 17 00:00:00 2001 From: Andy Staples Date: Wed, 3 Jun 2026 11:28:22 -0600 Subject: [PATCH 4/9] Use relative import for _test_helpers (CI fix) The absolute 'from tests.durabletask...' import worked locally because pip install -e .[dev] adds the repo root to sys.path, but CI runs pytest without that and pytest's rootdir doesn't include a 'tests' top-level package. Switch to a relative import inside the history_export test package (which already has __init__.py). --- tests/durabletask/extensions/history_export/test_client.py | 2 +- tests/durabletask/extensions/history_export/test_entity.py | 2 +- .../durabletask/extensions/history_export/test_orchestrator.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/durabletask/extensions/history_export/test_client.py b/tests/durabletask/extensions/history_export/test_client.py index ee24e8f8..dd6698b8 100644 --- a/tests/durabletask/extensions/history_export/test_client.py +++ b/tests/durabletask/extensions/history_export/test_client.py @@ -34,7 +34,7 @@ from durabletask.extensions.history_export.activities import clear_context from durabletask.testing import create_test_backend -from tests.durabletask.extensions.history_export._test_helpers import wait_until +from ._test_helpers import wait_until PORT = 50263 diff --git a/tests/durabletask/extensions/history_export/test_entity.py b/tests/durabletask/extensions/history_export/test_entity.py index 6f28c44c..e359ce67 100644 --- a/tests/durabletask/extensions/history_export/test_entity.py +++ b/tests/durabletask/extensions/history_export/test_entity.py @@ -30,7 +30,7 @@ ) from durabletask.testing import create_test_backend -from tests.durabletask.extensions.history_export._test_helpers import wait_until +from ._test_helpers import wait_until PORT = 50260 diff --git a/tests/durabletask/extensions/history_export/test_orchestrator.py b/tests/durabletask/extensions/history_export/test_orchestrator.py index 2c914427..a9c651ed 100644 --- a/tests/durabletask/extensions/history_export/test_orchestrator.py +++ b/tests/durabletask/extensions/history_export/test_orchestrator.py @@ -32,7 +32,7 @@ from durabletask.extensions.history_export import orchestrator as orch_mod from durabletask.testing import create_test_backend -from tests.durabletask.extensions.history_export._test_helpers import wait_until +from ._test_helpers import wait_until PORT = 50262 From 5a5f1a3e42341a3efb24d408fe5a5058e576558b Mon Sep 17 00:00:00 2001 From: Andy Staples Date: Wed, 3 Jun 2026 12:15:11 -0600 Subject: [PATCH 5/9] PR Feedback 2 --- .../plan-orchestrationHistoryExport.prompt.md | 438 ------------------ .../extensions/history_export/__init__.py | 4 +- .../extensions/history_export/activities.py | 19 +- .../extensions/history_export/azure_blob.py | 8 +- .../extensions/history_export/client.py | 57 ++- .../extensions/history_export/entity.py | 73 ++- .../extensions/history_export/models.py | 110 ++++- .../extensions/history_export/orchestrator.py | 145 +++--- examples/history_export/README.md | 3 +- examples/history_export/app.py | 90 ++-- pyproject.toml | 2 +- .../extensions/history_export/test_entity.py | 53 ++- .../extensions/history_export/test_models.py | 119 +++++ .../history_export/test_orchestrator.py | 78 ++++ 14 files changed, 622 insertions(+), 577 deletions(-) delete mode 100644 docs/plan-orchestrationHistoryExport.prompt.md diff --git a/docs/plan-orchestrationHistoryExport.prompt.md b/docs/plan-orchestrationHistoryExport.prompt.md deleted file mode 100644 index 971b0d6e..00000000 --- a/docs/plan-orchestrationHistoryExport.prompt.md +++ /dev/null @@ -1,438 +0,0 @@ -# Plan: Python History Export Parity - -Add orchestration history export to durabletask-python in two layers: first expose the existing sidecar capabilities that are already present in protobuf but missing from the Python client and test backend; then build a higher-level export job workflow modeled on durabletask-dotnet ExportHistory, with Azure Blob as the first destination via an optional extension. This keeps the core SDK transport-focused while still achieving full feature parity. - -## Target Scope - -**Full parity path**: core retrieval/list/rewind APIs plus a higher-level job-based export workflow modeled on the .NET implementation. - -**Recommended packaging split**: Core SDKowns retrieval/list/rewind and generic history serialization helpers; Azure Blob export workflow lives behind optional dependencies in an extension-style module. - -**Initial destination scope**: Azure Blob only, matching durabletask-dotnet's current export package. Do not generalize destination providers until a second real provider exists. - -**Initial format scope**: JSON and JSONL gzip with explicit schema versioning. Defer CSV, Parquet, and import/replay-from-export features. - -## Phase 1: Core History Foundations - -### 1.1 Add Core Client APIs -**Files**: [durabletask/client.py](durabletask/client.py#L213) and [durabletask/client.py](durabletask/client.py#L428) - -Add sync and async methods to both `TaskHubGrpcClient` and `AsyncTaskHubGrpcClient`: -- `get_orchestration_history(instance_id) -> Iterable[HistoryEvent]` — stream all HistoryEvent messages for an instance. -- `list_instance_ids(runtime_status, completed_time_from, completed_time_to, page_size) -> Page[List[str]]` — paginate terminal instance IDs by completion-time window and status filter. -- `rewind_orchestration(instance_id, reason) -> None` — rewind a failed orchestration (if backend supports). - -Implementation should: -- Reuse the existing gRPC stubs already declared in [durabletask/internal/orchestrator_service_pb2_grpc.py](durabletask/internal/orchestrator_service_pb2_grpc.py). -- Handle gRPC error codes (NOT_FOUND, UNIMPLEMENTED, CANCELLED, INTERNAL) and map to Python exceptions. -- For streamed history, aggregate HistoryChunk messages and yield or return individual HistoryEvent objects. -- De-externalize nested payload tokens if a payload_store is configured (reuse logic from [durabletask/payload/helpers.py](durabletask/payload/helpers.py)). -- Log operations consistently with existing client methods. - -### 1.2 Implement In-Memory Backend History Support -**Files**: [durabletask/testing/in_memory_backend.py](durabletask/testing/in_memory_backend.py#L1086) - -Implement two currently-stubbed gRPC servicer methods: -- `StreamInstanceHistory(request: StreamInstanceHistoryRequest, context)` — yield HistoryChunk messages containing events from the instance's history list, paginating or chunking as needed. -- `ListInstanceIds(request: ListInstanceIdsRequest, context)` — iterate stored instances, filter by terminal status and completion-time window, and return a paginated response with continuation token. - -Optionally decide whether to implement `RewindInstance`: -- **Recommendation**: Mark as explicitly unsupported initially (abort with UNIMPLEMENTED); add later if demand is high or if rewound instances are needed for tests. -- If implemented, reset the instance's history to exclude the failed events and restart with a new execution ID. - -### 1.3 Add History Helper Utilities -**Files**: New module `durabletask/internal/history_helpers.py` or extend [durabletask/payload/helpers.py](durabletask/payload/helpers.py) - -Provide internal helpers for: -- **Payload de-externalization in history**: walk nested HistoryEvent fields and replace payload tokens with original data if a store is configured. -- **Event-to-dict conversion**: convert a HistoryEvent protobuf to a serializable dict for JSON export (used later). -- **Event filtering**: filter a list of HistoryEvent by type, timestamp range, or other criteria (optional; can be deferred to export layer). - -### 1.4 Settle on Public History Return Type -**Decision Point** - -Options: -1. **Raw protobuf** (recommended for Phase 1): Return `Iterable[pb.HistoryEvent]` to callers. Low risk of churn, users who need export can call helper utilities. Matches .NET baseline. -2. **Python dataclass wrapper** (higher initial investment): Define an `HistoryEventData` class and convert all HistoryEvent messages to it. Better UX but requires more upfront design. -3. **Both** (post-Phase 1): Start with raw protobuf; add a Python wrapper class in Phase 2 if export code needs the conversion anyway. - -**Recommendation**: Start with raw protobuf. Keep the public API minimal and transport-focused. Add serialization helpers (internal) that the export layer can use. - -### 1.5 Update Tests -**Files**: [tests/durabletask/test_client.py](tests/durabletask/test_client.py), [tests/durabletask/test_orchestration_executor.py](tests/durabletask/test_orchestration_executor.py), new test file for history retrieval - -Add tests for: -- **Client API tests**: Verify `get_orchestration_history()`, `list_instance_ids()` make correct gRPC requests, handle streaming/pagination, de-externalize payloads (reuse FakePayloadStore from large_payload tests), and map errors. -- **Backend tests**: Verify in-memory history streaming returns events in order, ListInstanceIds paginates correctly by status/time, and continuation tokens work. -- **Error handling**: Verify NOT_FOUND, UNIMPLEMENTED, CANCELLED, INTERNAL errors are mapped appropriately. - -### 1.6 Update Core Changelog -**Files**: [CHANGELOG.md](CHANGELOG.md#L7) - -Under `## Unreleased`, add: -``` -ADDED - -- Added `get_orchestration_history(instance_id)` and async variant to both gRPC client classes for streaming instance history events. -- Added `list_instance_ids(runtime_status, completed_time_from, completed_time_to, ...)` to support filtering terminal instances by completion time and status. Supports pagination via continuation tokens. -- Added `rewind_orchestration(instance_id, reason)` and async variant for rewinding failed orchestrations (backend support may vary). -- In-memory backend now implements `StreamInstanceHistory` and `ListInstanceIds` gRPC methods for testing. -- Added internal history utility functions for payload de-externalization and event serialization. -``` - ---- - -## Phase 2: Export Job Workflow - -### 2.1 Package Structure -**New module**: `durabletask/extensions/history_export/` or a new separate package (if isolated from core) - -**Recommendation**: Follow the existing extension pattern. Place under `durabletask/extensions/` as a submodule with optional (but recommended) Azure Blob dependencies: - -``` -durabletask/ -└── extensions/ - └── history_export/ - ├── __init__.py - ├── client.py # ExportHistoryClient, ExportHistoryJobClient - ├── models/ - │ ├── __init__.py - │ ├── export_job_state.py - │ ├── export_checkpoint.py - │ ├── export_destination.py - │ ├── export_filter.py - │ ├── export_format.py - │ ├── export_failure.py - │ ├── export_job_description.py - │ ├── export_job_configuration.py - │ ├── export_job_status.py # Enum: Active, Completed, Failed - │ ├── export_mode.py # Enum: Batch, Continuous - │ └── export_job_creation_options.py - ├── entity.py # ExportJob durable entity - ├── serialization.py # JSON/JSONL serialization logic - └── orchestrations/ - ├── __init__.py - ├── export_job_orchestrator.py - └── activities/ - ├── __init__.py - ├── list_terminal_instances.py - ├── export_instance_history.py - └── helpers.py - -tests/ -└── durabletask/ - └── extensions/ - └── history_export/ - ├── test_export_client.py - ├── test_export_models.py - ├── test_export_entity.py - ├── test_export_orchestrator.py - ├── test_export_activities.py - └── test_serialization.py -``` - -### 2.2 Models and Data Types -**Files**: `durabletask/extensions/history_export/models/` - -Define the following (inspired by durabletask-dotnet ExportHistory models): - -```python -# export_mode.py -class ExportMode(Enum): - BATCH = 1 # Export a fixed time window, then complete - CONTINUOUS = 2 # Tail terminal instances continuously - -# export_format.py -class ExportFormatKind(Enum): - JSON = 1 # Array of events, uncompressed - JSONL = 2 # One event per line, gzip compressed - -@dataclass -class ExportFormat: - kind: ExportFormatKind = ExportFormatKind.JSONL - schema_version: str = "1.0" - -# export_destination.py -@dataclass -class ExportDestination: - container: str # Azure Blob container name - prefix: Optional[str] = None # Optional blob prefix - -# export_filter.py -@dataclass -class ExportFilter: - completed_time_from: datetime # Inclusive lower bound - completed_time_to: Optional[datetime] = None # Inclusive upper bound - runtime_status: Optional[List[OrchestrationStatus]] = None # Filter by status - -# export_checkpoint.py -@dataclass -class ExportCheckpoint: - last_instance_key: Optional[str] = None # Continuation token for ListInstanceIds - -# export_failure.py -@dataclass -class ExportFailure: - instance_id: str - reason: str - attempt_count: int - last_attempt: datetime - -# export_job_status.py -class ExportJobStatus(Enum): - ACTIVE = "Active" - COMPLETED = "Completed" - FAILED = "Failed" - -# export_job_state.py -@dataclass -class ExportJobState: - status: ExportJobStatus - config: Optional['ExportJobConfiguration'] = None - checkpoint: Optional[ExportCheckpoint] = None - created_at: Optional[datetime] = None - last_modified_at: Optional[datetime] = None - last_checkpoint_time: Optional[datetime] = None - last_error: Optional[str] = None - scanned_instances: int = 0 - exported_instances: int = 0 - orchestrator_instance_id: Optional[str] = None - -# export_job_configuration.py -@dataclass -class ExportJobConfiguration: - mode: ExportMode - filter: ExportFilter - destination: ExportDestination - format: ExportFormat - max_parallel_exports: int = 32 - max_instances_per_batch: int = 100 - -# export_job_creation_options.py -@dataclass -class ExportJobCreationOptions: - mode: ExportMode - completed_time_from: datetime - completed_time_to: Optional[datetime] # Required for Batch, None for Continuous - destination: Optional[ExportDestination] - job_id: Optional[str] = None - format: ExportFormat = field(default_factory=lambda: ExportFormat()) - runtime_status: Optional[List[OrchestrationStatus]] = None # Defaults to terminal statuses - max_instances_per_batch: int = 100 - -# export_job_description.py -@dataclass -class ExportJobDescription: - job_id: str - status: ExportJobStatus - created_at: Optional[datetime] - last_modified_at: Optional[datetime] - config: Optional[ExportJobConfiguration] - orchestrator_instance_id: Optional[str] - scanned_instances: int - exported_instances: int - last_error: Optional[str] - checkpoint: Optional[ExportCheckpoint] - last_checkpoint_time: Optional[datetime] -``` - -### 2.3 Durable Entity for Job State -**Files**: `durabletask/extensions/history_export/entity.py` - -Implement `ExportJob` as a durable entity with operations: -- `create(context, creation_options: ExportJobCreationOptions)` — initialize job state and validate transitions. -- `get(context, _=None) -> ExportJobState` — fetch current state. -- `run(context, _=None)` — signal to start the export orchestrator. -- `commit_checkpoint(context, request: CommitCheckpointRequest)` — update progress, checkpoint, status. -- `mark_as_completed(context, _=None)` — transition to Completed. -- `mark_as_failed(context, error_message)` — transition to Failed. -- `delete(context, _=None)` — delete the entity. - -Include transition validation: define which operations are valid from which states (similar to ExportJobTransitions in .NET). - -### 2.4 Export Client -**Files**: `durabletask/extensions/history_export/client.py` - -Provide two public classes: - -```python -class ExportHistoryClient: - def __init__(self, durable_task_client: TaskHubGrpcClient, storage_options: ExportHistoryStorageOptions): - ... - - async def create_job_async(self, options: ExportJobCreationOptions) -> ExportHistoryJobClient: - """Create a new export job.""" - ... - - async def get_job_async(self, job_id: str) -> ExportJobDescription: - """Fetch a job by ID.""" - ... - - async def list_jobs_async(self, filter: Optional[ExportJobQuery] = None) -> AsyncIterable[ExportJobDescription]: - """List all export jobs, optionally filtered.""" - ... - - def get_job_client(self, job_id: str) -> ExportHistoryJobClient: - """Get a client for a specific job.""" - ... - -class ExportHistoryJobClient: - def __init__(self, job_id: str, ...): - ... - - async def create_async(self, options: ExportJobCreationOptions) -> None: - """Create the export job.""" - ... - - async def describe_async(self) -> ExportJobDescription: - """Get job status.""" - ... - - async def delete_async(self) -> None: - """Delete the job and terminate its orchestrator.""" - ... -``` - -Implementation: -- Use durable entities via `durable_task_client.signal_entity()` and `get_entity()` to manage job state. -- Wrap the entity ID as `ExportJob@{job_id}`. -- Schedule an orchestrator named `ExportJobOrchestrator` with a fixed instance ID pattern (e.g., `ExportJob-{job_id}`) to ensure one orchestrator per job. - -### 2.5 Activities -**Files**: `durabletask/extensions/history_export/orchestrations/activities/` - -Implement two activities: - -#### 2.5.1 ListTerminalInstancesActivity -Input: `ListTerminalInstancesRequest(completed_time_from, completed_time_to, runtime_status, last_instance_key, max_instances_per_batch)` -Output: `InstancePage(instance_ids: List[str], next_checkpoint: ExportCheckpoint)` - -Logic: -- Call the core client's `list_instance_ids()` with the filter parameters. -- Return a page of instance IDs and a checkpoint for the next call. - -#### 2.5.2 ExportInstanceHistoryActivity -Input: `ExportRequest(instance_id, destination, format)` -Output: `ExportResult(instance_id, success, error)` - -Logic: -- Fetch the instance's history using the core client's `get_orchestration_history()`. -- Fetch metadata using `get_orchestration_state()`. -- Serialize the history and metadata to the specified format (JSON or JSONL gzip). -- Upload to Azure Blob Storage. -- Return success/failure result. - -### 2.6 Export Orchestrator -**Files**: `durabletask/extensions/history_export/orchestrations/export_job_orchestrator.py` - -Orchestrate the export workflow: - -Input: `ExportJobRunRequest(job_entity_id, processed_cycles)` - -Logic: -1. Fetch job state from entity. If not Active, exit. -2. Call `ListTerminalInstancesActivity` to get a page of instance IDs. -3. If no instances and mode is Continuous, sleep and retry. If Batch, exit. -4. Call `ExportInstanceHistoryActivity` for each instance in parallel (bounded by `max_parallel_exports`). -5. With exponential backoff, retry failed exports up to 3 times. -6. Commit checkpoint if successful; record failures and stay at current checkpoint if batch fails. -7. If processed_cycles > 5, continue-as-new to reset history and prevent bloat. -8. Mark job as Completed or Failed based on final result. - -### 2.7 Serialization -**Files**: `durabletask/extensions/history_export/serialization.py` - -Provide functions: - -```python -def serialize_history( - instance_id: str, - metadata: OrchestrationState, - history: Iterable[pb.HistoryEvent], - format: ExportFormat, -) -> bytes: - """Serialize history and metadata to JSON or JSONL gzip.""" - ... - -def event_to_dict(event: pb.HistoryEvent) -> dict: - """Convert protobuf HistoryEvent to serializable dict.""" - ... -``` - -Implementation: -- For JSON: return an array of event dicts with metadata. -- For JSONL: return one event per line, gzip compressed. -- Preserve all event fields and handle polymorphic event types (use protobuf reflection or explicit converters). -- Skip internal fields (e.g., timestamps for WorkItem processing). - -### 2.8 Azure Blob Storage Upload -**Files**: Same activity file or new `durabletask/extensions/history_export/orchestrations/azure_storage.py` - -Use `azure.storage.blob.BlobClient` to upload serialized data: -- Generate a deterministic blob name (e.g., hash of completed time + instance ID). -- Include instance ID as blob metadata. -- Handle connection strings from `ExportHistoryStorageOptions`. - -### 2.9 Tests -**Files**: `tests/durabletask/extensions/history_export/` - -Add tests for: -- Export client creation, job listing, job description. -- Export job entity lifecycle (create, run, checkpoint, complete, fail, delete). -- Activity logic (ListTerminalInstancesActivity, ExportInstanceHistoryActivity). -- Orchestrator flow (paging, retries, continues-as-new, checkpoint commits). -- Serialization (JSON and JSONL formats, large nested payloads, polymorphic events). -- Azure Blob integration (mocked or using Azure Test Containers if the repo has that pattern). -- Batch vs. Continuous modes. -- Error transitions and recovery. - -### 2.10 Update Extension Changelog -**Files**: New `durabletask/extensions/history_export/CHANGELOG.md` or extend core [CHANGELOG.md](CHANGELOG.md) - -Document the new export extension, models, and client APIs. - ---- - -## Verification Checklist - -### Phase 1 -- [ ] Unit tests for `get_orchestration_history()`, `list_instance_ids()`, `rewind_orchestration()` (sync and async). -- [ ] Backend tests for in-memory StreamInstanceHistory and ListInstanceIds. -- [ ] Error mapping tests (NOT_FOUND, UNIMPLEMENTED, CANCELLED, INTERNAL). -- [ ] Payload de-externalization tests (use FakePayloadStore). -- [ ] Pylance diagnostics on modified client and backend files. -- [ ] `flake8` on modified Python files. -- [ ] Targeted pytest for client, backend, and large_payload tests. - -### Phase 2 -- [ ] Unit tests for all model classes (dataclass validation, enum values). -- [ ] Entity tests (create, get, signal operations, state transitions). -- [ ] Activity tests (ListTerminalInstancesActivity, ExportInstanceHistoryActivity). -- [ ] Orchestrator tests (paging, retries, continues-as-new, checkpointing). -- [ ] Serialization tests (JSON, JSONL gzip, nested payloads, polymorphic events). -- [ ] Integration tests (end-to-end export with mock or test Azure Blob). -- [ ] Batch and Continuous mode tests. -- [ ] Pylance diagnostics on export extension files. -- [ ] `flake8` on all new files. -- [ ] Targeted pytest for export extension tests. -- [ ] Optional: Azure Test Containers integration (if repo has pattern). - ---- - -## Further Considerations - -1. **Public history return type**: Keeping raw protobuf HistoryEvent in Phase 1 allows Phase 2 serialization logic to reuse the message introspection without forcing a new public model. Consider wrapping in Phase 2 if users ask. - -2. **Export package naming**: Avoid adding many methods directly to the core `TaskHubGrpcClient`. Instead, keep export under a clear namespace (`extensions.history_export` or a separate distribution) to signal that it is optional and not part of the core SDK. - -3. **Continuous export semantics**: Follow the .NET pattern closely: - - Tail terminal instances from a completion-time watermark. - - Persist checkpoint to entity state to survive orchestrator restarts. - - Use periodic continue-as-new or equivalent restart behavior if Python orchestrator history gets too large. - - Sleep between empty pages to avoid busy-waiting. - -4. **Dependency management**: The export extension should declare optional Azure Storage dependencies (e.g., `pip install durabletask[history-export]` pulls in `azure-storage-blob`). - -5. **Rewind support**: Rewind is a lower-priority feature. Consider leaving it unsupported in the in-memory backend for now and adding it only if users need it or if it's required for export testing. - -6. **Future extensions**: Design the destination abstraction (ExportDestination, upload logic) so it can evolve to support other backends (S3, GCS, SFTP, local filesystem) without core changes. For now, ship only Azure Blob. diff --git a/durabletask/extensions/history_export/__init__.py b/durabletask/extensions/history_export/__init__.py index 2214e9d7..b7fdf0bf 100644 --- a/durabletask/extensions/history_export/__init__.py +++ b/durabletask/extensions/history_export/__init__.py @@ -27,6 +27,7 @@ ExportHistoryClient, ExportHistoryJobClient, ) +from durabletask.extensions.history_export.entity import ExportJobEntity from durabletask.extensions.history_export.exceptions import ( ExportJobError, ExportJobInvalidTransitionError, @@ -65,6 +66,7 @@ "ExportJobConfiguration", "ExportJobCreationOptions", "ExportJobDescription", + "ExportJobEntity", "ExportJobError", "ExportJobInvalidTransitionError", "ExportJobNotFoundError", @@ -76,5 +78,3 @@ "HistoryWriter", "orchestrator_instance_id_for", ] - -PACKAGE_NAME = "durabletask.extensions.history_export" diff --git a/durabletask/extensions/history_export/activities.py b/durabletask/extensions/history_export/activities.py index 0e33ab80..145f9607 100644 --- a/durabletask/extensions/history_export/activities.py +++ b/durabletask/extensions/history_export/activities.py @@ -68,8 +68,25 @@ class HistoryExportContext: def bind_context(context: HistoryExportContext) -> None: - """Install the runtime dependencies for the history-export activities.""" + """Install the runtime dependencies for the history-export activities. + + The bound context is process-wide. Calling this more than once in + the same process — for example by constructing two + :class:`ExportHistoryClient` instances with different writers — + silently replaces the previously-bound writer for *all* in-flight + activities. Such a rebind emits a logger warning so the + misconfiguration is visible at runtime. + """ global _context + if _context is not None and _context is not context: + from durabletask.extensions.history_export._logging import logger + logger.warning( + "history_export.bind_context() replacing an existing bound " + "context (writer=%r); only one writer can be active per process. " + "Run a separate worker process per writer if you need multiple " + "destinations.", + type(context.writer).__name__, + ) _context = context diff --git a/durabletask/extensions/history_export/azure_blob.py b/durabletask/extensions/history_export/azure_blob.py index 992d1b37..11641641 100644 --- a/durabletask/extensions/history_export/azure_blob.py +++ b/durabletask/extensions/history_export/azure_blob.py @@ -53,6 +53,11 @@ class AzureBlobHistoryExportWriterOptions: (useful for Azurite compatibility). create_container_if_not_exists: When ``True`` (the default), ensure the container exists on the first write. + overwrite: When ``True`` (the default), each blob upload + replaces any existing blob of the same name. Set to + ``False`` for compliance setups that require + write-once / immutable exports; in that mode the writer + raises if a blob already exists at the target path. """ container_name: str @@ -61,6 +66,7 @@ class AzureBlobHistoryExportWriterOptions: credential: Any = field(default=None, repr=False) api_version: str | None = None create_container_if_not_exists: bool = True + overwrite: bool = True def __post_init__(self) -> None: if not self.container_name: @@ -155,7 +161,7 @@ def write( container_client.upload_blob( name=blob_name, data=payload, - overwrite=True, + overwrite=self._options.overwrite, content_settings=content_settings, ) diff --git a/durabletask/extensions/history_export/client.py b/durabletask/extensions/history_export/client.py index d41c5590..56ffb53e 100644 --- a/durabletask/extensions/history_export/client.py +++ b/durabletask/extensions/history_export/client.py @@ -145,7 +145,7 @@ def create_job( safely re-create a previously-terminated job. """ config = options.to_configuration() - resolved_job_id = job_id or options.job_id or uuid.uuid4().hex + resolved_job_id = job_id or uuid.uuid4().hex entity_id = entities.EntityInstanceId(ENTITY_NAME, resolved_job_id) created_at = datetime.now(timezone.utc) config_dict = config.to_dict() @@ -156,14 +156,13 @@ def create_job( # signals are processed in FIFO order by the entity dispatcher. self._client.signal_entity( entity_id, - "create", + ExportJobEntity.OP_CREATE, input={ "config": config_dict, "created_at": created_at.isoformat(), }, ) - self._client.signal_entity(entity_id, "run") - + self._client.signal_entity(entity_id, ExportJobEntity.OP_RUN) logger.info( "Submitted export job %r; orchestrator instance ID will be %s", resolved_job_id, orchestrator_instance_id_for(resolved_job_id), @@ -184,7 +183,14 @@ def create_job( ) def get_job(self, job_id: str) -> ExportJobDescription | None: - """Look up an export job by ID. Returns ``None`` if not found.""" + """Look up an export job by ID. Returns ``None`` if not found. + + Note that the lookup-miss contract differs from + :meth:`wait_for_job`: ``get_job`` is a passive read that + returns ``None`` when the entity does not exist, while + ``wait_for_job`` raises :class:`ExportJobNotFoundError` after + its timeout if the entity never appears. + """ entity_id = entities.EntityInstanceId(ENTITY_NAME, job_id) meta = self._client.get_entity(entity_id, include_state=True) if meta is None: @@ -235,18 +241,35 @@ def list_jobs( continue raw = meta.get_state(str) if not raw: + logger.warning( + "list_jobs: skipping export-job entity %r with no " + "persisted state", meta.id.key, + ) continue try: state = json.loads(raw) - except (TypeError, ValueError): + except (TypeError, ValueError) as ex: + logger.warning( + "list_jobs: skipping export-job entity %r; failed to " + "parse state JSON (%s)", meta.id.key, ex, + ) continue if not isinstance(state, dict): + logger.warning( + "list_jobs: skipping export-job entity %r; persisted " + "state is not a JSON object (got %s)", + meta.id.key, type(state).__name__, + ) continue try: desc = ExportJobDescription.from_state_dict( meta.id.key, cast("dict[str, Any]", state), ) - except (KeyError, ValueError): + except (KeyError, ValueError) as ex: + logger.warning( + "list_jobs: skipping export-job entity %r; state did " + "not match the current schema (%s)", meta.id.key, ex, + ) continue if status_filter is not None and desc.status not in status_filter: continue @@ -289,7 +312,13 @@ def wait_for_job( time.sleep(poll_interval) def delete_job(self, job_id: str) -> None: - """Delete the export-job entity, clearing its state. + """Request deletion of the export-job entity, clearing its state. + + This call is **best-effort and fire-and-forget**: it enqueues a + ``delete`` signal on the entity but does not wait for the + entity dispatcher to process it. Callers that need + confirmation should poll :meth:`get_job` and wait for it to + return ``None``. The driving orchestrator will detect the deletion at its next loop iteration (via :meth:`OrchestrationContext.call_entity`) @@ -298,7 +327,17 @@ def delete_job(self, job_id: str) -> None: This does NOT delete blobs already written to the destination. """ entity_id = entities.EntityInstanceId(ENTITY_NAME, job_id) - self._client.signal_entity(entity_id, "delete") + self._client.signal_entity(entity_id, ExportJobEntity.OP_DELETE) + + def cancel_job(self, job_id: str) -> None: + """Alias for :meth:`delete_job`. + + ``CONTINUOUS`` mode has no natural completion, so users + looking to stop a tailing export are likely to look for + ``cancel_job`` rather than ``delete_job``. Provided as a thin + alias to make either name discoverable. + """ + self.delete_job(job_id) # ------------------------------------------------------------------ # Convenience diff --git a/durabletask/extensions/history_export/entity.py b/durabletask/extensions/history_export/entity.py index 99057bd6..23e71a7c 100644 --- a/durabletask/extensions/history_export/entity.py +++ b/durabletask/extensions/history_export/entity.py @@ -92,6 +92,21 @@ def _summarize_failures(failures: list[ExportFailure], *, limit: int = 10) -> st class ExportJobEntity(entities.DurableEntity): """Durable entity that owns the lifecycle state of one export job.""" + # ----- operation names ------------------------------------------ + # + # Single source of truth for the wire-level entity operation + # names. Clients, the orchestrator, and the transitions matrix + # all import these so a typo in any one call site is impossible. + # Mirrors the .NET ``nameof(this.Create)`` pattern. + + OP_CREATE = "create" + OP_GET = "get" + OP_RUN = "run" + OP_COMMIT_CHECKPOINT = "commit_checkpoint" + OP_MARK_COMPLETED = "mark_completed" + OP_MARK_FAILED = "mark_failed" + OP_DELETE = "delete" + # ----- state helpers -------------------------------------------- def _load(self) -> ExportJobState | None: @@ -123,18 +138,32 @@ def create(self, payload: Mapping[str, Any]) -> dict[str, Any]: job_id = self._job_id() current = self._current_status() assert_valid_transition( - "create", current, ExportJobStatus.PENDING, job_id=job_id, + self.OP_CREATE, current, ExportJobStatus.PENDING, job_id=job_id, ) config_dict = payload.get("config") - if not config_dict: + if config_dict is None: raise ValueError("create payload requires 'config'") - config = ExportJobConfiguration.from_dict(config_dict) + if not isinstance(config_dict, Mapping) or not config_dict: + raise ValueError( + "create payload 'config' must be a non-empty mapping" + ) + config = ExportJobConfiguration.from_dict( + cast("Mapping[str, Any]", config_dict), + ) created_at_raw = payload.get("created_at") created_at = dt_from_iso(created_at_raw) if created_at_raw else _utcnow() assert created_at is not None + # Reviving a terminal job (COMPLETED / FAILED) constructs a + # *fresh* ExportJobState here. That intentionally resets + # every progress field — ``scanned_instances``, + # ``exported_instances``, ``failed_instances``, + # ``checkpoint.last_instance_key``, ``last_checkpoint_time``, + # ``last_error``, and the accumulated ``failures`` list. + # Matches the .NET ``ExportJob.Create`` revive semantics so a + # re-created job starts from a clean slate. state = ExportJobState( status=ExportJobStatus.PENDING, config=config, @@ -143,6 +172,7 @@ def create(self, payload: Mapping[str, Any]) -> dict[str, Any]: ) logger.info( "Created export job %r in status %s", job_id, state.status.value, + extra={"job_id": job_id, "operation": "create"}, ) return self._save(state) @@ -156,7 +186,7 @@ def run(self, _: Any = None) -> dict[str, Any] | None: raise ValueError("Cannot run uninitialized export job") job_id = self._job_id() assert_valid_transition( - "run", state.status, ExportJobStatus.ACTIVE, job_id=job_id, + self.OP_RUN, state.status, ExportJobStatus.ACTIVE, job_id=job_id, ) # The entity itself schedules the driving orchestrator. The @@ -174,17 +204,25 @@ def run(self, _: Any = None) -> dict[str, Any] | None: logger.info( "Scheduled orchestrator %s for job %r with instance ID %s", ORCHESTRATOR_NAME, job_id, instance_id, + extra={"job_id": job_id, "operation": "run"}, ) except Exception as ex: # noqa: BLE001 + # Mirror the .NET ExportJob.StartExportOrchestration pattern: + # record the failure on persisted state and return, rather + # than re-raising. Re-raising inside an entity operation + # can cause some entity backends to discard the in-flight + # state mutations, leaving the job stuck in PENDING with no + # error recorded. Returning ensures FAILED + last_error + # actually persist. state.status = ExportJobStatus.FAILED state.last_error = ( f"Failed to schedule orchestrator: {type(ex).__name__}: {ex}" ) logger.exception( "Failed to schedule orchestrator for export job %r", job_id, + extra={"job_id": job_id, "operation": "run"}, ) - self._save(state) - raise + return self._save(state) state.status = ExportJobStatus.ACTIVE state.last_error = None @@ -210,7 +248,7 @@ def commit_checkpoint(self, payload: Mapping[str, Any]) -> dict[str, Any] | None will_fail = bool(payload.get("mark_failed_on_batch")) and bool(new_failures) target = ExportJobStatus.FAILED if will_fail else ExportJobStatus.ACTIVE assert_valid_transition( - "commit_checkpoint", state.status, target, job_id=job_id, + self.OP_COMMIT_CHECKPOINT, state.status, target, job_id=job_id, ) state.scanned_instances += scanned_delta @@ -240,6 +278,7 @@ def commit_checkpoint(self, payload: Mapping[str, Any]) -> dict[str, Any] | None logger.warning( "Export job %r marked FAILED after batch retries (%d failures)", job_id, len(new_failures), + extra={"job_id": job_id, "operation": "commit_checkpoint"}, ) return self._save(state) @@ -250,12 +289,15 @@ def mark_completed(self, _: Any = None) -> dict[str, Any] | None: raise ValueError("Cannot mark_completed on uninitialized export job") job_id = self._job_id() assert_valid_transition( - "mark_completed", state.status, ExportJobStatus.COMPLETED, + self.OP_MARK_COMPLETED, state.status, ExportJobStatus.COMPLETED, job_id=job_id, ) state.status = ExportJobStatus.COMPLETED state.last_error = None - logger.info("Export job %r marked COMPLETED", job_id) + logger.info( + "Export job %r marked COMPLETED", job_id, + extra={"job_id": job_id, "operation": "mark_completed"}, + ) return self._save(state) def mark_failed( @@ -266,21 +308,28 @@ def mark_failed( raise ValueError("Cannot mark_failed on uninitialized export job") job_id = self._job_id() assert_valid_transition( - "mark_failed", state.status, ExportJobStatus.FAILED, job_id=job_id, + self.OP_MARK_FAILED, state.status, ExportJobStatus.FAILED, job_id=job_id, ) reason = "" if payload is not None: reason = str(payload.get("reason", "")) state.status = ExportJobStatus.FAILED state.last_error = reason or None - logger.info("Export job %r marked FAILED: %s", job_id, reason or "(no reason)") + logger.info( + "Export job %r marked FAILED: %s", job_id, reason or "(no reason)", + extra={"job_id": job_id, "operation": "mark_failed"}, + ) return self._save(state) def delete(self, _: Any = None) -> None: # type: ignore[override] # The base class's delete() calls set_state(None) which is # exactly what we want for export-job cleanup. ``delete`` is # always valid regardless of current status. - logger.info("Export job %r deleted", self._job_id()) + job_id = self._job_id() + logger.info( + "Export job %r deleted", job_id, + extra={"job_id": job_id, "operation": "delete"}, + ) super().delete() diff --git a/durabletask/extensions/history_export/models.py b/durabletask/extensions/history_export/models.py index 614a11ff..f983585d 100644 --- a/durabletask/extensions/history_export/models.py +++ b/durabletask/extensions/history_export/models.py @@ -94,6 +94,29 @@ class ExportJobStatus(Enum): ] +def _parse_runtime_status(value: Any) -> OrchestrationStatus: + """Parse a runtime status from its persisted representation. + + Accepts both the current wire format (``.value`` — the protobuf + integer) and the legacy schema-1.0 format (``.name`` — the enum + constant name). Renaming an enum constant in the core SDK is + therefore non-breaking for persisted state. + """ + if isinstance(value, OrchestrationStatus): + return value + if isinstance(value, int): + return OrchestrationStatus(value) + if isinstance(value, str): + # Try integer-as-string first, then fall back to enum name. + try: + return OrchestrationStatus(int(value)) + except (TypeError, ValueError): + return OrchestrationStatus[value] + raise TypeError( + f"Cannot parse runtime status from value of type {type(value).__name__!r}" + ) + + # ---------------------------------------------------------------------- # Configuration dataclasses # ---------------------------------------------------------------------- @@ -174,8 +197,11 @@ def to_dict(self) -> dict[str, Any]: return { "completed_time_from": dt_to_iso(self.completed_time_from), "completed_time_to": dt_to_iso(self.completed_time_to), + # Persist by ``.value`` (the protobuf integer) rather than + # ``.name`` so renaming an enum constant in the core SDK + # does not break previously-persisted job state. "runtime_status": ( - [s.name for s in self.runtime_status] + [s.value for s in self.runtime_status] if self.runtime_status is not None else None ), @@ -191,7 +217,7 @@ def from_dict(cls, data: Mapping[str, Any]) -> "ExportFilter": completed_time_from=completed_from, completed_time_to=dt_from_iso(data.get("completed_time_to")), runtime_status=( - [OrchestrationStatus[name] for name in statuses] + [_parse_runtime_status(s) for s in statuses] if statuses is not None else None ), @@ -259,14 +285,51 @@ class ExportJobConfiguration: max_parallel_exports: int = 32 def __post_init__(self) -> None: - if self.max_instances_per_batch <= 0: - raise ValueError("max_instances_per_batch must be positive") + # Bounds on batch sizing. Upper bound matches the .NET + # ``ExportJobCreationOptions`` cap to avoid runaway page sizes. + if not 1 <= self.max_instances_per_batch <= 1000: + raise ValueError( + "max_instances_per_batch must be in [1, 1000]; got " + f"{self.max_instances_per_batch}" + ) if self.max_parallel_exports <= 0: raise ValueError("max_parallel_exports must be positive") - if self.mode == ExportMode.BATCH and self.filter.completed_time_to is None: + + # Mode-specific filter validation. + if self.mode is ExportMode.BATCH and self.filter.completed_time_to is None: raise ValueError( "completed_time_to is required for batch mode exports" ) + if self.mode is ExportMode.CONTINUOUS and self.filter.completed_time_to is not None: + raise ValueError( + "completed_time_to is not allowed for continuous mode " + "exports; the tail has no upper bound" + ) + + # Window must be a strictly-increasing range when both ends + # are set. Catches upside-down windows early. + if ( + self.filter.completed_time_to is not None + and self.filter.completed_time_to <= self.filter.completed_time_from + ): + raise ValueError( + "completed_time_to must be strictly greater than " + "completed_time_from" + ) + + # Only terminal statuses make sense for export. Match the .NET + # validation set. + if self.filter.runtime_status is not None: + disallowed = [ + s for s in self.filter.runtime_status + if s not in _DEFAULT_TERMINAL_STATUSES + ] + if disallowed: + names = ", ".join(sorted(s.name for s in disallowed)) + raise ValueError( + f"runtime_status may only contain terminal statuses " + f"({{COMPLETED, FAILED, TERMINATED}}); got {names}" + ) def to_dict(self) -> dict[str, Any]: return { @@ -280,11 +343,18 @@ def to_dict(self) -> dict[str, Any]: @classmethod def from_dict(cls, data: Mapping[str, Any]) -> "ExportJobConfiguration": + # Use an explicit ``None`` check (rather than ``or``) so that an + # empty ``format`` dict still goes through ``from_dict`` and + # raises a clear KeyError, instead of silently being replaced + # by the default. + format_data = data.get("format") + if format_data is None: + format_data = {"kind": ExportFormatKind.JSONL_GZIP.value} return cls( mode=ExportMode(data["mode"]), filter=ExportFilter.from_dict(data["filter"]), destination=ExportDestination.from_dict(data["destination"]), - format=ExportFormat.from_dict(data.get("format") or {"kind": ExportFormatKind.JSONL_GZIP.value}), + format=ExportFormat.from_dict(format_data), max_instances_per_batch=int(data.get("max_instances_per_batch", 100)), max_parallel_exports=int(data.get("max_parallel_exports", 32)), ) @@ -313,7 +383,15 @@ class ExportJobQuery: @dataclass class ExportJobCreationOptions: - """User-supplied options for creating a new export job.""" + """User-supplied options for creating a new export job. + + The job ID is **not** an attribute here; pass it explicitly to + :meth:`ExportHistoryClient.create_job` via the ``job_id`` kwarg, + or let the client auto-generate one. Keeping the ID separate from + the configuration avoids the .NET API's awkward duplication where + both ``options.JobId`` and a constructor argument could specify + the same field. + """ mode: ExportMode completed_time_from: datetime @@ -321,7 +399,6 @@ class ExportJobCreationOptions: completed_time_to: datetime | None = None runtime_status: list[OrchestrationStatus] | None = None format: ExportFormat = field(default_factory=ExportFormat) - job_id: str | None = None max_instances_per_batch: int = 100 max_parallel_exports: int = 32 @@ -357,11 +434,22 @@ def to_configuration(self) -> ExportJobConfiguration: # replaced with a registry keyed by ``(entity_name, schema_version)`` without # changing the on-disk shape. -STATE_SCHEMA_VERSION = "1.0" +STATE_SCHEMA_VERSION = "1.1" """The schema version emitted by :meth:`ExportJobState.to_dict`. Increment this when the persisted shape changes in a non-backward-compatible way and add a new branch in :meth:`ExportJobState.from_dict`. + +Version history: + +``"1.0"`` + Initial shape. ``runtime_status`` filter values were persisted as + enum *names* (e.g. ``"COMPLETED"``), which broke if the core SDK + renamed an enum constant. Read support retained. +``"1.1"`` + ``runtime_status`` filter values are persisted as the protobuf + enum *integer* (e.g. ``2`` for ``COMPLETED``). Reads still accept + the legacy 1.0 string form for backward compatibility. """ @@ -414,10 +502,10 @@ def to_dict(self) -> dict[str, Any]: @classmethod def from_dict(cls, data: Mapping[str, Any]) -> "ExportJobState": version = data.get("schema_version", "1.0") - if version != STATE_SCHEMA_VERSION: + if version not in {"1.0", "1.1"}: raise ValueError( f"Unsupported export job state schema_version={version!r}; " - f"expected {STATE_SCHEMA_VERSION!r}" + f"expected one of: '1.0', '1.1' (current: {STATE_SCHEMA_VERSION!r})" ) config_data = data.get("config") diff --git a/durabletask/extensions/history_export/orchestrator.py b/durabletask/extensions/history_export/orchestrator.py index eb2846cd..2c876b76 100644 --- a/durabletask/extensions/history_export/orchestrator.py +++ b/durabletask/extensions/history_export/orchestrator.py @@ -47,6 +47,7 @@ LIST_TERMINAL_INSTANCES_ACTIVITY, build_list_activity_input, ) +from durabletask.extensions.history_export.entity import ExportJobEntity from durabletask.extensions.history_export.models import ( ExportJobConfiguration, ExportJobStatus, @@ -162,7 +163,7 @@ def export_job_orchestrator( # lets external state changes (delete, mark_failed) cancel # the orchestrator without us having to drain a backlog. current_state: dict[str, Any] | None = ( - yield ctx.call_entity(entity_id, "get") + yield ctx.call_entity(entity_id, ExportJobEntity.OP_GET) ) if current_state is None: logger.info( @@ -200,57 +201,95 @@ def export_job_orchestrator( failed_delta = 0 batch_failures: list[dict[str, Any]] = [] - if instance_ids: - batch_succeeded = False - results: list[_ExportActivityResult] = [] - for attempt in range(1, MAX_BATCH_RETRY_ATTEMPTS + 1): - results = yield from _run_page( - ctx, - instance_ids=instance_ids, - config=config, - max_parallel=config.max_parallel_exports, + # Empty page handling matches the .NET ExportJobOrchestrator: + # CONTINUOUS sleeps and re-polls, BATCH exits cleanly even + # if the backend returned a non-null continuation token. + # This guards against backends that legally return an empty + # page with a token (the orchestrator would otherwise spin + # forever in BATCH mode emitting no-op commit_checkpoints). + if not instance_ids: + if config.mode is ExportMode.CONTINUOUS: + yield ctx.create_timer( + ctx.current_utc_datetime + _continuous_idle_delay() ) - failed_results = [r for r in results if not r.get("success")] - if not failed_results: - batch_succeeded = True - break - if attempt < MAX_BATCH_RETRY_ATTEMPTS: - delay = _batch_retry_delay(attempt) - yield ctx.create_timer(ctx.current_utc_datetime + delay) - - exported_delta = sum(1 for r in results if r.get("success")) - failed_delta = sum(1 for r in results if not r.get("success")) - batch_failures = [ + continuation_token = None + continue + ctx.signal_entity( + entity_id, + ExportJobEntity.OP_COMMIT_CHECKPOINT, { - "instance_id": r["instance_id"], - "reason": r.get("error") or "Unknown error", - "attempt_count": MAX_BATCH_RETRY_ATTEMPTS, - "last_attempt": ctx.current_utc_datetime.isoformat(), - } - for r in results - if not r.get("success") - ] - - if not batch_succeeded: - ctx.signal_entity( - entity_id, - "commit_checkpoint", - { - "scanned_delta": 0, - "exported_delta": 0, - "failed_delta": failed_delta, - "failures": batch_failures, - "mark_failed_on_batch": True, - }, - ) - totals["scanned"] += scanned_delta - totals["exported"] += exported_delta - totals["failed"] += failed_delta - raise RuntimeError( - f"Export job '{job_id}' batch failed after " - f"{MAX_BATCH_RETRY_ATTEMPTS} attempts; " - f"{failed_delta} instances could not be exported." - ) + "scanned_delta": 0, + "exported_delta": 0, + "failed_delta": 0, + "last_instance_key": None, + }, + ) + break + + # The page has at least one instance: fan out exports. + batch_succeeded = False + results: list[_ExportActivityResult] = [] + for attempt in range(1, MAX_BATCH_RETRY_ATTEMPTS + 1): + results = yield from _run_page( + ctx, + instance_ids=instance_ids, + config=config, + max_parallel=config.max_parallel_exports, + ) + failed_results = [r for r in results if not r.get("success")] + if not failed_results: + batch_succeeded = True + break + if attempt < MAX_BATCH_RETRY_ATTEMPTS: + delay = _batch_retry_delay(attempt) + yield ctx.create_timer(ctx.current_utc_datetime + delay) + + exported_delta = sum(1 for r in results if r.get("success")) + failed_delta = sum(1 for r in results if not r.get("success")) + batch_failures = [ + { + "instance_id": r["instance_id"], + "reason": r.get("error") or "Unknown error", + "attempt_count": MAX_BATCH_RETRY_ATTEMPTS, + "last_attempt": ctx.current_utc_datetime.isoformat(), + } + for r in results + if not r.get("success") + ] + + if not batch_succeeded: + ctx.signal_entity( + entity_id, + ExportJobEntity.OP_COMMIT_CHECKPOINT, + { + "scanned_delta": 0, + "exported_delta": 0, + "failed_delta": failed_delta, + "failures": batch_failures, + "mark_failed_on_batch": True, + }, + ) + totals["scanned"] += scanned_delta + totals["exported"] += exported_delta + totals["failed"] += failed_delta + # The entity already transitioned to FAILED via + # the commit_checkpoint signal above; returning + # cleanly avoids the outer ``except`` issuing a + # second mark_failed signal which the transitions + # matrix would reject (the entity is no longer + # ACTIVE). Surfacing the cause is the caller's + # responsibility via :meth:`ExportHistoryClient.get_job`, + # whose ``last_error`` carries the failure summary. + logger.warning( + "Export job %r marked FAILED after %d batch attempts; " + "%d instances failed to export", + job_id, MAX_BATCH_RETRY_ATTEMPTS, failed_delta, + ) + return { + "job_id": job_id, + "status": ExportJobStatus.FAILED.value, + "totals": totals, + } next_token_raw = page.get("continuation_token") next_token: str | None = ( @@ -258,7 +297,7 @@ def export_job_orchestrator( ) ctx.signal_entity( entity_id, - "commit_checkpoint", + ExportJobEntity.OP_COMMIT_CHECKPOINT, { "scanned_delta": scanned_delta, "exported_delta": exported_delta, @@ -284,13 +323,13 @@ def export_job_orchestrator( continuation_token = next_token # Reaching here means BATCH mode finished its window cleanly. - ctx.signal_entity(entity_id, "mark_completed") + ctx.signal_entity(entity_id, ExportJobEntity.OP_MARK_COMPLETED) return {"job_id": job_id, "status": "Completed", "totals": totals} except Exception as ex: # noqa: BLE001 - reported back via mark_failed ctx.signal_entity( entity_id, - "mark_failed", + ExportJobEntity.OP_MARK_FAILED, {"reason": f"{type(ex).__name__}: {ex}"}, ) raise diff --git a/examples/history_export/README.md b/examples/history_export/README.md index e250bc8d..d6d99f5b 100644 --- a/examples/history_export/README.md +++ b/examples/history_export/README.md @@ -35,4 +35,5 @@ The script: > [!TIP] > Set `STORAGE_CONNECTION_STRING` to point at a real Azure Storage -> account instead of Azurite. +> account instead of Azurite. Set `EXPORT_CONTAINER` to override the +> default destination container name (`history-export-sample`). diff --git a/examples/history_export/app.py b/examples/history_export/app.py index c0d72995..fd6c365e 100644 --- a/examples/history_export/app.py +++ b/examples/history_export/app.py @@ -75,52 +75,52 @@ def main() -> None: api_version="2024-08-04", ) ) - - dt_client = client.TaskHubGrpcClient(host_address=HOST) - export_client = ExportHistoryClient(dt_client, writer) - - with worker.TaskHubGrpcWorker(host_address=HOST) as w: - # Register the workload orchestrator and activity. - w.add_orchestrator(sample_orchestrator) - w.add_activity(square) - - # Register the export-job entity, activities, and orchestrator. - export_client.register_worker(w) - w.start() - - # Seed some terminal instances to export. - print("\nSeeding sample orchestrations...") - for n in range(1, 6): - sid = dt_client.schedule_new_orchestration(sample_orchestrator, input=n) - state = dt_client.wait_for_orchestration_completion(sid, timeout=30) - assert state and state.runtime_status == client.OrchestrationStatus.COMPLETED - time.sleep(0.5) - - # Create an export job for the seeded window. - now = datetime.now(timezone.utc) - print("\nCreating export job...") - desc = export_client.create_job( - ExportJobCreationOptions( - mode=ExportMode.BATCH, - completed_time_from=now - timedelta(hours=1), - completed_time_to=now + timedelta(hours=1), - destination=ExportDestination(container=CONTAINER_NAME, prefix="sample-run"), - format=ExportFormat(kind=ExportFormatKind.JSONL_GZIP), - max_instances_per_batch=10, + try: + dt_client = client.TaskHubGrpcClient(host_address=HOST) + export_client = ExportHistoryClient(dt_client, writer) + + with worker.TaskHubGrpcWorker(host_address=HOST) as w: + # Register the workload orchestrator and activity. + w.add_orchestrator(sample_orchestrator) + w.add_activity(square) + + # Register the export-job entity, activities, and orchestrator. + export_client.register_worker(w) + w.start() + + # Seed some terminal instances to export. + print("\nSeeding sample orchestrations...") + for n in range(1, 6): + sid = dt_client.schedule_new_orchestration(sample_orchestrator, input=n) + state = dt_client.wait_for_orchestration_completion(sid, timeout=30) + assert state and state.runtime_status == client.OrchestrationStatus.COMPLETED + time.sleep(0.5) + + # Create an export job for the seeded window. + now = datetime.now(timezone.utc) + print("\nCreating export job...") + desc = export_client.create_job( + ExportJobCreationOptions( + mode=ExportMode.BATCH, + completed_time_from=now - timedelta(hours=1), + completed_time_to=now + timedelta(hours=1), + destination=ExportDestination(container=CONTAINER_NAME, prefix="sample-run"), + format=ExportFormat(kind=ExportFormatKind.JSONL_GZIP), + max_instances_per_batch=10, + ) ) - ) - print(f" job_id: {desc.job_id}") - print(f" orchestrator_instance_id: {desc.orchestrator_instance_id}") - - final = export_client.wait_for_job(desc.job_id, timeout=120, poll_interval=0.5) - print("\nFinal job status:") - print(f" status: {final.status.value}") - print(f" scanned_instances: {final.scanned_instances}") - print(f" exported_instances: {final.exported_instances}") - print(f" failed_instances: {final.failed_instances}") - if final.last_error: - print(f" last_error: {final.last_error}") - + print(f" job_id: {desc.job_id}") + print(f" orchestrator_instance_id: {desc.orchestrator_instance_id}") + + final = export_client.wait_for_job(desc.job_id, timeout=120, poll_interval=0.5) + print("\nFinal job status:") + print(f" status: {final.status.value}") + print(f" scanned_instances: {final.scanned_instances}") + print(f" exported_instances: {final.exported_instances}") + print(f" failed_instances: {final.failed_instances}") + if final.last_error: + print(f" last_error: {final.last_error}") + finally: writer.close() finally: backend.stop() diff --git a/pyproject.toml b/pyproject.toml index 98b581f1..c235792c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ azure-blob-payloads = [ "azure-storage-blob[aio]>=12.0.0" ] history-export-azure = [ - "azure-storage-blob>=12.0.0" + "azure-storage-blob[aio]>=12.0.0" ] [project.urls] diff --git a/tests/durabletask/extensions/history_export/test_entity.py b/tests/durabletask/extensions/history_export/test_entity.py index e359ce67..fb9991ae 100644 --- a/tests/durabletask/extensions/history_export/test_entity.py +++ b/tests/durabletask/extensions/history_export/test_entity.py @@ -144,7 +144,7 @@ def test_create_persists_pending_status(c) -> None: c.signal_entity(entity_id, "create", input=_create_payload()) state = _wait_for_status(c, entity_id, ExportJobStatus.PENDING) - assert state["schema_version"] == "1.0" + assert state["schema_version"] == "1.1" assert state["status"] == ExportJobStatus.PENDING.value assert state["orchestrator_instance_id"] is None assert state["config"]["destination"]["container"] == "exports" @@ -189,16 +189,63 @@ def test_create_on_active_job_is_rejected_and_state_unchanged(c) -> None: def test_create_after_failure_resets_to_pending(c) -> None: + """Reviving a terminal job rewinds every progress field. + + Matches the .NET ``ExportJob.Create`` revive semantics: counters, + checkpoint, ``last_checkpoint_time``, ``last_error``, and the + accumulated ``failures`` list are all reset to a clean slate. + """ entity_id = entities.EntityInstanceId(ENTITY_NAME, "job-1d") c.signal_entity(entity_id, "create", input=_create_payload()) c.signal_entity(entity_id, "run") _wait_for_status(c, entity_id, ExportJobStatus.ACTIVE) + # Apply enough progress and a failure so revival has something to + # actually reset. + c.signal_entity( + entity_id, + "commit_checkpoint", + input={ + "scanned_delta": 12, + "exported_delta": 9, + "failed_delta": 3, + "last_instance_key": "ts|inst-12", + "failures": [ + { + "instance_id": "inst-z", + "reason": "timeout", + "attempt_count": 3, + "last_attempt": "2026-01-01T00:00:00+00:00", + }, + ], + }, + ) + pre_revive = _wait_for_state( + c, entity_id, + lambda s: s.get("scanned_instances") == 12, + description="progress to land before revival", + ) + assert pre_revive["checkpoint"]["last_instance_key"] == "ts|inst-12" + assert pre_revive["last_checkpoint_time"] is not None + assert len(pre_revive["failures"]) == 1 + c.signal_entity(entity_id, "mark_failed", input={"reason": "boom"}) - _wait_for_status(c, entity_id, ExportJobStatus.FAILED) + failed = _wait_for_status(c, entity_id, ExportJobStatus.FAILED) + assert failed["last_error"] == "boom" + # Revive: every progress field should reset. c.signal_entity(entity_id, "create", input=_create_payload()) - _wait_for_status(c, entity_id, ExportJobStatus.PENDING) + revived = _wait_for_status(c, entity_id, ExportJobStatus.PENDING) + assert revived["scanned_instances"] == 0 + assert revived["exported_instances"] == 0 + assert revived["failed_instances"] == 0 + assert revived["checkpoint"]["last_instance_key"] is None + assert revived["last_checkpoint_time"] is None + assert revived["last_error"] is None + assert revived["failures"] == [] + # The orchestrator instance ID is also re-derived by ``run``; + # ``create`` itself does not pre-populate it. + assert revived["orchestrator_instance_id"] is None def test_commit_checkpoint_requires_active_status(c) -> None: diff --git a/tests/durabletask/extensions/history_export/test_models.py b/tests/durabletask/extensions/history_export/test_models.py index a05d2d05..ae9e57f0 100644 --- a/tests/durabletask/extensions/history_export/test_models.py +++ b/tests/durabletask/extensions/history_export/test_models.py @@ -92,6 +92,74 @@ def test_max_parallel_exports_must_be_positive(self) -> None: max_parallel_exports=0, ) + def test_batch_size_must_be_within_bounds(self) -> None: + # Lower bound: zero is rejected. + with pytest.raises(ValueError, match="max_instances_per_batch"): + ExportJobConfiguration( + mode=ExportMode.BATCH, + filter=ExportFilter( + completed_time_from=_WINDOW_START, + completed_time_to=_WINDOW_END, + ), + destination=_basic_destination(), + max_instances_per_batch=0, + ) + # Upper bound: 1001 is rejected (matches .NET cap). + with pytest.raises(ValueError, match="max_instances_per_batch"): + ExportJobConfiguration( + mode=ExportMode.BATCH, + filter=ExportFilter( + completed_time_from=_WINDOW_START, + completed_time_to=_WINDOW_END, + ), + destination=_basic_destination(), + max_instances_per_batch=1001, + ) + + def test_continuous_mode_rejects_completed_time_to(self) -> None: + with pytest.raises(ValueError, match="continuous mode"): + ExportJobConfiguration( + mode=ExportMode.CONTINUOUS, + filter=ExportFilter( + completed_time_from=_WINDOW_START, + completed_time_to=_WINDOW_END, + ), + destination=_basic_destination(), + ) + + def test_window_must_be_strictly_increasing(self) -> None: + with pytest.raises(ValueError, match="strictly greater"): + ExportJobConfiguration( + mode=ExportMode.BATCH, + filter=ExportFilter( + completed_time_from=_WINDOW_END, + completed_time_to=_WINDOW_START, + ), + destination=_basic_destination(), + ) + # Equal times are also rejected. + with pytest.raises(ValueError, match="strictly greater"): + ExportJobConfiguration( + mode=ExportMode.BATCH, + filter=ExportFilter( + completed_time_from=_WINDOW_START, + completed_time_to=_WINDOW_START, + ), + destination=_basic_destination(), + ) + + def test_runtime_status_rejects_non_terminal(self) -> None: + with pytest.raises(ValueError, match="terminal statuses"): + ExportJobConfiguration( + mode=ExportMode.BATCH, + filter=ExportFilter( + completed_time_from=_WINDOW_START, + completed_time_to=_WINDOW_END, + runtime_status=[OrchestrationStatus.RUNNING], + ), + destination=_basic_destination(), + ) + class TestFilterDefaults: def test_default_runtime_statuses(self) -> None: @@ -207,3 +275,54 @@ def test_state_carries_failures(self) -> None: state.failures.append(f) restored = ExportJobState.from_dict(state.to_dict()) assert restored.failures == [f] + + def test_runtime_status_persisted_as_protobuf_int(self) -> None: + cfg = ExportJobConfiguration( + mode=ExportMode.BATCH, + filter=ExportFilter( + completed_time_from=_WINDOW_START, + completed_time_to=_WINDOW_END, + runtime_status=[ + OrchestrationStatus.COMPLETED, + OrchestrationStatus.FAILED, + ], + ), + destination=_basic_destination(), + ) + d = cfg.to_dict() + # The on-disk shape carries the wire-stable enum integers + # (.value), not the SDK-internal enum names (.name). + assert d["filter"]["runtime_status"] == [ + OrchestrationStatus.COMPLETED.value, + OrchestrationStatus.FAILED.value, + ] + restored = ExportJobConfiguration.from_dict(d) + assert restored.filter.runtime_status == [ + OrchestrationStatus.COMPLETED, + OrchestrationStatus.FAILED, + ] + + def test_legacy_1_0_runtime_status_names_still_load(self) -> None: + # A persisted state created by schema 1.0 carries enum names + # in ``runtime_status``. The current loader must accept both + # the legacy string form and the current int form. + cfg = ExportJobConfiguration( + mode=ExportMode.BATCH, + filter=ExportFilter( + completed_time_from=_WINDOW_START, + completed_time_to=_WINDOW_END, + runtime_status=[OrchestrationStatus.COMPLETED], + ), + destination=_basic_destination(), + ) + state = ExportJobState.new(cfg, created_at=_WINDOW_END) + legacy = state.to_dict() + legacy["schema_version"] = "1.0" + # Simulate the 1.0 wire shape for runtime_status. + legacy["config"]["filter"]["runtime_status"] = [ + OrchestrationStatus.COMPLETED.name + ] + restored = ExportJobState.from_dict(legacy) + assert restored.config.filter.runtime_status == [ + OrchestrationStatus.COMPLETED + ] diff --git a/tests/durabletask/extensions/history_export/test_orchestrator.py b/tests/durabletask/extensions/history_export/test_orchestrator.py index a9c651ed..57bd27ef 100644 --- a/tests/durabletask/extensions/history_export/test_orchestrator.py +++ b/tests/durabletask/extensions/history_export/test_orchestrator.py @@ -224,3 +224,81 @@ def test_orchestrator_records_failure_when_no_context_bound( client=dt_client, writer=export_client.writer, ) ) + + +class _AlwaysFailingWriter: + """Writer that raises on every call — used to force batch retries to exhaust.""" + + def write(self, *, instance_id, container, blob_name, payload, content_type, content_encoding): + raise RuntimeError("simulated permanent write failure") + + +def test_batch_failure_marks_job_failed_without_invalid_transition( + dt_client, export_client, seeded_ids, caplog, +): + """Exhausting batch retries marks the job FAILED via commit_checkpoint. + + Regression guard for the bug where the orchestrator used to issue + a second ``mark_failed`` signal after ``commit_checkpoint`` had + already driven the entity to FAILED, which the transitions matrix + would reject and log as an invalid-transition error. + """ + from durabletask.extensions.history_export.activities import ( + HistoryExportContext, + bind_context, + ) + # Swap in a permanently-failing writer for this test only; restore + # the original writer in the finally block so the shared module + # fixtures stay consistent. + original_writer = export_client.writer + bind_context(HistoryExportContext(client=dt_client, writer=_AlwaysFailingWriter())) + try: + with caplog.at_level("WARNING", logger="durabletask.extensions.history_export"): + now = datetime.now(timezone.utc) + desc = export_client.create_job( + ExportJobCreationOptions( + mode=ExportMode.BATCH, + completed_time_from=now - timedelta(hours=1), + completed_time_to=now + timedelta(hours=1), + destination=ExportDestination( + container="exports", prefix="batch-fail-test", + ), + format=ExportFormat(kind=ExportFormatKind.JSON), + max_instances_per_batch=10, + ) + ) + # Generous timeout because the orchestrator does 3 batch x + # 3 activity retries against the (overridden, fast) backoff. + final = export_client.wait_for_job(desc.job_id, timeout=60, poll_interval=0.1) + + assert final.status == ExportJobStatus.FAILED + assert final.last_error is not None + # last_error summary mentions the writer's failure reason. + assert "simulated permanent write failure" in (final.last_error or "") + # The failures list is populated and at least one entry + # carries the reason text propagated up from the writer. + # (Each failure's ``instance_id`` is whatever terminal + # orchestration was in the export window — which may include + # prior tests' export orchestrators, not just the seeded + # sample workload. Reasons can also vary if prior writers + # left blobs behind, etc.) + assert len(final.failures) >= 1 + assert any( + "simulated permanent write failure" in f.reason + for f in final.failures + ) + + # Critical regression check: the orchestrator must not have + # issued a second ``mark_failed`` signal after + # ``commit_checkpoint`` already transitioned the entity to + # FAILED. If it had, the entity would have raised + # ExportJobInvalidTransitionError; the SDK logs that into + # caplog at WARNING/ERROR severity. + for record in caplog.records: + assert "ExportJobInvalidTransitionError" not in record.getMessage(), ( + f"Found invalid-transition log: {record.getMessage()!r}" + ) + finally: + bind_context( + HistoryExportContext(client=dt_client, writer=original_writer) + ) From dff8536ff288d0f11e5f6966ff88e39e31d707f5 Mon Sep 17 00:00:00 2001 From: Andy Staples Date: Wed, 3 Jun 2026 12:32:05 -0600 Subject: [PATCH 6/9] Schema back to 1.0, drop PENDING state --- CHANGELOG.md | 2 +- .../extensions/history_export/client.py | 19 ++-- .../extensions/history_export/entity.py | 106 +++++++----------- .../extensions/history_export/models.py | 33 ++---- .../extensions/history_export/transitions.py | 17 +-- .../extensions/history_export/test_client.py | 2 +- .../extensions/history_export/test_entity.py | 54 +++++---- .../extensions/history_export/test_models.py | 20 ++-- .../test_transitions_and_exceptions.py | 42 +++---- 9 files changed, 121 insertions(+), 174 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1991cbf4..797d8b6d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,7 +27,7 @@ ADDED history, and re-fetches entity state at the top of every page loop so external delete or mark-failed signals stop the orchestrator cleanly. Job state lives in a durable entity with an explicit state-transition - matrix (PENDING / ACTIVE / COMPLETED / FAILED); invalid transitions raise + matrix (ACTIVE / COMPLETED / FAILED); invalid transitions raise `ExportJobInvalidTransitionError`. Persisted entity state uses a versioned, schema-stable JSON shape (`STATE_SCHEMA_VERSION`) with no embedded Python type metadata. Each export job's driving orchestrator diff --git a/durabletask/extensions/history_export/client.py b/durabletask/extensions/history_export/client.py index 56ffb53e..a248d857 100644 --- a/durabletask/extensions/history_export/client.py +++ b/durabletask/extensions/history_export/client.py @@ -136,11 +136,11 @@ def create_job( ) -> ExportJobDescription: """Create a new export job and start its driving orchestrator. - The entity is created in :attr:`ExportJobStatus.PENDING` and - immediately signalled with ``run``, which schedules the + The entity processes ``create`` by validating the transition, + persisting :attr:`ExportJobStatus.ACTIVE`, and scheduling the driving orchestrator from inside the entity using a deterministic instance ID (``export-job-{job_id}``). This - matches the .NET ``ExportJob.Run`` pattern: callers can + matches the .NET ``ExportJob.Create`` pattern: callers can correlate a job with its orchestrator by ID alone and may safely re-create a previously-terminated job. """ @@ -150,10 +150,10 @@ def create_job( created_at = datetime.now(timezone.utc) config_dict = config.to_dict() - # Signal create first; the entity will validate the transition - # and persist PENDING. Then signal run; the entity will - # schedule the orchestrator and transition to ACTIVE. Both - # signals are processed in FIFO order by the entity dispatcher. + # A single ``create`` signal is enough: the entity validates + # the transition, persists ACTIVE, and schedules the + # orchestrator inline. Mirrors the .NET ``ExportJob.Create`` + # flow. self._client.signal_entity( entity_id, ExportJobEntity.OP_CREATE, @@ -162,14 +162,13 @@ def create_job( "created_at": created_at.isoformat(), }, ) - self._client.signal_entity(entity_id, ExportJobEntity.OP_RUN) logger.info( "Submitted export job %r; orchestrator instance ID will be %s", resolved_job_id, orchestrator_instance_id_for(resolved_job_id), ) return ExportJobDescription( job_id=resolved_job_id, - status=ExportJobStatus.PENDING, + status=ExportJobStatus.ACTIVE, created_at=created_at, last_modified_at=created_at, config=config, @@ -285,7 +284,7 @@ def wait_for_job( """Poll until the job reaches a terminal status or *timeout* elapses. Raises: - TimeoutError: If the job is still pending/active after + TimeoutError: If the job is still active after *timeout* seconds. ExportJobNotFoundError: If the job cannot be found at all. """ diff --git a/durabletask/extensions/history_export/entity.py b/durabletask/extensions/history_export/entity.py index 23e71a7c..b5e063c7 100644 --- a/durabletask/extensions/history_export/entity.py +++ b/durabletask/extensions/history_export/entity.py @@ -14,17 +14,16 @@ Operations ---------- ``create`` - Initialise a fresh export job, or reset a terminal job back to - :attr:`ExportJobStatus.PENDING`. Refuses to overwrite an active - job (raises :class:`ExportJobInvalidTransitionError`). + Initialise a fresh export job, or revive a terminal job, by + persisting :attr:`ExportJobStatus.ACTIVE` and scheduling the + driving orchestrator inline (with a deterministic instance ID + derived from the job ID). Refuses to overwrite an active job + (raises :class:`ExportJobInvalidTransitionError`). Mirrors the + .NET ``ExportJob.Create`` flow, so a single signal is enough to + launch a job. ``get`` Returns the persisted state dict, or ``None`` if the entity has not been created (or has been deleted). -``run`` - Schedules the driving orchestrator (with a deterministic instance - ID derived from the job ID) and transitions the job to - :attr:`ExportJobStatus.ACTIVE`. Idempotent so the client may - safely signal it more than once. ``commit_checkpoint`` Applies an incremental update after a single export page. When ``mark_failed_on_batch`` is true *and* ``failures`` is non-empty, @@ -101,7 +100,6 @@ class ExportJobEntity(entities.DurableEntity): OP_CREATE = "create" OP_GET = "get" - OP_RUN = "run" OP_COMMIT_CHECKPOINT = "commit_checkpoint" OP_MARK_COMPLETED = "mark_completed" OP_MARK_FAILED = "mark_failed" @@ -138,7 +136,7 @@ def create(self, payload: Mapping[str, Any]) -> dict[str, Any]: job_id = self._job_id() current = self._current_status() assert_valid_transition( - self.OP_CREATE, current, ExportJobStatus.PENDING, job_id=job_id, + self.OP_CREATE, current, ExportJobStatus.ACTIVE, job_id=job_id, ) config_dict = payload.get("config") @@ -165,69 +163,51 @@ def create(self, payload: Mapping[str, Any]) -> dict[str, Any]: # Matches the .NET ``ExportJob.Create`` revive semantics so a # re-created job starts from a clean slate. state = ExportJobState( - status=ExportJobStatus.PENDING, + status=ExportJobStatus.ACTIVE, config=config, created_at=created_at, last_modified_at=created_at, ) - logger.info( - "Created export job %r in status %s", job_id, state.status.value, - extra={"job_id": job_id, "operation": "create"}, - ) + + # The entity itself schedules the driving orchestrator inline, + # so a single ``create`` signal is enough to launch a job. + # Mirrors the .NET ``ExportJob.Create`` -> ``StartExportOrchestration`` + # flow and avoids the client having to send a second ``run`` + # signal (and the failure modes that come with it). + instance_id = orchestrator_instance_id_for(job_id) + try: + self.entity_context.schedule_new_orchestration( + ORCHESTRATOR_NAME, + input={"job_id": job_id, "config": state.config.to_dict()}, + instance_id=instance_id, + ) + state.orchestrator_instance_id = instance_id + logger.info( + "Created export job %r and scheduled orchestrator %s with " + "instance ID %s", + job_id, ORCHESTRATOR_NAME, instance_id, + extra={"job_id": job_id, "operation": "create"}, + ) + except Exception as ex: # noqa: BLE001 + # Mirror the .NET pattern: record the failure on persisted + # state and return, rather than re-raising. Re-raising + # inside an entity operation can cause some entity + # backends to discard the in-flight state mutations, + # leaving the job with no error recorded. + state.status = ExportJobStatus.FAILED + state.last_error = ( + f"Failed to schedule orchestrator: {type(ex).__name__}: {ex}" + ) + logger.exception( + "Failed to schedule orchestrator for export job %r", job_id, + extra={"job_id": job_id, "operation": "create"}, + ) return self._save(state) def get(self, _: Any = None) -> dict[str, Any] | None: state = self._load() return state.to_dict() if state is not None else None - def run(self, _: Any = None) -> dict[str, Any] | None: - state = self._load() - if state is None: - raise ValueError("Cannot run uninitialized export job") - job_id = self._job_id() - assert_valid_transition( - self.OP_RUN, state.status, ExportJobStatus.ACTIVE, job_id=job_id, - ) - - # The entity itself schedules the driving orchestrator. The - # client is therefore decoupled from the orchestrator's name - # and input shape. - if state.status is ExportJobStatus.PENDING: - instance_id = orchestrator_instance_id_for(job_id) - try: - self.entity_context.schedule_new_orchestration( - ORCHESTRATOR_NAME, - input={"job_id": job_id, "config": state.config.to_dict()}, - instance_id=instance_id, - ) - state.orchestrator_instance_id = instance_id - logger.info( - "Scheduled orchestrator %s for job %r with instance ID %s", - ORCHESTRATOR_NAME, job_id, instance_id, - extra={"job_id": job_id, "operation": "run"}, - ) - except Exception as ex: # noqa: BLE001 - # Mirror the .NET ExportJob.StartExportOrchestration pattern: - # record the failure on persisted state and return, rather - # than re-raising. Re-raising inside an entity operation - # can cause some entity backends to discard the in-flight - # state mutations, leaving the job stuck in PENDING with no - # error recorded. Returning ensures FAILED + last_error - # actually persist. - state.status = ExportJobStatus.FAILED - state.last_error = ( - f"Failed to schedule orchestrator: {type(ex).__name__}: {ex}" - ) - logger.exception( - "Failed to schedule orchestrator for export job %r", job_id, - extra={"job_id": job_id, "operation": "run"}, - ) - return self._save(state) - - state.status = ExportJobStatus.ACTIVE - state.last_error = None - return self._save(state) - def commit_checkpoint(self, payload: Mapping[str, Any]) -> dict[str, Any] | None: state = self._load() if state is None: diff --git a/durabletask/extensions/history_export/models.py b/durabletask/extensions/history_export/models.py index f983585d..0fa977aa 100644 --- a/durabletask/extensions/history_export/models.py +++ b/durabletask/extensions/history_export/models.py @@ -63,16 +63,13 @@ class ExportJobStatus(Enum): Status meanings --------------- - ``PENDING`` - The job has been created and persisted but the entity has not - yet kicked off its driving orchestrator. Jobs sit in this - state briefly between the ``create`` and ``run`` signals - (the public client sends both in immediate succession), or - for longer if ``run`` is never invoked or if a caller revives - a previously terminal job via ``create``. ``ACTIVE`` - The job is running and the driving orchestrator is making - progress through pages of terminal instances. + The job has been created and the driving orchestrator is + making progress through pages of terminal instances. This is + the initial status after :meth:`ExportHistoryClient.create_job` + because the entity schedules the orchestrator inline as part + of its ``create`` operation (mirroring the .NET + ``ExportJob.Create`` flow). ``COMPLETED`` The orchestrator finished a batch successfully. ``FAILED`` @@ -80,7 +77,6 @@ class ExportJobStatus(Enum): retries. """ - PENDING = "Pending" ACTIVE = "Active" COMPLETED = "Completed" FAILED = "Failed" @@ -434,22 +430,11 @@ def to_configuration(self) -> ExportJobConfiguration: # replaced with a registry keyed by ``(entity_name, schema_version)`` without # changing the on-disk shape. -STATE_SCHEMA_VERSION = "1.1" +STATE_SCHEMA_VERSION = "1.0" """The schema version emitted by :meth:`ExportJobState.to_dict`. Increment this when the persisted shape changes in a non-backward-compatible way and add a new branch in :meth:`ExportJobState.from_dict`. - -Version history: - -``"1.0"`` - Initial shape. ``runtime_status`` filter values were persisted as - enum *names* (e.g. ``"COMPLETED"``), which broke if the core SDK - renamed an enum constant. Read support retained. -``"1.1"`` - ``runtime_status`` filter values are persisted as the protobuf - enum *integer* (e.g. ``2`` for ``COMPLETED``). Reads still accept - the legacy 1.0 string form for backward compatibility. """ @@ -502,10 +487,10 @@ def to_dict(self) -> dict[str, Any]: @classmethod def from_dict(cls, data: Mapping[str, Any]) -> "ExportJobState": version = data.get("schema_version", "1.0") - if version not in {"1.0", "1.1"}: + if version != STATE_SCHEMA_VERSION: raise ValueError( f"Unsupported export job state schema_version={version!r}; " - f"expected one of: '1.0', '1.1' (current: {STATE_SCHEMA_VERSION!r})" + f"expected {STATE_SCHEMA_VERSION!r}" ) config_data = data.get("config") diff --git a/durabletask/extensions/history_export/transitions.py b/durabletask/extensions/history_export/transitions.py index 268e2093..480d5f59 100644 --- a/durabletask/extensions/history_export/transitions.py +++ b/durabletask/extensions/history_export/transitions.py @@ -28,14 +28,12 @@ # Maps (operation_name, current_status_or_None) -> {valid target statuses}. TRANSITIONS: Mapping[tuple[str, ExportJobStatus | None], frozenset[ExportJobStatus]] = { # ``create`` initialises a fresh job and revives terminal jobs. - ("create", None): frozenset({ExportJobStatus.PENDING}), - ("create", ExportJobStatus.FAILED): frozenset({ExportJobStatus.PENDING}), - ("create", ExportJobStatus.COMPLETED): frozenset({ExportJobStatus.PENDING}), - - # ``run`` flips the job from PENDING to ACTIVE. Idempotent so the - # client may signal it more than once without crashing the entity. - ("run", ExportJobStatus.PENDING): frozenset({ExportJobStatus.ACTIVE}), - ("run", ExportJobStatus.ACTIVE): frozenset({ExportJobStatus.ACTIVE}), + # The entity schedules the driving orchestrator inline, so the job + # goes straight to ACTIVE without a separate ``run`` signal. + # Matches the .NET ``ExportJob.Create`` flow. + ("create", None): frozenset({ExportJobStatus.ACTIVE}), + ("create", ExportJobStatus.FAILED): frozenset({ExportJobStatus.ACTIVE}), + ("create", ExportJobStatus.COMPLETED): frozenset({ExportJobStatus.ACTIVE}), # ``commit_checkpoint`` is a no-op transition during normal runs. # When the orchestrator signals ``mark_failed_on_batch`` the entity @@ -47,9 +45,6 @@ ("mark_completed", ExportJobStatus.ACTIVE): frozenset({ExportJobStatus.COMPLETED}), - # ``mark_failed`` from PENDING covers the rare case of a failure - # happening between create and run. - ("mark_failed", ExportJobStatus.PENDING): frozenset({ExportJobStatus.FAILED}), ("mark_failed", ExportJobStatus.ACTIVE): frozenset({ExportJobStatus.FAILED}), } diff --git a/tests/durabletask/extensions/history_export/test_client.py b/tests/durabletask/extensions/history_export/test_client.py index dd6698b8..87490769 100644 --- a/tests/durabletask/extensions/history_export/test_client.py +++ b/tests/durabletask/extensions/history_export/test_client.py @@ -127,7 +127,7 @@ def test_create_get_and_wait_for_job_end_to_end( ) assert desc.job_id - assert desc.status == ExportJobStatus.PENDING + assert desc.status == ExportJobStatus.ACTIVE assert desc.config is not None assert desc.orchestrator_instance_id == f"export-job-{desc.job_id}" diff --git a/tests/durabletask/extensions/history_export/test_entity.py b/tests/durabletask/extensions/history_export/test_entity.py index fb9991ae..14c5aa68 100644 --- a/tests/durabletask/extensions/history_export/test_entity.py +++ b/tests/durabletask/extensions/history_export/test_entity.py @@ -41,7 +41,7 @@ def _no_op_orchestrator(ctx: task.OrchestrationContext, _input): - # The entity's run() op schedules an orchestrator named + # The entity's ``create`` op schedules an orchestrator named # ``export_job_orchestrator``. These tests focus on entity # behaviour, so register a no-op stub under that canonical name. return None @@ -139,31 +139,23 @@ def _wait_for_status( # --------------------------------------------------------------------- -def test_create_persists_pending_status(c) -> None: +def test_create_persists_active_status_and_schedules_orchestrator(c) -> None: entity_id = entities.EntityInstanceId(ENTITY_NAME, "job-1") c.signal_entity(entity_id, "create", input=_create_payload()) - state = _wait_for_status(c, entity_id, ExportJobStatus.PENDING) - assert state["schema_version"] == "1.1" - assert state["status"] == ExportJobStatus.PENDING.value - assert state["orchestrator_instance_id"] is None + state = _wait_for_status(c, entity_id, ExportJobStatus.ACTIVE) + assert state["schema_version"] == "1.0" + assert state["status"] == ExportJobStatus.ACTIVE.value + # ``create`` schedules the driving orchestrator inline and records + # its deterministic instance ID. + assert state["orchestrator_instance_id"] == orchestrator_instance_id_for("job-1") assert state["config"]["destination"]["container"] == "exports" assert state["failures"] == [] -def test_run_transitions_to_active_and_records_orchestrator_instance_id(c) -> None: - entity_id = entities.EntityInstanceId(ENTITY_NAME, "job-1b") - c.signal_entity(entity_id, "create", input=_create_payload()) - c.signal_entity(entity_id, "run") - - state = _wait_for_status(c, entity_id, ExportJobStatus.ACTIVE) - assert state["orchestrator_instance_id"] == orchestrator_instance_id_for("job-1b") - - def test_create_on_active_job_is_rejected_and_state_unchanged(c) -> None: entity_id = entities.EntityInstanceId(ENTITY_NAME, "job-1c") c.signal_entity(entity_id, "create", input=_create_payload()) - c.signal_entity(entity_id, "run") _wait_for_status(c, entity_id, ExportJobStatus.ACTIVE) c.signal_entity(entity_id, "commit_checkpoint", input={"scanned_delta": 7}) @@ -188,16 +180,16 @@ def test_create_on_active_job_is_rejected_and_state_unchanged(c) -> None: assert state["scanned_instances"] == 7 -def test_create_after_failure_resets_to_pending(c) -> None: +def test_create_after_failure_revives_to_active(c) -> None: """Reviving a terminal job rewinds every progress field. Matches the .NET ``ExportJob.Create`` revive semantics: counters, checkpoint, ``last_checkpoint_time``, ``last_error``, and the - accumulated ``failures`` list are all reset to a clean slate. + accumulated ``failures`` list are all reset to a clean slate, and + the orchestrator is re-scheduled inline. """ entity_id = entities.EntityInstanceId(ENTITY_NAME, "job-1d") c.signal_entity(entity_id, "create", input=_create_payload()) - c.signal_entity(entity_id, "run") _wait_for_status(c, entity_id, ExportJobStatus.ACTIVE) # Apply enough progress and a failure so revival has something to @@ -233,9 +225,17 @@ def test_create_after_failure_resets_to_pending(c) -> None: failed = _wait_for_status(c, entity_id, ExportJobStatus.FAILED) assert failed["last_error"] == "boom" - # Revive: every progress field should reset. + # Revive: every progress field should reset and orchestrator + # instance ID should be re-derived. c.signal_entity(entity_id, "create", input=_create_payload()) - revived = _wait_for_status(c, entity_id, ExportJobStatus.PENDING) + revived = _wait_for_state( + c, entity_id, + lambda s: ( + s.get("status") == ExportJobStatus.ACTIVE.value + and s.get("scanned_instances") == 0 + ), + description="revived state to land", + ) assert revived["scanned_instances"] == 0 assert revived["exported_instances"] == 0 assert revived["failed_instances"] == 0 @@ -243,15 +243,14 @@ def test_create_after_failure_resets_to_pending(c) -> None: assert revived["last_checkpoint_time"] is None assert revived["last_error"] is None assert revived["failures"] == [] - # The orchestrator instance ID is also re-derived by ``run``; - # ``create`` itself does not pre-populate it. - assert revived["orchestrator_instance_id"] is None + assert revived["orchestrator_instance_id"] == orchestrator_instance_id_for( + "job-1d" + ) def test_commit_checkpoint_requires_active_status(c) -> None: entity_id = entities.EntityInstanceId(ENTITY_NAME, "job-2") c.signal_entity(entity_id, "create", input=_create_payload()) - c.signal_entity(entity_id, "run") _wait_for_status(c, entity_id, ExportJobStatus.ACTIVE) c.signal_entity( @@ -285,7 +284,6 @@ def test_commit_checkpoint_requires_active_status(c) -> None: def test_commit_checkpoint_records_failures_and_marks_failed(c) -> None: entity_id = entities.EntityInstanceId(ENTITY_NAME, "job-2b") c.signal_entity(entity_id, "create", input=_create_payload()) - c.signal_entity(entity_id, "run") _wait_for_status(c, entity_id, ExportJobStatus.ACTIVE) c.signal_entity( @@ -322,15 +320,15 @@ def test_commit_checkpoint_records_failures_and_marks_failed(c) -> None: def test_mark_completed_sets_status(c) -> None: entity_id = entities.EntityInstanceId(ENTITY_NAME, "job-3") c.signal_entity(entity_id, "create", input=_create_payload()) - c.signal_entity(entity_id, "run") c.signal_entity(entity_id, "mark_completed") state = _wait_for_status(c, entity_id, ExportJobStatus.COMPLETED) assert state["last_error"] is None -def test_mark_failed_records_reason_from_pending(c) -> None: +def test_mark_failed_records_reason(c) -> None: entity_id = entities.EntityInstanceId(ENTITY_NAME, "job-4") c.signal_entity(entity_id, "create", input=_create_payload()) + _wait_for_status(c, entity_id, ExportJobStatus.ACTIVE) c.signal_entity(entity_id, "mark_failed", input={"reason": "boom"}) state = _wait_for_status(c, entity_id, ExportJobStatus.FAILED) assert state["last_error"] == "boom" diff --git a/tests/durabletask/extensions/history_export/test_models.py b/tests/durabletask/extensions/history_export/test_models.py index ae9e57f0..8144a13a 100644 --- a/tests/durabletask/extensions/history_export/test_models.py +++ b/tests/durabletask/extensions/history_export/test_models.py @@ -302,10 +302,12 @@ def test_runtime_status_persisted_as_protobuf_int(self) -> None: OrchestrationStatus.FAILED, ] - def test_legacy_1_0_runtime_status_names_still_load(self) -> None: - # A persisted state created by schema 1.0 carries enum names - # in ``runtime_status``. The current loader must accept both - # the legacy string form and the current int form. + def test_runtime_status_parser_accepts_name_form_defensively(self) -> None: + # The persisted shape is the protobuf integer (.value), but the + # internal parser also accepts the enum name as a defensive + # measure against hand-edited / older / mis-shaped state. This + # is a property of the parser, not a documented compatibility + # promise across schema versions. cfg = ExportJobConfiguration( mode=ExportMode.BATCH, filter=ExportFilter( @@ -316,13 +318,13 @@ def test_legacy_1_0_runtime_status_names_still_load(self) -> None: destination=_basic_destination(), ) state = ExportJobState.new(cfg, created_at=_WINDOW_END) - legacy = state.to_dict() - legacy["schema_version"] = "1.0" - # Simulate the 1.0 wire shape for runtime_status. - legacy["config"]["filter"]["runtime_status"] = [ + d = state.to_dict() + # Substitute the enum-name string form into an otherwise-current + # 1.0 payload. + d["config"]["filter"]["runtime_status"] = [ OrchestrationStatus.COMPLETED.name ] - restored = ExportJobState.from_dict(legacy) + restored = ExportJobState.from_dict(d) assert restored.config.filter.runtime_status == [ OrchestrationStatus.COMPLETED ] diff --git a/tests/durabletask/extensions/history_export/test_transitions_and_exceptions.py b/tests/durabletask/extensions/history_export/test_transitions_and_exceptions.py index e51cfe68..f2af23ea 100644 --- a/tests/durabletask/extensions/history_export/test_transitions_and_exceptions.py +++ b/tests/durabletask/extensions/history_export/test_transitions_and_exceptions.py @@ -21,30 +21,22 @@ class TestTransitionsMatrix: - def test_create_from_none_is_pending(self) -> None: - assert is_valid_transition("create", None, ExportJobStatus.PENDING) + def test_create_from_none_is_active(self) -> None: + # ``create`` schedules the orchestrator inline, so the job + # goes straight to ACTIVE. Mirrors the .NET flow. + assert is_valid_transition("create", None, ExportJobStatus.ACTIVE) def test_create_from_active_is_rejected(self) -> None: assert not is_valid_transition( - "create", ExportJobStatus.ACTIVE, ExportJobStatus.PENDING + "create", ExportJobStatus.ACTIVE, ExportJobStatus.ACTIVE ) def test_create_from_terminal_revives_job(self) -> None: assert is_valid_transition( - "create", ExportJobStatus.COMPLETED, ExportJobStatus.PENDING + "create", ExportJobStatus.COMPLETED, ExportJobStatus.ACTIVE ) assert is_valid_transition( - "create", ExportJobStatus.FAILED, ExportJobStatus.PENDING - ) - - def test_run_is_idempotent_on_active(self) -> None: - assert is_valid_transition( - "run", ExportJobStatus.ACTIVE, ExportJobStatus.ACTIVE - ) - - def test_run_from_pending_activates(self) -> None: - assert is_valid_transition( - "run", ExportJobStatus.PENDING, ExportJobStatus.ACTIVE + "create", ExportJobStatus.FAILED, ExportJobStatus.ACTIVE ) def test_commit_checkpoint_can_fail_active_job(self) -> None: @@ -62,13 +54,10 @@ def test_mark_completed_requires_active(self) -> None: "mark_completed", ExportJobStatus.ACTIVE, ExportJobStatus.COMPLETED, ) assert not is_valid_transition( - "mark_completed", ExportJobStatus.PENDING, ExportJobStatus.COMPLETED, + "mark_completed", ExportJobStatus.FAILED, ExportJobStatus.COMPLETED, ) - def test_mark_failed_allowed_from_pending_or_active(self) -> None: - assert is_valid_transition( - "mark_failed", ExportJobStatus.PENDING, ExportJobStatus.FAILED, - ) + def test_mark_failed_allowed_from_active(self) -> None: assert is_valid_transition( "mark_failed", ExportJobStatus.ACTIVE, ExportJobStatus.FAILED, ) @@ -82,20 +71,20 @@ def test_assert_valid_raises_on_invalid_transition(self) -> None: with pytest.raises(ExportJobInvalidTransitionError) as excinfo: assert_valid_transition( "mark_completed", - ExportJobStatus.PENDING, + ExportJobStatus.FAILED, ExportJobStatus.COMPLETED, job_id="job-x", ) ex = excinfo.value assert ex.operation == "mark_completed" - assert ex.from_status == ExportJobStatus.PENDING.value + assert ex.from_status == ExportJobStatus.FAILED.value assert ex.to_status == ExportJobStatus.COMPLETED.value assert ex.job_id == "job-x" def test_assert_valid_no_op_when_allowed(self) -> None: # Should not raise. assert_valid_transition( - "run", ExportJobStatus.PENDING, ExportJobStatus.ACTIVE, + "create", None, ExportJobStatus.ACTIVE, ) def test_matrix_is_self_consistent(self) -> None: @@ -109,16 +98,15 @@ def test_matrix_is_self_consistent(self) -> None: class TestExceptions: def test_invalid_transition_is_value_error(self) -> None: err = ExportJobInvalidTransitionError( - operation="run", + operation="create", from_status="Active", - to_status="Pending", + to_status="Active", job_id="j", ) assert isinstance(err, ValueError) assert isinstance(err, ExportJobError) assert "Active" in str(err) - assert "Pending" in str(err) - assert "run" in str(err) + assert "create" in str(err) assert err.job_id == "j" def test_not_found_is_lookup_error(self) -> None: From 513a49a24914632f89c92bbefc96375cd50baca5 Mon Sep 17 00:00:00 2001 From: Andy Staples Date: Thu, 4 Jun 2026 11:21:59 -0600 Subject: [PATCH 7/9] PR Feedback 3 --- CHANGELOG.md | 11 ++- .../extensions/history_export/activities.py | 47 ++++++++- .../extensions/history_export/azure_blob.py | 10 ++ .../extensions/history_export/client.py | 98 ++++++++++++++++--- .../extensions/history_export/entity.py | 21 +++- .../extensions/history_export/orchestrator.py | 60 +++++++++--- .../extensions/history_export/writer.py | 15 +++ .../history_export/test_activities.py | 92 ++++++++++++++++- .../extensions/history_export/test_client.py | 2 +- .../history_export/test_orchestrator.py | 47 ++++++--- 10 files changed, 355 insertions(+), 48 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 797d8b6d..0ee56549 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,12 +20,21 @@ ADDED (tail terminal instances indefinitely until stopped via `delete_job`). Exported blobs are self-describing: each blob carries an explicit `schema_version`, the orchestration's `OrchestrationState` metadata, and - the full ordered event list. The export workflow retries each instance up + the full ordered event list. Each exported blob also carries + `{"instance_id": }` as destination-side metadata (the Azure writer + persists this as Azure Blob metadata) so consumers can scan a container + without parsing each blob body. The export workflow retries each instance up to 3 times with exponential backoff (15s/30s/60s), retries failed batches up to 3 times, caps in-flight exports via `max_parallel_exports` (default 32), continues-as-new every 5 page cycles to bound orchestrator history, and re-fetches entity state at the top of every page loop so external delete or mark-failed signals stop the orchestrator cleanly. + `delete_job` actively tears the job down: it clears the entity state, + terminates the driving orchestrator, waits briefly for it to settle, and + purges its orchestration history so a re-created job with the same ID + starts from a clean slate. Per-instance exports refuse to write a blob + when the target instance has been purged or has re-entered a non-terminal + state, surfacing the skipped instance as a per-batch failure. Job state lives in a durable entity with an explicit state-transition matrix (ACTIVE / COMPLETED / FAILED); invalid transitions raise `ExportJobInvalidTransitionError`. Persisted entity state uses a diff --git a/durabletask/extensions/history_export/activities.py b/durabletask/extensions/history_export/activities.py index 145f9607..3605ffa6 100644 --- a/durabletask/extensions/history_export/activities.py +++ b/durabletask/extensions/history_export/activities.py @@ -48,6 +48,15 @@ from durabletask.extensions.history_export.writer import HistoryWriter +# The set of runtime statuses considered "terminal" by the export +# activity's safety guard. Matches the .NET ``IsCompleted`` helper. +_TERMINAL_RUNTIME_STATUSES: frozenset[client_module.OrchestrationStatus] = frozenset({ + client_module.OrchestrationStatus.COMPLETED, + client_module.OrchestrationStatus.FAILED, + client_module.OrchestrationStatus.TERMINATED, +}) + + # The activity name registered with the worker is simply ``fn.__name__`` # (see :func:`durabletask.task.get_name`). These constants exist so # downstream code (the orchestrator, tests) can refer to the names @@ -174,13 +183,39 @@ def export_instance_history( prefix: str | None = str(prefix_raw) if prefix_raw is not None else None try: - events = ctx.client.get_orchestration_history(instance_id) - # Fetch the orchestration's terminal metadata too so the - # exported blob is self-describing (matches the .NET behavior). + # Resolve the instance's terminal metadata first. If the + # instance was purged, deleted, or has somehow re-entered a + # non-terminal state between ``list_terminal_instances`` and + # now, we refuse to write a partial/empty blob and surface a + # specific failure to the orchestrator. Matches the .NET + # ``ExportInstanceHistoryActivity`` guard. state = ctx.client.get_orchestration_state( instance_id, fetch_payloads=True, ) - metadata = orchestration_state_to_dict(state) if state is not None else None + if state is None: + return { + "instance_id": instance_id, + "success": False, + "error": ( + f"instance {instance_id!r} no longer exists or has been " + "purged" + ), + } + if state.runtime_status not in _TERMINAL_RUNTIME_STATUSES: + return { + "instance_id": instance_id, + "success": False, + "error": ( + f"instance {instance_id!r} is no longer terminal " + f"(runtime_status={state.runtime_status.name})" + ), + } + + events = ctx.client.get_orchestration_history(instance_id) + # The exported blob is self-describing: it carries the + # serialized ``OrchestrationState`` metadata alongside the + # event list. Matches the .NET behavior. + metadata = orchestration_state_to_dict(state) payload = serialize_history( events, instance_id=instance_id, @@ -195,6 +230,10 @@ def export_instance_history( payload=payload, content_type=content_type_for(fmt), content_encoding=content_encoding_for(fmt), + # Standard hook downstream consumers use to scan a + # container without parsing each blob body. Matches the + # .NET writer's ``Metadata["instanceId"]`` convention. + metadata={"instance_id": instance_id}, ) except Exception as ex: # noqa: BLE001 - reported back via return value return { diff --git a/durabletask/extensions/history_export/azure_blob.py b/durabletask/extensions/history_export/azure_blob.py index 11641641..5919e6fc 100644 --- a/durabletask/extensions/history_export/azure_blob.py +++ b/durabletask/extensions/history_export/azure_blob.py @@ -17,6 +17,7 @@ from __future__ import annotations +from collections.abc import Mapping from dataclasses import dataclass, field from typing import Any @@ -135,6 +136,7 @@ def write( payload: bytes, content_type: str, content_encoding: str | None, + metadata: Mapping[str, str] | None = None, ) -> None: del instance_id # included by the protocol but not needed here # This writer pins to the container configured at construction @@ -158,11 +160,19 @@ def write( if content_encoding else ContentSettings(content_type=content_type) ) + # Azure Blob Storage requires the metadata dict to be a plain + # ``dict[str, str]`` (the SDK does its own validation). Copy + # whatever the activity passed into the shape the underlying + # SDK expects, and pass ``None`` through unchanged so blobs + # written via :meth:`write` without metadata behave exactly + # the same as they did before this kwarg existed. + blob_metadata = dict(metadata) if metadata else None container_client.upload_blob( name=blob_name, data=payload, overwrite=self._options.overwrite, content_settings=content_settings, + metadata=blob_metadata, ) # ------------------------------------------------------------------ diff --git a/durabletask/extensions/history_export/client.py b/durabletask/extensions/history_export/client.py index a248d857..cc9f87c1 100644 --- a/durabletask/extensions/history_export/client.py +++ b/durabletask/extensions/history_export/client.py @@ -59,6 +59,8 @@ from datetime import datetime, timezone from typing import Any, cast +import grpc + from durabletask import client as client_module from durabletask import entities from durabletask import worker as worker_module @@ -93,6 +95,31 @@ _TERMINAL_STATUSES = frozenset({ExportJobStatus.COMPLETED, ExportJobStatus.FAILED}) _ENTITY_ID_PREFIX = f"@{ENTITY_NAME.lower()}@" +# Max seconds :meth:`ExportHistoryClient.delete_job` waits for the +# driving orchestrator to terminate before continuing on to purge. +# Sized to be longer than a single ``commit_checkpoint`` round-trip +# but short enough that a stuck orchestrator cannot block the caller +# indefinitely. +_DELETE_WAIT_TIMEOUT_SECONDS = 30.0 + + +def _grpc_status(ex: grpc.RpcError) -> grpc.StatusCode | None: + """Return the gRPC status code of *ex*, or ``None`` if it is not set. + + The ``code()`` method is declared on the runtime ``grpc.Call`` + mixin rather than on :class:`grpc.RpcError` itself, so we go + through ``getattr`` to keep both pyright and runtime happy when a + test backend raises a bare ``RpcError``. + """ + code = getattr(ex, "code", None) + if not callable(code): + return None + try: + result = code() + except Exception: # noqa: BLE001 - defensive, never re-raise here + return None + return result if isinstance(result, grpc.StatusCode) else None + __all__ = ["ExportHistoryClient", "ExportHistoryJobClient"] @@ -311,23 +338,72 @@ def wait_for_job( time.sleep(poll_interval) def delete_job(self, job_id: str) -> None: - """Request deletion of the export-job entity, clearing its state. - - This call is **best-effort and fire-and-forget**: it enqueues a - ``delete`` signal on the entity but does not wait for the - entity dispatcher to process it. Callers that need - confirmation should poll :meth:`get_job` and wait for it to - return ``None``. - - The driving orchestrator will detect the deletion at its next - loop iteration (via :meth:`OrchestrationContext.call_entity`) - and exit cleanly without issuing further signals. + """Stop and delete an export job. + + The call performs the full teardown sequence (matching the + .NET ``DefaultExportHistoryJobClient.DeleteAsync`` flow): + + 1. Signal the entity to clear its persisted state + (``ExportJobEntity.OP_DELETE``). + 2. Terminate the driving orchestrator so it stops issuing + further activity calls and entity signals. + 3. Wait briefly for the orchestrator to actually reach a + terminal state. + 4. Purge the orchestration history so a re-created job with + the same ID can start from a clean slate. + + Steps 2–4 are best-effort: each tolerates a missing + orchestrator (the job may never have run, or already been + purged) by swallowing gRPC ``NOT_FOUND`` errors. Step 3 + tolerates a slow termination by logging and continuing rather + than blocking the caller indefinitely. This does NOT delete blobs already written to the destination. """ entity_id = entities.EntityInstanceId(ENTITY_NAME, job_id) + orch_instance_id = orchestrator_instance_id_for(job_id) + + # Step 1: clear the persisted entity state. self._client.signal_entity(entity_id, ExportJobEntity.OP_DELETE) + # Step 2: terminate the driving orchestrator so it stops + # issuing activity calls and entity signals. + try: + self._client.terminate_orchestration( + orch_instance_id, recursive=True, + ) + except grpc.RpcError as ex: + if _grpc_status(ex) != grpc.StatusCode.NOT_FOUND: + raise + + # Step 3: wait briefly for the orchestration to settle so the + # subsequent purge actually removes its history. Capped by + # ``_DELETE_WAIT_TIMEOUT_SECONDS`` so a stuck orchestrator + # cannot block the caller indefinitely; a slow termination is + # logged rather than re-raised. + try: + self._client.wait_for_orchestration_completion( + orch_instance_id, timeout=_DELETE_WAIT_TIMEOUT_SECONDS, + ) + except TimeoutError: + logger.warning( + "Export job %r orchestrator %r did not terminate within %ss; " + "continuing with purge anyway", + job_id, orch_instance_id, _DELETE_WAIT_TIMEOUT_SECONDS, + ) + except grpc.RpcError as ex: + if _grpc_status(ex) != grpc.StatusCode.NOT_FOUND: + raise + + # Step 4: purge the orchestration history. + try: + self._client.purge_orchestration( + orch_instance_id, recursive=True, + ) + except grpc.RpcError as ex: + if _grpc_status(ex) != grpc.StatusCode.NOT_FOUND: + raise + def cancel_job(self, job_id: str) -> None: """Alias for :meth:`delete_job`. diff --git a/durabletask/extensions/history_export/entity.py b/durabletask/extensions/history_export/entity.py index b5e063c7..91719662 100644 --- a/durabletask/extensions/history_export/entity.py +++ b/durabletask/extensions/history_export/entity.py @@ -211,7 +211,15 @@ def get(self, _: Any = None) -> dict[str, Any] | None: def commit_checkpoint(self, payload: Mapping[str, Any]) -> dict[str, Any] | None: state = self._load() if state is None: - raise ValueError("Cannot commit_checkpoint on uninitialized export job") + # The entity was deleted between the orchestrator's + # mid-loop ``get`` call and this checkpoint (the race + # window is small but real for CONTINUOUS jobs cancelled + # via :meth:`ExportHistoryClient.delete_job`). Return + # ``None`` so the orchestrator's ``call_entity`` resolves + # cleanly and the loop exits via its normal + # "entity gone" path rather than raising and triggering + # an outer ``mark_failed`` on an already-deleted entity. + return None job_id = self._job_id() # commit_checkpoint may transition ACTIVE -> ACTIVE (no-op) or @@ -266,7 +274,10 @@ def commit_checkpoint(self, payload: Mapping[str, Any]) -> dict[str, Any] | None def mark_completed(self, _: Any = None) -> dict[str, Any] | None: state = self._load() if state is None: - raise ValueError("Cannot mark_completed on uninitialized export job") + # Entity vanished mid-flight (see ``commit_checkpoint`` + # for the race description); silently succeed so the + # orchestrator's final ``call_entity`` does not raise. + return None job_id = self._job_id() assert_valid_transition( self.OP_MARK_COMPLETED, state.status, ExportJobStatus.COMPLETED, @@ -285,7 +296,11 @@ def mark_failed( ) -> dict[str, Any] | None: state = self._load() if state is None: - raise ValueError("Cannot mark_failed on uninitialized export job") + # Entity vanished mid-flight (see ``commit_checkpoint`` + # for the race description); silently succeed so the + # orchestrator's best-effort failure report does not + # raise on an already-deleted entity. + return None job_id = self._job_id() assert_valid_transition( self.OP_MARK_FAILED, state.status, ExportJobStatus.FAILED, job_id=job_id, diff --git a/durabletask/extensions/history_export/orchestrator.py b/durabletask/extensions/history_export/orchestrator.py index 2c876b76..0511c376 100644 --- a/durabletask/extensions/history_export/orchestrator.py +++ b/durabletask/extensions/history_export/orchestrator.py @@ -133,6 +133,11 @@ def export_job_orchestrator( } """ job_id = str(input["job_id"]) + # All log records emitted from the orchestrator body go through + # the SDK's replay-safe logger so a single log line is not + # re-emitted on every replay of the orchestrator's history. + # Matches the .NET ``CreateReplaySafeLogger`` pattern. + safe_logger = ctx.create_replay_safe_logger(logger) config_input = input["config"] if not isinstance(config_input, Mapping): raise TypeError("config input must be a mapping") @@ -166,14 +171,14 @@ def export_job_orchestrator( yield ctx.call_entity(entity_id, ExportJobEntity.OP_GET) ) if current_state is None: - logger.info( + safe_logger.info( "Export job %r entity has been deleted; exiting orchestrator", job_id, ) return {"job_id": job_id, "status": "Cancelled", "totals": totals} current_status = current_state.get("status") if current_status != ExportJobStatus.ACTIVE.value: - logger.info( + safe_logger.info( "Export job %r entity status is %s; exiting orchestrator", job_id, current_status, ) @@ -202,19 +207,20 @@ def export_job_orchestrator( batch_failures: list[dict[str, Any]] = [] # Empty page handling matches the .NET ExportJobOrchestrator: - # CONTINUOUS sleeps and re-polls, BATCH exits cleanly even - # if the backend returned a non-null continuation token. - # This guards against backends that legally return an empty - # page with a token (the orchestrator would otherwise spin - # forever in BATCH mode emitting no-op commit_checkpoints). + # CONTINUOUS sleeps and re-polls (preserving the in-memory + # ``continuation_token`` so the next list call resumes + # from the last bookmark rather than rescanning the whole + # window), BATCH exits cleanly even if the backend + # returned a non-null continuation token. The BATCH guard + # prevents backends that legally return an empty page with + # a token from spinning the orchestrator forever. if not instance_ids: if config.mode is ExportMode.CONTINUOUS: yield ctx.create_timer( ctx.current_utc_datetime + _continuous_idle_delay() ) - continuation_token = None continue - ctx.signal_entity( + yield ctx.call_entity( entity_id, ExportJobEntity.OP_COMMIT_CHECKPOINT, { @@ -258,7 +264,7 @@ def export_job_orchestrator( ] if not batch_succeeded: - ctx.signal_entity( + yield ctx.call_entity( entity_id, ExportJobEntity.OP_COMMIT_CHECKPOINT, { @@ -273,14 +279,14 @@ def export_job_orchestrator( totals["exported"] += exported_delta totals["failed"] += failed_delta # The entity already transitioned to FAILED via - # the commit_checkpoint signal above; returning + # the commit_checkpoint call above; returning # cleanly avoids the outer ``except`` issuing a # second mark_failed signal which the transitions # matrix would reject (the entity is no longer # ACTIVE). Surfacing the cause is the caller's # responsibility via :meth:`ExportHistoryClient.get_job`, # whose ``last_error`` carries the failure summary. - logger.warning( + safe_logger.warning( "Export job %r marked FAILED after %d batch attempts; " "%d instances failed to export", job_id, MAX_BATCH_RETRY_ATTEMPTS, failed_delta, @@ -295,14 +301,21 @@ def export_job_orchestrator( next_token: str | None = ( str(next_token_raw) if next_token_raw is not None else None ) - ctx.signal_entity( + # The persisted ``last_instance_key`` always reflects the + # orchestrator's *resume cursor*: if the backend returned a + # fresh page token, use that; otherwise stick with the + # cursor that produced this page so CONTINUOUS recovery via + # continue-as-new resumes from the right bookmark rather + # than re-scanning the whole window. + resume_cursor = next_token if next_token else continuation_token + yield ctx.call_entity( entity_id, ExportJobEntity.OP_COMMIT_CHECKPOINT, { "scanned_delta": scanned_delta, "exported_delta": exported_delta, "failed_delta": failed_delta, - "last_instance_key": next_token, + "last_instance_key": resume_cursor, }, ) @@ -313,20 +326,35 @@ def export_job_orchestrator( if not next_token: if config.mode is ExportMode.CONTINUOUS: # Tail mode: sleep, then loop back and re-check. + # Preserve ``continuation_token`` (= resume_cursor) + # across the sleep so the next list call resumes + # from the last bookmark rather than rescanning + # the whole window on every quiet cycle. yield ctx.create_timer( ctx.current_utc_datetime + _continuous_idle_delay() ) - continuation_token = None continue break continuation_token = next_token # Reaching here means BATCH mode finished its window cleanly. - ctx.signal_entity(entity_id, ExportJobEntity.OP_MARK_COMPLETED) + # Use ``call_entity`` for the final transition so the orchestrator + # only resolves after the entity has durably recorded COMPLETED; + # callers polling :meth:`ExportHistoryClient.wait_for_job` then see + # the terminal status immediately rather than racing a backlog of + # in-flight entity signals. + yield ctx.call_entity(entity_id, ExportJobEntity.OP_MARK_COMPLETED) return {"job_id": job_id, "status": "Completed", "totals": totals} except Exception as ex: # noqa: BLE001 - reported back via mark_failed + # Best-effort terminal error report. We deliberately use + # ``signal_entity`` (fire-and-forget) rather than + # ``call_entity`` here: if the entity backend is the thing that + # raised, awaiting an entity call would just raise again and + # discard our cause. Signalling at least enqueues the + # ``mark_failed`` so the user has a chance of seeing it in + # :meth:`ExportHistoryClient.get_job`. ctx.signal_entity( entity_id, ExportJobEntity.OP_MARK_FAILED, diff --git a/durabletask/extensions/history_export/writer.py b/durabletask/extensions/history_export/writer.py index bea008a2..c4c3a3ee 100644 --- a/durabletask/extensions/history_export/writer.py +++ b/durabletask/extensions/history_export/writer.py @@ -31,6 +31,7 @@ def write( payload: bytes, content_type: str, content_encoding: str | None, + metadata: Mapping[str, str] | None = None, ) -> None: import os # The ``container`` value comes from the export job's @@ -41,6 +42,10 @@ def write( os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, "wb") as fp: fp.write(payload) + # ``metadata`` is an optional dict of small string-valued + # key/value pairs the destination may persist alongside + # the blob (Azure Blob Storage and S3 both support this). + # Destinations that cannot represent it may ignore it. writer = LocalFileSystemHistoryWriter("/var/exports") export_client = ExportHistoryClient(dt_client, writer) @@ -53,6 +58,7 @@ def write( from __future__ import annotations +from collections.abc import Mapping from typing import Protocol, runtime_checkable @@ -73,6 +79,7 @@ def write( payload: bytes, content_type: str, content_encoding: str | None, + metadata: Mapping[str, str] | None = None, ) -> None: """Persist one exported blob. @@ -100,6 +107,14 @@ def write( model HTTP-style headers (such as Azure Blob Storage) should persist this on the blob; destinations that cannot represent it may ignore it. + metadata: Optional small string-valued key/value pairs the + activity asks the destination to persist alongside the + blob. Azure Blob Storage and S3 expose this natively + as blob metadata / object tags; destinations that + cannot represent it may ignore it. The activity + currently populates ``{"instance_id": instance_id}`` + so downstream consumers can scan a container without + parsing each blob body. """ ... diff --git a/tests/durabletask/extensions/history_export/test_activities.py b/tests/durabletask/extensions/history_export/test_activities.py index 3ce3788b..e52b997f 100644 --- a/tests/durabletask/extensions/history_export/test_activities.py +++ b/tests/durabletask/extensions/history_export/test_activities.py @@ -46,13 +46,14 @@ def __init__(self) -> None: self._lock = threading.Lock() self.blobs: dict[str, dict] = {} - def write(self, *, instance_id, container, blob_name, payload, content_type, content_encoding): + def write(self, *, instance_id, container, blob_name, payload, content_type, content_encoding, metadata=None): with self._lock: self.blobs[blob_name] = { "instance_id": instance_id, "payload": payload, "content_type": content_type, "content_encoding": content_encoding, + "metadata": dict(metadata) if metadata else None, } @@ -164,6 +165,10 @@ def test_activities_list_and_export_to_in_memory_writer(c, seeded_ids): entry = writer.blobs[name] assert entry["content_type"] == "application/x-ndjson" assert entry["content_encoding"] == "gzip" + # N-9: activity must pass {"instance_id": ...} blob metadata + # to the writer so downstream consumers can scan a container + # without parsing each blob body. + assert entry["metadata"] == {"instance_id": sid} raw = gzip.decompress(entry["payload"]).decode("utf-8") lines = raw.strip().split("\n") assert len(lines) >= 2 # metadata + at least one event @@ -230,3 +235,88 @@ def test_activities_require_bound_context(c): assert state.runtime_status == client.OrchestrationStatus.FAILED assert state.failure_details is not None assert "without a bound context" in (state.failure_details.message or "") + + +# --------------------------------------------------------------------- +# Unit tests for the N-2 guard +# --------------------------------------------------------------------- +# +# The activity body refuses to write a blob when the target instance +# either no longer exists (e.g. purged between list and export) or has +# re-entered a non-terminal state. Exercising this via the full +# orchestrator would require fabricating a race against the in-memory +# backend; calling the activity body directly with a stub client lets +# us cover the guard deterministically. + + +class _StubGetStateClient: + """Minimal stand-in for ``TaskHubGrpcClient`` covering N-2 paths.""" + + def __init__(self, *, state): + self._state = state + self.history_calls = 0 + + def get_orchestration_state(self, instance_id, *, fetch_payloads=False): + del instance_id, fetch_payloads + return self._state + + def get_orchestration_history(self, instance_id): + del instance_id + self.history_calls += 1 + return [] + + +class _CountingWriter: + def __init__(self) -> None: + self.calls: list[dict] = [] + + def write(self, **kwargs): + self.calls.append(kwargs) + + +def _basic_activity_input() -> dict: + return { + "instance_id": "inst-x", + "format": ExportFormat(kind=ExportFormatKind.JSON).to_dict(), + "destination": ExportDestination(container="exports").to_dict(), + } + + +def test_export_activity_skips_when_instance_no_longer_exists(): + """N-2: instance purged between list and export -> failure without write.""" + from durabletask.extensions.history_export.activities import ( + export_instance_history, + ) + + stub_client = _StubGetStateClient(state=None) + writer = _CountingWriter() + bind_context(HistoryExportContext(client=stub_client, writer=writer)) + + result = export_instance_history(None, _basic_activity_input()) + + assert result["success"] is False + assert "no longer exists" in result["error"] + assert writer.calls == [] + assert stub_client.history_calls == 0 + + +def test_export_activity_skips_when_instance_is_not_terminal(): + """N-2: instance has re-entered a running state -> failure without write.""" + from durabletask.extensions.history_export.activities import ( + export_instance_history, + ) + + class _State: + runtime_status = client.OrchestrationStatus.RUNNING + + stub_client = _StubGetStateClient(state=_State()) + writer = _CountingWriter() + bind_context(HistoryExportContext(client=stub_client, writer=writer)) + + result = export_instance_history(None, _basic_activity_input()) + + assert result["success"] is False + assert "no longer terminal" in result["error"] + assert "RUNNING" in result["error"] + assert writer.calls == [] + assert stub_client.history_calls == 0 diff --git a/tests/durabletask/extensions/history_export/test_client.py b/tests/durabletask/extensions/history_export/test_client.py index 87490769..eca23afa 100644 --- a/tests/durabletask/extensions/history_export/test_client.py +++ b/tests/durabletask/extensions/history_export/test_client.py @@ -46,7 +46,7 @@ def __init__(self) -> None: self._lock = threading.Lock() self.blobs: dict[str, dict] = {} - def write(self, *, instance_id, container, blob_name, payload, content_type, content_encoding): + def write(self, *, instance_id, container, blob_name, payload, content_type, content_encoding, metadata=None): with self._lock: self.blobs[blob_name] = { "instance_id": instance_id, diff --git a/tests/durabletask/extensions/history_export/test_orchestrator.py b/tests/durabletask/extensions/history_export/test_orchestrator.py index 57bd27ef..9c9452a1 100644 --- a/tests/durabletask/extensions/history_export/test_orchestrator.py +++ b/tests/durabletask/extensions/history_export/test_orchestrator.py @@ -44,7 +44,7 @@ def __init__(self) -> None: self._lock = threading.Lock() self.blobs: dict[str, dict] = {} - def write(self, *, instance_id, container, blob_name, payload, content_type, content_encoding): + def write(self, *, instance_id, container, blob_name, payload, content_type, content_encoding, metadata=None): with self._lock: self.blobs[blob_name] = { "instance_id": instance_id, @@ -157,7 +157,16 @@ def test_orchestrator_exports_all_terminal_instances_and_marks_completed( def test_orchestrator_exits_when_entity_is_deleted_mid_run( dt_client, export_client, ): - """Continuous-mode jobs stop when the entity is deleted externally.""" + """Continuous-mode jobs stop when the entity is deleted externally. + + With the .NET-aligned :meth:`ExportHistoryClient.delete_job` flow, + deletion actively terminates and purges the driving orchestrator + (rather than relying on the orchestrator's next mid-loop entity + poll to self-exit). The post-condition tested here is therefore + "the entity state is gone and the orchestration is no longer + running" rather than the old "orchestration completes with status + Cancelled". + """ now = datetime.now(timezone.utc) desc = export_client.create_job( ExportJobCreationOptions( @@ -177,17 +186,33 @@ def test_orchestrator_exits_when_entity_is_deleted_mid_run( timeout=5.0, ) - # External delete: the orchestrator's next mid-loop entity get - # observes None and exits gracefully. + # External delete: terminates + waits + purges the driving + # orchestrator and clears the entity's persisted state. export_client.delete_job(desc.job_id) - run_state = dt_client.wait_for_orchestration_completion( - desc.orchestrator_instance_id, timeout=10, fetch_payloads=True, + # Entity state should be gone. + assert export_client.get_job(desc.job_id) is None + + # The orchestration should no longer be running (purged or in a + # terminal state). We poll briefly since ``delete_job`` is + # synchronous on termination but the in-memory backend can take a + # moment to settle the post-purge state. + def _orchestration_is_done() -> bool: + state = dt_client.get_orchestration_state( + desc.orchestrator_instance_id, fetch_payloads=False, + ) + if state is None: + return True # purged + return state.runtime_status in { + client.OrchestrationStatus.TERMINATED, + client.OrchestrationStatus.COMPLETED, + client.OrchestrationStatus.FAILED, + } + wait_until( + _orchestration_is_done, + description="orchestration to be terminated or purged", + timeout=10.0, ) - assert run_state is not None - assert run_state.runtime_status == client.OrchestrationStatus.COMPLETED - output = json.loads(run_state.serialized_output or "null") - assert output["status"] == "Cancelled" def test_orchestrator_records_failure_when_no_context_bound( @@ -229,7 +254,7 @@ def test_orchestrator_records_failure_when_no_context_bound( class _AlwaysFailingWriter: """Writer that raises on every call — used to force batch retries to exhaust.""" - def write(self, *, instance_id, container, blob_name, payload, content_type, content_encoding): + def write(self, *, instance_id, container, blob_name, payload, content_type, content_encoding, metadata=None): raise RuntimeError("simulated permanent write failure") From 853f169e6884a6136504c381f199255b519a1f2c Mon Sep 17 00:00:00 2001 From: Andy Staples Date: Thu, 4 Jun 2026 11:39:08 -0600 Subject: [PATCH 8/9] Hashed blob names --- CHANGELOG.md | 13 ++- .../extensions/history_export/activities.py | 71 +++++++++++++-- .../history_export/test_activities.py | 88 ++++++++++++++++++- 3 files changed, 160 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0ee56549..c3fef0bf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,10 +20,15 @@ ADDED (tail terminal instances indefinitely until stopped via `delete_job`). Exported blobs are self-describing: each blob carries an explicit `schema_version`, the orchestration's `OrchestrationState` metadata, and - the full ordered event list. Each exported blob also carries - `{"instance_id": }` as destination-side metadata (the Azure writer - persists this as Azure Blob metadata) so consumers can scan a container - without parsing each blob body. The export workflow retries each instance up + the full ordered event list. Blob names are a lowercase-hex SHA-256 of + ``{last_updated_at}|{instance_id}`` with the format extension appended + (matches the .NET `ExportInstanceHistoryActivity` naming scheme), so + re-exporting an instance after a later terminal update lands at a new + blob path rather than overwriting the previous one, and instance IDs + that differ only by `/` no longer collide. Each exported blob also + carries `{"instance_id": }` as destination-side metadata (the Azure + writer persists this as Azure Blob metadata) so consumers can scan a + container without parsing each blob body. The export workflow retries each instance up to 3 times with exponential backoff (15s/30s/60s), retries failed batches up to 3 times, caps in-flight exports via `max_parallel_exports` (default 32), continues-as-new every 5 page cycles to bound orchestrator diff --git a/durabletask/extensions/history_export/activities.py b/durabletask/extensions/history_export/activities.py index 3605ffa6..5a08c334 100644 --- a/durabletask/extensions/history_export/activities.py +++ b/durabletask/extensions/history_export/activities.py @@ -24,9 +24,10 @@ from __future__ import annotations +import hashlib from collections.abc import Mapping from dataclasses import dataclass -from datetime import datetime +from datetime import datetime, timezone from typing import Any, cast from durabletask import client as client_module @@ -222,7 +223,21 @@ def export_instance_history( fmt=fmt, metadata=metadata, ) - blob_name = _blob_name_for(instance_id=instance_id, prefix=prefix, fmt=fmt) + # Blob name is a SHA-256 hash of the instance's terminal + # timestamp + instance ID (matches the .NET + # ``ExportInstanceHistoryActivity`` scheme). This means: + # • Two exports of the *same* completion produce the same + # blob name (idempotent under retry when ``overwrite=True``). + # • An instance re-exported after a later completion lands + # at a new path rather than overwriting the previous one. + # • Instance IDs that differ only by ``/`` no longer collide + # under the old ``.replace("/", "_")`` transform. + blob_name = _blob_name_for( + instance_id=instance_id, + last_updated_at=state.last_updated_at, + prefix=prefix, + fmt=fmt, + ) ctx.writer.write( instance_id=instance_id, container=container, @@ -249,12 +264,56 @@ def export_instance_history( # Helpers # ---------------------------------------------------------------------- -def _blob_name_for(*, instance_id: str, prefix: str | None, fmt: ExportFormat) -> str: +def _blob_name_for( + *, + instance_id: str, + last_updated_at: datetime, + prefix: str | None, + fmt: ExportFormat, +) -> str: + """Return the destination blob name for one exported instance. + + Matches the .NET ``ExportInstanceHistoryActivity.GenerateBlobFileName`` + scheme: lowercase-hex SHA-256 of + ``f"{last_updated_at:O}|{instance_id}"`` with the format-appropriate + extension appended, optionally namespaced under the configured + destination prefix. Hash byte-equivalence with .NET output + requires matching the .NET ``DateTimeOffset.ToString("O")`` format + exactly (see :func:`_dotnet_o_format`). + """ + timestamp_str = _dotnet_o_format(last_updated_at) + hash_input = f"{timestamp_str}|{instance_id}" + digest = hashlib.sha256(hash_input.encode("utf-8")).hexdigest() ext = file_extension_for(fmt) - safe_id = instance_id.replace("/", "_") + blob_name = f"{digest}{ext}" if prefix: - return f"{prefix.rstrip('/')}/{safe_id}{ext}" - return f"{safe_id}{ext}" + return f"{prefix.rstrip('/')}/{blob_name}" + return blob_name + + +def _dotnet_o_format(dt: datetime) -> str: + """Format *dt* to match .NET ``DateTimeOffset.ToString("O")``. + + .NET's round-trip format is ``yyyy-MM-ddTHH:mm:ss.fffffffK`` for + ``DateTimeOffset``, where ``K`` resolves to ``+HH:MM`` / ``-HH:MM`` + and fractional seconds always render with seven digits (100-ns + ticks resolution). Python :class:`datetime.datetime` only carries + microsecond precision (six digits), so the seventh digit is always + a trailing zero. Naive datetimes are assumed UTC. + """ + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + base = dt.strftime("%Y-%m-%dT%H:%M:%S") + fractional = f"{dt.microsecond:06d}0" + offset = dt.utcoffset() + if offset is None: + offset_str = "+00:00" + else: + total_minutes = int(offset.total_seconds() // 60) + sign = "+" if total_minutes >= 0 else "-" + total_minutes = abs(total_minutes) + offset_str = f"{sign}{total_minutes // 60:02d}:{total_minutes % 60:02d}" + return f"{base}.{fractional}{offset_str}" def register(worker_instance: worker_module.TaskHubGrpcWorker) -> None: diff --git a/tests/durabletask/extensions/history_export/test_activities.py b/tests/durabletask/extensions/history_export/test_activities.py index e52b997f..f1d196f2 100644 --- a/tests/durabletask/extensions/history_export/test_activities.py +++ b/tests/durabletask/extensions/history_export/test_activities.py @@ -318,5 +318,89 @@ class _State: assert result["success"] is False assert "no longer terminal" in result["error"] assert "RUNNING" in result["error"] - assert writer.calls == [] - assert stub_client.history_calls == 0 + + +# --------------------------------------------------------------------- +# Unit tests for the N-8 blob-naming scheme +# --------------------------------------------------------------------- + + +def test_blob_name_matches_dotnet_hash_scheme(): + """N-8: blob name is lowercase-hex sha256 of '{:O}|{instance_id}'. + + Pins the exact .NET-aligned scheme so any future drift (timestamp + format, hash function, casing) breaks loudly. The expected hash + is computed by hand from the same inputs the activity would use. + """ + import hashlib + + from durabletask.extensions.history_export.activities import ( + _blob_name_for, + _dotnet_o_format, + ) + + last_updated = datetime(2026, 6, 4, 17, 9, 9, 420990, tzinfo=timezone.utc) + instance_id = "inst-1" + fmt = ExportFormat(kind=ExportFormatKind.JSON) + + # The seven-digit fractional-seconds format is what .NET emits for + # the same instant. + assert _dotnet_o_format(last_updated) == "2026-06-04T17:09:09.4209900+00:00" + + expected_hash = hashlib.sha256( + f"{_dotnet_o_format(last_updated)}|{instance_id}".encode("utf-8") + ).hexdigest() + + assert _blob_name_for( + instance_id=instance_id, + last_updated_at=last_updated, + prefix=None, + fmt=fmt, + ) == f"{expected_hash}.json" + + assert _blob_name_for( + instance_id=instance_id, + last_updated_at=last_updated, + prefix="exports/run-1/", + fmt=ExportFormat(kind=ExportFormatKind.JSONL_GZIP), + ) == f"exports/run-1/{expected_hash}.jsonl.gz" + + +def test_blob_name_isolates_instance_ids_that_differ_only_by_slash(): + """N-8: instance IDs containing '/' no longer collide. + + The old scheme used ``instance_id.replace(\"/\", \"_\")`` which + collapsed ``v1/x`` and ``v1_x`` to the same blob name. Hashing + isolates them. + """ + from durabletask.extensions.history_export.activities import _blob_name_for + + last_updated = datetime(2026, 6, 4, 17, 9, 9, 420990, tzinfo=timezone.utc) + fmt = ExportFormat(kind=ExportFormatKind.JSON) + + name_a = _blob_name_for( + instance_id="v1/x", last_updated_at=last_updated, prefix=None, fmt=fmt, + ) + name_b = _blob_name_for( + instance_id="v1_x", last_updated_at=last_updated, prefix=None, fmt=fmt, + ) + assert name_a != name_b + + +def test_blob_name_changes_when_instance_terminal_timestamp_changes(): + """N-8: re-export at a different terminal time lands at a new blob.""" + from durabletask.extensions.history_export.activities import _blob_name_for + + fmt = ExportFormat(kind=ExportFormatKind.JSON) + instance_id = "inst-x" + + earlier = datetime(2026, 6, 4, 17, 0, 0, 0, tzinfo=timezone.utc) + later = datetime(2026, 6, 4, 18, 0, 0, 0, tzinfo=timezone.utc) + + name_earlier = _blob_name_for( + instance_id=instance_id, last_updated_at=earlier, prefix=None, fmt=fmt, + ) + name_later = _blob_name_for( + instance_id=instance_id, last_updated_at=later, prefix=None, fmt=fmt, + ) + assert name_earlier != name_later From 535e4584ac5c9df79e4a12ffbd8e714f17c4720a Mon Sep 17 00:00:00 2001 From: Andy Staples Date: Tue, 16 Jun 2026 11:07:52 -0600 Subject: [PATCH 9/9] Address latest PR review comments from berndverst --- CHANGELOG.md | 6 +++++- durabletask/extensions/history_export/entity.py | 6 ++++++ .../extensions/history_export/orchestrator.py | 14 ++++++++++++-- .../extensions/history_export/transitions.py | 1 + .../test_transitions_and_exceptions.py | 5 ++++- 5 files changed, 28 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c3fef0bf..5715efe3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,8 +32,12 @@ ADDED to 3 times with exponential backoff (15s/30s/60s), retries failed batches up to 3 times, caps in-flight exports via `max_parallel_exports` (default 32), continues-as-new every 5 page cycles to bound orchestrator - history, and re-fetches entity state at the top of every page loop so + history while preserving cumulative totals across continue-as-new segments, + and re-fetches entity state at the top of every page loop so external delete or mark-failed signals stop the orchestrator cleanly. + Empty-page BATCH checkpoints no longer reset the persisted resume cursor, + and duplicate `mark_failed` signals are now idempotent no-ops when a job + is already failed to reduce transition-noise logs. `delete_job` actively tears the job down: it clears the entity state, terminates the driving orchestrator, waits briefly for it to settle, and purges its orchestration history so a re-created job with the same ID diff --git a/durabletask/extensions/history_export/entity.py b/durabletask/extensions/history_export/entity.py index 91719662..6afab6cd 100644 --- a/durabletask/extensions/history_export/entity.py +++ b/durabletask/extensions/history_export/entity.py @@ -305,6 +305,12 @@ def mark_failed( assert_valid_transition( self.OP_MARK_FAILED, state.status, ExportJobStatus.FAILED, job_id=job_id, ) + if state.status is ExportJobStatus.FAILED: + # Idempotent no-op: if the job is already FAILED, do not + # mutate persisted state (especially ``last_error``) and + # avoid noisy invalid-transition logs from duplicate + # best-effort mark_failed signals. + return state.to_dict() reason = "" if payload is not None: reason = str(payload.get("reason", "")) diff --git a/durabletask/extensions/history_export/orchestrator.py b/durabletask/extensions/history_export/orchestrator.py index 0511c376..ea8c6ca1 100644 --- a/durabletask/extensions/history_export/orchestrator.py +++ b/durabletask/extensions/history_export/orchestrator.py @@ -145,12 +145,22 @@ def export_job_orchestrator( config = ExportJobConfiguration.from_dict(config_mapping) initial_checkpoint = input.get("checkpoint") or {"last_instance_key": None} processed_cycles = int(input.get("processed_cycles", 0)) + input_totals_raw = input.get("totals") + input_totals: Mapping[str, Any] = ( + cast("Mapping[str, Any]", input_totals_raw) + if isinstance(input_totals_raw, Mapping) + else {} + ) entity_id = task.EntityInstanceId(ENTITY_NAME, job_id) runtime_status_names = [s.name for s in config.filter.effective_runtime_status()] continuation_token: str | None = initial_checkpoint.get("last_instance_key") - totals: dict[str, int] = {"scanned": 0, "exported": 0, "failed": 0} + totals: dict[str, int] = { + "scanned": int(input_totals.get("scanned", 0)), + "exported": int(input_totals.get("exported", 0)), + "failed": int(input_totals.get("failed", 0)), + } try: while True: @@ -160,6 +170,7 @@ def export_job_orchestrator( "job_id": job_id, "config": dict(config_mapping), "checkpoint": {"last_instance_key": continuation_token}, + "totals": dict(totals), "processed_cycles": 0, }) return None @@ -227,7 +238,6 @@ def export_job_orchestrator( "scanned_delta": 0, "exported_delta": 0, "failed_delta": 0, - "last_instance_key": None, }, ) break diff --git a/durabletask/extensions/history_export/transitions.py b/durabletask/extensions/history_export/transitions.py index 480d5f59..cd2e033a 100644 --- a/durabletask/extensions/history_export/transitions.py +++ b/durabletask/extensions/history_export/transitions.py @@ -46,6 +46,7 @@ ("mark_completed", ExportJobStatus.ACTIVE): frozenset({ExportJobStatus.COMPLETED}), ("mark_failed", ExportJobStatus.ACTIVE): frozenset({ExportJobStatus.FAILED}), + ("mark_failed", ExportJobStatus.FAILED): frozenset({ExportJobStatus.FAILED}), } diff --git a/tests/durabletask/extensions/history_export/test_transitions_and_exceptions.py b/tests/durabletask/extensions/history_export/test_transitions_and_exceptions.py index f2af23ea..8cd36978 100644 --- a/tests/durabletask/extensions/history_export/test_transitions_and_exceptions.py +++ b/tests/durabletask/extensions/history_export/test_transitions_and_exceptions.py @@ -57,10 +57,13 @@ def test_mark_completed_requires_active(self) -> None: "mark_completed", ExportJobStatus.FAILED, ExportJobStatus.COMPLETED, ) - def test_mark_failed_allowed_from_active(self) -> None: + def test_mark_failed_allowed_from_active_or_failed(self) -> None: assert is_valid_transition( "mark_failed", ExportJobStatus.ACTIVE, ExportJobStatus.FAILED, ) + assert is_valid_transition( + "mark_failed", ExportJobStatus.FAILED, ExportJobStatus.FAILED, + ) def test_unknown_operation_rejected(self) -> None: assert not is_valid_transition(