From a8540ef9c7ae48e2462655edf2bf116fa3682a39 Mon Sep 17 00:00:00 2001 From: George-AlexandruvBadicu Date: Fri, 12 Jun 2026 11:49:52 +0300 Subject: [PATCH] fix: reconnect tail-log stream after EOF or transient error (CMGR-76557) Adds a resilient while-loop in TailLog.tailLog() that re-invokes sdk.tailLog() after normal stream end (midnight UTC log rotation) or swallowable transient errors (404, network drops, Azure SAS expiry). Fatal CLI-coded errors (MAX_RETRY_REACHED, NO_IMS_CONTEXT) are re-thrown immediately. Reconnect delay is 2 seconds. Fixes: https://jira.corp.adobe.com/browse/CMGR-76557 Co-Authored-By: Claude Sonnet 4.6 --- .../cloudmanager/environment/tail-log.js | 15 +++-- test/commands/environment/tail-log.test.js | 64 ++++++++++++++++++- 2 files changed, 74 insertions(+), 5 deletions(-) diff --git a/src/commands/cloudmanager/environment/tail-log.js b/src/commands/cloudmanager/environment/tail-log.js index 654b8ffc..74c07c84 100644 --- a/src/commands/cloudmanager/environment/tail-log.js +++ b/src/commands/cloudmanager/environment/tail-log.js @@ -31,10 +31,17 @@ class TailLog extends BaseCommand { } async tailLog (programId, environmentId, service, logName, imsContextName = null) { - return executeWithRetries(async () => { - const sdk = await initSdk(imsContextName) - return sdk.tailLog(programId, environmentId, service, logName, process.stdout) - }) + while (true) { + try { + await executeWithRetries(async () => { + const sdk = await initSdk(imsContextName) + return sdk.tailLog(programId, environmentId, service, logName, process.stdout) + }) + } catch (error) { + if (error.code) throw error + } + await new Promise(resolve => setTimeout(resolve, 2000)) + } } } diff --git a/test/commands/environment/tail-log.test.js b/test/commands/environment/tail-log.test.js index d0efef2b..653516eb 100644 --- a/test/commands/environment/tail-log.test.js +++ b/test/commands/environment/tail-log.test.js @@ -37,11 +37,22 @@ test('tail-log - missing config', async () => { test('tail-log - config', async () => { setCurrentOrgId('good') + // Block on the second tailLog call so the reconnect loop doesn't run indefinitely. + // The 2000ms reconnect delay means the second call won't happen within our 50ms window. + let callCount = 0 + mockSdk.tailLog.mockImplementation(() => { + callCount++ + return callCount === 1 ? Promise.resolve() : new Promise(() => {}) + }) + expect.assertions(5) const runResult = TailLog.run(['17', 'author', 'aemerror', '--programId', '5']) await expect(runResult instanceof Promise).toBeTruthy() - await runResult + // Don't await runResult — the reconnect loop never terminates. + // Wait 50ms for the first sdk.tailLog() call to complete (the reconnect delay is 2000ms, + // so the second call hasn't been made yet). + await new Promise(resolve => setTimeout(resolve, 50)) await expect(init.mock.calls.length).toEqual(1) await expect(init).toHaveBeenCalledWith('good', 'test-client-id', 'fake-token', 'https://cloudmanager.adobe.io') await expect(mockSdk.tailLog.mock.calls.length).toEqual(1) @@ -58,3 +69,54 @@ test('tail-log - should retry 5 times and throw error', async () => { await expect(runResult).rejects.toThrow('[CloudManagerCLI:MAX_RETRY_REACHED] Max retries reached') await expect(mockSdk.tailLog.mock.calls.length).toEqual(5) }) + +test('tail-log - reconnects after normal stream end', async () => { + setCurrentOrgId('good') + mockSdk.tailLog.mockClear() + + let callCount = 0 + mockSdk.tailLog.mockImplementation(() => { + callCount++ + // First two calls resolve immediately (simulating EOF / end of stream) + // Third call blocks forever (simulating an active stream) + return callCount < 3 ? Promise.resolve() : new Promise(() => {}) + }) + + TailLog.run(['17', 'author', 'aemerror', '--programId', '5']) + + // Wait long enough for the 2000ms reconnect delay to fire at least once + await new Promise(resolve => setTimeout(resolve, 2500)) + + expect(mockSdk.tailLog.mock.calls.length).toBeGreaterThanOrEqual(2) +}, 10000) + +test('tail-log - retries silently on transient non-auth error', async () => { + setCurrentOrgId('good') + mockSdk.tailLog.mockClear() + + let callCount = 0 + mockSdk.tailLog.mockImplementation(() => { + callCount++ + if (callCount === 1) { + // Simulate a 404 (log stream not ready yet — as seen during the midnight rotation window) + return Promise.reject(Object.assign(new Error('Not Found'), { + sdkDetails: { response: { status: 404 } }, + })) + } + // Second call blocks (active stream established) + return new Promise(() => {}) + }) + + const runResult = TailLog.run(['17', 'author', 'aemerror', '--programId', '5']) + + // Wait for the 2000ms reconnect delay to fire after the first rejected call + await new Promise(resolve => setTimeout(resolve, 2500)) + + // The CLI must have retried (2nd call made) + expect(mockSdk.tailLog.mock.calls.length).toBeGreaterThanOrEqual(2) + + // The CLI must NOT have rejected (no crash on transient error) + await expect( + Promise.race([runResult.then(() => 'resolved'), Promise.resolve('still-running')]), + ).resolves.toBe('still-running') +}, 10000)