From 97b5e25c2eb7e4d994e98adc0007bf159be878ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Wed, 24 Jun 2026 21:17:15 +0300 Subject: [PATCH 01/11] Harden SQLite for single-node production workloads. Enable WAL, busy_timeout, and NORMAL synchronous pragmas on connect. Add withDbBusyRetry for task claims and improve SQLITE_BUSY detection in the transaction decorator and NATS reconcile claim path. --- .../managers/nats-reconcile-task-manager.js | 5 + src/data/providers/sqlite.js | 5 + src/decorators/transaction-decorator.js | 5 +- src/helpers/db-busy-retry.js | 39 ++++++ src/helpers/sqlite-pragmas.js | 58 +++++++++ test/src/helpers/db-busy-retry.test.js | 117 ++++++++++++++++++ test/src/helpers/sqlite-pragmas.test.js | 53 ++++++++ 7 files changed, 280 insertions(+), 2 deletions(-) create mode 100644 src/helpers/db-busy-retry.js create mode 100644 src/helpers/sqlite-pragmas.js create mode 100644 test/src/helpers/db-busy-retry.test.js create mode 100644 test/src/helpers/sqlite-pragmas.test.js diff --git a/src/data/managers/nats-reconcile-task-manager.js b/src/data/managers/nats-reconcile-task-manager.js index 5e23d2aa..89569e1b 100644 --- a/src/data/managers/nats-reconcile-task-manager.js +++ b/src/data/managers/nats-reconcile-task-manager.js @@ -3,6 +3,7 @@ const models = require('../models') const config = require('../../config') const databaseProvider = require('../providers/database-factory') const { Op } = require('sequelize') +const { withDbBusyRetry } = require('../../helpers/db-busy-retry') class NatsReconcileTaskManager extends BaseManager { getEntity () { @@ -10,6 +11,10 @@ class NatsReconcileTaskManager extends BaseManager { } async claimNext (controllerUuid, stalenessSeconds) { + return withDbBusyRetry(() => this._claimNextInternal(controllerUuid, stalenessSeconds)) + } + + async _claimNextInternal (controllerUuid, stalenessSeconds) { const sequelize = databaseProvider.sequelize const T = stalenessSeconds != null ? stalenessSeconds : config.get('settings.natsReconcileTaskStalenessSeconds', 900) const staleThreshold = new Date(Date.now() - T * 1000) diff --git a/src/data/providers/sqlite.js b/src/data/providers/sqlite.js index 10a21d62..53693e1c 100644 --- a/src/data/providers/sqlite.js +++ b/src/data/providers/sqlite.js @@ -4,6 +4,7 @@ const Sequelize = require('sequelize') const config = require('../../config') const DatabaseProvider = require('./database-provider') +const { registerSqlitePragmas, applySqlitePragmas } = require('../../helpers/sqlite-pragmas') class SqliteDatabaseProvider extends DatabaseProvider { constructor () { @@ -26,9 +27,13 @@ class SqliteDatabaseProvider extends DatabaseProvider { } else { this.sequelize = new Sequelize(sqliteConfig) } + + registerSqlitePragmas(this.sequelize, sqliteConfig.pragmas || {}) } async initDB () { + const pragmaConfig = config.get('database.sqlite.pragmas', {}) + await applySqlitePragmas(this.sequelize, pragmaConfig) } } diff --git a/src/decorators/transaction-decorator.js b/src/decorators/transaction-decorator.js index c49b8dc2..3c96a2f6 100644 --- a/src/decorators/transaction-decorator.js +++ b/src/decorators/transaction-decorator.js @@ -2,6 +2,7 @@ const cq = require('concurrent-queue') const Transaction = require('sequelize/lib/transaction') const { isTest } = require('../helpers/app-helper') +const { isSqliteBusyError } = require('../helpers/db-busy-retry') const transactionsQueue = cq() .limit({ concurrency: 1 }) @@ -42,7 +43,7 @@ function queueTransaction (resolve, reject, transaction, that, retries, ...args) return resolve(success) } - if (retries < 1 || (error.message || '').indexOf('SQLITE_BUSY') === -1) { + if (retries < 1 || !isSqliteBusyError(error)) { return reject(error) } @@ -54,7 +55,7 @@ function applyTransaction (resolve, reject, transaction, that, ...args) { transaction.apply(that, args) .then(resolve) .catch((error) => { - if ((error.message || '').indexOf('SQLITE_BUSY') === -1) { + if (!isSqliteBusyError(error)) { return reject(error) } diff --git a/src/helpers/db-busy-retry.js b/src/helpers/db-busy-retry.js new file mode 100644 index 00000000..7ce06dd4 --- /dev/null +++ b/src/helpers/db-busy-retry.js @@ -0,0 +1,39 @@ +const DEFAULT_MAX_RETRIES = 5 + +function isSqliteBusyError (error) { + if (!error) { + return false + } + const messages = [ + error.message, + error.parent && error.parent.message, + error.original && error.original.message + ] + return messages.some((message) => message && message.indexOf('SQLITE_BUSY') !== -1) +} + +/** + * Retry an async DB operation when SQLite reports SQLITE_BUSY (same semantics as TransactionDecorator queue retries). + * No-op for non-SQLITE_BUSY errors. Safe on mysql/postgres — busy errors never match. + */ +async function withDbBusyRetry (fn, options = {}) { + const maxRetries = options.maxRetries != null ? options.maxRetries : DEFAULT_MAX_RETRIES + let attempt = 0 + + while (true) { + try { + return await fn() + } catch (error) { + if (attempt >= maxRetries || !isSqliteBusyError(error)) { + throw error + } + attempt++ + } + } +} + +module.exports = { + DEFAULT_MAX_RETRIES, + isSqliteBusyError, + withDbBusyRetry +} diff --git a/src/helpers/sqlite-pragmas.js b/src/helpers/sqlite-pragmas.js new file mode 100644 index 00000000..b185b65f --- /dev/null +++ b/src/helpers/sqlite-pragmas.js @@ -0,0 +1,58 @@ +/** + * Apply production SQLite pragmas (WAL, busy_timeout, synchronous). + * @param {import('sequelize').Sequelize} sequelize + * @param {{ journalMode?: string, busyTimeoutMs?: number, synchronous?: string }} [pragmaConfig] + */ +async function applySqlitePragmas (sequelize, pragmaConfig = {}) { + const journalMode = pragmaConfig.journalMode != null ? pragmaConfig.journalMode : 'WAL' + const busyTimeoutMs = pragmaConfig.busyTimeoutMs != null ? pragmaConfig.busyTimeoutMs : 10000 + const synchronous = pragmaConfig.synchronous != null ? pragmaConfig.synchronous : 'NORMAL' + + if (journalMode) { + await sequelize.query(`PRAGMA journal_mode = ${journalMode}`) + } + if (busyTimeoutMs > 0) { + await sequelize.query(`PRAGMA busy_timeout = ${busyTimeoutMs}`) + } + if (synchronous) { + await sequelize.query(`PRAGMA synchronous = ${synchronous}`) + } +} + +function runOnConnection (connection, sql) { + return new Promise((resolve, reject) => { + if (typeof connection.run !== 'function') { + reject(new Error('Unsupported SQLite connection for pragma setup')) + return + } + connection.run(sql, (err) => (err ? reject(err) : resolve())) + }) +} + +/** + * Register afterConnect hook so pragmas apply when the pool opens a new connection. + * @param {import('sequelize').Sequelize} sequelize + * @param {{ journalMode?: string, busyTimeoutMs?: number, synchronous?: string }} [pragmaConfig] + */ +function registerSqlitePragmas (sequelize, pragmaConfig = {}) { + const journalMode = pragmaConfig.journalMode != null ? pragmaConfig.journalMode : 'WAL' + const busyTimeoutMs = pragmaConfig.busyTimeoutMs != null ? pragmaConfig.busyTimeoutMs : 10000 + const synchronous = pragmaConfig.synchronous != null ? pragmaConfig.synchronous : 'NORMAL' + + sequelize.addHook('afterConnect', async (connection) => { + if (journalMode) { + await runOnConnection(connection, `PRAGMA journal_mode = ${journalMode}`) + } + if (busyTimeoutMs > 0) { + await runOnConnection(connection, `PRAGMA busy_timeout = ${busyTimeoutMs}`) + } + if (synchronous) { + await runOnConnection(connection, `PRAGMA synchronous = ${synchronous}`) + } + }) +} + +module.exports = { + applySqlitePragmas, + registerSqlitePragmas +} diff --git a/test/src/helpers/db-busy-retry.test.js b/test/src/helpers/db-busy-retry.test.js new file mode 100644 index 00000000..d4dc16b9 --- /dev/null +++ b/test/src/helpers/db-busy-retry.test.js @@ -0,0 +1,117 @@ +const { expect } = require('chai') +const fs = require('fs') +const os = require('os') +const path = require('path') +const Sequelize = require('sequelize') + +const { isSqliteBusyError, withDbBusyRetry } = require('../../../src/helpers/db-busy-retry') +const { registerSqlitePragmas, applySqlitePragmas } = require('../../../src/helpers/sqlite-pragmas') + +describe('db-busy-retry', () => { + it('detects SQLITE_BUSY on nested Sequelize errors', () => { + const error = { + message: 'SQLITE_BUSY: database is locked', + parent: { message: 'SQLITE_BUSY: database is locked', code: 'SQLITE_BUSY' } + } + expect(isSqliteBusyError(error)).to.equal(true) + }) + + it('does not treat unrelated errors as busy', () => { + expect(isSqliteBusyError(new Error('connection refused'))).to.equal(false) + }) + + it('retries until the operation succeeds', async () => { + let attempts = 0 + const result = await withDbBusyRetry(async () => { + attempts++ + if (attempts < 3) { + throw new Error('SQLITE_BUSY: database is locked') + } + return 'ok' + }) + + expect(result).to.equal('ok') + expect(attempts).to.equal(3) + }) + + it('rethrows non-busy errors immediately', async () => { + let attempts = 0 + try { + await withDbBusyRetry(async () => { + attempts++ + throw new Error('constraint violation') + }) + throw new Error('expected throw') + } catch (error) { + expect(error.message).to.equal('constraint violation') + expect(attempts).to.equal(1) + } + }) + + it('exhausts retries and rethrows the last busy error', async () => { + let attempts = 0 + try { + await withDbBusyRetry(async () => { + attempts++ + throw new Error('SQLITE_BUSY: database is locked') + }, { maxRetries: 2 }) + throw new Error('expected throw') + } catch (error) { + expect(error.message).to.contain('SQLITE_BUSY') + expect(attempts).to.equal(3) + } + }) +}) + +describe('sqlite lock contention regression', () => { + let sequelize + let dbPath + + beforeEach(async () => { + dbPath = path.join(os.tmpdir(), `controller-lock-test-${Date.now()}-${Math.random()}.sqlite`) + sequelize = new Sequelize({ + dialect: 'sqlite', + storage: dbPath, + logging: false, + pool: { max: 1, min: 0, idle: 10000 } + }) + registerSqlitePragmas(sequelize, { + journalMode: 'WAL', + busyTimeoutMs: 10000, + synchronous: 'NORMAL' + }) + await sequelize.authenticate() + await applySqlitePragmas(sequelize, { + journalMode: 'WAL', + busyTimeoutMs: 10000, + synchronous: 'NORMAL' + }) + await sequelize.query('CREATE TABLE lock_test (id INTEGER PRIMARY KEY AUTOINCREMENT, val INTEGER NOT NULL)') + }) + + afterEach(async () => { + if (sequelize) { + await sequelize.close() + } + for (const suffix of ['', '-wal', '-shm']) { + try { + fs.unlinkSync(dbPath + suffix) + } catch (_) { /* ignore missing sidecar files */ } + } + }) + + it('completes concurrent writes with withDbBusyRetry under single-connection SQLite', async function () { + this.timeout(15000) + + const write = () => withDbBusyRetry(() => + sequelize.transaction(async (transaction) => { + await sequelize.query('INSERT INTO lock_test (val) VALUES (1)', { transaction }) + }) + ) + + await Promise.all(Array.from({ length: 12 }, () => write())) + + const [rows] = await sequelize.query('SELECT COUNT(*) AS count FROM lock_test') + expect(Number(rows[0].count)).to.equal(12) + }) +}) diff --git a/test/src/helpers/sqlite-pragmas.test.js b/test/src/helpers/sqlite-pragmas.test.js new file mode 100644 index 00000000..cde556ce --- /dev/null +++ b/test/src/helpers/sqlite-pragmas.test.js @@ -0,0 +1,53 @@ +const { expect } = require('chai') +const fs = require('fs') +const os = require('os') +const path = require('path') +const Sequelize = require('sequelize') + +const { registerSqlitePragmas, applySqlitePragmas } = require('../../../src/helpers/sqlite-pragmas') + +describe('sqlite-pragmas', () => { + let sequelize + let dbPath + + afterEach(async () => { + if (sequelize) { + await sequelize.close() + sequelize = null + } + for (const suffix of ['', '-wal', '-shm']) { + try { + fs.unlinkSync(dbPath + suffix) + } catch (_) { /* ignore */ } + } + }) + + it('applies WAL and busy_timeout on connect', async () => { + dbPath = path.join(os.tmpdir(), `controller-pragma-test-${Date.now()}.sqlite`) + sequelize = new Sequelize({ + dialect: 'sqlite', + storage: dbPath, + logging: false + }) + registerSqlitePragmas(sequelize, { + journalMode: 'WAL', + busyTimeoutMs: 7500, + synchronous: 'NORMAL' + }) + + await sequelize.authenticate() + await applySqlitePragmas(sequelize, { + journalMode: 'WAL', + busyTimeoutMs: 7500, + synchronous: 'NORMAL' + }) + + const [journalRows] = await sequelize.query('PRAGMA journal_mode') + const journalMode = Object.values(journalRows[0] || {})[0] + expect(String(journalMode).toLowerCase()).to.equal('wal') + + const [busyRows] = await sequelize.query('PRAGMA busy_timeout') + const busyTimeout = Object.values(busyRows[0] || {})[0] + expect(Number(busyTimeout)).to.equal(7500) + }) +}) From f300f21b9e3bb9190124c8993cca2be6c6662c4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Wed, 24 Jun 2026 21:17:20 +0300 Subject: [PATCH 02/11] Add fog and service platform reconcile persistence layer. Introduce FogPlatformSpecs, FogPlatformStatuses, FogPlatformReconcileTasks, ServicePlatformReconcileTasks, and HubRouterConfigLocks with Sequelize models, managers, deduplicated enqueue, and stale task reclaim. --- .../fog-platform-reconcile-task-manager.js | 144 +++++ .../managers/fog-platform-spec-manager.js | 42 ++ .../managers/fog-platform-status-manager.js | 83 +++ .../hub-router-config-lock-manager.js | 93 +++ ...service-platform-reconcile-task-manager.js | 152 +++++ .../mysql/db_migration_mysql_v3.8.0.sql | 68 +++ .../postgres/db_migration_pg_v3.8.0.sql | 68 +++ .../sqlite/db_migration_sqlite_v3.8.0.sql | 68 +++ src/data/models/fog.js | 10 + src/data/models/fogPlatformReconcileTask.js | 73 +++ src/data/models/fogPlatformSpec.js | 36 ++ src/data/models/fogPlatformStatus.js | 52 ++ src/data/models/hubRouterConfigLock.js | 30 + .../models/servicePlatformReconcileTask.js | 66 +++ src/schemas/fog-platform-spec.js | 237 ++++++++ test/src/data/fog-platform-managers.test.js | 539 ++++++++++++++++++ 16 files changed, 1761 insertions(+) create mode 100644 src/data/managers/fog-platform-reconcile-task-manager.js create mode 100644 src/data/managers/fog-platform-spec-manager.js create mode 100644 src/data/managers/fog-platform-status-manager.js create mode 100644 src/data/managers/hub-router-config-lock-manager.js create mode 100644 src/data/managers/service-platform-reconcile-task-manager.js create mode 100644 src/data/models/fogPlatformReconcileTask.js create mode 100644 src/data/models/fogPlatformSpec.js create mode 100644 src/data/models/fogPlatformStatus.js create mode 100644 src/data/models/hubRouterConfigLock.js create mode 100644 src/data/models/servicePlatformReconcileTask.js create mode 100644 src/schemas/fog-platform-spec.js create mode 100644 test/src/data/fog-platform-managers.test.js diff --git a/src/data/managers/fog-platform-reconcile-task-manager.js b/src/data/managers/fog-platform-reconcile-task-manager.js new file mode 100644 index 00000000..820b655c --- /dev/null +++ b/src/data/managers/fog-platform-reconcile-task-manager.js @@ -0,0 +1,144 @@ +const BaseManager = require('./base-manager') +const models = require('../models') +const config = require('../../config') +const databaseProvider = require('../providers/database-factory') +const { Op } = require('sequelize') +const { FOG_PLATFORM_REASONS } = require('../../schemas/fog-platform-spec') +const { withDbBusyRetry } = require('../../helpers/db-busy-retry') + +const ACTIVE_STATUSES = ['pending', 'in_progress'] + +class FogPlatformReconcileTaskManager extends BaseManager { + getEntity () { + return models.FogPlatformReconcileTask + } + + _normalizeReason (reason) { + return FOG_PLATFORM_REASONS.includes(reason) ? reason : 'spec-changed' + } + + async enqueueFogPlatformReconcileTask (options = {}, transaction) { + if (transaction.fakeTransaction) { + return databaseProvider.sequelize.transaction((t) => + this.enqueueFogPlatformReconcileTask(options, t) + ) + } + + const fogUuid = options.fogUuid + if (!fogUuid) { + throw new Error('fogUuid is required to enqueue fog platform reconcile task') + } + + const reason = this._normalizeReason(options.reason) + const specGeneration = options.specGeneration != null ? options.specGeneration : null + const Entity = this.getEntity() + + const existing = await Entity.findOne({ + where: { + fogUuid, + status: { [Op.in]: ACTIVE_STATUSES } + }, + transaction + }) + + if (existing) { + const update = { specGeneration } + if (reason === 'delete' || existing.reason !== 'delete') { + update.reason = reason + } + if (reason === 'manual-retry') { + update.nextAttemptAt = null + update.attempts = 0 + update.lastError = null + } + await Entity.update(update, { where: { id: existing.id }, transaction }) + return this.findOne({ id: existing.id }, transaction) + } + + return this.create({ + fogUuid, + reason, + specGeneration, + status: 'pending' + }, transaction) + } + + async claimNextFogTask (controllerUuid, stalenessSeconds) { + return withDbBusyRetry(() => this._claimNextFogTaskInternal(controllerUuid, stalenessSeconds)) + } + + async _claimNextFogTaskInternal (controllerUuid, stalenessSeconds) { + const sequelize = databaseProvider.sequelize + const T = stalenessSeconds != null + ? stalenessSeconds + : config.get('settings.fogPlatformReconcileTaskStalenessSeconds', 300) + const staleThreshold = new Date(Date.now() - T * 1000) + const Entity = this.getEntity() + const now = new Date() + + return sequelize.transaction(async (transaction) => { + const task = await Entity.findOne({ + where: { + status: { [Op.in]: ACTIVE_STATUSES }, + [Op.or]: [ + { nextAttemptAt: null }, + { nextAttemptAt: { [Op.lte]: now } } + ], + [Op.and]: [{ + [Op.or]: [ + { leaderUuid: null }, + { claimedAt: { [Op.lt]: staleThreshold } } + ] + }] + }, + order: [['id', 'ASC']], + limit: 1, + transaction + }) + if (!task) return null + + const [affected] = await Entity.update( + { leaderUuid: controllerUuid, claimedAt: new Date(), status: 'in_progress' }, + { + where: { + id: task.id, + [Op.or]: [ + { leaderUuid: null }, + { claimedAt: { [Op.lt]: staleThreshold } } + ] + }, + transaction + } + ) + if (affected === 0) return null + return this.findOne({ id: task.id }, transaction) + }) + } + + async recordFogTaskFailure (taskId, errorMessage, options = {}, transaction) { + const maxAttempts = config.get('settings.fogPlatformReconcileMaxAttempts', 10) + const backoffBaseSeconds = config.get('settings.fogPlatformReconcileBackoffBaseSeconds', 5) + const attempts = (options.attempts != null ? options.attempts : 0) + 1 + const isPermanent = attempts >= maxAttempts + const nextAttemptAt = isPermanent + ? null + : new Date(Date.now() + backoffBaseSeconds * Math.pow(2, attempts - 1) * 1000) + + const Entity = this.getEntity() + await Entity.update({ + attempts, + lastError: errorMessage, + nextAttemptAt, + status: isPermanent ? 'failed' : 'pending', + leaderUuid: null, + claimedAt: null + }, { + where: { id: taskId }, + transaction + }) + + return this.findOne({ id: taskId }, transaction) + } +} + +module.exports = new FogPlatformReconcileTaskManager() diff --git a/src/data/managers/fog-platform-spec-manager.js b/src/data/managers/fog-platform-spec-manager.js new file mode 100644 index 00000000..492fe1c4 --- /dev/null +++ b/src/data/managers/fog-platform-spec-manager.js @@ -0,0 +1,42 @@ +const BaseManager = require('./base-manager') +const models = require('../models') +const { + validateFogPlatformSpec, + parseSpecJson, + serializeSpecJson +} = require('../../schemas/fog-platform-spec') + +class FogPlatformSpecManager extends BaseManager { + getEntity () { + return models.FogPlatformSpec + } + + async getParsedSpec (fogUuid, transaction) { + const row = await this.findOne({ fogUuid }, transaction) + if (!row) { + return null + } + return { + fogUuid: row.fogUuid, + generation: row.generation, + spec: parseSpecJson(row.specJson) + } + } + + async upsertSpec (fogUuid, specObject, transaction) { + await validateFogPlatformSpec(specObject) + const specJson = serializeSpecJson(specObject) + + const existing = await this.findOne({ fogUuid }, transaction) + if (existing) { + const generation = existing.generation + 1 + await this.update({ fogUuid }, { specJson, generation }, transaction) + return { fogUuid, generation, specJson } + } + + await this.create({ fogUuid, specJson, generation: 1 }, transaction) + return { fogUuid, generation: 1, specJson } + } +} + +module.exports = new FogPlatformSpecManager() diff --git a/src/data/managers/fog-platform-status-manager.js b/src/data/managers/fog-platform-status-manager.js new file mode 100644 index 00000000..88307163 --- /dev/null +++ b/src/data/managers/fog-platform-status-manager.js @@ -0,0 +1,83 @@ +const BaseManager = require('./base-manager') +const models = require('../models') +const { + FOG_PLATFORM_PHASES, + parseConditionsJson, + serializeConditionsJson +} = require('../../schemas/fog-platform-spec') + +class FogPlatformStatusManager extends BaseManager { + getEntity () { + return models.FogPlatformStatus + } + + async getParsedStatus (fogUuid, transaction) { + const row = await this.findOne({ fogUuid }, transaction) + if (!row) { + return null + } + return { + fogUuid: row.fogUuid, + observedGeneration: row.observedGeneration, + phase: row.phase, + lastError: row.lastError, + lastTransitionAt: row.lastTransitionAt, + conditions: row.conditionsJson ? parseConditionsJson(row.conditionsJson) : [] + } + } + + async ensurePending (fogUuid, transaction) { + const existing = await this.findOne({ fogUuid }, transaction) + const now = new Date() + if (existing) { + await this.update({ + fogUuid + }, { + phase: existing.phase === 'Deleting' ? 'Deleting' : 'Pending', + lastTransitionAt: now + }, transaction) + return this.findOne({ fogUuid }, transaction) + } + + return this.create({ + fogUuid, + observedGeneration: 0, + phase: 'Pending', + lastTransitionAt: now + }, transaction) + } + + async setPhase (fogUuid, phase, options = {}, transaction) { + if (!FOG_PLATFORM_PHASES.includes(phase)) { + throw new Error(`Invalid fog platform phase '${phase}'`) + } + + const update = { + phase, + lastTransitionAt: new Date() + } + if (Object.hasOwn(options, 'lastError')) { + update.lastError = options.lastError + } + if (Object.hasOwn(options, 'observedGeneration')) { + update.observedGeneration = options.observedGeneration + } + if (Object.hasOwn(options, 'conditions')) { + update.conditionsJson = serializeConditionsJson(options.conditions) + } + + const existing = await this.findOne({ fogUuid }, transaction) + if (existing) { + await this.update({ fogUuid }, update, transaction) + return this.findOne({ fogUuid }, transaction) + } + + return this.create({ + fogUuid, + observedGeneration: update.observedGeneration || 0, + ...update + }, transaction) + } +} + +module.exports = new FogPlatformStatusManager() diff --git a/src/data/managers/hub-router-config-lock-manager.js b/src/data/managers/hub-router-config-lock-manager.js new file mode 100644 index 00000000..6b7d64ea --- /dev/null +++ b/src/data/managers/hub-router-config-lock-manager.js @@ -0,0 +1,93 @@ +const BaseManager = require('./base-manager') +const models = require('../models') +const databaseProvider = require('../providers/database-factory') +const config = require('../../config') + +const LOCK_ROW_ID = 1 + +class HubRouterConfigLockManager extends BaseManager { + getEntity () { + return models.HubRouterConfigLock + } + + _getModelOptions (transaction) { + return transaction && transaction.fakeTransaction + ? {} + : { transaction } + } + + _isUniqueConstraintError (error) { + return error && error.name === 'SequelizeUniqueConstraintError' + } + + async initializeLock (transaction) { + const lock = await this.findOne({ id: LOCK_ROW_ID }, transaction) + if (!lock) { + try { + await this.create({ id: LOCK_ROW_ID, leaderUuid: null, claimedAt: null }, transaction) + } catch (error) { + if (!this._isUniqueConstraintError(error)) { + throw error + } + } + } + } + + _getStalenessSeconds (timeoutSeconds) { + if (timeoutSeconds != null) { + return timeoutSeconds + } + return config.get('settings.hubRouterConfigLockTimeoutSeconds', 120) + } + + async tryAcquire (controllerUuid, timeoutSeconds, transaction) { + if (transaction.fakeTransaction) { + return databaseProvider.sequelize.transaction((t) => + this.tryAcquire(controllerUuid, timeoutSeconds, t) + ) + } + + await this.initializeLock(transaction) + + const stalenessSeconds = this._getStalenessSeconds(timeoutSeconds) + const staleThreshold = new Date(Date.now() - stalenessSeconds * 1000) + const lock = await this.findOne({ id: LOCK_ROW_ID }, transaction) + + if (!lock.leaderUuid || !lock.claimedAt) { + await this.update({ id: LOCK_ROW_ID }, { + leaderUuid: controllerUuid, + claimedAt: new Date() + }, transaction) + return true + } + + if (lock.leaderUuid === controllerUuid) { + return true + } + + if (lock.claimedAt < staleThreshold) { + await this.update({ id: LOCK_ROW_ID }, { + leaderUuid: controllerUuid, + claimedAt: new Date() + }, transaction) + return true + } + + return false + } + + async release (controllerUuid, transaction) { + const lock = await this.findOne({ id: LOCK_ROW_ID }, transaction) + if (!lock || lock.leaderUuid !== controllerUuid) { + return false + } + + await this.update({ id: LOCK_ROW_ID }, { + leaderUuid: null, + claimedAt: null + }, transaction) + return true + } +} + +module.exports = new HubRouterConfigLockManager() diff --git a/src/data/managers/service-platform-reconcile-task-manager.js b/src/data/managers/service-platform-reconcile-task-manager.js new file mode 100644 index 00000000..7bd91105 --- /dev/null +++ b/src/data/managers/service-platform-reconcile-task-manager.js @@ -0,0 +1,152 @@ +const BaseManager = require('./base-manager') +const models = require('../models') +const config = require('../../config') +const databaseProvider = require('../providers/database-factory') +const { Op } = require('sequelize') +const { + SERVICE_PLATFORM_REASONS, + serializeSpecSnapshot, + parseSpecSnapshot +} = require('../../schemas/fog-platform-spec') +const { withDbBusyRetry } = require('../../helpers/db-busy-retry') + +const ACTIVE_STATUSES = ['pending', 'in_progress'] + +class ServicePlatformReconcileTaskManager extends BaseManager { + getEntity () { + return models.ServicePlatformReconcileTask + } + + _normalizeReason (reason) { + return SERVICE_PLATFORM_REASONS.includes(reason) ? reason : 'spec-changed' + } + + getParsedSpecSnapshot (task) { + if (!task || task.specSnapshot == null) { + return null + } + return parseSpecSnapshot(task.specSnapshot) + } + + async enqueueServicePlatformReconcileTask (options = {}, transaction) { + if (transaction.fakeTransaction) { + return databaseProvider.sequelize.transaction((t) => + this.enqueueServicePlatformReconcileTask(options, t) + ) + } + + const serviceName = options.serviceName + if (!serviceName) { + throw new Error('serviceName is required to enqueue service platform reconcile task') + } + + const reason = this._normalizeReason(options.reason) + const specSnapshot = serializeSpecSnapshot(options.specSnapshot) + const Entity = this.getEntity() + + const existing = await Entity.findOne({ + where: { + serviceName, + status: { [Op.in]: ACTIVE_STATUSES } + }, + transaction + }) + + if (existing) { + const update = { reason, specSnapshot } + if (reason === 'manual-retry') { + update.nextAttemptAt = null + update.attempts = 0 + update.lastError = null + } + await Entity.update(update, { where: { id: existing.id }, transaction }) + return this.findOne({ id: existing.id }, transaction) + } + + return this.create({ + serviceName, + reason, + specSnapshot, + status: 'pending' + }, transaction) + } + + async claimNextServiceTask (controllerUuid, stalenessSeconds) { + return withDbBusyRetry(() => this._claimNextServiceTaskInternal(controllerUuid, stalenessSeconds)) + } + + async _claimNextServiceTaskInternal (controllerUuid, stalenessSeconds) { + const sequelize = databaseProvider.sequelize + const T = stalenessSeconds != null + ? stalenessSeconds + : config.get('settings.fogPlatformReconcileTaskStalenessSeconds', 300) + const staleThreshold = new Date(Date.now() - T * 1000) + const Entity = this.getEntity() + const now = new Date() + + return sequelize.transaction(async (transaction) => { + const task = await Entity.findOne({ + where: { + status: { [Op.in]: ACTIVE_STATUSES }, + [Op.or]: [ + { nextAttemptAt: null }, + { nextAttemptAt: { [Op.lte]: now } } + ], + [Op.and]: [{ + [Op.or]: [ + { leaderUuid: null }, + { claimedAt: { [Op.lt]: staleThreshold } } + ] + }] + }, + order: [['id', 'ASC']], + limit: 1, + transaction + }) + if (!task) return null + + const [affected] = await Entity.update( + { leaderUuid: controllerUuid, claimedAt: new Date(), status: 'in_progress' }, + { + where: { + id: task.id, + [Op.or]: [ + { leaderUuid: null }, + { claimedAt: { [Op.lt]: staleThreshold } } + ] + }, + transaction + } + ) + if (affected === 0) return null + return this.findOne({ id: task.id }, transaction) + }) + } + + async recordServiceTaskFailure (taskId, errorMessage, options = {}, transaction) { + const maxAttempts = config.get('settings.servicePlatformReconcileMaxAttempts', 10) + const backoffBaseSeconds = config.get('settings.fogPlatformReconcileBackoffBaseSeconds', 5) + const attempts = (options.attempts != null ? options.attempts : 0) + 1 + const isPermanent = attempts >= maxAttempts + const nextAttemptAt = isPermanent + ? null + : new Date(Date.now() + backoffBaseSeconds * Math.pow(2, attempts - 1) * 1000) + + const Entity = this.getEntity() + await Entity.update({ + attempts, + lastError: errorMessage, + nextAttemptAt, + status: isPermanent ? 'failed' : 'pending', + leaderUuid: null, + claimedAt: null + }, { + where: { id: taskId }, + transaction + }) + + return this.findOne({ id: taskId }, transaction) + } +} + +module.exports = new ServicePlatformReconcileTaskManager() diff --git a/src/data/migrations/mysql/db_migration_mysql_v3.8.0.sql b/src/data/migrations/mysql/db_migration_mysql_v3.8.0.sql index 9eea616f..5884bdbf 100644 --- a/src/data/migrations/mysql/db_migration_mysql_v3.8.0.sql +++ b/src/data/migrations/mysql/db_migration_mysql_v3.8.0.sql @@ -1140,4 +1140,72 @@ CREATE TABLE IF NOT EXISTS AuthPolicy ( updated_at DATETIME ); +CREATE TABLE IF NOT EXISTS FogPlatformSpecs ( + fog_uuid VARCHAR(36) PRIMARY KEY NOT NULL, + spec_json TEXT NOT NULL, + generation INT NOT NULL DEFAULT 1, + created_at DATETIME, + updated_at DATETIME, + FOREIGN KEY (fog_uuid) REFERENCES Fogs (uuid) ON DELETE CASCADE +); + +CREATE TABLE IF NOT EXISTS FogPlatformStatuses ( + fog_uuid VARCHAR(36) PRIMARY KEY NOT NULL, + observed_generation INT NOT NULL DEFAULT 0, + phase VARCHAR(32) NOT NULL DEFAULT 'Pending', + last_error TEXT, + last_transition_at DATETIME, + conditions_json TEXT, + created_at DATETIME, + updated_at DATETIME, + FOREIGN KEY (fog_uuid) REFERENCES Fogs (uuid) ON DELETE CASCADE +); + +CREATE TABLE IF NOT EXISTS FogPlatformReconcileTasks ( + id INT AUTO_INCREMENT PRIMARY KEY, + fog_uuid VARCHAR(36) NOT NULL, + reason VARCHAR(64) NOT NULL, + spec_generation INT, + status VARCHAR(32) NOT NULL DEFAULT 'pending', + leader_uuid VARCHAR(36), + claimed_at DATETIME, + next_attempt_at DATETIME, + attempts INT NOT NULL DEFAULT 0, + last_error TEXT, + created_at DATETIME, + updated_at DATETIME, + FOREIGN KEY (fog_uuid) REFERENCES Fogs (uuid) ON DELETE CASCADE +); + +CREATE INDEX idx_fog_platform_reconcile_tasks_fog_status ON FogPlatformReconcileTasks (fog_uuid, status); +CREATE INDEX idx_fog_platform_reconcile_tasks_status_claimed ON FogPlatformReconcileTasks (status, claimed_at); +CREATE INDEX idx_fog_platform_reconcile_tasks_next_attempt ON FogPlatformReconcileTasks (next_attempt_at); + +CREATE TABLE IF NOT EXISTS ServicePlatformReconcileTasks ( + id INT AUTO_INCREMENT PRIMARY KEY, + service_name TEXT NOT NULL, + reason VARCHAR(64) NOT NULL, + spec_snapshot TEXT, + status VARCHAR(32) NOT NULL DEFAULT 'pending', + leader_uuid VARCHAR(36), + claimed_at DATETIME, + next_attempt_at DATETIME, + attempts INT NOT NULL DEFAULT 0, + last_error TEXT, + created_at DATETIME, + updated_at DATETIME +); + +CREATE INDEX idx_service_platform_reconcile_tasks_name_status ON ServicePlatformReconcileTasks (service_name(255), status); +CREATE INDEX idx_service_platform_reconcile_tasks_status_claimed ON ServicePlatformReconcileTasks (status, claimed_at); +CREATE INDEX idx_service_platform_reconcile_tasks_next_attempt ON ServicePlatformReconcileTasks (next_attempt_at); + +CREATE TABLE IF NOT EXISTS HubRouterConfigLocks ( + id INT PRIMARY KEY, + leader_uuid VARCHAR(36), + claimed_at DATETIME, + created_at DATETIME, + updated_at DATETIME +); + COMMIT; diff --git a/src/data/migrations/postgres/db_migration_pg_v3.8.0.sql b/src/data/migrations/postgres/db_migration_pg_v3.8.0.sql index 1c569600..17bb8664 100644 --- a/src/data/migrations/postgres/db_migration_pg_v3.8.0.sql +++ b/src/data/migrations/postgres/db_migration_pg_v3.8.0.sql @@ -1131,3 +1131,71 @@ CREATE TABLE IF NOT EXISTS "AuthPolicy" ( created_at TIMESTAMP(0), updated_at TIMESTAMP(0) ); + +CREATE TABLE IF NOT EXISTS "FogPlatformSpecs" ( + fog_uuid VARCHAR(36) PRIMARY KEY NOT NULL, + spec_json TEXT NOT NULL, + generation INT NOT NULL DEFAULT 1, + created_at TIMESTAMP(0), + updated_at TIMESTAMP(0), + FOREIGN KEY (fog_uuid) REFERENCES "Fogs" (uuid) ON DELETE CASCADE +); + +CREATE TABLE IF NOT EXISTS "FogPlatformStatuses" ( + fog_uuid VARCHAR(36) PRIMARY KEY NOT NULL, + observed_generation INT NOT NULL DEFAULT 0, + phase VARCHAR(32) NOT NULL DEFAULT 'Pending', + last_error TEXT, + last_transition_at TIMESTAMP(0), + conditions_json TEXT, + created_at TIMESTAMP(0), + updated_at TIMESTAMP(0), + FOREIGN KEY (fog_uuid) REFERENCES "Fogs" (uuid) ON DELETE CASCADE +); + +CREATE TABLE IF NOT EXISTS "FogPlatformReconcileTasks" ( + id INT GENERATED ALWAYS AS IDENTITY PRIMARY KEY NOT NULL, + fog_uuid VARCHAR(36) NOT NULL, + reason VARCHAR(64) NOT NULL, + spec_generation INT, + status VARCHAR(32) NOT NULL DEFAULT 'pending', + leader_uuid VARCHAR(36), + claimed_at TIMESTAMP(0), + next_attempt_at TIMESTAMP(0), + attempts INT NOT NULL DEFAULT 0, + last_error TEXT, + created_at TIMESTAMP(0), + updated_at TIMESTAMP(0), + FOREIGN KEY (fog_uuid) REFERENCES "Fogs" (uuid) ON DELETE CASCADE +); + +CREATE UNIQUE INDEX idx_fog_platform_reconcile_tasks_active_fog_uuid ON "FogPlatformReconcileTasks" (fog_uuid) WHERE status IN ('pending', 'in_progress'); +CREATE INDEX idx_fog_platform_reconcile_tasks_status_claimed ON "FogPlatformReconcileTasks" (status, claimed_at); +CREATE INDEX idx_fog_platform_reconcile_tasks_next_attempt ON "FogPlatformReconcileTasks" (next_attempt_at); + +CREATE TABLE IF NOT EXISTS "ServicePlatformReconcileTasks" ( + id INT GENERATED ALWAYS AS IDENTITY PRIMARY KEY NOT NULL, + service_name TEXT NOT NULL, + reason VARCHAR(64) NOT NULL, + spec_snapshot TEXT, + status VARCHAR(32) NOT NULL DEFAULT 'pending', + leader_uuid VARCHAR(36), + claimed_at TIMESTAMP(0), + next_attempt_at TIMESTAMP(0), + attempts INT NOT NULL DEFAULT 0, + last_error TEXT, + created_at TIMESTAMP(0), + updated_at TIMESTAMP(0) +); + +CREATE UNIQUE INDEX idx_service_platform_reconcile_tasks_active_service_name ON "ServicePlatformReconcileTasks" (service_name) WHERE status IN ('pending', 'in_progress'); +CREATE INDEX idx_service_platform_reconcile_tasks_status_claimed ON "ServicePlatformReconcileTasks" (status, claimed_at); +CREATE INDEX idx_service_platform_reconcile_tasks_next_attempt ON "ServicePlatformReconcileTasks" (next_attempt_at); + +CREATE TABLE IF NOT EXISTS "HubRouterConfigLocks" ( + id INT PRIMARY KEY NOT NULL CHECK (id = 1), + leader_uuid VARCHAR(36), + claimed_at TIMESTAMP(0), + created_at TIMESTAMP(0), + updated_at TIMESTAMP(0) +); diff --git a/src/data/migrations/sqlite/db_migration_sqlite_v3.8.0.sql b/src/data/migrations/sqlite/db_migration_sqlite_v3.8.0.sql index 5947f5c4..b85c1c3d 100644 --- a/src/data/migrations/sqlite/db_migration_sqlite_v3.8.0.sql +++ b/src/data/migrations/sqlite/db_migration_sqlite_v3.8.0.sql @@ -1130,3 +1130,71 @@ CREATE TABLE IF NOT EXISTS AuthPolicy ( created_at DATETIME, updated_at DATETIME ); + +CREATE TABLE IF NOT EXISTS FogPlatformSpecs ( + fog_uuid VARCHAR(36) PRIMARY KEY NOT NULL, + spec_json TEXT NOT NULL, + generation INTEGER NOT NULL DEFAULT 1, + created_at DATETIME, + updated_at DATETIME, + FOREIGN KEY (fog_uuid) REFERENCES Fogs (uuid) ON DELETE CASCADE +); + +CREATE TABLE IF NOT EXISTS FogPlatformStatuses ( + fog_uuid VARCHAR(36) PRIMARY KEY NOT NULL, + observed_generation INTEGER NOT NULL DEFAULT 0, + phase VARCHAR(32) NOT NULL DEFAULT 'Pending', + last_error TEXT, + last_transition_at DATETIME, + conditions_json TEXT, + created_at DATETIME, + updated_at DATETIME, + FOREIGN KEY (fog_uuid) REFERENCES Fogs (uuid) ON DELETE CASCADE +); + +CREATE TABLE IF NOT EXISTS FogPlatformReconcileTasks ( + id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, + fog_uuid VARCHAR(36) NOT NULL, + reason VARCHAR(64) NOT NULL, + spec_generation INTEGER, + status VARCHAR(32) NOT NULL DEFAULT 'pending', + leader_uuid VARCHAR(36), + claimed_at DATETIME, + next_attempt_at DATETIME, + attempts INTEGER NOT NULL DEFAULT 0, + last_error TEXT, + created_at DATETIME, + updated_at DATETIME, + FOREIGN KEY (fog_uuid) REFERENCES Fogs (uuid) ON DELETE CASCADE +); + +CREATE UNIQUE INDEX idx_fog_platform_reconcile_tasks_active_fog_uuid ON FogPlatformReconcileTasks (fog_uuid) WHERE status IN ('pending', 'in_progress'); +CREATE INDEX idx_fog_platform_reconcile_tasks_status_claimed ON FogPlatformReconcileTasks (status, claimed_at); +CREATE INDEX idx_fog_platform_reconcile_tasks_next_attempt ON FogPlatformReconcileTasks (next_attempt_at); + +CREATE TABLE IF NOT EXISTS ServicePlatformReconcileTasks ( + id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, + service_name TEXT NOT NULL, + reason VARCHAR(64) NOT NULL, + spec_snapshot TEXT, + status VARCHAR(32) NOT NULL DEFAULT 'pending', + leader_uuid VARCHAR(36), + claimed_at DATETIME, + next_attempt_at DATETIME, + attempts INTEGER NOT NULL DEFAULT 0, + last_error TEXT, + created_at DATETIME, + updated_at DATETIME +); + +CREATE UNIQUE INDEX idx_service_platform_reconcile_tasks_active_service_name ON ServicePlatformReconcileTasks (service_name) WHERE status IN ('pending', 'in_progress'); +CREATE INDEX idx_service_platform_reconcile_tasks_status_claimed ON ServicePlatformReconcileTasks (status, claimed_at); +CREATE INDEX idx_service_platform_reconcile_tasks_next_attempt ON ServicePlatformReconcileTasks (next_attempt_at); + +CREATE TABLE IF NOT EXISTS HubRouterConfigLocks ( + id INTEGER PRIMARY KEY NOT NULL CHECK (id = 1), + leader_uuid VARCHAR(36), + claimed_at DATETIME, + created_at DATETIME, + updated_at DATETIME +); diff --git a/src/data/models/fog.js b/src/data/models/fog.js index 3096b03e..303d790c 100644 --- a/src/data/models/fog.js +++ b/src/data/models/fog.js @@ -397,6 +397,16 @@ module.exports = (sequelize, DataTypes) => { as: 'nats' }) + Fog.hasOne(models.FogPlatformSpec, { + foreignKey: 'fog_uuid', + as: 'platformSpec' + }) + + Fog.hasOne(models.FogPlatformStatus, { + foreignKey: 'fog_uuid', + as: 'platformStatus' + }) + Fog.belongsToMany(models.Tags, { through: 'IofogTags', as: 'tags' }) Fog.belongsToMany(models.VolumeMount, { through: 'FogVolumeMounts', as: 'volumeMounts' }) } diff --git a/src/data/models/fogPlatformReconcileTask.js b/src/data/models/fogPlatformReconcileTask.js new file mode 100644 index 00000000..9b393a20 --- /dev/null +++ b/src/data/models/fogPlatformReconcileTask.js @@ -0,0 +1,73 @@ +'use strict' + +module.exports = (sequelize, DataTypes) => { + const FogPlatformReconcileTask = sequelize.define('FogPlatformReconcileTask', { + id: { + type: DataTypes.INTEGER, + primaryKey: true, + autoIncrement: true, + allowNull: false, + field: 'id' + }, + fogUuid: { + type: DataTypes.STRING(36), + allowNull: false, + field: 'fog_uuid' + }, + reason: { + type: DataTypes.STRING(64), + allowNull: false, + field: 'reason' + }, + specGeneration: { + type: DataTypes.INTEGER, + allowNull: true, + field: 'spec_generation' + }, + status: { + type: DataTypes.STRING(32), + allowNull: false, + defaultValue: 'pending', + field: 'status' + }, + leaderUuid: { + type: DataTypes.STRING(36), + allowNull: true, + field: 'leader_uuid' + }, + claimedAt: { + type: DataTypes.DATE, + allowNull: true, + field: 'claimed_at' + }, + nextAttemptAt: { + type: DataTypes.DATE, + allowNull: true, + field: 'next_attempt_at' + }, + attempts: { + type: DataTypes.INTEGER, + allowNull: false, + defaultValue: 0, + field: 'attempts' + }, + lastError: { + type: DataTypes.TEXT, + allowNull: true, + field: 'last_error' + } + }, { + tableName: 'FogPlatformReconcileTasks', + timestamps: true, + underscored: true + }) + + FogPlatformReconcileTask.associate = (models) => { + FogPlatformReconcileTask.belongsTo(models.Fog, { + foreignKey: 'fog_uuid', + as: 'fog' + }) + } + + return FogPlatformReconcileTask +} diff --git a/src/data/models/fogPlatformSpec.js b/src/data/models/fogPlatformSpec.js new file mode 100644 index 00000000..8d032960 --- /dev/null +++ b/src/data/models/fogPlatformSpec.js @@ -0,0 +1,36 @@ +'use strict' + +module.exports = (sequelize, DataTypes) => { + const FogPlatformSpec = sequelize.define('FogPlatformSpec', { + fogUuid: { + type: DataTypes.STRING(36), + primaryKey: true, + allowNull: false, + field: 'fog_uuid' + }, + specJson: { + type: DataTypes.TEXT, + allowNull: false, + field: 'spec_json' + }, + generation: { + type: DataTypes.INTEGER, + allowNull: false, + defaultValue: 1, + field: 'generation' + } + }, { + tableName: 'FogPlatformSpecs', + timestamps: true, + underscored: true + }) + + FogPlatformSpec.associate = (models) => { + FogPlatformSpec.belongsTo(models.Fog, { + foreignKey: 'fog_uuid', + as: 'fog' + }) + } + + return FogPlatformSpec +} diff --git a/src/data/models/fogPlatformStatus.js b/src/data/models/fogPlatformStatus.js new file mode 100644 index 00000000..bc24e153 --- /dev/null +++ b/src/data/models/fogPlatformStatus.js @@ -0,0 +1,52 @@ +'use strict' + +module.exports = (sequelize, DataTypes) => { + const FogPlatformStatus = sequelize.define('FogPlatformStatus', { + fogUuid: { + type: DataTypes.STRING(36), + primaryKey: true, + allowNull: false, + field: 'fog_uuid' + }, + observedGeneration: { + type: DataTypes.INTEGER, + allowNull: false, + defaultValue: 0, + field: 'observed_generation' + }, + phase: { + type: DataTypes.STRING(32), + allowNull: false, + defaultValue: 'Pending', + field: 'phase' + }, + lastError: { + type: DataTypes.TEXT, + allowNull: true, + field: 'last_error' + }, + lastTransitionAt: { + type: DataTypes.DATE, + allowNull: true, + field: 'last_transition_at' + }, + conditionsJson: { + type: DataTypes.TEXT, + allowNull: true, + field: 'conditions_json' + } + }, { + tableName: 'FogPlatformStatuses', + timestamps: true, + underscored: true + }) + + FogPlatformStatus.associate = (models) => { + FogPlatformStatus.belongsTo(models.Fog, { + foreignKey: 'fog_uuid', + as: 'fog' + }) + } + + return FogPlatformStatus +} diff --git a/src/data/models/hubRouterConfigLock.js b/src/data/models/hubRouterConfigLock.js new file mode 100644 index 00000000..64b41a8f --- /dev/null +++ b/src/data/models/hubRouterConfigLock.js @@ -0,0 +1,30 @@ +'use strict' + +module.exports = (sequelize, DataTypes) => { + const HubRouterConfigLock = sequelize.define('HubRouterConfigLock', { + id: { + type: DataTypes.INTEGER, + primaryKey: true, + allowNull: false, + defaultValue: 1, + field: 'id' + }, + leaderUuid: { + type: DataTypes.STRING(36), + allowNull: true, + field: 'leader_uuid' + }, + claimedAt: { + type: DataTypes.DATE, + allowNull: true, + field: 'claimed_at' + } + }, { + tableName: 'HubRouterConfigLocks', + timestamps: true, + underscored: true, + freezeTableName: true + }) + + return HubRouterConfigLock +} diff --git a/src/data/models/servicePlatformReconcileTask.js b/src/data/models/servicePlatformReconcileTask.js new file mode 100644 index 00000000..cef58b69 --- /dev/null +++ b/src/data/models/servicePlatformReconcileTask.js @@ -0,0 +1,66 @@ +'use strict' + +module.exports = (sequelize, DataTypes) => { + const ServicePlatformReconcileTask = sequelize.define('ServicePlatformReconcileTask', { + id: { + type: DataTypes.INTEGER, + primaryKey: true, + autoIncrement: true, + allowNull: false, + field: 'id' + }, + serviceName: { + type: DataTypes.TEXT, + allowNull: false, + field: 'service_name' + }, + reason: { + type: DataTypes.STRING(64), + allowNull: false, + field: 'reason' + }, + specSnapshot: { + type: DataTypes.TEXT, + allowNull: true, + field: 'spec_snapshot' + }, + status: { + type: DataTypes.STRING(32), + allowNull: false, + defaultValue: 'pending', + field: 'status' + }, + leaderUuid: { + type: DataTypes.STRING(36), + allowNull: true, + field: 'leader_uuid' + }, + claimedAt: { + type: DataTypes.DATE, + allowNull: true, + field: 'claimed_at' + }, + nextAttemptAt: { + type: DataTypes.DATE, + allowNull: true, + field: 'next_attempt_at' + }, + attempts: { + type: DataTypes.INTEGER, + allowNull: false, + defaultValue: 0, + field: 'attempts' + }, + lastError: { + type: DataTypes.TEXT, + allowNull: true, + field: 'last_error' + } + }, { + tableName: 'ServicePlatformReconcileTasks', + timestamps: true, + underscored: true + }) + + return ServicePlatformReconcileTask +} diff --git a/src/schemas/fog-platform-spec.js b/src/schemas/fog-platform-spec.js new file mode 100644 index 00000000..84c09c39 --- /dev/null +++ b/src/schemas/fog-platform-spec.js @@ -0,0 +1,237 @@ +const { Validator } = require('jsonschema') +const Errors = require('../helpers/errors') + +const FOG_PLATFORM_SPEC_MAX_BYTES = 16 * 1024 + +const FOG_PLATFORM_REASONS = [ + 'spec-changed', + 'delete', + 'periodic-sweep', + 'manual-retry', + 'service-changed' +] + +const SERVICE_PLATFORM_REASONS = [ + 'spec-changed', + 'delete', + 'periodic-sweep', + 'manual-retry' +] + +const FOG_PLATFORM_PHASES = [ + 'Pending', + 'Progressing', + 'Ready', + 'Failed', + 'Deleting' +] + +const fogPlatformSpec = { + id: '/fogPlatformSpec', + type: 'object', + additionalProperties: false, + properties: { + routerMode: { type: 'string', enum: ['none', 'edge', 'interior'] }, + natsMode: { type: 'string', enum: ['none', 'leaf', 'server'] }, + host: { type: 'string' }, + messagingPort: { type: 'integer', minimum: 1, maximum: 65535 }, + interRouterPort: { type: 'integer', minimum: 1, maximum: 65535 }, + edgeRouterPort: { type: 'integer', minimum: 1, maximum: 65535 }, + upstreamRouters: { + type: 'array', + items: { type: 'string', minLength: 1 } + }, + upstreamNatsServers: { + type: 'array', + items: { type: 'string', minLength: 1 } + }, + natsServerPort: { type: 'integer', minimum: 1, maximum: 65535 }, + natsLeafPort: { type: 'integer', minimum: 1, maximum: 65535 }, + natsClusterPort: { type: 'integer', minimum: 1, maximum: 65535 }, + natsMqttPort: { type: 'integer', minimum: 1, maximum: 65535 }, + natsHttpPort: { type: 'integer', minimum: 1, maximum: 65535 }, + jsStorageSize: { type: 'string', maxLength: 32 }, + jsMemoryStoreSize: { type: 'string', maxLength: 32 }, + networkRouter: { type: ['string', 'null'] }, + containerEngine: { type: 'string', enum: ['edgelet', 'docker', 'podman'] }, + bluetoothEnabled: { type: 'boolean' }, + abstractedHardwareEnabled: { type: 'boolean' }, + tags: { + type: 'array', + items: { + type: 'object', + properties: { + value: { type: 'string' } + }, + required: ['value'] + } + } + } +} + +const validator = new Validator() +validator.addSchema(fogPlatformSpec, fogPlatformSpec.id) + +function assertMaxBytes (json, label) { + if (Buffer.byteLength(json, 'utf8') > FOG_PLATFORM_SPEC_MAX_BYTES) { + throw new Errors.ValidationError(`${label} exceeds maximum size of ${FOG_PLATFORM_SPEC_MAX_BYTES} bytes`) + } +} + +function parseJsonText (text, label) { + if (text == null || text === '') { + return null + } + try { + return JSON.parse(text) + } catch (error) { + throw new Errors.ValidationError(`Invalid ${label} JSON`) + } +} + +function serializeJson (object, label) { + const json = JSON.stringify(object) + assertMaxBytes(json, label) + return json +} + +async function validateFogPlatformSpec (object) { + const response = validator.validate(object || {}, fogPlatformSpec) + if (!response.valid) { + const error = response.errors[0] + const property = (error.property || '').replace('instance.', '') + throw new Errors.ValidationError( + property ? `Invalid fog platform spec field '${property}'` : 'Invalid fog platform spec' + ) + } + return object +} + +function parseSpecJson (specJson) { + return parseJsonText(specJson, 'fog platform spec') +} + +function serializeSpecJson (object) { + return serializeJson(object, 'Fog platform spec') +} + +function parseSpecSnapshot (specSnapshot) { + return parseJsonText(specSnapshot, 'service platform spec snapshot') +} + +function serializeSpecSnapshot (object) { + if (object == null) { + return null + } + return serializeJson(object, 'Service platform spec snapshot') +} + +function parseConditionsJson (conditionsJson) { + const parsed = parseJsonText(conditionsJson, 'fog platform conditions') + if (parsed == null) { + return [] + } + if (!Array.isArray(parsed)) { + throw new Errors.ValidationError('Invalid fog platform conditions JSON') + } + return parsed +} + +function serializeConditionsJson (conditions) { + if (conditions == null) { + return null + } + return serializeJson(conditions, 'Fog platform conditions') +} + +const FOG_PLATFORM_SPEC_SCALAR_FIELDS = [ + 'routerMode', + 'natsMode', + 'host', + 'messagingPort', + 'interRouterPort', + 'edgeRouterPort', + 'natsServerPort', + 'natsLeafPort', + 'natsClusterPort', + 'natsMqttPort', + 'natsHttpPort', + 'jsStorageSize', + 'jsMemoryStoreSize', + 'networkRouter', + 'containerEngine', + 'bluetoothEnabled', + 'abstractedHardwareEnabled' +] + +const FOG_PLATFORM_SPEC_ARRAY_FIELDS = [ + 'upstreamRouters', + 'upstreamNatsServers', + 'tags' +] + +function normalizeSpecTags (tags) { + if (!Array.isArray(tags)) { + return tags + } + return tags.map((tag) => (typeof tag === 'string' ? { value: tag } : tag)) +} + +function buildPlatformSpecFromFogData (fogData, options = {}) { + const spec = {} + for (const field of FOG_PLATFORM_SPEC_SCALAR_FIELDS) { + if (fogData[field] !== undefined) { + spec[field] = fogData[field] + } + } + for (const field of FOG_PLATFORM_SPEC_ARRAY_FIELDS) { + if (fogData[field] !== undefined) { + spec[field] = field === 'tags' ? normalizeSpecTags(fogData[field]) : fogData[field] + } + } + if (options.applyCreateDefaults) { + if (spec.routerMode === undefined) { + spec.routerMode = 'edge' + } + if (spec.natsMode === undefined) { + spec.natsMode = 'leaf' + } + } + return spec +} + +function mergePlatformSpecPatch (existingSpec, patchFogData) { + const merged = { ...(existingSpec || {}) } + for (const field of FOG_PLATFORM_SPEC_SCALAR_FIELDS) { + if (patchFogData[field] !== undefined) { + merged[field] = patchFogData[field] + } + } + for (const field of FOG_PLATFORM_SPEC_ARRAY_FIELDS) { + if (patchFogData[field] !== undefined) { + merged[field] = field === 'tags' ? normalizeSpecTags(patchFogData[field]) : patchFogData[field] + } + } + return merged +} + +module.exports = { + mainSchemas: [], + innerSchemas: [fogPlatformSpec], + FOG_PLATFORM_SPEC_MAX_BYTES, + FOG_PLATFORM_REASONS, + SERVICE_PLATFORM_REASONS, + FOG_PLATFORM_PHASES, + fogPlatformSpec, + validateFogPlatformSpec, + parseSpecJson, + serializeSpecJson, + parseSpecSnapshot, + serializeSpecSnapshot, + parseConditionsJson, + serializeConditionsJson, + FOG_PLATFORM_SPEC_SCALAR_FIELDS, + FOG_PLATFORM_SPEC_ARRAY_FIELDS, + buildPlatformSpecFromFogData, + mergePlatformSpecPatch +} diff --git a/test/src/data/fog-platform-managers.test.js b/test/src/data/fog-platform-managers.test.js new file mode 100644 index 00000000..d0a221d2 --- /dev/null +++ b/test/src/data/fog-platform-managers.test.js @@ -0,0 +1,539 @@ +const { expect } = require('chai') +const sinon = require('sinon') + +const { + FOG_PLATFORM_SPEC_MAX_BYTES, + validateFogPlatformSpec, + parseSpecJson, + serializeSpecJson, + parseSpecSnapshot, + serializeSpecSnapshot +} = require('../../../src/schemas/fog-platform-spec') +const FogPlatformReconcileTaskManager = require('../../../src/data/managers/fog-platform-reconcile-task-manager') +const ServicePlatformReconcileTaskManager = require('../../../src/data/managers/service-platform-reconcile-task-manager') +const databaseProvider = require('../../../src/data/providers/database-factory') + +describe('Fog platform spec schema', () => { + it('validates a minimal platform spec subset', async () => { + await validateFogPlatformSpec({ + routerMode: 'edge', + natsMode: 'leaf', + host: '10.0.0.1' + }) + }) + + it('rejects unknown platform spec fields', async () => { + try { + await validateFogPlatformSpec({ routerMode: 'edge', unknownField: true }) + throw new Error('expected validation to fail') + } catch (error) { + expect(error.name).to.equal('ValidationError') + } + }) + + it('round-trips spec JSON through parse and serialize', () => { + const spec = { + routerMode: 'interior', + natsMode: 'server', + host: 'controlplane', + upstreamRouters: ['edge-1'], + tags: [{ value: 'site-a' }] + } + const serialized = serializeSpecJson(spec) + expect(parseSpecJson(serialized)).to.eql(spec) + }) + + it('rejects specs larger than 16 KB', () => { + const huge = { host: 'x'.repeat(FOG_PLATFORM_SPEC_MAX_BYTES) } + expect(() => serializeSpecJson(huge)).to.throw('exceeds maximum size') + }) + + it('round-trips service spec snapshots', () => { + const snapshot = { + name: 'my-service', + type: 'microservice', + resource: 'app.ms', + targetPort: 8080, + tags: [{ value: 'site-a' }, { value: 'site-b' }] + } + const serialized = serializeSpecSnapshot(snapshot) + expect(parseSpecSnapshot(serialized)).to.eql(snapshot) + }) +}) + +describe('Fog platform reconcile task enqueue', () => { + def('sandbox', () => sinon.createSandbox()) + const transaction = {} + + afterEach(() => $sandbox.restore()) + + it('coalesces duplicate pending tasks for the same fog', async () => { + const existing = { id: 7, fogUuid: 'fog-1', reason: 'spec-changed', status: 'pending' } + const entity = { + findOne: $sandbox.stub().resolves(existing), + update: $sandbox.stub().resolves([1]) + } + + $sandbox.stub(FogPlatformReconcileTaskManager, 'getEntity').returns(entity) + $sandbox.stub(FogPlatformReconcileTaskManager, 'findOne').resolves({ ...existing, reason: 'manual-retry' }) + $sandbox.stub(FogPlatformReconcileTaskManager, 'create') + + const task = await FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask({ + fogUuid: 'fog-1', + reason: 'manual-retry', + specGeneration: 3 + }, transaction) + + expect(entity.update).to.have.been.calledOnce + expect(FogPlatformReconcileTaskManager.create).to.not.have.been.called + expect(task.reason).to.equal('manual-retry') + }) + + it('creates a new task when no active task exists', async () => { + const entity = { + findOne: $sandbox.stub().resolves(null) + } + const created = { id: 1, fogUuid: 'fog-2', reason: 'spec-changed', status: 'pending' } + + $sandbox.stub(FogPlatformReconcileTaskManager, 'getEntity').returns(entity) + $sandbox.stub(FogPlatformReconcileTaskManager, 'create').resolves(created) + + const task = await FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask({ + fogUuid: 'fog-2', + reason: 'spec-changed', + specGeneration: 1 + }, transaction) + + expect(FogPlatformReconcileTaskManager.create).to.have.been.calledOnce + expect(task).to.eql(created) + }) + + it('supersedes pending work with delete reason', async () => { + const existing = { id: 9, fogUuid: 'fog-3', reason: 'spec-changed', status: 'in_progress' } + const entity = { + findOne: $sandbox.stub().resolves(existing), + update: $sandbox.stub().resolves([1]) + } + + $sandbox.stub(FogPlatformReconcileTaskManager, 'getEntity').returns(entity) + $sandbox.stub(FogPlatformReconcileTaskManager, 'findOne').resolves({ ...existing, reason: 'delete' }) + + await FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask({ + fogUuid: 'fog-3', + reason: 'delete' + }, transaction) + + expect(entity.update).to.have.been.calledWithMatch( + { reason: 'delete' }, + sinon.match.has('where', { id: 9 }) + ) + }) +}) + +describe('Fog platform reconcile task claim', () => { + def('sandbox', () => sinon.createSandbox()) + const transaction = {} + + afterEach(() => $sandbox.restore()) + + it('claims an available task atomically', async () => { + const task = { id: 1, fogUuid: 'fog-1', status: 'pending' } + const entity = { + findOne: $sandbox.stub().resolves(task), + update: $sandbox.stub().resolves([1]) + } + + $sandbox.stub(databaseProvider.sequelize, 'transaction').callsFake(async (fn) => fn(transaction)) + $sandbox.stub(FogPlatformReconcileTaskManager, 'getEntity').returns(entity) + $sandbox.stub(FogPlatformReconcileTaskManager, 'findOne').resolves(task) + + const result = await FogPlatformReconcileTaskManager.claimNextFogTask('controller-1', 300) + + expect(result).to.eql(task) + expect(entity.update).to.have.been.calledOnceWith( + { leaderUuid: 'controller-1', claimedAt: sinon.match.date, status: 'in_progress' }, + sinon.match.has('where', sinon.match.has('id', 1)) + ) + }) + + it('returns null when a concurrent claim wins the update', async () => { + const task = { id: 2, fogUuid: 'fog-2', status: 'pending' } + const entity = { + findOne: $sandbox.stub().resolves(task), + update: $sandbox.stub().resolves([0]) + } + + $sandbox.stub(databaseProvider.sequelize, 'transaction').callsFake(async (fn) => fn(transaction)) + $sandbox.stub(FogPlatformReconcileTaskManager, 'getEntity').returns(entity) + $sandbox.stub(FogPlatformReconcileTaskManager, 'findOne') + + const result = await FogPlatformReconcileTaskManager.claimNextFogTask('controller-1', 300) + + expect(result).to.be.null + expect(FogPlatformReconcileTaskManager.findOne).to.not.have.been.called + }) + + it('returns null when no task is eligible', async () => { + const entity = { + findOne: $sandbox.stub().resolves(null), + update: $sandbox.stub() + } + + $sandbox.stub(databaseProvider.sequelize, 'transaction').callsFake(async (fn) => fn(transaction)) + $sandbox.stub(FogPlatformReconcileTaskManager, 'getEntity').returns(entity) + + const result = await FogPlatformReconcileTaskManager.claimNextFogTask('controller-1', 300) + + expect(result).to.be.null + expect(entity.update).to.not.have.been.called + }) + + it('retries claim on SQLITE_BUSY before succeeding', async () => { + const task = { id: 3, fogUuid: 'fog-3', status: 'pending' } + const entity = { + findOne: $sandbox.stub().resolves(task), + update: $sandbox.stub().resolves([1]) + } + let txAttempts = 0 + + $sandbox.stub(databaseProvider.sequelize, 'transaction').callsFake(async (fn) => { + txAttempts++ + if (txAttempts < 2) { + throw new Error('SQLITE_BUSY: database is locked') + } + return fn(transaction) + }) + $sandbox.stub(FogPlatformReconcileTaskManager, 'getEntity').returns(entity) + $sandbox.stub(FogPlatformReconcileTaskManager, 'findOne').resolves(task) + + const result = await FogPlatformReconcileTaskManager.claimNextFogTask('controller-1', 300) + + expect(result).to.eql(task) + expect(txAttempts).to.equal(2) + }) + + it('reclaims stale in_progress tasks for another controller', async () => { + const staleClaimedAt = new Date(Date.now() - 400 * 1000) + const task = { + id: 10, + fogUuid: 'fog-stale', + status: 'in_progress', + leaderUuid: 'controller-a', + claimedAt: staleClaimedAt + } + const reclaimedTask = { + ...task, + leaderUuid: 'controller-b', + claimedAt: new Date(), + status: 'in_progress' + } + const entity = { + findOne: $sandbox.stub().resolves(task), + update: $sandbox.stub().resolves([1]) + } + + $sandbox.stub(databaseProvider.sequelize, 'transaction').callsFake(async (fn) => fn(transaction)) + $sandbox.stub(FogPlatformReconcileTaskManager, 'getEntity').returns(entity) + $sandbox.stub(FogPlatformReconcileTaskManager, 'findOne').resolves(reclaimedTask) + + const result = await FogPlatformReconcileTaskManager.claimNextFogTask('controller-b', 300) + + expect(result).to.eql(reclaimedTask) + expect(entity.update).to.have.been.calledOnceWith( + { leaderUuid: 'controller-b', claimedAt: sinon.match.date, status: 'in_progress' }, + sinon.match.has('where', sinon.match({ id: 10 })) + ) + }) + + it('records retryable failure with backoff', async () => { + const entity = { + update: $sandbox.stub().resolves([1]) + } + const updatedTask = { + id: 5, + attempts: 1, + status: 'pending', + lastError: 'router failed' + } + + $sandbox.stub(FogPlatformReconcileTaskManager, 'getEntity').returns(entity) + $sandbox.stub(FogPlatformReconcileTaskManager, 'findOne').resolves(updatedTask) + + const result = await FogPlatformReconcileTaskManager.recordFogTaskFailure( + 5, + 'router failed', + { attempts: 0 }, + transaction + ) + + expect(entity.update).to.have.been.calledOnceWith( + sinon.match({ + attempts: 1, + lastError: 'router failed', + status: 'pending', + leaderUuid: null, + claimedAt: null, + nextAttemptAt: sinon.match.date + }), + sinon.match.has('where', { id: 5 }) + ) + expect(result).to.eql(updatedTask) + }) + + it('marks task failed permanently after max attempts', async () => { + const entity = { + update: $sandbox.stub().resolves([1]) + } + + $sandbox.stub(FogPlatformReconcileTaskManager, 'getEntity').returns(entity) + $sandbox.stub(FogPlatformReconcileTaskManager, 'findOne').resolves({ + id: 6, + attempts: 10, + status: 'failed' + }) + + await FogPlatformReconcileTaskManager.recordFogTaskFailure( + 6, + 'still failing', + { attempts: 9 }, + transaction + ) + + expect(entity.update).to.have.been.calledWith( + sinon.match({ + attempts: 10, + status: 'failed', + nextAttemptAt: null + }), + sinon.match.any + ) + }) +}) + +describe('Service platform reconcile task enqueue', () => { + def('sandbox', () => sinon.createSandbox()) + const transaction = {} + + afterEach(() => $sandbox.restore()) + + it('stores spec_snapshot JSON at enqueue time', async () => { + const snapshot = { + name: 'api', + type: 'k8s', + resource: 'default.api', + targetPort: 8080, + tags: [{ value: 'hub' }] + } + const entity = { + findOne: $sandbox.stub().resolves(null) + } + const created = { + id: 3, + serviceName: 'api', + reason: 'spec-changed', + specSnapshot: serializeSpecSnapshot(snapshot), + status: 'pending' + } + + $sandbox.stub(ServicePlatformReconcileTaskManager, 'getEntity').returns(entity) + $sandbox.stub(ServicePlatformReconcileTaskManager, 'create').resolves(created) + + const task = await ServicePlatformReconcileTaskManager.enqueueServicePlatformReconcileTask({ + serviceName: 'api', + reason: 'spec-changed', + specSnapshot: snapshot + }, transaction) + + expect(ServicePlatformReconcileTaskManager.create).to.have.been.calledWithMatch({ + serviceName: 'api', + specSnapshot: serializeSpecSnapshot(snapshot) + }) + expect(ServicePlatformReconcileTaskManager.getParsedSpecSnapshot(task)).to.eql(snapshot) + }) + + it('coalesces duplicate pending tasks for the same service', async () => { + const snapshot = { name: 'api', type: 'k8s', resource: 'default.api', targetPort: 8080 } + const existing = { + id: 4, + serviceName: 'api', + reason: 'spec-changed', + specSnapshot: serializeSpecSnapshot({ name: 'api', tags: [] }), + status: 'pending' + } + const entity = { + findOne: $sandbox.stub().resolves(existing), + update: $sandbox.stub().resolves([1]) + } + + $sandbox.stub(ServicePlatformReconcileTaskManager, 'getEntity').returns(entity) + $sandbox.stub(ServicePlatformReconcileTaskManager, 'findOne').resolves({ + ...existing, + specSnapshot: serializeSpecSnapshot(snapshot) + }) + $sandbox.stub(ServicePlatformReconcileTaskManager, 'create') + + await ServicePlatformReconcileTaskManager.enqueueServicePlatformReconcileTask({ + serviceName: 'api', + reason: 'spec-changed', + specSnapshot: snapshot + }, transaction) + + expect(entity.update).to.have.been.calledOnce + expect(ServicePlatformReconcileTaskManager.create).to.not.have.been.called + }) +}) + +describe('Service platform reconcile task claim', () => { + def('sandbox', () => sinon.createSandbox()) + const transaction = {} + + afterEach(() => $sandbox.restore()) + + it('claims an available task atomically', async () => { + const task = { id: 1, serviceName: 'api', status: 'pending' } + const entity = { + findOne: $sandbox.stub().resolves(task), + update: $sandbox.stub().resolves([1]) + } + + $sandbox.stub(databaseProvider.sequelize, 'transaction').callsFake(async (fn) => fn(transaction)) + $sandbox.stub(ServicePlatformReconcileTaskManager, 'getEntity').returns(entity) + $sandbox.stub(ServicePlatformReconcileTaskManager, 'findOne').resolves(task) + + const result = await ServicePlatformReconcileTaskManager.claimNextServiceTask('controller-1', 300) + + expect(result).to.eql(task) + expect(entity.update).to.have.been.calledOnceWith( + { leaderUuid: 'controller-1', claimedAt: sinon.match.date, status: 'in_progress' }, + sinon.match.has('where', sinon.match.has('id', 1)) + ) + }) + + it('returns null when a concurrent claim wins the update', async () => { + const task = { id: 2, serviceName: 'api-2', status: 'pending' } + const entity = { + findOne: $sandbox.stub().resolves(task), + update: $sandbox.stub().resolves([0]) + } + + $sandbox.stub(databaseProvider.sequelize, 'transaction').callsFake(async (fn) => fn(transaction)) + $sandbox.stub(ServicePlatformReconcileTaskManager, 'getEntity').returns(entity) + $sandbox.stub(ServicePlatformReconcileTaskManager, 'findOne') + + const result = await ServicePlatformReconcileTaskManager.claimNextServiceTask('controller-1', 300) + + expect(result).to.be.null + expect(ServicePlatformReconcileTaskManager.findOne).to.not.have.been.called + }) + + it('records retryable failure with backoff', async () => { + const entity = { + update: $sandbox.stub().resolves([1]) + } + const updatedTask = { + id: 8, + attempts: 1, + status: 'pending', + lastError: 'hub patch failed' + } + + $sandbox.stub(ServicePlatformReconcileTaskManager, 'getEntity').returns(entity) + $sandbox.stub(ServicePlatformReconcileTaskManager, 'findOne').resolves(updatedTask) + + const result = await ServicePlatformReconcileTaskManager.recordServiceTaskFailure( + 8, + 'hub patch failed', + { attempts: 0 }, + transaction + ) + + expect(entity.update).to.have.been.calledOnceWith( + sinon.match({ + attempts: 1, + lastError: 'hub patch failed', + status: 'pending', + leaderUuid: null, + claimedAt: null, + nextAttemptAt: sinon.match.date + }), + sinon.match.has('where', { id: 8 }) + ) + expect(result).to.eql(updatedTask) + }) + + it('marks task failed permanently after max attempts', async () => { + const entity = { + update: $sandbox.stub().resolves([1]) + } + + $sandbox.stub(ServicePlatformReconcileTaskManager, 'getEntity').returns(entity) + $sandbox.stub(ServicePlatformReconcileTaskManager, 'findOne').resolves({ + id: 9, + attempts: 10, + status: 'failed' + }) + + await ServicePlatformReconcileTaskManager.recordServiceTaskFailure( + 9, + 'still failing', + { attempts: 9 }, + transaction + ) + + expect(entity.update).to.have.been.calledWith( + sinon.match({ + attempts: 10, + status: 'failed', + nextAttemptAt: null + }), + sinon.match.any + ) + }) +}) + +describe('Hub router ConfigMap lock', () => { + def('sandbox', () => sinon.createSandbox()) + const transaction = {} + + afterEach(() => $sandbox.restore()) + + it('denies acquire when another controller holds a fresh lock', async () => { + const HubRouterConfigLockManager = require('../../../src/data/managers/hub-router-config-lock-manager') + const lock = { + id: 1, + leaderUuid: 'controller-a', + claimedAt: new Date() + } + + $sandbox.stub(HubRouterConfigLockManager, 'initializeLock').resolves() + $sandbox.stub(HubRouterConfigLockManager, 'findOne').resolves(lock) + $sandbox.stub(HubRouterConfigLockManager, 'update') + + const acquired = await HubRouterConfigLockManager.tryAcquire('controller-b', 120, transaction) + + expect(acquired).to.equal(false) + expect(HubRouterConfigLockManager.update).to.not.have.been.called + }) + + it('allows stale lock reclaim by another controller', async () => { + const HubRouterConfigLockManager = require('../../../src/data/managers/hub-router-config-lock-manager') + const lock = { + id: 1, + leaderUuid: 'controller-a', + claimedAt: new Date(Date.now() - 200 * 1000) + } + + $sandbox.stub(HubRouterConfigLockManager, 'initializeLock').resolves() + $sandbox.stub(HubRouterConfigLockManager, 'findOne').resolves(lock) + $sandbox.stub(HubRouterConfigLockManager, 'update').resolves() + + const acquired = await HubRouterConfigLockManager.tryAcquire('controller-b', 120, transaction) + + expect(acquired).to.equal(true) + expect(HubRouterConfigLockManager.update).to.have.been.calledOnceWith( + { id: 1 }, + { leaderUuid: 'controller-b', claimedAt: sinon.match.date }, + transaction + ) + }) +}) From ac3956c4b2e69f174afb35b72b35ad0e3638f100 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Wed, 24 Jun 2026 21:17:25 +0300 Subject: [PATCH 03/11] Implement fog and service platform reconcile services. Extract router/NATS lifecycle into FogPlatformService with full recompute of service-derived TCP bridges. Add ServicePlatformService for hub connector/listener, K8s Service lifecycle, ConfigMap lock, and fog fan-out. --- src/services/fog-platform-service.js | 362 ++++++++++++++ src/services/service-bridge-config.js | 104 ++++ src/services/service-platform-service.js | 465 ++++++++++++++++++ .../src/services/fog-platform-service.test.js | 236 +++++++++ .../services/service-bridge-config.test.js | 110 +++++ .../services/service-platform-service.test.js | 374 ++++++++++++++ 6 files changed, 1651 insertions(+) create mode 100644 src/services/fog-platform-service.js create mode 100644 src/services/service-bridge-config.js create mode 100644 src/services/service-platform-service.js create mode 100644 test/src/services/fog-platform-service.test.js create mode 100644 test/src/services/service-bridge-config.test.js create mode 100644 test/src/services/service-platform-service.test.js diff --git a/src/services/fog-platform-service.js b/src/services/fog-platform-service.js new file mode 100644 index 00000000..337c647f --- /dev/null +++ b/src/services/fog-platform-service.js @@ -0,0 +1,362 @@ +const TransactionDecorator = require('../decorators/transaction-decorator') +const AppHelper = require('../helpers/app-helper') +const Constants = require('../helpers/constants') +const Errors = require('../helpers/errors') +const ErrorMessages = require('../helpers/error-messages') +const FogManager = require('../data/managers/iofog-manager') +const FogPlatformSpecManager = require('../data/managers/fog-platform-spec-manager') +const FogPlatformStatusManager = require('../data/managers/fog-platform-status-manager') +const RouterManager = require('../data/managers/router-manager') +const RouterConnectionManager = require('../data/managers/router-connection-manager') +const NatsInstanceManager = require('../data/managers/nats-instance-manager') +const NatsConnectionManager = require('../data/managers/nats-connection-manager') +const ChangeTrackingService = require('./change-tracking-service') +const IofogService = require('./iofog-service') +const NatsService = require('./nats-service') +const RouterService = require('./router-service') +const ServiceBridgeConfig = require('./service-bridge-config') +const logger = require('../logger') + +function buildFogDataFromSpecAndFog (fog, spec) { + const specTags = Array.isArray(spec.tags) ? spec.tags.map((tag) => tag.value) : [] + const fogTags = fog.tags ? fog.tags.map((tag) => tag.value) : [] + + return { + uuid: fog.uuid, + name: fog.name, + isSystem: fog.isSystem, + host: spec.host != null ? spec.host : fog.host, + routerMode: spec.routerMode, + natsMode: spec.natsMode, + messagingPort: spec.messagingPort, + interRouterPort: spec.interRouterPort, + edgeRouterPort: spec.edgeRouterPort, + upstreamRouters: spec.upstreamRouters, + upstreamNatsServers: spec.upstreamNatsServers, + natsServerPort: spec.natsServerPort, + natsLeafPort: spec.natsLeafPort, + natsClusterPort: spec.natsClusterPort, + natsMqttPort: spec.natsMqttPort, + natsHttpPort: spec.natsHttpPort, + jsStorageSize: spec.jsStorageSize, + jsMemoryStoreSize: spec.jsMemoryStoreSize, + networkRouter: spec.networkRouter, + containerEngine: spec.containerEngine || fog.containerEngine, + bluetoothEnabled: spec.bluetoothEnabled != null ? spec.bluetoothEnabled : fog.bluetoothEnabled, + abstractedHardwareEnabled: spec.abstractedHardwareEnabled != null + ? spec.abstractedHardwareEnabled + : fog.abstractedHardwareEnabled, + tags: fogTags.length > 0 ? fogTags : specTags + } +} + +function validateSystemFogInvariants (fog, spec) { + if (!fog.isSystem) { + return + } + if (spec.routerMode !== 'interior') { + throw new Errors.ValidationError(AppHelper.formatMessage(ErrorMessages.INVALID_ROUTER_MODE, spec.routerMode)) + } + if (spec.natsMode !== 'server') { + throw new Errors.ValidationError(AppHelper.formatMessage(ErrorMessages.INVALID_NATS_MODE, spec.natsMode)) + } +} + +function buildNatsConfig (spec) { + return { + mode: spec.natsMode, + serverPort: spec.natsServerPort, + leafPort: spec.natsLeafPort, + clusterPort: spec.natsClusterPort, + mqttPort: spec.natsMqttPort, + httpPort: spec.natsHttpPort, + upstreamNatsServers: spec.upstreamNatsServers, + jsStorageSize: spec.jsStorageSize || NatsService.DEFAULT_JS_STORAGE_SIZE, + jsMemoryStoreSize: spec.jsMemoryStoreSize || NatsService.DEFAULT_JS_MEMORY_STORE_SIZE + } +} + +function _getRouterUuid (router, defaultRouter) { + return (defaultRouter && router.id === defaultRouter.id) + ? Constants.DEFAULT_ROUTER_NAME + : router.iofogUuid +} + +function _getNatsUuid (nats, defaultHub) { + return (defaultHub && nats.id === defaultHub.id) + ? Constants.DEFAULT_NATS_HUB_NAME + : nats.iofogUuid +} + +async function captureTopologySnapshot (fogUuid, transaction) { + const defaultRouter = await RouterManager.findOne({ isDefault: true }, transaction) + const defaultHub = await NatsInstanceManager.findOne({ isHub: true }, transaction) + const router = await RouterManager.findOne({ iofogUuid: fogUuid }, transaction) + const nats = await NatsInstanceManager.findByFog(fogUuid, transaction) + + let upstreamRouters = [] + if (router) { + const connections = await RouterConnectionManager.findAllWithRouters({ sourceRouter: router.id }, transaction) + upstreamRouters = (connections || []).map((connection) => _getRouterUuid(connection.dest, defaultRouter)).sort() + } + + let upstreamNatsServers = [] + if (nats) { + const connections = await NatsConnectionManager.findAllWithNats({ sourceNats: nats.id }, transaction) + upstreamNatsServers = (connections || []).map((connection) => _getNatsUuid(connection.dest, defaultHub)).sort() + } + + return { + routerMode: router ? (router.isEdge ? 'edge' : 'interior') : 'none', + natsMode: nats ? (nats.isLeaf ? 'leaf' : 'server') : 'none', + upstreamRouters: upstreamRouters.join(','), + upstreamNatsServers: upstreamNatsServers.join(',') + } +} + +function topologyChanged (before, after) { + return before.routerMode !== after.routerMode || + before.natsMode !== after.natsMode || + before.upstreamRouters !== after.upstreamRouters || + before.upstreamNatsServers !== after.upstreamNatsServers +} + +function truncateErrorMessage (errorMessage, maxLength = 200) { + return errorMessage.length > maxLength ? errorMessage.slice(0, maxLength) : errorMessage +} + +async function markReconcileFailed (fogUuid, error, transaction) { + const errorMessage = error.message || String(error) + const shortError = truncateErrorMessage(errorMessage) + + await FogPlatformStatusManager.setPhase(fogUuid, 'Failed', { + lastError: errorMessage + }, transaction) + await FogManager.update({ uuid: fogUuid }, { + warningMessage: `Platform reconcile: ${shortError}` + }, transaction) +} + +function buildReadyConditions (spec, router, nats) { + const routerReady = spec.routerMode === 'none' || !!router + const natsReady = spec.natsMode === 'none' || !!nats + return [ + { type: 'RouterReady', status: routerReady ? 'True' : 'False', reason: 'ReconcileComplete' }, + { type: 'NatsReady', status: natsReady ? 'True' : 'False', reason: 'ReconcileComplete' } + ] +} + +async function reconcileFog (fogUuid, transaction) { + const startedAt = Date.now() + let generation = null + let phase = 'Progressing' + + try { + const fog = await FogManager.findOneWithTags({ uuid: fogUuid }, transaction) + if (!fog) { + throw new Errors.NotFoundError(AppHelper.formatMessage(ErrorMessages.INVALID_IOFOG_UUID, fogUuid)) + } + + const parsedSpec = await FogPlatformSpecManager.getParsedSpec(fogUuid, transaction) + if (!parsedSpec) { + throw new Errors.NotFoundError(`Fog platform spec not found for fog ${fogUuid}`) + } + + const status = await FogPlatformStatusManager.getParsedStatus(fogUuid, transaction) + if (status && status.phase === 'Deleting') { + logger.info('fogPlatformReconcile skipped delete-owned fog', { + fogUuid, + generation: parsedSpec.generation, + phase: status.phase, + durationMs: Date.now() - startedAt + }) + return { skipped: true, reason: 'deleting' } + } + + generation = parsedSpec.generation + const spec = parsedSpec.spec + const fogData = buildFogDataFromSpecAndFog(fog, spec) + const topologyBefore = await captureTopologySnapshot(fogUuid, transaction) + + await FogPlatformStatusManager.setPhase(fogUuid, 'Progressing', {}, transaction) + validateSystemFogInvariants(fog, spec) + + const router = await RouterManager.findOne({ iofogUuid: fogUuid }, transaction) + const oldRouterMode = router ? (router.isEdge ? 'edge' : 'interior') : 'none' + const isRouterModeChanged = spec.routerMode !== oldRouterMode && + (spec.routerMode === 'none' || oldRouterMode === 'none') + const isHostChanged = spec.host != null && spec.host !== fog.host + const shouldRecreateCerts = isRouterModeChanged || isHostChanged + + await IofogService._handleRouterCertificates(fogData, fogUuid, shouldRecreateCerts, transaction) + if (shouldRecreateCerts) { + await ChangeTrackingService.update(fogUuid, ChangeTrackingService.events.volumeMounts, transaction) + } + + const natsConfig = buildNatsConfig(spec) + if (spec.natsMode === 'none') { + await NatsService.cleanupNatsForFog(fog, transaction) + await IofogService._deleteNatsMicroserviceByFog(fogData, transaction) + await ChangeTrackingService.update(fogUuid, ChangeTrackingService.events.microserviceList, transaction) + } else { + if (isHostChanged) { + await IofogService._reconcileNatsCertificatesOnHostChange(fog, transaction) + } + await NatsService.ensureNatsForFog(fog, natsConfig, transaction) + } + + let networkRouter = null + if (spec.routerMode === 'none') { + networkRouter = await RouterService.getNetworkRouter(spec.networkRouter, transaction) + if (!networkRouter) { + throw new Errors.NotFoundError(AppHelper.formatMessage( + ErrorMessages.INVALID_ROUTER, + spec.networkRouter || Constants.DEFAULT_ROUTER_NAME + )) + } + if (router) { + await IofogService._deleteFogRouter(fogData, transaction) + } + await FogManager.update({ uuid: fogUuid }, { routerId: networkRouter.id }, transaction) + } else { + const defaultRouter = await RouterManager.findOne({ isDefault: true }, transaction) + const upstreamConnections = router + ? await RouterConnectionManager.findAllWithRouters({ sourceRouter: router.id }, transaction) + : [] + const upstreamRoutersIofogUuid = spec.upstreamRouters || (upstreamConnections || []) + .map((connection) => connection.dest.iofogUuid) + const upstreamRouters = await RouterService.validateAndReturnUpstreamRouters( + upstreamRoutersIofogUuid, + fog.isSystem, + defaultRouter, + transaction + ) + + const host = spec.host || (router ? router.host : null) + if (!router) { + networkRouter = await RouterService.createRouterForFog(fogData, fogUuid, upstreamRouters, transaction) + } else { + networkRouter = await RouterService.updateRouter(router, { + messagingPort: spec.messagingPort || router.messagingPort, + interRouterPort: spec.interRouterPort || router.interRouterPort, + edgeRouterPort: spec.edgeRouterPort || router.edgeRouterPort, + isEdge: spec.routerMode === 'edge', + host + }, upstreamRouters, spec.containerEngine || fog.containerEngine, transaction) + } + + const baseRouterConfig = await IofogService._getRouterMicroserviceConfig(fogUuid, transaction) + await ServiceBridgeConfig.recomputeServiceBridgeConfig(fogUuid, baseRouterConfig, transaction) + } + + if (spec.host && spec.host !== fog.host) { + await IofogService._updateMicroserviceExtraHosts(fogUuid, spec.host, transaction) + } + + if (fog.abstractedHardwareEnabled === true && spec.abstractedHardwareEnabled === false) { + await IofogService._deleteHalMicroserviceByFog(fogData, transaction) + await ChangeTrackingService.update(fogUuid, ChangeTrackingService.events.microserviceCommon, transaction) + } else if (fog.abstractedHardwareEnabled === false && spec.abstractedHardwareEnabled === true) { + await IofogService._createHalMicroserviceForFog(fogData, fog, transaction) + await ChangeTrackingService.update(fogUuid, ChangeTrackingService.events.microserviceCommon, transaction) + } + + if (fog.bluetoothEnabled === true && spec.bluetoothEnabled === false) { + await IofogService._deleteBluetoothMicroserviceByFog(fogData, transaction) + await ChangeTrackingService.update(fogUuid, ChangeTrackingService.events.microserviceCommon, transaction) + } else if (fog.bluetoothEnabled === false && spec.bluetoothEnabled === true) { + await IofogService._createBluetoothMicroserviceForFog(fogData, fog, transaction) + await ChangeTrackingService.update(fogUuid, ChangeTrackingService.events.microserviceCommon, transaction) + } + + const isFirstReconcile = !status || status.observedGeneration === 0 + if (isFirstReconcile) { + await ChangeTrackingService.create(fogUuid, transaction) + } + await ChangeTrackingService.update(fogUuid, ChangeTrackingService.events.microserviceCommon, transaction) + + const routerAfter = await RouterManager.findOne({ iofogUuid: fogUuid }, transaction) + const natsAfter = await NatsInstanceManager.findByFog(fogUuid, transaction) + const topologyAfter = await captureTopologySnapshot(fogUuid, transaction) + + if (topologyChanged(topologyBefore, topologyAfter)) { + await NatsService.enqueueReconcileTask({ + reason: 'cluster-routes-changed', + fogUuids: [fogUuid] + }, transaction) + } + + phase = 'Ready' + await FogPlatformStatusManager.setPhase(fogUuid, 'Ready', { + observedGeneration: generation, + lastError: null, + conditions: buildReadyConditions(spec, routerAfter, natsAfter) + }, transaction) + + await FogManager.update({ uuid: fogUuid }, { warningMessage: 'HEALTHY' }, transaction) + + logger.info('fogPlatformReconcile completed', { + fogUuid, + generation, + phase, + durationMs: Date.now() - startedAt + }) + + return { + fogUuid, + generation, + phase, + networkRouterId: networkRouter ? networkRouter.id : null + } + } catch (error) { + logger.error('fogPlatformReconcile failed', { + fogUuid, + generation, + phase, + durationMs: Date.now() - startedAt, + error: error.message + }) + throw error + } +} + +async function reconcileFogDelete (fogUuid, transaction) { + const startedAt = Date.now() + + const fog = await FogManager.findOne({ uuid: fogUuid }, transaction) + if (!fog) { + logger.info('fogPlatformReconcile delete skipped missing fog', { + fogUuid, + durationMs: Date.now() - startedAt + }) + return { skipped: true, reason: 'not-found' } + } + + const parsedSpec = await FogPlatformSpecManager.getParsedSpec(fogUuid, transaction) + const fogData = parsedSpec + ? buildFogDataFromSpecAndFog(fog, parsedSpec.spec) + : { uuid: fogUuid, name: fog.name, containerEngine: fog.containerEngine } + + await IofogService._deleteFogRouter(fogData, transaction) + await IofogService._processDeleteCommand(fog, transaction) + + logger.info('fogPlatformReconcile delete completed', { + fogUuid, + phase: 'Deleting', + durationMs: Date.now() - startedAt + }) + + return { fogUuid, deleted: true } +} + +const bypassOptions = { bypassQueue: true } + +module.exports = { + buildFogDataFromSpecAndFog, + validateSystemFogInvariants, + captureTopologySnapshot, + topologyChanged, + markReconcileFailed, + reconcileFog: TransactionDecorator.generateTransaction(reconcileFog, bypassOptions), + reconcileFogDelete: TransactionDecorator.generateTransaction(reconcileFogDelete, bypassOptions) +} diff --git a/src/services/service-bridge-config.js b/src/services/service-bridge-config.js new file mode 100644 index 00000000..e9c50e3d --- /dev/null +++ b/src/services/service-bridge-config.js @@ -0,0 +1,104 @@ +const FogManager = require('../data/managers/iofog-manager') +const MicroserviceManager = require('../data/managers/microservice-manager') +const ChangeTrackingService = require('./change-tracking-service') +const IofogService = require('./iofog-service') +const { + ensureSystemApplication, + getSystemMicroserviceName +} = require('../helpers/system-naming') +const Errors = require('../helpers/errors') +const ErrorMessages = require('../helpers/error-messages') +const AppHelper = require('../helpers/app-helper') + +function isServiceDerivedBridgeKey (name) { + return typeof name === 'string' && (name.endsWith('-listener') || name.endsWith('-connector')) +} + +function stripServiceDerivedBridges (config) { + const result = JSON.parse(JSON.stringify(config || {})) + if (!result.bridges) { + result.bridges = { tcpConnectors: {}, tcpListeners: {} } + return result + } + if (result.bridges.tcpListeners) { + for (const key of Object.keys(result.bridges.tcpListeners)) { + if (isServiceDerivedBridgeKey(key)) { + delete result.bridges.tcpListeners[key] + } + } + } + if (result.bridges.tcpConnectors) { + for (const key of Object.keys(result.bridges.tcpConnectors)) { + if (isServiceDerivedBridgeKey(key)) { + delete result.bridges.tcpConnectors[key] + } + } + } + return result +} + +function buildTcpListenerForService (service) { + return IofogService._buildTcpListenerForFog(service) +} + +async function _resolveFogTagValues (fogUuid, transaction) { + const fog = await FogManager.findOneWithTags({ uuid: fogUuid }, transaction) + if (!fog) { + throw new Errors.NotFoundError(AppHelper.formatMessage(ErrorMessages.INVALID_IOFOG_UUID, fogUuid)) + } + if (fog.tags && fog.tags.length > 0) { + return fog.tags.map((tag) => tag.value) + } + return [] +} + +async function recomputeServiceBridgeConfig (fogUuid, baseConfig, transaction) { + let config = stripServiceDerivedBridges(baseConfig) + + const tagValues = await _resolveFogTagValues(fogUuid, transaction) + const serviceTags = await IofogService._extractServiceTags(tagValues) + if (serviceTags.length === 0) { + await _persistRouterConfigIfPresent(fogUuid, config, transaction) + return config + } + + const services = await IofogService._findMatchingServices(serviceTags, transaction) + for (const service of services) { + const listenerConfig = buildTcpListenerForService(service) + config = IofogService._mergeTcpListener(config, listenerConfig) + } + + await _persistRouterConfigIfPresent(fogUuid, config, transaction) + return config +} + +async function _persistRouterConfigIfPresent (fogUuid, config, transaction) { + const fog = await FogManager.findOne({ uuid: fogUuid }, transaction) + if (!fog) { + return + } + + const application = await ensureSystemApplication(fog, transaction) + const routerName = getSystemMicroserviceName('router') + const routerMicroservice = await MicroserviceManager.findOne({ + name: routerName, + applicationId: application.id + }, transaction) + if (!routerMicroservice) { + return + } + + await MicroserviceManager.update( + { uuid: routerMicroservice.uuid }, + { config: JSON.stringify(config) }, + transaction + ) + await ChangeTrackingService.update(fogUuid, ChangeTrackingService.events.microserviceConfig, transaction) +} + +module.exports = { + isServiceDerivedBridgeKey, + stripServiceDerivedBridges, + buildTcpListenerForService, + recomputeServiceBridgeConfig +} diff --git a/src/services/service-platform-service.js b/src/services/service-platform-service.js new file mode 100644 index 00000000..14d97bfe --- /dev/null +++ b/src/services/service-platform-service.js @@ -0,0 +1,465 @@ +const TransactionDecorator = require('../decorators/transaction-decorator') +const config = require('../config') +const Errors = require('../helpers/errors') +const ErrorMessages = require('../helpers/error-messages') +const AppHelper = require('../helpers/app-helper') +const ServiceManager = require('../data/managers/service-manager') +const FogPlatformReconcileTaskManager = require('../data/managers/fog-platform-reconcile-task-manager') +const ServicePlatformReconcileTaskManager = require('../data/managers/service-platform-reconcile-task-manager') +const HubRouterConfigLockManager = require('../data/managers/hub-router-config-lock-manager') +const RouterManager = require('../data/managers/router-manager') +const MicroserviceManager = require('../data/managers/microservice-manager') +const FogManager = require('../data/managers/iofog-manager') +const ChangeTrackingService = require('./change-tracking-service') +const ServicesService = require('./services-service') +const K8sClient = require('../utils/k8s-client') +const { + ensureSystemApplication, + getSystemMicroserviceName +} = require('../helpers/system-naming') +const logger = require('../logger') + +const K8S_ROUTER_CONFIG_MAP = 'iofog-router' +const HUB_LOCK_POLL_MS = 500 + +function getControllerUuid () { + return config.get('app.uuid') +} + +function normalizeTags (tags) { + if (!tags || tags.length === 0) { + return [] + } + return tags.map((tag) => (typeof tag === 'string' ? tag : tag.value)) +} + +function unionTags (tagsA, tagsB) { + return [...new Set([...normalizeTags(tagsA), ...normalizeTags(tagsB)])] +} + +function buildServiceConfigFromRow (service) { + return { + name: service.name, + type: service.type, + resource: service.resource, + defaultBridge: service.defaultBridge, + bridgePort: service.bridgePort, + targetPort: service.targetPort, + servicePort: service.servicePort, + k8sType: service.k8sType, + serviceEndpoint: service.serviceEndpoint, + tags: normalizeTags(service.tags) + } +} + +async function _getRouterMicroservice (fogNodeUuid, transaction) { + const fog = await FogManager.findOne({ uuid: fogNodeUuid }, transaction) + if (!fog) { + throw new Errors.NotFoundError(AppHelper.formatMessage(ErrorMessages.INVALID_IOFOG_UUID, fogNodeUuid)) + } + const application = await ensureSystemApplication(fog, transaction) + const routerName = getSystemMicroserviceName('router') + const routerMicroservice = await MicroserviceManager.findOne({ + name: routerName, + applicationId: application.id + }, transaction) + if (!routerMicroservice) { + throw new Errors.NotFoundError(`Router microservice not found: ${routerName}`) + } + return routerMicroservice +} + +async function _updateRouterMicroserviceConfig (fogNodeUuid, routerConfig, transaction) { + const routerMicroservice = await _getRouterMicroservice(fogNodeUuid, transaction) + await MicroserviceManager.update( + { uuid: routerMicroservice.uuid }, + { config: JSON.stringify(routerConfig) }, + transaction + ) + await ChangeTrackingService.update(fogNodeUuid, ChangeTrackingService.events.microserviceConfig, transaction) +} + +async function _patchK8sRouterConfig (routerConfig) { + const configMap = await K8sClient.getConfigMap(K8S_ROUTER_CONFIG_MAP) + if (!configMap) { + throw new Errors.NotFoundError(`ConfigMap not found: ${K8S_ROUTER_CONFIG_MAP}`) + } + await K8sClient.patchConfigMap(K8S_ROUTER_CONFIG_MAP, { + data: { + 'skrouterd.json': JSON.stringify(routerConfig) + } + }) +} + +async function _resolveHubListenerFogUuid (serviceConfig, transaction) { + if (serviceConfig.defaultBridge === 'default-router') { + const defaultRouter = await RouterManager.findOne({ isDefault: true }, transaction) + if (!defaultRouter) { + throw new Errors.NotFoundError('Default router not found') + } + return defaultRouter.iofogUuid + } + return serviceConfig.defaultBridge +} + +async function upsertHubTcpListener (serviceConfig, transaction) { + const isK8s = await ServicesService.checkKubernetesEnvironment() + const listener = ServicesService._buildTcpListener(serviceConfig) + + if (isK8s) { + const configMap = await K8sClient.getConfigMap(K8S_ROUTER_CONFIG_MAP) + if (!configMap) { + throw new Errors.NotFoundError(`ConfigMap not found: ${K8S_ROUTER_CONFIG_MAP}`) + } + const routerConfig = JSON.parse(configMap.data['skrouterd.json']) + const listenerIndex = routerConfig.findIndex((item) => + item[0] === 'tcpListener' && item[1].name === listener.name + ) + if (listenerIndex !== -1) { + routerConfig[listenerIndex] = ['tcpListener', listener] + } else { + routerConfig.push(['tcpListener', listener]) + } + await _patchK8sRouterConfig(routerConfig) + return + } + + const fogNodeUuid = await _resolveHubListenerFogUuid(serviceConfig, transaction) + const routerMicroservice = await _getRouterMicroservice(fogNodeUuid, transaction) + const currentConfig = JSON.parse(routerMicroservice.config || '{}') + if (!currentConfig.bridges) { + currentConfig.bridges = {} + } + if (!currentConfig.bridges.tcpListeners) { + currentConfig.bridges.tcpListeners = {} + } + currentConfig.bridges.tcpListeners[listener.name] = listener + await _updateRouterMicroserviceConfig(fogNodeUuid, currentConfig, transaction) +} + +async function upsertHubTcpConnector (serviceConfig, transaction) { + const isK8s = await ServicesService.checkKubernetesEnvironment() + const targetRouterNode = await ServicesService._determineConnectorSiteId(serviceConfig, transaction) + const connector = await ServicesService._buildTcpConnector(serviceConfig, transaction) + + if (targetRouterNode === 'default-router') { + if (isK8s) { + const configMap = await K8sClient.getConfigMap(K8S_ROUTER_CONFIG_MAP) + if (!configMap) { + throw new Errors.NotFoundError(`ConfigMap not found: ${K8S_ROUTER_CONFIG_MAP}`) + } + const routerConfig = JSON.parse(configMap.data['skrouterd.json']) + const connectorIndex = routerConfig.findIndex((item) => + item[0] === 'tcpConnector' && item[1].name === connector.name + ) + if (connectorIndex !== -1) { + routerConfig[connectorIndex] = ['tcpConnector', connector] + } else { + routerConfig.push(['tcpConnector', connector]) + } + await _patchK8sRouterConfig(routerConfig) + return + } + + const defaultRouter = await RouterManager.findOne({ isDefault: true }, transaction) + if (!defaultRouter) { + throw new Errors.NotFoundError('Default router not found') + } + const fogNodeUuid = defaultRouter.iofogUuid + const routerMicroservice = await _getRouterMicroservice(fogNodeUuid, transaction) + const currentConfig = JSON.parse(routerMicroservice.config || '{}') + if (!currentConfig.bridges) { + currentConfig.bridges = {} + } + if (!currentConfig.bridges.tcpConnectors) { + currentConfig.bridges.tcpConnectors = {} + } + currentConfig.bridges.tcpConnectors[connector.name] = connector + await _updateRouterMicroserviceConfig(fogNodeUuid, currentConfig, transaction) + return + } + + const fogNodeUuid = targetRouterNode + const routerMicroservice = await _getRouterMicroservice(fogNodeUuid, transaction) + const currentConfig = JSON.parse(routerMicroservice.config || '{}') + if (!currentConfig.bridges) { + currentConfig.bridges = {} + } + if (!currentConfig.bridges.tcpConnectors) { + currentConfig.bridges.tcpConnectors = {} + } + currentConfig.bridges.tcpConnectors[connector.name] = connector + await _updateRouterMicroserviceConfig(fogNodeUuid, currentConfig, transaction) +} + +async function deleteHubTcpConnector (serviceConfig, transaction) { + const isK8s = await ServicesService.checkKubernetesEnvironment() + const connectorName = `${serviceConfig.name}-connector` + const targetRouterNode = await ServicesService._determineConnectorSiteId(serviceConfig, transaction) + + if (targetRouterNode === 'default-router') { + if (isK8s) { + const configMap = await K8sClient.getConfigMap(K8S_ROUTER_CONFIG_MAP) + if (!configMap) { + throw new Errors.NotFoundError(`ConfigMap not found: ${K8S_ROUTER_CONFIG_MAP}`) + } + const routerConfig = JSON.parse(configMap.data['skrouterd.json']) + const updatedConfig = routerConfig.filter((item) => + !(item[0] === 'tcpConnector' && item[1].name === connectorName) + ) + await _patchK8sRouterConfig(updatedConfig) + return + } + + const defaultRouter = await RouterManager.findOne({ isDefault: true }, transaction) + if (!defaultRouter) { + throw new Errors.NotFoundError('Default router not found') + } + const fogNodeUuid = defaultRouter.iofogUuid + const routerMicroservice = await _getRouterMicroservice(fogNodeUuid, transaction) + const currentConfig = JSON.parse(routerMicroservice.config || '{}') + if (currentConfig.bridges && currentConfig.bridges.tcpConnectors) { + delete currentConfig.bridges.tcpConnectors[connectorName] + } + await _updateRouterMicroserviceConfig(fogNodeUuid, currentConfig, transaction) + return + } + + const fogNodeUuid = targetRouterNode + const routerMicroservice = await _getRouterMicroservice(fogNodeUuid, transaction) + const currentConfig = JSON.parse(routerMicroservice.config || '{}') + if (currentConfig.bridges && currentConfig.bridges.tcpConnectors) { + delete currentConfig.bridges.tcpConnectors[connectorName] + } + await _updateRouterMicroserviceConfig(fogNodeUuid, currentConfig, transaction) +} + +async function deleteHubTcpListener (serviceConfig, transaction) { + const isK8s = await ServicesService.checkKubernetesEnvironment() + const listenerName = `${serviceConfig.name}-listener` + + if (isK8s) { + const configMap = await K8sClient.getConfigMap(K8S_ROUTER_CONFIG_MAP) + if (!configMap) { + throw new Errors.NotFoundError(`ConfigMap not found: ${K8S_ROUTER_CONFIG_MAP}`) + } + const routerConfig = JSON.parse(configMap.data['skrouterd.json']) + const updatedConfig = routerConfig.filter((item) => + !(item[0] === 'tcpListener' && item[1].name === listenerName) + ) + await _patchK8sRouterConfig(updatedConfig) + return + } + + const fogNodeUuid = await _resolveHubListenerFogUuid(serviceConfig, transaction) + const routerMicroservice = await _getRouterMicroservice(fogNodeUuid, transaction) + const currentConfig = JSON.parse(routerMicroservice.config || '{}') + if (currentConfig.bridges && currentConfig.bridges.tcpListeners) { + delete currentConfig.bridges.tcpListeners[listenerName] + } + await _updateRouterMicroserviceConfig(fogNodeUuid, currentConfig, transaction) +} + +async function acquireHubLockWithTimeout (controllerUuid, transaction) { + const timeoutSeconds = config.get('settings.hubRouterConfigLockTimeoutSeconds', 120) + const deadline = Date.now() + timeoutSeconds * 1000 + + while (Date.now() < deadline) { + const acquired = await HubRouterConfigLockManager.tryAcquire( + controllerUuid, + timeoutSeconds, + transaction + ) + if (acquired) { + return true + } + await new Promise((resolve) => setTimeout(resolve, HUB_LOCK_POLL_MS)) + } + + throw new Error(`Timed out waiting for hub router ConfigMap lock after ${timeoutSeconds}s`) +} + +async function watchLoadBalancerWithTimeout (serviceName) { + const timeoutSeconds = config.get('settings.serviceLoadBalancerWatchTimeoutSeconds', 300) + const retryInterval = 2000 + const maxRetries = Math.max(1, Math.ceil((timeoutSeconds * 1000) / retryInterval)) + const loadBalancerIP = await K8sClient.watchLoadBalancerIP(serviceName, maxRetries, retryInterval) + if (!loadBalancerIP) { + throw new Error( + `LoadBalancer IP not assigned for service ${serviceName} within ${timeoutSeconds}s` + ) + } + return loadBalancerIP +} + +function needsK8sService (serviceConfig, isK8s) { + if (!isK8s) { + return false + } + const serviceType = (serviceConfig.type || '').toLowerCase() + return serviceType === 'microservice' || + serviceType === 'agent' || + serviceType === 'external' +} + +async function reconcileK8sService (serviceConfig, isK8s, transaction) { + if (!needsK8sService(serviceConfig, isK8s)) { + return + } + + await ServicesService._updateK8sService(serviceConfig, transaction) + + if ((serviceConfig.k8sType || '').toLowerCase() === 'loadbalancer') { + const loadBalancerIP = await watchLoadBalancerWithTimeout(serviceConfig.name) + await ServiceManager.update( + { name: serviceConfig.name }, + { serviceEndpoint: loadBalancerIP }, + transaction + ) + } +} + +async function fanOutFogReconcile (serviceTags, transaction) { + const fogUuids = await ServicesService.handleServiceDistribution(serviceTags, transaction) + for (const fogUuid of fogUuids) { + await FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask({ + fogUuid, + reason: 'service-changed' + }, transaction) + } + return fogUuids +} + +async function reconcileServiceHub (serviceConfig, snapshot, transaction) { + if (snapshot && + snapshot.resource != null && + serviceConfig.resource != null && + snapshot.resource !== serviceConfig.resource) { + await deleteHubTcpConnector(buildServiceConfigFromRow(snapshot), transaction) + } + + await upsertHubTcpConnector(serviceConfig, transaction) + await upsertHubTcpListener(serviceConfig, transaction) +} + +async function reconcileServiceDeleteHub (serviceConfig, isK8s, transaction) { + await deleteHubTcpConnector(serviceConfig, transaction) + await deleteHubTcpListener(serviceConfig, transaction) + + if (isK8s && (serviceConfig.type || '').toLowerCase() !== 'k8s') { + await ServicesService._deleteK8sService(serviceConfig.name) + } +} + +async function reconcileService (serviceName, task, transaction) { + const startedAt = Date.now() + const isDelete = task && task.reason === 'delete' + const snapshot = task ? ServicePlatformReconcileTaskManager.getParsedSpecSnapshot(task) : null + const controllerUuid = getControllerUuid() + let hubLockHeld = false + + try { + let serviceConfig = null + let fanOutTags = [] + + if (isDelete) { + if (!snapshot) { + throw new Errors.ValidationError(`Service delete reconcile requires spec_snapshot for ${serviceName}`) + } + serviceConfig = buildServiceConfigFromRow(snapshot) + fanOutTags = normalizeTags(snapshot.tags) + } else { + const service = await ServiceManager.findOneWithTags({ name: serviceName }, transaction) + if (!service) { + throw new Errors.NotFoundError(`Service with name ${serviceName} not found`) + } + serviceConfig = buildServiceConfigFromRow(service) + fanOutTags = unionTags(snapshot && snapshot.tags, serviceConfig.tags) + + await ServiceManager.update( + { name: serviceName }, + { provisioningStatus: 'pending', provisioningError: null }, + transaction + ) + } + + const isK8s = await ServicesService.checkKubernetesEnvironment() + + if (isK8s) { + await acquireHubLockWithTimeout(controllerUuid, transaction) + hubLockHeld = true + } + + if (isDelete) { + await reconcileServiceDeleteHub(serviceConfig, isK8s, transaction) + } else { + await reconcileServiceHub(serviceConfig, snapshot, transaction) + await reconcileK8sService(serviceConfig, isK8s, transaction) + } + + if (hubLockHeld) { + await HubRouterConfigLockManager.release(controllerUuid, transaction) + hubLockHeld = false + } + + await fanOutFogReconcile(fanOutTags, transaction) + + if (!isDelete) { + await ServiceManager.update( + { name: serviceName }, + { provisioningStatus: 'ready', provisioningError: null }, + transaction + ) + } else if (task && task.id != null) { + await ServicePlatformReconcileTaskManager.delete({ id: task.id }, transaction) + } + + logger.info('servicePlatformReconcile completed', { + serviceName, + reason: task ? task.reason : null, + isDelete, + durationMs: Date.now() - startedAt + }) + + return { + serviceName, + isDelete, + provisioningStatus: isDelete ? null : 'ready' + } + } catch (error) { + if (hubLockHeld) { + try { + await HubRouterConfigLockManager.release(controllerUuid, transaction) + } catch (releaseError) { + logger.warn('servicePlatformReconcile failed to release hub lock', { + serviceName, + error: releaseError.message + }) + } + } + + logger.error('servicePlatformReconcile failed', { + serviceName, + reason: task ? task.reason : null, + durationMs: Date.now() - startedAt, + error: error.message + }) + throw error + } +} + +const bypassOptions = { bypassQueue: true } + +module.exports = { + normalizeTags, + unionTags, + buildServiceConfigFromRow, + upsertHubTcpListener, + upsertHubTcpConnector, + deleteHubTcpConnector, + deleteHubTcpListener, + acquireHubLockWithTimeout, + watchLoadBalancerWithTimeout, + fanOutFogReconcile, + reconcileService: TransactionDecorator.generateTransaction(reconcileService, bypassOptions) +} diff --git a/test/src/services/fog-platform-service.test.js b/test/src/services/fog-platform-service.test.js new file mode 100644 index 00000000..5e87642e --- /dev/null +++ b/test/src/services/fog-platform-service.test.js @@ -0,0 +1,236 @@ +const { expect } = require('chai') +const sinon = require('sinon') + +const FogPlatformService = require('../../../src/services/fog-platform-service') +const FogManager = require('../../../src/data/managers/iofog-manager') +const FogPlatformSpecManager = require('../../../src/data/managers/fog-platform-spec-manager') +const FogPlatformStatusManager = require('../../../src/data/managers/fog-platform-status-manager') +const RouterManager = require('../../../src/data/managers/router-manager') +const RouterConnectionManager = require('../../../src/data/managers/router-connection-manager') +const NatsInstanceManager = require('../../../src/data/managers/nats-instance-manager') +const NatsConnectionManager = require('../../../src/data/managers/nats-connection-manager') +const IofogService = require('../../../src/services/iofog-service') +const NatsService = require('../../../src/services/nats-service') +const RouterService = require('../../../src/services/router-service') +const ServiceBridgeConfig = require('../../../src/services/service-bridge-config') +const ChangeTrackingService = require('../../../src/services/change-tracking-service') +const MicroserviceManager = require('../../../src/data/managers/microservice-manager') +const MicroserviceService = require('../../../src/services/microservices-service') +const ApplicationManager = require('../../../src/data/managers/application-manager') +const SecretManager = require('../../../src/data/managers/secret-manager') +const FogPublicKeyManager = require('../../../src/data/managers/iofog-public-key-manager') + +describe('Fog platform service', () => { + def('sandbox', () => sinon.createSandbox()) + const transaction = {} + const fogUuid = 'fog-abc' + + afterEach(() => $sandbox.restore()) + + describe('.validateSystemFogInvariants()', () => { + it('rejects non-interior router mode for system fog', () => { + try { + FogPlatformService.validateSystemFogInvariants( + { isSystem: true }, + { routerMode: 'edge', natsMode: 'server' } + ) + throw new Error('expected validation to fail') + } catch (error) { + expect(error.name).to.equal('ValidationError') + } + }) + }) + + describe('.reconcileFog()', () => { + const fog = { + uuid: fogUuid, + name: 'edge-a', + isSystem: false, + host: '10.0.0.5', + bluetoothEnabled: false, + abstractedHardwareEnabled: false, + containerEngine: 'edgelet', + tags: [] + } + const spec = { + routerMode: 'edge', + natsMode: 'leaf', + host: '10.0.0.5', + messagingPort: 5671, + containerEngine: 'edgelet', + bluetoothEnabled: false, + abstractedHardwareEnabled: false + } + const parsedSpec = { fogUuid, generation: 2, spec } + const router = { + id: 11, + iofogUuid: fogUuid, + isEdge: true, + host: '10.0.0.5', + messagingPort: 5671, + interRouterPort: null, + edgeRouterPort: null + } + + beforeEach(() => { + $sandbox.stub(FogManager, 'findOneWithTags').resolves({ ...fog }) + $sandbox.stub(FogManager, 'findOne').resolves({ ...fog }) + $sandbox.stub(FogManager, 'update').resolves() + $sandbox.stub(FogPlatformSpecManager, 'getParsedSpec').resolves(parsedSpec) + $sandbox.stub(FogPlatformStatusManager, 'getParsedStatus').resolves({ + fogUuid, + observedGeneration: 1, + phase: 'Pending' + }) + $sandbox.stub(FogPlatformStatusManager, 'setPhase').resolves() + $sandbox.stub(RouterManager, 'findOne').callsFake((query) => { + if (query && query.isDefault) { + return Promise.resolve({ id: 1, iofogUuid: 'default', isDefault: true }) + } + return Promise.resolve({ ...router }) + }) + $sandbox.stub(RouterConnectionManager, 'findAllWithRouters').resolves([]) + $sandbox.stub(NatsInstanceManager, 'findOne').resolves(null) + $sandbox.stub(NatsInstanceManager, 'findByFog').resolves({ id: 5, isLeaf: true }) + $sandbox.stub(NatsConnectionManager, 'findAllWithNats').resolves([]) + $sandbox.stub(IofogService, '_handleRouterCertificates').resolves() + $sandbox.stub(NatsService, 'ensureNatsForFog').resolves() + $sandbox.stub(NatsService, 'cleanupNatsForFog').resolves() + $sandbox.stub(NatsService, 'enqueueReconcileTask').resolves() + $sandbox.stub(RouterService, 'validateAndReturnUpstreamRouters').resolves([]) + $sandbox.stub(RouterService, 'updateRouter').resolves(router) + $sandbox.stub(IofogService, '_getRouterMicroserviceConfig').resolves({ bridges: { tcpListeners: {}, tcpConnectors: {} } }) + $sandbox.stub(ServiceBridgeConfig, 'recomputeServiceBridgeConfig').resolves({ bridges: { tcpListeners: {}, tcpConnectors: {} } }) + $sandbox.stub(ChangeTrackingService, 'create').resolves() + $sandbox.stub(ChangeTrackingService, 'update').resolves() + }) + + it('skips reconcile when platform phase is Deleting', async () => { + FogPlatformStatusManager.getParsedStatus.resolves({ fogUuid, phase: 'Deleting', observedGeneration: 1 }) + + const result = await FogPlatformService.reconcileFog(fogUuid, transaction) + + expect(result).to.eql({ skipped: true, reason: 'deleting' }) + expect(FogPlatformStatusManager.setPhase).to.not.have.been.called + expect(IofogService._handleRouterCertificates).to.not.have.been.called + }) + + it('runs ordered reconcile steps and marks platform Ready', async () => { + const result = await FogPlatformService.reconcileFog(fogUuid, transaction) + + expect(IofogService._handleRouterCertificates).to.have.been.calledOnce + expect(NatsService.ensureNatsForFog).to.have.been.calledOnce + expect(RouterService.updateRouter).to.have.been.calledOnce + expect(ServiceBridgeConfig.recomputeServiceBridgeConfig).to.have.been.calledOnce + expect(FogPlatformStatusManager.setPhase).to.have.been.calledWith( + fogUuid, + 'Ready', + sinon.match.has('observedGeneration', 2), + transaction + ) + expect(FogManager.update).to.have.been.calledWith( + { uuid: fogUuid }, + { warningMessage: 'HEALTHY' }, + transaction + ) + expect(result.phase).to.equal('Ready') + }) + + it('is safe to reconcile the same generation twice', async () => { + await FogPlatformService.reconcileFog(fogUuid, transaction) + await FogPlatformService.reconcileFog(fogUuid, transaction) + + expect(RouterService.updateRouter).to.have.been.calledTwice + expect(ServiceBridgeConfig.recomputeServiceBridgeConfig).to.have.been.calledTwice + }) + + it('accepts worker call shape (fogUuid only) with decorator fakeTransaction outside test mode', async () => { + const appHelperPath = require.resolve('../../../src/helpers/app-helper') + const decoratorPath = require.resolve('../../../src/decorators/transaction-decorator') + const fogPlatformServicePath = require.resolve('../../../src/services/fog-platform-service') + + $sandbox.stub(require(appHelperPath), 'isTest').returns(false) + delete require.cache[decoratorPath] + delete require.cache[fogPlatformServicePath] + const FreshFogPlatformService = require('../../../src/services/fog-platform-service') + + FogPlatformStatusManager.getParsedStatus.resolves({ fogUuid, phase: 'Deleting', observedGeneration: 1 }) + + const result = await FreshFogPlatformService.reconcileFog(fogUuid) + + expect(result).to.eql({ skipped: true, reason: 'deleting' }) + expect(FogManager.findOneWithTags).to.have.been.calledWith( + { uuid: fogUuid }, + sinon.match({ fakeTransaction: true }) + ) + }) + + it('enqueues NATS resolver work when topology changes', async () => { + NatsInstanceManager.findByFog + .onCall(0).resolves(null) + .onCall(1).resolves({ id: 5, isLeaf: true }) + + await FogPlatformService.reconcileFog(fogUuid, transaction) + + expect(NatsService.enqueueReconcileTask).to.have.been.calledWithMatch({ + reason: 'cluster-routes-changed', + fogUuids: [fogUuid] + }, transaction) + }) + }) + + describe('.markReconcileFailed()', () => { + beforeEach(() => { + $sandbox.stub(FogPlatformStatusManager, 'setPhase').resolves() + $sandbox.stub(FogManager, 'update').resolves() + }) + + it('marks platform Failed and mirrors warningMessage on fog row', async () => { + const error = new Error('router create failed') + + await FogPlatformService.markReconcileFailed(fogUuid, error, transaction) + + expect(FogPlatformStatusManager.setPhase).to.have.been.calledOnceWith( + fogUuid, + 'Failed', + { lastError: 'router create failed' }, + transaction + ) + expect(FogManager.update).to.have.been.calledOnceWith( + { uuid: fogUuid }, + { warningMessage: 'Platform reconcile: router create failed' }, + transaction + ) + }) + }) + + describe('.reconcileFogDelete()', () => { + const fog = { uuid: fogUuid, name: 'edge-a', containerEngine: 'edgelet' } + + beforeEach(() => { + $sandbox.stub(FogManager, 'findOne').resolves(fog) + $sandbox.stub(FogPlatformSpecManager, 'getParsedSpec').resolves({ + fogUuid, + generation: 1, + spec: { routerMode: 'edge', natsMode: 'leaf', containerEngine: 'edgelet' } + }) + $sandbox.stub(IofogService, '_deleteFogRouter').resolves() + $sandbox.stub(MicroserviceManager, 'findAll').resolves([]) + $sandbox.stub(ApplicationManager, 'delete').resolves() + $sandbox.stub(ChangeTrackingService, 'update').resolves() + $sandbox.stub(SecretManager, 'findOne').resolves(null) + $sandbox.stub(NatsService, 'cleanupNatsForFog').resolves() + $sandbox.stub(FogPublicKeyManager, 'findByFogUuid').resolves(null) + $sandbox.stub(FogManager, 'delete').resolves() + }) + + it('tears down router runtime before deleting fog resources', async () => { + const result = await FogPlatformService.reconcileFogDelete(fogUuid, transaction) + + expect(IofogService._deleteFogRouter).to.have.been.calledBefore(FogManager.delete) + expect(FogManager.delete).to.have.been.calledOnceWith({ uuid: fogUuid }, transaction) + expect(NatsService.cleanupNatsForFog).to.have.been.calledOnceWith(fog, transaction) + expect(result).to.eql({ fogUuid, deleted: true }) + }) + }) +}) diff --git a/test/src/services/service-bridge-config.test.js b/test/src/services/service-bridge-config.test.js new file mode 100644 index 00000000..949ebdfe --- /dev/null +++ b/test/src/services/service-bridge-config.test.js @@ -0,0 +1,110 @@ +const { expect } = require('chai') +const sinon = require('sinon') + +const ServiceBridgeConfig = require('../../../src/services/service-bridge-config') +const IofogService = require('../../../src/services/iofog-service') +const FogManager = require('../../../src/data/managers/iofog-manager') +const MicroserviceManager = require('../../../src/data/managers/microservice-manager') +const ChangeTrackingService = require('../../../src/services/change-tracking-service') +const ApplicationManager = require('../../../src/data/managers/application-manager') + +describe('Service bridge config', () => { + def('sandbox', () => sinon.createSandbox()) + const transaction = {} + + afterEach(() => $sandbox.restore()) + + describe('.stripServiceDerivedBridges()', () => { + it('removes service-derived listeners and connectors while preserving router bridges', () => { + const baseConfig = { + bridges: { + tcpListeners: { + 'api-listener': { name: 'api-listener', port: '9000', address: 'api' }, + 'fog-amqp': { name: 'fog-amqp', port: '5672', address: 'amqp' } + }, + tcpConnectors: { + 'api-connector': { name: 'api-connector', host: 'hub', port: '8080' }, + 'upstream-router': { name: 'upstream-router', host: '10.0.0.2', port: '55671' } + } + } + } + + const stripped = ServiceBridgeConfig.stripServiceDerivedBridges(baseConfig) + + expect(stripped.bridges.tcpListeners).to.eql({ + 'fog-amqp': { name: 'fog-amqp', port: '5672', address: 'amqp' } + }) + expect(stripped.bridges.tcpConnectors).to.eql({ + 'upstream-router': { name: 'upstream-router', host: '10.0.0.2', port: '55671' } + }) + }) + }) + + describe('.recomputeServiceBridgeConfig()', () => { + const fogUuid = 'fog-1' + const fog = { + uuid: fogUuid, + name: 'edge-a', + tags: [{ value: 'service:site-a' }] + } + const services = [ + { name: 'api', bridgePort: 9001 }, + { name: 'mqtt', bridgePort: 9002 } + ] + const routerMicroservice = { uuid: 'router-ms-1' } + + beforeEach(() => { + $sandbox.stub(FogManager, 'findOneWithTags').resolves(fog) + $sandbox.stub(FogManager, 'findOne').resolves(fog) + $sandbox.stub(IofogService, '_extractServiceTags').resolves(['site-a']) + $sandbox.stub(IofogService, '_findMatchingServices').resolves(services) + $sandbox.stub(ApplicationManager, 'findOne').resolves({ id: 10, name: 'system-edge-a', isSystem: true }) + $sandbox.stub(MicroserviceManager, 'findOne').resolves(routerMicroservice) + $sandbox.stub(MicroserviceManager, 'update').resolves() + $sandbox.stub(MicroserviceManager, 'delete').resolves() + $sandbox.stub(MicroserviceManager, 'findAll').resolves([]) + $sandbox.stub(ChangeTrackingService, 'update').resolves() + }) + + it('rebuilds service listeners from the catalog and persists router config', async () => { + const baseConfig = { + bridges: { + tcpListeners: { + 'stale-listener': { name: 'stale-listener', port: '8000', address: 'stale' } + }, + tcpConnectors: {} + } + } + + const result = await ServiceBridgeConfig.recomputeServiceBridgeConfig(fogUuid, baseConfig, transaction) + + expect(result.bridges.tcpListeners).to.eql({ + 'api-listener': { name: 'api-listener', port: '9001', address: 'api' }, + 'mqtt-listener': { name: 'mqtt-listener', port: '9002', address: 'mqtt' } + }) + expect(MicroserviceManager.update).to.have.been.calledOnce + expect(ChangeTrackingService.update).to.have.been.calledWith( + fogUuid, + ChangeTrackingService.events.microserviceConfig, + transaction + ) + }) + + it('clears stale service listeners when no services match', async () => { + IofogService._extractServiceTags.resolves([]) + const baseConfig = { + bridges: { + tcpListeners: { + 'api-listener': { name: 'api-listener', port: '9001', address: 'api' } + }, + tcpConnectors: {} + } + } + + const result = await ServiceBridgeConfig.recomputeServiceBridgeConfig(fogUuid, baseConfig, transaction) + + expect(result.bridges.tcpListeners).to.eql({}) + expect(MicroserviceManager.update).to.have.been.calledOnce + }) + }) +}) diff --git a/test/src/services/service-platform-service.test.js b/test/src/services/service-platform-service.test.js new file mode 100644 index 00000000..2ab1c874 --- /dev/null +++ b/test/src/services/service-platform-service.test.js @@ -0,0 +1,374 @@ +const { expect } = require('chai') +const sinon = require('sinon') + +const ServicePlatformService = require('../../../src/services/service-platform-service') +const ServiceManager = require('../../../src/data/managers/service-manager') +const ServicePlatformReconcileTaskManager = require('../../../src/data/managers/service-platform-reconcile-task-manager') +const FogPlatformReconcileTaskManager = require('../../../src/data/managers/fog-platform-reconcile-task-manager') +const HubRouterConfigLockManager = require('../../../src/data/managers/hub-router-config-lock-manager') +const RouterManager = require('../../../src/data/managers/router-manager') +const ServicesService = require('../../../src/services/services-service') +const K8sClient = require('../../../src/utils/k8s-client') +const config = require('../../../src/config') + +describe('Service platform service', () => { + def('sandbox', () => sinon.createSandbox()) + const transaction = {} + const serviceName = 'api-gateway' + + afterEach(() => $sandbox.restore()) + + describe('.unionTags()', () => { + it('merges snapshot and current tag sets without duplicates', () => { + const merged = ServicePlatformService.unionTags( + [{ value: 'site-a' }, { value: 'site-b' }], + ['site-b', 'site-c'] + ) + expect(merged).to.have.members(['site-a', 'site-b', 'site-c']) + }) + }) + + describe('.reconcileService()', () => { + const service = { + name: serviceName, + type: 'external', + resource: '10.0.0.8', + defaultBridge: 'default-router', + bridgePort: 9100, + targetPort: 8080, + servicePort: 9100, + k8sType: 'LoadBalancer', + tags: [{ value: 'site-a' }] + } + const task = { + id: 42, + serviceName, + reason: 'spec-changed', + specSnapshot: JSON.stringify({ + name: serviceName, + type: 'external', + resource: '10.0.0.8', + defaultBridge: 'default-router', + bridgePort: 9100, + targetPort: 8080, + servicePort: 9100, + k8sType: 'LoadBalancer', + tags: [{ value: 'site-a' }] + }) + } + + beforeEach(() => { + $sandbox.stub(config, 'get').callsFake((key, defaultValue) => { + if (key === 'app.uuid') { + return 'controller-uuid-1' + } + if (key === 'settings.hubRouterConfigLockTimeoutSeconds') { + return 120 + } + if (key === 'settings.serviceLoadBalancerWatchTimeoutSeconds') { + return 300 + } + return defaultValue + }) + $sandbox.stub(ServicesService, 'checkKubernetesEnvironment').resolves(true) + $sandbox.stub(HubRouterConfigLockManager, 'tryAcquire').resolves(true) + $sandbox.stub(HubRouterConfigLockManager, 'release').resolves(true) + $sandbox.stub(ServicesService, '_determineConnectorSiteId').resolves('default-router') + $sandbox.stub(ServicesService, '_buildTcpConnector').resolves({ + name: `${serviceName}-connector`, + host: '10.0.0.8', + port: '8080', + address: serviceName, + processId: `${serviceName}-external-8080` + }) + $sandbox.stub(ServicesService, '_buildTcpListener').returns({ + name: `${serviceName}-listener`, + port: '9100', + address: serviceName + }) + $sandbox.stub(K8sClient, 'getConfigMap').resolves({ + data: { + 'skrouterd.json': JSON.stringify([]) + } + }) + $sandbox.stub(K8sClient, 'patchConfigMap').resolves() + $sandbox.stub(ServicesService, '_updateK8sService').resolves() + $sandbox.stub(K8sClient, 'watchLoadBalancerIP').resolves('203.0.113.10') + $sandbox.stub(ServicesService, 'handleServiceDistribution').resolves(['fog-a']) + $sandbox.stub(FogPlatformReconcileTaskManager, 'enqueueFogPlatformReconcileTask').resolves({ id: 1 }) + $sandbox.stub(ServiceManager, 'findOneWithTags').resolves({ ...service, tags: [...service.tags] }) + $sandbox.stub(ServiceManager, 'update').resolves() + $sandbox.stub(ServicePlatformReconcileTaskManager, 'delete').resolves() + }) + + it('runs hub reconcile, fan-out, and marks provisioning ready', async () => { + const result = await ServicePlatformService.reconcileService(serviceName, task, transaction) + + expect(HubRouterConfigLockManager.tryAcquire).to.have.been.calledOnce + expect(K8sClient.patchConfigMap).to.have.been.called + expect(ServicesService._updateK8sService).to.have.been.calledOnce + expect(K8sClient.watchLoadBalancerIP).to.have.been.calledOnce + expect(HubRouterConfigLockManager.release).to.have.been.calledOnce + expect(FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask).to.have.been.calledWith({ + fogUuid: 'fog-a', + reason: 'service-changed' + }, transaction) + expect(ServiceManager.update).to.have.been.calledWith( + { name: serviceName }, + { provisioningStatus: 'ready', provisioningError: null }, + transaction + ) + expect(result.provisioningStatus).to.equal('ready') + }) + + it('fans out fog reconcile to old and new tagged fogs from snapshot union', async () => { + const tagChangeTask = { + ...task, + specSnapshot: JSON.stringify({ + ...JSON.parse(task.specSnapshot), + tags: [{ value: 'site-a' }] + }) + } + + ServiceManager.findOneWithTags.resolves({ + ...service, + tags: [{ value: 'site-b' }] + }) + ServicesService.handleServiceDistribution.resolves(['fog-a', 'fog-b', 'fog-c']) + + await ServicePlatformService.reconcileService(serviceName, tagChangeTask, transaction) + + expect(ServicesService.handleServiceDistribution).to.have.been.calledWith( + ['site-a', 'site-b'], + transaction + ) + expect(FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask).to.have.callCount(3) + }) + + it('is safe to reconcile the same service twice', async () => { + await ServicePlatformService.reconcileService(serviceName, task, transaction) + await ServicePlatformService.reconcileService(serviceName, task, transaction) + + expect(K8sClient.patchConfigMap.callCount).to.be.at.least(4) + }) + + it('throws when LoadBalancer IP watch times out', async () => { + K8sClient.watchLoadBalancerIP.resolves(null) + + try { + await ServicePlatformService.reconcileService(serviceName, task, transaction) + throw new Error('expected reconcile to fail') + } catch (error) { + expect(error.message).to.include('LoadBalancer IP not assigned') + } + + expect(HubRouterConfigLockManager.release).to.have.been.calledOnce + expect(ServiceManager.update).to.not.have.been.calledWith( + { name: serviceName }, + { provisioningStatus: 'ready', provisioningError: null }, + transaction + ) + }) + }) + + describe('.reconcileService() delete path', () => { + const snapshot = { + name: serviceName, + type: 'external', + resource: '10.0.0.8', + defaultBridge: 'default-router', + bridgePort: 9100, + targetPort: 8080, + servicePort: 9100, + k8sType: 'LoadBalancer', + tags: [{ value: 'site-a' }, { value: 'site-b' }] + } + const deleteTask = { + id: 99, + serviceName, + reason: 'delete', + specSnapshot: JSON.stringify(snapshot) + } + + beforeEach(() => { + $sandbox.stub(config, 'get').callsFake((key, defaultValue) => { + if (key === 'app.uuid') { + return 'controller-uuid-1' + } + return defaultValue + }) + $sandbox.stub(ServicesService, 'checkKubernetesEnvironment').resolves(true) + $sandbox.stub(HubRouterConfigLockManager, 'tryAcquire').resolves(true) + $sandbox.stub(HubRouterConfigLockManager, 'release').resolves(true) + $sandbox.stub(ServicesService, '_determineConnectorSiteId').resolves('default-router') + $sandbox.stub(K8sClient, 'getConfigMap').resolves({ + data: { + 'skrouterd.json': JSON.stringify([ + ['tcpConnector', { name: `${serviceName}-connector` }], + ['tcpListener', { name: `${serviceName}-listener` }] + ]) + } + }) + $sandbox.stub(K8sClient, 'patchConfigMap').resolves() + $sandbox.stub(ServicesService, '_deleteK8sService').resolves() + $sandbox.stub(ServicesService, 'handleServiceDistribution').resolves(['fog-a', 'fog-b']) + $sandbox.stub(FogPlatformReconcileTaskManager, 'enqueueFogPlatformReconcileTask').resolves({ id: 1 }) + $sandbox.stub(ServicePlatformReconcileTaskManager, 'delete').resolves() + $sandbox.stub(ServiceManager, 'findOneWithTags') + $sandbox.stub(ServiceManager, 'update') + }) + + it('uses spec_snapshot for hub teardown, fan-out, and destroys the task', async () => { + const result = await ServicePlatformService.reconcileService(serviceName, deleteTask, transaction) + + expect(ServiceManager.findOneWithTags).to.not.have.been.called + expect(K8sClient.patchConfigMap).to.have.been.calledTwice + expect(ServicesService._deleteK8sService).to.have.been.calledWith(serviceName) + expect(ServicesService.handleServiceDistribution).to.have.been.calledWith( + ['site-a', 'site-b'], + transaction + ) + expect(ServicePlatformReconcileTaskManager.delete).to.have.been.calledWith({ id: 99 }, transaction) + expect(ServiceManager.update).to.not.have.been.called + expect(result.isDelete).to.equal(true) + }) + }) + + describe('.acquireHubLockWithTimeout()', () => { + let clock + + beforeEach(() => { + clock = sinon.useFakeTimers() + $sandbox.stub(config, 'get').callsFake((key, defaultValue) => { + if (key === 'settings.hubRouterConfigLockTimeoutSeconds') { + return 1 + } + return defaultValue + }) + $sandbox.stub(HubRouterConfigLockManager, 'tryAcquire').resolves(false) + }) + + afterEach(() => { + clock.restore() + }) + + it('times out when hub lock is held by another controller', async () => { + const acquirePromise = ServicePlatformService.acquireHubLockWithTimeout('controller-uuid-1', transaction) + await clock.runAllAsync() + + try { + await acquirePromise + throw new Error('expected lock acquire to fail') + } catch (error) { + expect(error.message).to.include('Timed out waiting for hub router ConfigMap lock') + } + + expect(HubRouterConfigLockManager.tryAcquire).to.have.been.called + }) + }) + + describe('.fanOutFogReconcile()', () => { + beforeEach(() => { + $sandbox.stub(ServicesService, 'handleServiceDistribution').resolves(['fog-a', 'fog-b']) + $sandbox.stub(FogPlatformReconcileTaskManager, 'enqueueFogPlatformReconcileTask').resolves({ id: 1 }) + }) + + it('enqueues fog platform reconcile tasks for distributed fogs', async () => { + const fogUuids = await ServicePlatformService.fanOutFogReconcile(['site-a'], transaction) + + expect(fogUuids).to.eql(['fog-a', 'fog-b']) + expect(FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask).to.have.been.calledTwice + expect(FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask).to.have.been.calledWith({ + fogUuid: 'fog-a', + reason: 'service-changed' + }, transaction) + }) + }) + + describe('.upsertHubTcpListener()', () => { + const serviceConfig = { + name: serviceName, + bridgePort: 9100 + } + + beforeEach(() => { + $sandbox.stub(ServicesService, 'checkKubernetesEnvironment').resolves(true) + $sandbox.stub(ServicesService, '_buildTcpListener').returns({ + name: `${serviceName}-listener`, + port: '9100', + address: serviceName + }) + $sandbox.stub(K8sClient, 'getConfigMap').resolves({ + data: { + 'skrouterd.json': JSON.stringify([ + ['tcpListener', { name: 'other-listener', port: '8000', address: 'other' }] + ]) + } + }) + $sandbox.stub(K8sClient, 'patchConfigMap').resolves() + }) + + it('upserts hub listener entries in the K8s router ConfigMap', async () => { + await ServicePlatformService.upsertHubTcpListener(serviceConfig, transaction) + + expect(K8sClient.patchConfigMap).to.have.been.calledOnce + const patchData = K8sClient.patchConfigMap.firstCall.args[1] + const routerConfig = JSON.parse(patchData.data['skrouterd.json']) + expect(routerConfig).to.have.length(2) + expect(routerConfig[1]).to.eql([ + 'tcpListener', + { name: `${serviceName}-listener`, port: '9100', address: serviceName } + ]) + }) + }) + + describe('.upsertHubTcpConnector()', () => { + const serviceConfig = { + name: serviceName, + type: 'external', + resource: '10.0.0.8', + targetPort: 8080 + } + + beforeEach(() => { + $sandbox.stub(ServicesService, 'checkKubernetesEnvironment').resolves(false) + $sandbox.stub(ServicesService, '_determineConnectorSiteId').resolves('default-router') + $sandbox.stub(ServicesService, '_buildTcpConnector').resolves({ + name: `${serviceName}-connector`, + host: '10.0.0.8', + port: '8080', + address: serviceName, + processId: `${serviceName}-external-8080` + }) + $sandbox.stub(RouterManager, 'findOne').resolves({ iofogUuid: 'default-fog' }) + }) + + it('upserts connector on the default router microservice when not on K8s CP', async () => { + const FogManager = require('../../../src/data/managers/iofog-manager') + const MicroserviceManager = require('../../../src/data/managers/microservice-manager') + const ApplicationManager = require('../../../src/data/managers/application-manager') + const ChangeTrackingService = require('../../../src/services/change-tracking-service') + + $sandbox.stub(FogManager, 'findOne').resolves({ uuid: 'default-fog', name: 'controlplane' }) + $sandbox.stub(ApplicationManager, 'findOne').resolves({ id: 10, name: 'system-controlplane', isSystem: true }) + $sandbox.stub(MicroserviceManager, 'findAll').resolves([]) + $sandbox.stub(MicroserviceManager, 'findOne').resolves({ + uuid: 'router-ms-1', + config: JSON.stringify({ bridges: { tcpConnectors: {} } }) + }) + $sandbox.stub(MicroserviceManager, 'update').resolves() + $sandbox.stub(MicroserviceManager, 'delete').resolves() + $sandbox.stub(ChangeTrackingService, 'update').resolves() + + await ServicePlatformService.upsertHubTcpConnector(serviceConfig, transaction) + + expect(MicroserviceManager.update).to.have.been.calledOnce + const updatePayload = MicroserviceManager.update.firstCall.args[1] + const parsedConfig = JSON.parse(updatePayload.config) + expect(parsedConfig.bridges.tcpConnectors[`${serviceName}-connector`]).to.include({ + name: `${serviceName}-connector`, + host: '10.0.0.8' + }) + }) + }) +}) From 61ee1316097485a65b5e8e50951f048254d969bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Wed, 24 Jun 2026 21:17:29 +0300 Subject: [PATCH 04/11] Add platform reconcile worker, sweep job, and staggered job startup. Run fog and service reconcile claims in one worker with backoff and max attempts. Add periodic drift sweep and delay reconcile-heavy jobs on boot to reduce SQLite lock contention on single-controller deployments. --- src/config/config.yaml | 15 +- src/config/env-mapping.js | 9 + src/helpers/job-startup.js | 39 +++ src/jobs/fog-platform-sweep-job.js | 311 ++++++++++++++++++ src/jobs/nats-reconcile-worker-job.js | 21 +- src/jobs/platform-reconcile-worker-job.js | 191 +++++++++++ src/server.js | 29 +- test/src/helpers/job-startup.test.js | 71 ++++ test/src/jobs/fog-platform-sweep-job.test.js | 206 ++++++++++++ .../platform-reconcile-worker-job.test.js | 227 +++++++++++++ 10 files changed, 1104 insertions(+), 15 deletions(-) create mode 100644 src/helpers/job-startup.js create mode 100644 src/jobs/fog-platform-sweep-job.js create mode 100644 src/jobs/platform-reconcile-worker-job.js create mode 100644 test/src/helpers/job-startup.test.js create mode 100644 test/src/jobs/fog-platform-sweep-job.test.js create mode 100644 test/src/jobs/platform-reconcile-worker-job.test.js diff --git a/src/config/config.yaml b/src/config/config.yaml index 0f1e5072..12c9b1e4 100644 --- a/src/config/config.yaml +++ b/src/config/config.yaml @@ -65,6 +65,15 @@ settings: natsReconcileChunkSize: 1 # NATS reconcile task chunk size in fogs per task (default: 1) natsReconcileTaskStalenessSeconds: 900 # NATS reconcile task staleness for reclaim (default: 900 = 15 minutes) natsReconcileWorkerIntervalSeconds: 3 # NATS reconcile worker poll interval in seconds (default: 3) + fogPlatformReconcileWorkerIntervalSeconds: 3 # Platform reconcile worker poll interval in seconds (default: 3) + fogPlatformReconcileTaskStalenessSeconds: 300 # Stale fog platform task reclaim in seconds (default: 300 = 5 minutes) + fogPlatformReconcileMaxAttempts: 10 # Permanent fail threshold for fog platform tasks (default: 10) + fogPlatformReconcileBackoffBaseSeconds: 5 # Exponential backoff base for fog platform tasks (default: 5) + fogPlatformSweepIntervalSeconds: 900 # Platform drift sweep interval in seconds (default: 900 = 15 minutes) + servicePlatformReconcileMaxAttempts: 10 # Permanent fail threshold for service platform tasks (default: 10) + hubRouterConfigLockTimeoutSeconds: 120 # Hub router ConfigMap lock wait in seconds (default: 120) + serviceLoadBalancerWatchTimeoutSeconds: 300 # LoadBalancer IP watch timeout in service reconcile (default: 300) + jobStartupDelaySeconds: 3 # Delay before reconcile-heavy background jobs start (default: 3) # Database Configuration database: @@ -89,6 +98,10 @@ database: databaseName: controller_db.sqlite # SQLite database file name logging: false # Enable SQLite query logging transactionType: IMMEDIATE # SQLite transaction type + pragmas: + journalMode: WAL # WAL allows concurrent readers during writes (production single-node) + busyTimeoutMs: 10000 # Wait up to 10s for lock before SQLITE_BUSY + synchronous: NORMAL # Safe with WAL; better write throughput than FULL pool: maxActive: 1 # Maximum active connections max: 1 # Maximum total connections @@ -125,7 +138,7 @@ auth: # sessionTtlSeconds: # AUTH_OIDC_SESSION_TTL_SECONDS — defaults to AuthPolicy refresh_token_ttl_seconds # idTokenTtlSeconds: # AUTH_OIDC_ID_TOKEN_TTL_SECONDS — defaults to AuthPolicy access_token_ttl_seconds -# Build flavor (Plan 12 — override via env or Docker build-arg) +# Build flavor (override via env or Docker build-arg) flavor: distribution: datasance rbacApiVersion: datasance.com/v3 diff --git a/src/config/env-mapping.js b/src/config/env-mapping.js index 228843b6..0e1bf1c2 100644 --- a/src/config/env-mapping.js +++ b/src/config/env-mapping.js @@ -54,6 +54,15 @@ module.exports = { CONTROLLER_HEARTBEAT_INTERVAL: 'settings.controllerHeartbeatInterval', CONTROLLER_INACTIVE_THRESHOLD: 'settings.controllerInactiveThreshold', CONTROLLER_CLEANUP_INTERVAL: 'settings.controllerCleanupInterval', + FOG_PLATFORM_RECONCILE_WORKER_INTERVAL_SECONDS: 'settings.fogPlatformReconcileWorkerIntervalSeconds', + FOG_PLATFORM_RECONCILE_TASK_STALENESS_SECONDS: 'settings.fogPlatformReconcileTaskStalenessSeconds', + FOG_PLATFORM_RECONCILE_MAX_ATTEMPTS: 'settings.fogPlatformReconcileMaxAttempts', + FOG_PLATFORM_RECONCILE_BACKOFF_BASE_SECONDS: 'settings.fogPlatformReconcileBackoffBaseSeconds', + FOG_PLATFORM_SWEEP_INTERVAL_SECONDS: 'settings.fogPlatformSweepIntervalSeconds', + SERVICE_PLATFORM_RECONCILE_MAX_ATTEMPTS: 'settings.servicePlatformReconcileMaxAttempts', + HUB_ROUTER_CONFIG_LOCK_TIMEOUT_SECONDS: 'settings.hubRouterConfigLockTimeoutSeconds', + SERVICE_LOAD_BALANCER_WATCH_TIMEOUT_SECONDS: 'settings.serviceLoadBalancerWatchTimeoutSeconds', + JOB_STARTUP_DELAY_SECONDS: 'settings.jobStartupDelaySeconds', // Database Configuration DB_PROVIDER: 'database.provider', diff --git a/src/helpers/job-startup.js b/src/helpers/job-startup.js new file mode 100644 index 00000000..a764b869 --- /dev/null +++ b/src/helpers/job-startup.js @@ -0,0 +1,39 @@ +const config = require('../config') +const logger = require('../logger') + +const RECONCILE_HEAVY_JOBS = new Set([ + 'platform-reconcile-worker-job.js', + 'nats-reconcile-worker-job.js', + 'fog-platform-sweep-job.js', + 'fog-status-job.js' +]) + +const JOB_STAGGER_MS = 500 + +/** + * Start background jobs after API listen. Reconcile-heavy jobs are delayed and staggered + * to avoid a startup thundering herd against SQLite (and reduce contention on all DB types). + * @param {{ module: { run: Function }, file: string }[]} jobEntries + */ +function startBackgroundJobs (jobEntries) { + const baseDelayMs = config.get('settings.jobStartupDelaySeconds', 3) * 1000 + let staggerIndex = 0 + + for (const { module: job, file } of jobEntries) { + if (RECONCILE_HEAVY_JOBS.has(file)) { + const delayMs = baseDelayMs + staggerIndex * JOB_STAGGER_MS + staggerIndex++ + setTimeout(() => { + logger.debug(`Starting background job ${file} after ${delayMs}ms startup delay`) + job.run() + }, delayMs) + } else { + job.run() + } + } +} + +module.exports = { + RECONCILE_HEAVY_JOBS, + startBackgroundJobs +} diff --git a/src/jobs/fog-platform-sweep-job.js b/src/jobs/fog-platform-sweep-job.js new file mode 100644 index 00000000..b0a37ad8 --- /dev/null +++ b/src/jobs/fog-platform-sweep-job.js @@ -0,0 +1,311 @@ +const { Op } = require('sequelize') +const ClusterControllerService = require('../services/cluster-controller-service') +const FogPlatformSpecManager = require('../data/managers/fog-platform-spec-manager') +const FogPlatformStatusManager = require('../data/managers/fog-platform-status-manager') +const FogPlatformReconcileTaskManager = require('../data/managers/fog-platform-reconcile-task-manager') +const ServicePlatformReconcileTaskManager = require('../data/managers/service-platform-reconcile-task-manager') +const FogManager = require('../data/managers/iofog-manager') +const ServiceManager = require('../data/managers/service-manager') +const RouterManager = require('../data/managers/router-manager') +const NatsInstanceManager = require('../data/managers/nats-instance-manager') +const TransactionDecorator = require('../decorators/transaction-decorator') +const IofogService = require('../services/iofog-service') +const ServicesService = require('../services/services-service') +const K8sClient = require('../utils/k8s-client') +const databaseProvider = require('../data/providers/database-factory') +const Config = require('../config') +const logger = require('../logger') + +const ACTIVE_STATUSES = ['pending', 'in_progress'] +const K8S_ROUTER_CONFIG_MAP = 'iofog-router' + +const scheduleTime = Config.get('settings.fogPlatformSweepIntervalSeconds', 900) * 1000 + +async function run () { + try { + await runSweepInternal() + } catch (error) { + logger.error('Fog platform sweep error:', error) + } finally { + setTimeout(run, scheduleTime) + } +} + +async function runSweepInternal (transaction) { + const uuid = ClusterControllerService.getCurrentControllerUuid() + if (!uuid) { + return { fogEnqueued: 0, serviceEnqueued: 0 } + } + + const execute = async (t) => { + let fogEnqueued = 0 + let serviceEnqueued = 0 + + const specs = await FogPlatformSpecManager.findAll({}, t) + for (const specRow of specs) { + const shouldEnqueue = await shouldEnqueueFogSweepInternal(specRow.fogUuid, t) + if (!shouldEnqueue) { + continue + } + + const parsedSpec = await FogPlatformSpecManager.getParsedSpec(specRow.fogUuid, t) + await FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask({ + fogUuid: specRow.fogUuid, + reason: 'periodic-sweep', + specGeneration: parsedSpec ? parsedSpec.generation : specRow.generation + }, t) + fogEnqueued += 1 + } + + const services = await ServiceManager.findAllWithTags({}, t) + for (const service of services) { + const shouldEnqueue = await shouldEnqueueServiceSweepInternal(service, t) + if (!shouldEnqueue) { + continue + } + + const specSnapshot = { + name: service.name, + type: service.type, + resource: service.resource, + defaultBridge: service.defaultBridge, + bridgePort: service.bridgePort, + targetPort: service.targetPort, + servicePort: service.servicePort, + k8sType: service.k8sType, + serviceEndpoint: service.serviceEndpoint, + tags: (service.tags || []).map((tag) => (typeof tag === 'string' ? tag : tag.value)) + } + await ServicePlatformReconcileTaskManager.enqueueServicePlatformReconcileTask({ + serviceName: service.name, + reason: 'periodic-sweep', + specSnapshot + }, t) + serviceEnqueued += 1 + } + + if (fogEnqueued > 0 || serviceEnqueued > 0) { + logger.info('Fog platform sweep enqueued reconcile tasks', { fogEnqueued, serviceEnqueued }) + } + + return { fogEnqueued, serviceEnqueued } + } + + if (transaction) { + return execute(transaction) + } + + return databaseProvider.sequelize.transaction((t) => execute(t)) +} + +async function hasActiveFogTask (fogUuid, transaction) { + const task = await FogPlatformReconcileTaskManager.getEntity().findOne({ + where: { + fogUuid, + status: { [Op.in]: ACTIVE_STATUSES } + }, + transaction + }) + return !!task +} + +async function hasActiveServiceTask (serviceName, transaction) { + const task = await ServicePlatformReconcileTaskManager.getEntity().findOne({ + where: { + serviceName, + status: { [Op.in]: ACTIVE_STATUSES } + }, + transaction + }) + return !!task +} + +function isBackoffElapsed (nextAttemptAt) { + if (!nextAttemptAt) { + return true + } + return new Date(nextAttemptAt).getTime() <= Date.now() +} + +async function hasRuntimeMissing (fogUuid, parsedSpec, transaction) { + const spec = parsedSpec.spec + if (spec.routerMode !== 'none') { + const router = await RouterManager.findOne({ iofogUuid: fogUuid }, transaction) + if (!router) { + return true + } + } + if (spec.natsMode !== 'none') { + const nats = await NatsInstanceManager.findByFog(fogUuid, transaction) + if (!nats) { + return true + } + } + return false +} + +async function hasModeMismatch (fogUuid, parsedSpec, transaction) { + const spec = parsedSpec.spec + const router = await RouterManager.findOne({ iofogUuid: fogUuid }, transaction) + + if (spec.routerMode !== 'none') { + if (!router) { + return false + } + const runtimeMode = router.isEdge ? 'edge' : 'interior' + if (spec.routerMode !== runtimeMode) { + return true + } + } else if (router) { + return true + } + + const nats = await NatsInstanceManager.findByFog(fogUuid, transaction) + if (spec.natsMode !== 'none') { + if (!nats) { + return false + } + const runtimeNatsMode = nats.isLeaf ? 'leaf' : 'server' + if (spec.natsMode !== runtimeNatsMode) { + return true + } + } else if (nats) { + return true + } + + return false +} + +async function hasMissingServiceBridges (fogUuid, parsedSpec, transaction) { + if (parsedSpec.spec.routerMode === 'none') { + return false + } + + const fog = await FogManager.findOneWithTags({ uuid: fogUuid }, transaction) + if (!fog) { + return false + } + + const tagValues = fog.tags ? fog.tags.map((tag) => tag.value) : [] + const serviceTags = await IofogService._extractServiceTags(tagValues) + if (serviceTags.length === 0) { + return false + } + + const services = await IofogService._findMatchingServices(serviceTags, transaction) + if (services.length === 0) { + return false + } + + let routerConfig + try { + routerConfig = await IofogService._getRouterMicroserviceConfig(fogUuid, transaction) + } catch (error) { + return false + } + + const listeners = routerConfig?.bridges?.tcpListeners || {} + for (const service of services) { + const listener = IofogService._buildTcpListenerForFog(service) + if (!listeners[listener.name]) { + return true + } + } + + return false +} + +async function hasServiceHubDrift (service, transaction) { + if (service.provisioningStatus !== 'ready') { + return false + } + + const listenerName = `${service.name}-listener` + const isK8s = await ServicesService.checkKubernetesEnvironment() + + if (isK8s) { + const configMap = await K8sClient.getConfigMap(K8S_ROUTER_CONFIG_MAP) + if (!configMap || !configMap.data || !configMap.data['skrouterd.json']) { + return true + } + const routerConfig = JSON.parse(configMap.data['skrouterd.json']) + return !routerConfig.some((entry) => + entry[0] === 'tcpListener' && entry[1] && entry[1].name === listenerName + ) + } + + try { + const defaultRouter = await RouterManager.findOne({ isDefault: true }, transaction) + if (!defaultRouter) { + return false + } + const routerConfig = await IofogService._getRouterMicroserviceConfig(defaultRouter.iofogUuid, transaction) + const listeners = routerConfig?.bridges?.tcpListeners || {} + return !listeners[listenerName] + } catch (error) { + return false + } +} + +async function shouldEnqueueFogSweepInternal (fogUuid, transaction) { + const parsedSpec = await FogPlatformSpecManager.getParsedSpec(fogUuid, transaction) + if (!parsedSpec) { + return false + } + + const status = await FogPlatformStatusManager.getParsedStatus(fogUuid, transaction) + if (status && status.phase === 'Deleting') { + return false + } + + if (!status || status.observedGeneration < parsedSpec.generation) { + return !(await hasActiveFogTask(fogUuid, transaction)) + } + + if (status.phase === 'Failed' && !(await hasActiveFogTask(fogUuid, transaction))) { + return true + } + + if (await hasRuntimeMissing(fogUuid, parsedSpec, transaction)) { + return !(await hasActiveFogTask(fogUuid, transaction)) + } + + if (await hasModeMismatch(fogUuid, parsedSpec, transaction)) { + return !(await hasActiveFogTask(fogUuid, transaction)) + } + + if (await hasMissingServiceBridges(fogUuid, parsedSpec, transaction)) { + return !(await hasActiveFogTask(fogUuid, transaction)) + } + + return false +} + +async function shouldEnqueueServiceSweepInternal (service, transaction) { + if (service.provisioningStatus === 'failed') { + if (await hasActiveServiceTask(service.name, transaction)) { + const task = await ServicePlatformReconcileTaskManager.getEntity().findOne({ + where: { + serviceName: service.name, + status: { [Op.in]: ACTIVE_STATUSES } + }, + transaction + }) + return task ? isBackoffElapsed(task.nextAttemptAt) : false + } + return true + } + + if (await hasServiceHubDrift(service, transaction)) { + return !(await hasActiveServiceTask(service.name, transaction)) + } + + return false +} + +module.exports = { + run, + runSweep: TransactionDecorator.generateTransaction(runSweepInternal), + shouldEnqueueFogSweep: TransactionDecorator.generateTransaction(shouldEnqueueFogSweepInternal), + shouldEnqueueServiceSweep: TransactionDecorator.generateTransaction(shouldEnqueueServiceSweepInternal) +} diff --git a/src/jobs/nats-reconcile-worker-job.js b/src/jobs/nats-reconcile-worker-job.js index cad203d2..f785a45f 100644 --- a/src/jobs/nats-reconcile-worker-job.js +++ b/src/jobs/nats-reconcile-worker-job.js @@ -11,7 +11,7 @@ async function run () { try { await processNextTask() } catch (error) { - logger.error('NATS reconcile worker error:', error) + logger.error({ err: error, msg: 'NATS reconcile worker error' }) } finally { setTimeout(run, scheduleTime) } @@ -22,10 +22,19 @@ async function processNextTask () { if (!uuid) { return } - const task = await NatsService.claimNextTask(uuid) + + let task + try { + task = await NatsService.claimNextTask(uuid) + } catch (error) { + logger.error({ err: error, msg: 'NATS reconcile task claim failed' }) + return + } + if (!task) { return } + const fogUuids = task.fogUuids ? task.fogUuids.split(',').map((s) => s.trim()).filter(Boolean) : undefined @@ -36,6 +45,7 @@ async function processNextTask () { userRuleId: task.userRuleId, fogUuids: fogUuids && fogUuids.length > 0 ? fogUuids : undefined } + try { logger.info(`NATS reconcile task ${task.id} started`) await NatsService.reconcileResolverArtifacts(options) @@ -47,7 +57,12 @@ async function processNextTask () { }) }) } catch (error) { - logger.error(`NATS reconcile task ${task.id} failed: ${error.message}. Task will be reclaimed after staleness.`) + logger.error({ + err: error, + msg: `NATS reconcile task ${task.id} failed; task will be reclaimed after staleness`, + taskId: task.id, + reason: task.reason + }) } } diff --git a/src/jobs/platform-reconcile-worker-job.js b/src/jobs/platform-reconcile-worker-job.js new file mode 100644 index 00000000..d211e7e8 --- /dev/null +++ b/src/jobs/platform-reconcile-worker-job.js @@ -0,0 +1,191 @@ +const ClusterControllerService = require('../services/cluster-controller-service') +const FogPlatformService = require('../services/fog-platform-service') +const ServicePlatformService = require('../services/service-platform-service') +const FogPlatformReconcileTaskManager = require('../data/managers/fog-platform-reconcile-task-manager') +const ServicePlatformReconcileTaskManager = require('../data/managers/service-platform-reconcile-task-manager') +const ServiceManager = require('../data/managers/service-manager') +const databaseProvider = require('../data/providers/database-factory') +const Config = require('../config') +const logger = require('../logger') + +const scheduleTime = (Config.get('settings.fogPlatformReconcileWorkerIntervalSeconds', 3)) * 1000 + +async function run () { + try { + await processNextFogTask() + await processNextServiceTask() + } catch (error) { + logger.error({ err: error, msg: 'Platform reconcile worker error' }) + } finally { + setTimeout(run, scheduleTime) + } +} + +async function processNextFogTask () { + const uuid = ClusterControllerService.getCurrentControllerUuid() + if (!uuid) { + return + } + + const stalenessSeconds = Config.get('settings.fogPlatformReconcileTaskStalenessSeconds', 300) + let task + try { + task = await FogPlatformReconcileTaskManager.claimNextFogTask(uuid, stalenessSeconds) + } catch (error) { + logger.error({ err: error, msg: 'Fog platform reconcile task claim failed' }) + return + } + + if (!task) { + return + } + + try { + logger.info(`Fog platform reconcile task ${task.id} started`, { + fogUuid: task.fogUuid, + reason: task.reason + }) + + const result = task.reason === 'delete' + ? await FogPlatformService.reconcileFogDelete(task.fogUuid) + : await FogPlatformService.reconcileFog(task.fogUuid) + + logger.info(`Fog platform reconcile task ${task.id} completed`, { + fogUuid: task.fogUuid, + reason: task.reason, + result + }) + + await databaseProvider.sequelize.transaction(async (transaction) => { + await FogPlatformReconcileTaskManager.getEntity().destroy({ + where: { id: task.id }, + transaction + }) + }) + } catch (error) { + logger.error({ + err: error, + msg: `Fog platform reconcile task ${task.id} failed`, + fogUuid: task.fogUuid, + reason: task.reason + }) + try { + await handleFogTaskFailure(task, error) + } catch (failureError) { + logger.error({ + err: failureError, + msg: 'Fog platform reconcile failure recording failed', + taskId: task.id, + fogUuid: task.fogUuid + }) + } + } +} + +async function processNextServiceTask () { + const uuid = ClusterControllerService.getCurrentControllerUuid() + if (!uuid) { + return + } + + const stalenessSeconds = Config.get('settings.fogPlatformReconcileTaskStalenessSeconds', 300) + let task + try { + task = await ServicePlatformReconcileTaskManager.claimNextServiceTask(uuid, stalenessSeconds) + } catch (error) { + logger.error({ err: error, msg: 'Service platform reconcile task claim failed' }) + return + } + + if (!task) { + return + } + + try { + logger.info(`Service platform reconcile task ${task.id} started`, { + serviceName: task.serviceName, + reason: task.reason + }) + + const result = await ServicePlatformService.reconcileService(task.serviceName, task) + + logger.info(`Service platform reconcile task ${task.id} completed`, { + serviceName: task.serviceName, + reason: task.reason, + result + }) + + if (task.reason !== 'delete') { + await databaseProvider.sequelize.transaction(async (transaction) => { + await ServicePlatformReconcileTaskManager.getEntity().destroy({ + where: { id: task.id }, + transaction + }) + }) + } + } catch (error) { + logger.error({ + err: error, + msg: `Service platform reconcile task ${task.id} failed`, + serviceName: task.serviceName, + reason: task.reason + }) + try { + await handleServiceTaskFailure(task, error) + } catch (failureError) { + logger.error({ + err: failureError, + msg: 'Service platform reconcile failure recording failed', + taskId: task.id, + serviceName: task.serviceName + }) + } + } +} + +async function handleFogTaskFailure (task, error) { + const errorMessage = error.message || String(error) + + await databaseProvider.sequelize.transaction(async (transaction) => { + await FogPlatformReconcileTaskManager.recordFogTaskFailure( + task.id, + errorMessage, + { attempts: task.attempts }, + transaction + ) + await FogPlatformService.markReconcileFailed(task.fogUuid, error, transaction) + }) +} + +async function handleServiceTaskFailure (task, error) { + const errorMessage = error.message || String(error) + const maxAttempts = Config.get('settings.servicePlatformReconcileMaxAttempts', 10) + const nextAttempts = (task.attempts != null ? task.attempts : 0) + 1 + const isPermanent = nextAttempts >= maxAttempts + + await databaseProvider.sequelize.transaction(async (transaction) => { + await ServicePlatformReconcileTaskManager.recordServiceTaskFailure( + task.id, + errorMessage, + { attempts: task.attempts }, + transaction + ) + + if (task.reason !== 'delete') { + await ServiceManager.update( + { name: task.serviceName }, + { + provisioningStatus: isPermanent ? 'failed' : 'pending', + provisioningError: errorMessage + }, + transaction + ) + } + }) +} + +module.exports = { + run, + processNextFogTask, + processNextServiceTask +} diff --git a/src/server.js b/src/server.js index 05486200..ef1c1116 100755 --- a/src/server.js +++ b/src/server.js @@ -175,7 +175,10 @@ initialize().then(() => { const jobs = [] const setupJobs = function (file) { - jobs.push((require(path.join(__dirname, 'jobs', file)) || [])) + jobs.push({ + module: require(path.join(__dirname, 'jobs', file)) || {}, + file + }) } function registerServers (api, consoleServer) { @@ -189,7 +192,9 @@ initialize().then(() => { }) } - function startHttpServer (apps, ports, jobs) { + const { startBackgroundJobs } = require('./helpers/job-startup') + + function startHttpServer (apps, ports, jobEntries) { logger.info('TLS not configured, starting HTTP server.') const consoleServer = apps.console.listen(ports.console, function onStart (err) { @@ -203,7 +208,7 @@ initialize().then(() => { logger.error(err) } logger.info(`==> 🌎 API Listening on port ${ports.api}. Open up http://localhost:${ports.api}/ in your browser.`) - jobs.forEach((job) => job.run()) + startBackgroundJobs(jobEntries) }) // Initialize WebSocket server (use singleton to ensure routes are available) @@ -215,7 +220,7 @@ initialize().then(() => { const { createSSLOptions } = require('./utils/ssl-utils') - function startHttpsServer (apps, ports, sslKey, sslCert, intermedKey, jobs, isBase64 = false) { + function startHttpsServer (apps, ports, sslKey, sslCert, intermedKey, jobEntries, isBase64 = false) { try { const sslOptions = createSSLOptions({ key: sslKey, @@ -229,7 +234,6 @@ initialize().then(() => { logger.error(err) } logger.info(`==> 🌎 HTTPS EdgeOps Console server listening on port ${ports.console}. Open up https://localhost:${ports.console}/ in your browser.`) - jobs.forEach((job) => job.run()) }) const apiServer = https.createServer(sslOptions, apps.api).listen(ports.api, function onStart (err) { @@ -237,7 +241,7 @@ initialize().then(() => { logger.error(err) } logger.info(`==> 🌎 HTTPS API server listening on port ${ports.api}. Open up https://localhost:${ports.api}/ in your browser.`) - jobs.forEach((job) => job.run()) + startBackgroundJobs(jobEntries) }) // Initialize WebSocket server with SSL (use singleton to ensure routes are available) @@ -289,11 +293,14 @@ initialize().then(() => { // Store PID to let deamon know we are running. jobs.push({ - run: () => { - const pidFile = path.join((process.env.PID_BASE || __dirname), 'iofog-controller.pid') - logger.info(`==> PID file: ${pidFile}`) - fs.writeFileSync(pidFile, process.pid.toString()) - } + module: { + run: () => { + const pidFile = path.join((process.env.PID_BASE || __dirname), 'iofog-controller.pid') + logger.info(`==> PID file: ${pidFile}`) + fs.writeFileSync(pidFile, process.pid.toString()) + } + }, + file: 'pid-file' }) } diff --git a/test/src/helpers/job-startup.test.js b/test/src/helpers/job-startup.test.js new file mode 100644 index 00000000..c3fdc97d --- /dev/null +++ b/test/src/helpers/job-startup.test.js @@ -0,0 +1,71 @@ +const { expect } = require('chai') +const sinon = require('sinon') + +const config = require('../../../src/config') +const { RECONCILE_HEAVY_JOBS, startBackgroundJobs } = require('../../../src/helpers/job-startup') + +describe('job-startup', () => { + def('sandbox', () => sinon.createSandbox()) + + afterEach(() => { + $sandbox.restore() + }) + + it('starts lightweight jobs immediately', () => { + const run = $sandbox.spy() + startBackgroundJobs([{ module: { run }, file: 'controller-heartbeat-job.js' }]) + expect(run).to.have.been.calledOnce + }) + + it('delays reconcile-heavy jobs by settings.jobStartupDelaySeconds', () => { + $sandbox.stub(config, 'get').callsFake((key, defaultValue) => { + if (key === 'settings.jobStartupDelaySeconds') { + return 2 + } + return defaultValue + }) + const clock = $sandbox.useFakeTimers() + const run = $sandbox.spy() + + startBackgroundJobs([{ module: { run }, file: 'platform-reconcile-worker-job.js' }]) + + expect(run).to.not.have.been.called + clock.tick(2000) + expect(run).to.have.been.calledOnce + }) + + it('staggers reconcile-heavy jobs by 500ms each', () => { + $sandbox.stub(config, 'get').callsFake((key, defaultValue) => { + if (key === 'settings.jobStartupDelaySeconds') { + return 1 + } + return defaultValue + }) + const clock = $sandbox.useFakeTimers() + const platformRun = $sandbox.spy() + const natsRun = $sandbox.spy() + + startBackgroundJobs([ + { module: { run: platformRun }, file: 'platform-reconcile-worker-job.js' }, + { module: { run: natsRun }, file: 'nats-reconcile-worker-job.js' } + ]) + + clock.tick(999) + expect(platformRun).to.not.have.been.called + expect(natsRun).to.not.have.been.called + + clock.tick(1) + expect(platformRun).to.have.been.calledOnce + expect(natsRun).to.not.have.been.called + + clock.tick(500) + expect(natsRun).to.have.been.calledOnce + }) + + it('lists the expected reconcile-heavy job files', () => { + expect(RECONCILE_HEAVY_JOBS.has('platform-reconcile-worker-job.js')).to.equal(true) + expect(RECONCILE_HEAVY_JOBS.has('nats-reconcile-worker-job.js')).to.equal(true) + expect(RECONCILE_HEAVY_JOBS.has('fog-platform-sweep-job.js')).to.equal(true) + expect(RECONCILE_HEAVY_JOBS.has('fog-status-job.js')).to.equal(true) + }) +}) diff --git a/test/src/jobs/fog-platform-sweep-job.test.js b/test/src/jobs/fog-platform-sweep-job.test.js new file mode 100644 index 00000000..6f674a87 --- /dev/null +++ b/test/src/jobs/fog-platform-sweep-job.test.js @@ -0,0 +1,206 @@ +const { expect } = require('chai') +const sinon = require('sinon') + +const ClusterControllerService = require('../../../src/services/cluster-controller-service') +const FogPlatformSpecManager = require('../../../src/data/managers/fog-platform-spec-manager') +const FogPlatformStatusManager = require('../../../src/data/managers/fog-platform-status-manager') +const FogPlatformReconcileTaskManager = require('../../../src/data/managers/fog-platform-reconcile-task-manager') +const ServicePlatformReconcileTaskManager = require('../../../src/data/managers/service-platform-reconcile-task-manager') +const FogManager = require('../../../src/data/managers/iofog-manager') +const ServiceManager = require('../../../src/data/managers/service-manager') +const RouterManager = require('../../../src/data/managers/router-manager') +const NatsInstanceManager = require('../../../src/data/managers/nats-instance-manager') +const IofogService = require('../../../src/services/iofog-service') +const ServicesService = require('../../../src/services/services-service') +const K8sClient = require('../../../src/utils/k8s-client') +const databaseProvider = require('../../../src/data/providers/database-factory') +const FogPlatformSweepJob = require('../../../src/jobs/fog-platform-sweep-job') + +describe('fog-platform-sweep-job', () => { + def('sandbox', () => sinon.createSandbox()) + const transaction = {} + + afterEach(() => $sandbox.restore()) + + it('skips sweep when controller uuid is not initialized', async () => { + $sandbox.stub(ClusterControllerService, 'getCurrentControllerUuid').returns(null) + $sandbox.stub(FogPlatformSpecManager, 'findAll') + + const result = await FogPlatformSweepJob.runSweep(transaction) + + expect(result).to.eql({ fogEnqueued: 0, serviceEnqueued: 0 }) + expect(FogPlatformSpecManager.findAll).to.not.have.been.called + }) + + it('enqueues fog reconcile on generation drift', async () => { + $sandbox.stub(ClusterControllerService, 'getCurrentControllerUuid').returns('controller-1') + $sandbox.stub(FogPlatformSpecManager, 'findAll').resolves([{ fogUuid: 'fog-1', generation: 3 }]) + $sandbox.stub(FogPlatformSpecManager, 'getParsedSpec').resolves({ + fogUuid: 'fog-1', + generation: 3, + spec: { routerMode: 'edge', natsMode: 'leaf' } + }) + $sandbox.stub(FogPlatformStatusManager, 'getParsedStatus').resolves({ + phase: 'Ready', + observedGeneration: 2 + }) + $sandbox.stub(FogPlatformReconcileTaskManager, 'getEntity').returns({ + findOne: $sandbox.stub().resolves(null) + }) + $sandbox.stub(FogPlatformReconcileTaskManager, 'enqueueFogPlatformReconcileTask').resolves() + $sandbox.stub(ServiceManager, 'findAllWithTags').resolves([]) + + const result = await FogPlatformSweepJob.runSweep(transaction) + + expect(result.fogEnqueued).to.equal(1) + expect(FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask).to.have.been.calledWith({ + fogUuid: 'fog-1', + reason: 'periodic-sweep', + specGeneration: 3 + }, transaction) + }) + + it('enqueues service reconcile when provisioning failed without active task', async () => { + $sandbox.stub(ClusterControllerService, 'getCurrentControllerUuid').returns('controller-1') + $sandbox.stub(FogPlatformSpecManager, 'findAll').resolves([]) + $sandbox.stub(ServiceManager, 'findAllWithTags').resolves([{ + name: 'api-gateway', + type: 'microservice', + resource: 'app.ms', + bridgePort: 12345, + targetPort: 8080, + provisioningStatus: 'failed', + tags: [{ value: 'site-a' }] + }]) + $sandbox.stub(ServicePlatformReconcileTaskManager, 'getEntity').returns({ + findOne: $sandbox.stub().resolves(null) + }) + $sandbox.stub(ServicePlatformReconcileTaskManager, 'enqueueServicePlatformReconcileTask').resolves() + + const result = await FogPlatformSweepJob.runSweep(transaction) + + expect(result.serviceEnqueued).to.equal(1) + expect(ServicePlatformReconcileTaskManager.enqueueServicePlatformReconcileTask).to.have.been.calledWith({ + serviceName: 'api-gateway', + reason: 'periodic-sweep', + specSnapshot: sinon.match({ + name: 'api-gateway', + tags: ['site-a'] + }) + }, transaction) + }) + + it('does not enqueue fog reconcile while delete is in progress', async () => { + $sandbox.stub(ClusterControllerService, 'getCurrentControllerUuid').returns('controller-1') + $sandbox.stub(FogPlatformSpecManager, 'findAll').resolves([{ fogUuid: 'fog-1', generation: 2 }]) + $sandbox.stub(FogPlatformSpecManager, 'getParsedSpec').resolves({ + fogUuid: 'fog-1', + generation: 2, + spec: { routerMode: 'edge', natsMode: 'leaf' } + }) + $sandbox.stub(FogPlatformStatusManager, 'getParsedStatus').resolves({ phase: 'Deleting' }) + $sandbox.stub(FogPlatformReconcileTaskManager, 'enqueueFogPlatformReconcileTask') + $sandbox.stub(ServiceManager, 'findAllWithTags').resolves([]) + + const result = await FogPlatformSweepJob.runSweep(transaction) + + expect(result.fogEnqueued).to.equal(0) + expect(FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask).to.not.have.been.called + }) + + describe('runSweep transaction consistency', () => { + it('uses the same transaction for drift check and enqueue', async () => { + const sharedTransaction = { id: 'sweep-tx' } + + $sandbox.stub(ClusterControllerService, 'getCurrentControllerUuid').returns('controller-1') + $sandbox.stub(FogPlatformSpecManager, 'findAll').resolves([{ fogUuid: 'fog-1', generation: 3 }]) + $sandbox.stub(FogPlatformSpecManager, 'getParsedSpec').resolves({ + fogUuid: 'fog-1', + generation: 3, + spec: { routerMode: 'edge', natsMode: 'leaf' } + }) + $sandbox.stub(FogPlatformStatusManager, 'getParsedStatus').resolves({ + phase: 'Ready', + observedGeneration: 2 + }) + $sandbox.stub(FogPlatformReconcileTaskManager, 'getEntity').returns({ + findOne: $sandbox.stub().resolves(null) + }) + $sandbox.stub(FogPlatformReconcileTaskManager, 'enqueueFogPlatformReconcileTask').resolves() + $sandbox.stub(ServiceManager, 'findAllWithTags').resolves([]) + + await FogPlatformSweepJob.runSweep(sharedTransaction) + + expect(FogPlatformStatusManager.getParsedStatus).to.have.been.calledWith('fog-1', sharedTransaction) + expect(FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask).to.have.been.calledWith( + sinon.match({ fogUuid: 'fog-1' }), + sharedTransaction + ) + }) + }) + + describe('shouldEnqueueFogSweep()', () => { + it('detects missing runtime rows', async () => { + $sandbox.stub(FogPlatformSpecManager, 'getParsedSpec').resolves({ + generation: 1, + spec: { routerMode: 'edge', natsMode: 'none' } + }) + $sandbox.stub(FogPlatformStatusManager, 'getParsedStatus').resolves({ + phase: 'Ready', + observedGeneration: 1 + }) + $sandbox.stub(FogPlatformReconcileTaskManager, 'getEntity').returns({ + findOne: $sandbox.stub().resolves(null) + }) + $sandbox.stub(RouterManager, 'findOne').resolves(null) + $sandbox.stub(NatsInstanceManager, 'findByFog').resolves(null) + + const shouldEnqueue = await FogPlatformSweepJob.shouldEnqueueFogSweep('fog-1', transaction) + + expect(shouldEnqueue).to.equal(true) + }) + + it('detects runtime mode drift against spec', async () => { + $sandbox.stub(FogPlatformSpecManager, 'getParsedSpec').resolves({ + generation: 2, + spec: { routerMode: 'interior', natsMode: 'server' } + }) + $sandbox.stub(FogPlatformStatusManager, 'getParsedStatus').resolves({ + phase: 'Ready', + observedGeneration: 2 + }) + $sandbox.stub(FogPlatformReconcileTaskManager, 'getEntity').returns({ + findOne: $sandbox.stub().resolves(null) + }) + $sandbox.stub(RouterManager, 'findOne').resolves({ id: 7, isEdge: true }) + $sandbox.stub(NatsInstanceManager, 'findByFog').resolves({ id: 8, isLeaf: false }) + + const shouldEnqueue = await FogPlatformSweepJob.shouldEnqueueFogSweep('fog-1', transaction) + + expect(shouldEnqueue).to.equal(true) + }) + }) + + describe('shouldEnqueueServiceSweep()', () => { + it('detects hub drift for ready services', async () => { + const service = { + name: 'api-gateway', + provisioningStatus: 'ready' + } + + $sandbox.stub(ServicesService, 'checkKubernetesEnvironment').resolves(true) + $sandbox.stub(K8sClient, 'getConfigMap').resolves({ + data: { + 'skrouterd.json': JSON.stringify([]) + } + }) + $sandbox.stub(ServicePlatformReconcileTaskManager, 'getEntity').returns({ + findOne: $sandbox.stub().resolves(null) + }) + + const shouldEnqueue = await FogPlatformSweepJob.shouldEnqueueServiceSweep(service, transaction) + + expect(shouldEnqueue).to.equal(true) + }) + }) +}) diff --git a/test/src/jobs/platform-reconcile-worker-job.test.js b/test/src/jobs/platform-reconcile-worker-job.test.js new file mode 100644 index 00000000..2b4b199b --- /dev/null +++ b/test/src/jobs/platform-reconcile-worker-job.test.js @@ -0,0 +1,227 @@ +const { expect } = require('chai') +const sinon = require('sinon') + +const ClusterControllerService = require('../../../src/services/cluster-controller-service') +const FogPlatformService = require('../../../src/services/fog-platform-service') +const ServicePlatformService = require('../../../src/services/service-platform-service') +const FogPlatformReconcileTaskManager = require('../../../src/data/managers/fog-platform-reconcile-task-manager') +const ServicePlatformReconcileTaskManager = require('../../../src/data/managers/service-platform-reconcile-task-manager') +const ServiceManager = require('../../../src/data/managers/service-manager') +const FogManager = require('../../../src/data/managers/iofog-manager') +const databaseProvider = require('../../../src/data/providers/database-factory') +const PlatformReconcileWorkerJob = require('../../../src/jobs/platform-reconcile-worker-job') + +describe('platform-reconcile-worker-job', () => { + def('sandbox', () => sinon.createSandbox()) + + afterEach(() => $sandbox.restore()) + + it('runs fog reconcile and destroys the task on success', async () => { + const task = { id: 11, fogUuid: 'fog-1', reason: 'spec-changed', attempts: 0 } + const entity = { destroy: $sandbox.stub().resolves(1) } + const transaction = {} + + $sandbox.stub(ClusterControllerService, 'getCurrentControllerUuid').returns('controller-1') + $sandbox.stub(FogPlatformReconcileTaskManager, 'claimNextFogTask').resolves(task) + $sandbox.stub(FogPlatformService, 'reconcileFog').resolves({ fogUuid: 'fog-1', phase: 'Ready' }) + $sandbox.stub(FogPlatformService, 'reconcileFogDelete') + $sandbox.stub(FogPlatformReconcileTaskManager, 'getEntity').returns(entity) + $sandbox.stub(databaseProvider.sequelize, 'transaction').callsFake(async (fn) => fn(transaction)) + + await PlatformReconcileWorkerJob.processNextFogTask() + + expect(FogPlatformService.reconcileFog).to.have.been.calledOnceWith('fog-1') + expect(FogPlatformService.reconcileFogDelete).to.not.have.been.called + expect(entity.destroy).to.have.been.calledOnceWith({ + where: { id: 11 }, + transaction + }) + }) + + it('passes fakeTransaction into reconcileFog DB layer from worker (no reconcileFog stub)', async () => { + const task = { id: 14, fogUuid: 'fog-1', reason: 'spec-changed', attempts: 0 } + const appHelperPath = require.resolve('../../../src/helpers/app-helper') + const decoratorPath = require.resolve('../../../src/decorators/transaction-decorator') + const fogPlatformServicePath = require.resolve('../../../src/services/fog-platform-service') + const workerPath = require.resolve('../../../src/jobs/platform-reconcile-worker-job') + + $sandbox.stub(require(appHelperPath), 'isTest').returns(false) + delete require.cache[decoratorPath] + delete require.cache[fogPlatformServicePath] + delete require.cache[workerPath] + const WorkerJob = require('../../../src/jobs/platform-reconcile-worker-job') + + $sandbox.stub(ClusterControllerService, 'getCurrentControllerUuid').returns('controller-1') + $sandbox.stub(FogPlatformReconcileTaskManager, 'claimNextFogTask').resolves(task) + $sandbox.stub(FogManager, 'findOneWithTags').resolves(null) + $sandbox.stub(FogPlatformReconcileTaskManager, 'recordFogTaskFailure').resolves(task) + const markFailedPath = require.resolve('../../../src/services/fog-platform-service') + $sandbox.stub(require(markFailedPath), 'markReconcileFailed').resolves() + $sandbox.stub(databaseProvider.sequelize, 'transaction').callsFake(async (fn) => fn({})) + + await WorkerJob.processNextFogTask() + + expect(FogManager.findOneWithTags).to.have.been.calledOnceWith( + { uuid: 'fog-1' }, + sinon.match({ fakeTransaction: true }) + ) + }) + + it('runs delete reconcile when task reason is delete', async () => { + const task = { id: 12, fogUuid: 'fog-2', reason: 'delete', attempts: 0 } + const entity = { destroy: $sandbox.stub().resolves(1) } + + $sandbox.stub(ClusterControllerService, 'getCurrentControllerUuid').returns('controller-1') + $sandbox.stub(FogPlatformReconcileTaskManager, 'claimNextFogTask').resolves(task) + $sandbox.stub(FogPlatformService, 'reconcileFogDelete').resolves({ fogUuid: 'fog-2', deleted: true }) + $sandbox.stub(FogPlatformService, 'reconcileFog') + $sandbox.stub(FogPlatformReconcileTaskManager, 'getEntity').returns(entity) + $sandbox.stub(databaseProvider.sequelize, 'transaction').callsFake(async (fn) => fn({})) + + await PlatformReconcileWorkerJob.processNextFogTask() + + expect(FogPlatformService.reconcileFogDelete).to.have.been.calledOnceWith('fog-2') + expect(FogPlatformService.reconcileFog).to.not.have.been.called + }) + + it('records failure and updates fog status when reconcile throws', async () => { + const task = { id: 13, fogUuid: 'fog-3', reason: 'spec-changed', attempts: 2 } + const error = new Error('router create failed') + + $sandbox.stub(ClusterControllerService, 'getCurrentControllerUuid').returns('controller-1') + $sandbox.stub(FogPlatformReconcileTaskManager, 'claimNextFogTask').resolves(task) + $sandbox.stub(FogPlatformService, 'reconcileFog').rejects(error) + $sandbox.stub(FogPlatformReconcileTaskManager, 'recordFogTaskFailure').resolves(task) + $sandbox.stub(FogPlatformService, 'markReconcileFailed').resolves() + $sandbox.stub(databaseProvider.sequelize, 'transaction').callsFake(async (fn) => fn({})) + + await PlatformReconcileWorkerJob.processNextFogTask() + + expect(FogPlatformReconcileTaskManager.recordFogTaskFailure).to.have.been.calledOnceWith( + 13, + 'router create failed', + { attempts: 2 }, + sinon.match.any + ) + expect(FogPlatformService.markReconcileFailed).to.have.been.calledOnceWith( + 'fog-3', + error, + sinon.match.any + ) + }) + + it('skips work when controller uuid is not initialized', async () => { + $sandbox.stub(ClusterControllerService, 'getCurrentControllerUuid').returns(null) + $sandbox.stub(FogPlatformReconcileTaskManager, 'claimNextFogTask') + + await PlatformReconcileWorkerJob.processNextFogTask() + + expect(FogPlatformReconcileTaskManager.claimNextFogTask).to.not.have.been.called + }) + + it('logs and continues when fog task claim fails', async () => { + const claimError = new Error('database is locked') + + $sandbox.stub(ClusterControllerService, 'getCurrentControllerUuid').returns('controller-1') + $sandbox.stub(FogPlatformReconcileTaskManager, 'claimNextFogTask').rejects(claimError) + $sandbox.stub(FogPlatformService, 'reconcileFog') + + await PlatformReconcileWorkerJob.processNextFogTask() + + expect(FogPlatformService.reconcileFog).to.not.have.been.called + }) + + it('logs and continues when service task claim fails', async () => { + const claimError = new Error('database is locked') + + $sandbox.stub(ClusterControllerService, 'getCurrentControllerUuid').returns('controller-1') + $sandbox.stub(ServicePlatformReconcileTaskManager, 'claimNextServiceTask').rejects(claimError) + $sandbox.stub(ServicePlatformService, 'reconcileService') + + await PlatformReconcileWorkerJob.processNextServiceTask() + + expect(ServicePlatformService.reconcileService).to.not.have.been.called + }) + + it('runs service reconcile and destroys the task on success', async () => { + const task = { id: 21, serviceName: 'api-gateway', reason: 'spec-changed', attempts: 0 } + const entity = { destroy: $sandbox.stub().resolves(1) } + const transaction = {} + + $sandbox.stub(ClusterControllerService, 'getCurrentControllerUuid').returns('controller-1') + $sandbox.stub(ServicePlatformReconcileTaskManager, 'claimNextServiceTask').resolves(task) + $sandbox.stub(ServicePlatformService, 'reconcileService').resolves({ + serviceName: 'api-gateway', + provisioningStatus: 'ready' + }) + $sandbox.stub(ServicePlatformReconcileTaskManager, 'getEntity').returns(entity) + $sandbox.stub(databaseProvider.sequelize, 'transaction').callsFake(async (fn) => fn(transaction)) + + await PlatformReconcileWorkerJob.processNextServiceTask() + + expect(ServicePlatformService.reconcileService).to.have.been.calledOnceWith('api-gateway', task) + expect(entity.destroy).to.have.been.calledOnceWith({ + where: { id: 21 }, + transaction + }) + }) + + it('does not destroy delete tasks because reconcileService removes them', async () => { + const task = { id: 22, serviceName: 'api-gateway', reason: 'delete', attempts: 0 } + + $sandbox.stub(ClusterControllerService, 'getCurrentControllerUuid').returns('controller-1') + $sandbox.stub(ServicePlatformReconcileTaskManager, 'claimNextServiceTask').resolves(task) + $sandbox.stub(ServicePlatformService, 'reconcileService').resolves({ serviceName: 'api-gateway', isDelete: true }) + $sandbox.stub(ServicePlatformReconcileTaskManager, 'getEntity') + + await PlatformReconcileWorkerJob.processNextServiceTask() + + expect(ServicePlatformReconcileTaskManager.getEntity).to.not.have.been.called + }) + + it('records service failure and marks provisioning failed after max attempts', async () => { + const task = { id: 23, serviceName: 'api-gateway', reason: 'spec-changed', attempts: 9 } + const error = new Error('hub lock timeout') + + $sandbox.stub(ClusterControllerService, 'getCurrentControllerUuid').returns('controller-1') + $sandbox.stub(ServicePlatformReconcileTaskManager, 'claimNextServiceTask').resolves(task) + $sandbox.stub(ServicePlatformService, 'reconcileService').rejects(error) + $sandbox.stub(ServicePlatformReconcileTaskManager, 'recordServiceTaskFailure').resolves(task) + $sandbox.stub(ServiceManager, 'update').resolves() + $sandbox.stub(databaseProvider.sequelize, 'transaction').callsFake(async (fn) => fn({})) + + await PlatformReconcileWorkerJob.processNextServiceTask() + + expect(ServicePlatformReconcileTaskManager.recordServiceTaskFailure).to.have.been.calledOnceWith( + 23, + 'hub lock timeout', + { attempts: 9 }, + sinon.match.any + ) + expect(ServiceManager.update).to.have.been.calledOnceWith( + { name: 'api-gateway' }, + { provisioningStatus: 'failed', provisioningError: 'hub lock timeout' }, + sinon.match.any + ) + }) + + it('keeps service provisioning pending on retryable failure', async () => { + const task = { id: 24, serviceName: 'api-gateway', reason: 'spec-changed', attempts: 2 } + const error = new Error('LoadBalancer IP not assigned') + + $sandbox.stub(ClusterControllerService, 'getCurrentControllerUuid').returns('controller-1') + $sandbox.stub(ServicePlatformReconcileTaskManager, 'claimNextServiceTask').resolves(task) + $sandbox.stub(ServicePlatformService, 'reconcileService').rejects(error) + $sandbox.stub(ServicePlatformReconcileTaskManager, 'recordServiceTaskFailure').resolves(task) + $sandbox.stub(ServiceManager, 'update').resolves() + $sandbox.stub(databaseProvider.sequelize, 'transaction').callsFake(async (fn) => fn({})) + + await PlatformReconcileWorkerJob.processNextServiceTask() + + expect(ServiceManager.update).to.have.been.calledOnceWith( + { name: 'api-gateway' }, + { provisioningStatus: 'pending', provisioningError: 'LoadBalancer IP not assigned' }, + sinon.match.any + ) + }) +}) From cb28ae308fabfdb8ed73be61509f37331daea66b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Wed, 24 Jun 2026 21:17:35 +0300 Subject: [PATCH 05/11] Replace fire-and-forget orchestration with async platform reconcile APIs. Enqueue fog and service reconcile tasks on create, update, and delete. Add spec fallback for router/nats modes, platformStatus on fog GET, manual reconcile endpoints, and agent warning gating during non-Ready phases. --- src/config/rbac-resources.yaml | 8 + src/controllers/iofog-controller.js | 5 + src/controllers/service-controller.js | 6 + src/routes/iofog.js | 34 ++ src/routes/service.js | 34 ++ src/services/agent-service.js | 12 +- src/services/iofog-service.js | 518 ++++++++------------- src/services/router-service.js | 6 +- src/services/services-service.js | 238 +++++----- test/src/services/agent-service.test.js | 100 ++++ test/src/services/iofog-service.test.js | 288 +++++++++++- test/src/services/services-service.test.js | 370 +++++++++++++++ 12 files changed, 1153 insertions(+), 466 deletions(-) create mode 100644 test/src/services/services-service.test.js diff --git a/src/config/rbac-resources.yaml b/src/config/rbac-resources.yaml index bf46db85..ee700ffb 100644 --- a/src/config/rbac-resources.yaml +++ b/src/config/rbac-resources.yaml @@ -126,6 +126,10 @@ resources: methods: GET: [get] resourceNameParam: uuid + - path: /api/v3/iofog/:uuid/reconcile + methods: + POST: [patch] + resourceNameParam: uuid - path: /api/v3/iofog/:uuid/version/:versionCommand methods: POST: [patch] @@ -374,6 +378,10 @@ resources: PATCH: [patch] DELETE: [delete] resourceNameParam: name + - path: /api/v3/services/:name/reconcile + methods: + POST: [patch] + resourceNameParam: name - path: /api/v3/services/yaml methods: POST: [create] diff --git a/src/controllers/iofog-controller.js b/src/controllers/iofog-controller.js index 4550cdb8..61d72345 100644 --- a/src/controllers/iofog-controller.js +++ b/src/controllers/iofog-controller.js @@ -19,6 +19,10 @@ async function deleteFogEndPoint (req) { return FogService.deleteFogEndPoint(deleteFog, false) } +async function reconcileFogEndPoint (req) { + return FogService.reconcileFogEndpoint({ uuid: req.params.uuid }) +} + async function getFogEndPoint (req) { const getFog = { uuid: req.params.uuid @@ -106,6 +110,7 @@ module.exports = { createFogEndPoint, updateFogEndPoint, deleteFogEndPoint, + reconcileFogEndPoint, getFogEndPoint, getFogListEndPoint, generateProvisioningKeyEndPoint: (generateProvisionKeyEndPoint), diff --git a/src/controllers/service-controller.js b/src/controllers/service-controller.js index 41bc8ca3..27c39b2e 100644 --- a/src/controllers/service-controller.js +++ b/src/controllers/service-controller.js @@ -42,10 +42,16 @@ const updateServiceYAMLEndpoint = async function (req) { return ServiceService.updateServiceEndpoint(serviceName, serviceData) } +const reconcileServiceEndpoint = async function (req) { + const serviceName = req.params.name + return ServiceService.reconcileServiceEndpoint(serviceName) +} + module.exports = { createServiceEndpoint, updateServiceEndpoint, deleteServiceEndpoint, + reconcileServiceEndpoint, getServiceEndpoint, listServicesEndpoint, createServiceYAMLEndpoint, diff --git a/src/routes/iofog.js b/src/routes/iofog.js index cf981377..dce4c412 100644 --- a/src/routes/iofog.js +++ b/src/routes/iofog.js @@ -198,6 +198,40 @@ module.exports = [ }) } }, + { + method: 'post', + path: '/api/v3/iofog/:uuid/reconcile', + middleware: async (req, res) => { + logger.apiReq(req) + + const successCode = constants.HTTP_CODE_SUCCESS + const errCodes = [ + { + code: 401, + errors: [Errors.AuthenticationError] + }, + { + code: 404, + errors: [Errors.NotFoundError] + } + ] + + await rbacMiddleware.protect()(req, res, async () => { + const reconcileFogEndpoint = ResponseDecorator.handleErrors( + FogController.reconcileFogEndPoint, + successCode, + errCodes + ) + const responseObject = await reconcileFogEndpoint(req) + const user = req.kauth && req.kauth.grant && req.kauth.grant.access_token ? req.kauth.grant.access_token.content.preferred_username : 'system' + res + .status(responseObject.code) + .send(responseObject.body) + + logger.apiRes({ req, user, res, responseObject }) + }) + } + }, { method: 'post', path: '/api/v3/iofog/:uuid/version/:versionCommand', diff --git a/src/routes/service.js b/src/routes/service.js index eb8670a0..497fb002 100644 --- a/src/routes/service.js +++ b/src/routes/service.js @@ -180,6 +180,40 @@ module.exports = [ }) } }, + { + method: 'post', + path: '/api/v3/services/:name/reconcile', + middleware: async (req, res) => { + logger.apiReq(req) + + const successCode = constants.HTTP_CODE_SUCCESS + const errorCodes = [ + { + code: constants.HTTP_CODE_UNAUTHORIZED, + errors: [Errors.AuthenticationError] + }, + { + code: constants.HTTP_CODE_NOT_FOUND, + errors: [Errors.NotFoundError] + } + ] + + await rbacMiddleware.protect()(req, res, async () => { + const reconcileServiceEndpoint = ResponseDecorator.handleErrors( + ServiceController.reconcileServiceEndpoint, + successCode, + errorCodes + ) + const responseObject = await reconcileServiceEndpoint(req) + const user = req.kauth && req.kauth.grant && req.kauth.grant.access_token ? req.kauth.grant.access_token.content.preferred_username : 'system' + res + .status(responseObject.code) + .send(responseObject.body) + + logger.apiRes({ req, user, res, responseObject }) + }) + } + }, { method: 'post', path: '/api/v3/services/yaml', diff --git a/src/services/agent-service.js b/src/services/agent-service.js index 432b27bf..3e891aa6 100644 --- a/src/services/agent-service.js +++ b/src/services/agent-service.js @@ -7,6 +7,7 @@ const moment = require('moment') const TransactionDecorator = require('../decorators/transaction-decorator') const FogProvisionKeyManager = require('../data/managers/iofog-provision-key-manager') const FogManager = require('../data/managers/iofog-manager') +const FogPlatformStatusManager = require('../data/managers/fog-platform-status-manager') const FogKeyService = require('../services/iofog-key-service') const ChangeTrackingService = require('./change-tracking-service') const FogVersionCommandManager = require('../data/managers/iofog-version-command-manager') @@ -274,10 +275,15 @@ const updateAgentStatus = async function (agentStatus, fog, transaction) { uuid: fog.uuid }, transaction) - if (!existingFog.warningMessage.includes('Background orchestration')) { - fogStatus.daemonStatus = agentStatus.daemonStatus - } else { + const platformStatus = await FogPlatformStatusManager.getParsedStatus(fog.uuid, transaction) + const platformGating = platformStatus && !['Ready', 'Deleting'].includes(platformStatus.phase) + const migrationGating = existingFog.warningMessage && + existingFog.warningMessage.startsWith('Platform reconcile:') + + if (platformGating || migrationGating) { fogStatus.daemonStatus = FogStates.WARNING + } else { + fogStatus.daemonStatus = agentStatus.daemonStatus } if (agentStatus.warningMessage.includes('HW signature changed') || agentStatus.warningMessage.includes('HW signature mismatch')) { diff --git a/src/services/iofog-service.js b/src/services/iofog-service.js index 3630f1d7..4c17ca9a 100644 --- a/src/services/iofog-service.js +++ b/src/services/iofog-service.js @@ -40,16 +40,21 @@ const { buildNatsMqttCertificateHostList } = require('../helpers/cert-dns-sans') const Op = require('sequelize').Op -const lget = require('lodash/get') const CertificateService = require('./certificate-service') const logger = require('../logger') const ServiceManager = require('../data/managers/service-manager') -const FogStates = require('../enums/fog-state') const SecretManager = require('../data/managers/secret-manager') const vaultManager = require('../vault/vault-manager') const SecretHelper = require('../helpers/secret-helper') const FogPublicKeyManager = require('../data/managers/iofog-public-key-manager') const { getServiceAnnotationTag } = require('../config/flavor') +const FogPlatformSpecManager = require('../data/managers/fog-platform-spec-manager') +const FogPlatformStatusManager = require('../data/managers/fog-platform-status-manager') +const FogPlatformReconcileTaskManager = require('../data/managers/fog-platform-reconcile-task-manager') +const { + buildPlatformSpecFromFogData, + mergePlatformSpecPatch +} = require('../schemas/fog-platform-spec') const SITE_CA_CERT = Constants.ROUTER_SITE_CA const DEFAULT_ROUTER_LOCAL_CA = Constants.DEFAULT_ROUTER_LOCAL_CA @@ -68,6 +73,23 @@ async function checkKubernetesEnvironment () { return controlPlane && controlPlane.toLowerCase() === 'kubernetes' } +async function _deriveRuntimePlatformModes (fog, transaction) { + const router = await fog.getRouter() + const nats = await fog.getNats() + return { + routerMode: router ? (router.isEdge ? 'edge' : 'interior') : undefined, + natsMode: nats ? (nats.isLeaf ? 'leaf' : 'server') : undefined + } +} + +function _resolveEffectivePlatformModes (fogData, runtimeModes, parsedSpec) { + const spec = parsedSpec && parsedSpec.spec ? parsedSpec.spec : {} + return { + routerMode: fogData.routerMode ?? runtimeModes.routerMode ?? spec.routerMode ?? 'none', + natsMode: fogData.natsMode ?? runtimeModes.natsMode ?? spec.natsMode ?? 'none' + } +} + async function getLocalCertificateHosts (fogData, uuid, transaction) { const defaultRouter = await RouterManager.findOne({ isDefault: true }, transaction) const isDefaultRouter = !!(defaultRouter && defaultRouter.iofogUuid === uuid) @@ -389,7 +411,7 @@ async function createFogEndPoint (fogData, isCLI, transaction) { throw new Errors.ValidationError(AppHelper.formatMessage(ErrorMessages.DUPLICATE_NAME, createFogData.name)) } - let defaultRouter, upstreamRouters + let defaultRouter if (fogData.routerMode === 'none') { const networkRouter = await RouterService.getNetworkRouter(fogData.networkRouter) if (!networkRouter) { @@ -398,7 +420,7 @@ async function createFogEndPoint (fogData, isCLI, transaction) { createFogData.routerId = networkRouter.id } else { defaultRouter = await RouterManager.findOne({ isDefault: true }, transaction) - upstreamRouters = await RouterService.validateAndReturnUpstreamRouters(fogData.upstreamRouters, fogData.isSystem, defaultRouter) + await RouterService.validateAndReturnUpstreamRouters(fogData.upstreamRouters, fogData.isSystem, defaultRouter) } const fog = await FogManager.create(createFogData, transaction) @@ -406,92 +428,16 @@ async function createFogEndPoint (fogData, isCLI, transaction) { // Set tags (synchronously, as this is a simple DB op) await _setTags(fog, fogData.tags, transaction) - // Return fog UUID immediately - const res = { uuid: fog.uuid } - - const natsConfig = { - mode: natsMode, - serverPort: fogData.natsServerPort, - leafPort: fogData.natsLeafPort, - clusterPort: fogData.natsClusterPort, - mqttPort: fogData.natsMqttPort, - httpPort: fogData.natsHttpPort, - jsStorageSize: fogData.jsStorageSize || NatsService.DEFAULT_JS_STORAGE_SIZE, - jsMemoryStoreSize: fogData.jsMemoryStoreSize || NatsService.DEFAULT_JS_MEMORY_STORE_SIZE - } - - if (fogData.upstreamNatsServers) { - natsConfig.upstreamNatsServers = fogData.upstreamNatsServers - } - - // Start background orchestration - setImmediate(() => { - (async () => { - const transaction = { fakeTransaction: true } - try { - // --- Begin orchestration logic (previously inside runWithRetries) --- - await _handleRouterCertificates({ ...fogData, name: fog.name }, fog.uuid, false, transaction) - await NatsService.ensureNatsForFog(fog, natsConfig, transaction) - - if (fogData.routerMode !== 'none') { - await RouterService.createRouterForFog(fogData, fog.uuid, upstreamRouters) - - // Service Distribution Logic - const serviceTags = await _extractServiceTags(fogData.tags) - if (serviceTags.length > 0) { - const services = await _findMatchingServices(serviceTags, transaction) - if (services.length > 0) { - const application = await ensureSystemApplication(fog, transaction) - const routerName = getSystemMicroserviceName('router') - const routerMicroservice = await MicroserviceManager.findOne({ - name: routerName, - applicationId: application.id - }, transaction) - if (!routerMicroservice) { - throw new Errors.NotFoundError(`Router microservice not found: ${routerName}`) - } - let config = JSON.parse(routerMicroservice.config || '{}') - for (const service of services) { - const listenerConfig = _buildTcpListenerForFog(service) - config = _mergeTcpListener(config, listenerConfig) - } - await MicroserviceManager.update( - { uuid: routerMicroservice.uuid }, - { config: JSON.stringify(config) }, - transaction - ) - await ChangeTrackingService.update(fog.uuid, ChangeTrackingService.events.microserviceConfig, transaction) - } - } - } - - await ChangeTrackingService.create(fog.uuid, transaction) - if (fogData.abstractedHardwareEnabled) { - await _createHalMicroserviceForFog(fog, null, transaction) - } - if (fogData.bluetoothEnabled) { - await _createBluetoothMicroserviceForFog(fog, null, transaction) - } - await ChangeTrackingService.update(createFogData.uuid, ChangeTrackingService.events.microserviceCommon, transaction) - // --- End orchestration logic --- - // Set fog node as healthy - await FogManager.update({ uuid: fog.uuid }, { warningMessage: 'HEALTHY' }, transaction) - } catch (err) { - logger.error('Background orchestration failed in createFogEndPoint: ' + err.message) - // Set fog node as warning with error message - await FogManager.update( - { uuid: fog.uuid }, - { - daemonStatus: FogStates.WARNING, - warningMessage: `Background orchestration error: ${err.message}` - }, - transaction - ) - } - })() - }) + const platformSpec = buildPlatformSpecFromFogData(fogData, { applyCreateDefaults: true }) + const { generation } = await FogPlatformSpecManager.upsertSpec(fog.uuid, platformSpec, transaction) + await FogPlatformStatusManager.ensurePending(fog.uuid, transaction) + await FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask({ + fogUuid: fog.uuid, + reason: 'spec-changed', + specGeneration: generation + }, transaction) - return res + return { uuid: fog.uuid } } async function _setTags (fogModel, tagsArray, transaction) { @@ -569,12 +515,21 @@ async function updateFogEndPoint (fogData, isCLI, transaction) { throw new Errors.ValidationError('Agent Resource Name is immutable') } - if (updateFogData.isSystem && updateFogData.natsMode !== 'server') { - throw new Errors.ValidationError(AppHelper.formatMessage(ErrorMessages.INVALID_NATS_MODE, updateFogData.natsMode)) + const parsedSpec = await FogPlatformSpecManager.getParsedSpec(fogData.uuid, transaction) + const runtimeModes = await _deriveRuntimePlatformModes(oldFog, transaction) + const { routerMode: effectiveRouterMode, natsMode: effectiveNatsMode } = _resolveEffectivePlatformModes( + fogData, + runtimeModes, + parsedSpec + ) + + const isSystem = updateFogData.isSystem === undefined ? oldFog.isSystem : updateFogData.isSystem + if (isSystem && effectiveNatsMode !== 'server') { + throw new Errors.ValidationError(AppHelper.formatMessage(ErrorMessages.INVALID_NATS_MODE, effectiveNatsMode)) } - if (updateFogData.isSystem && updateFogData.routerMode !== 'interior') { - throw new Errors.ValidationError(AppHelper.formatMessage(ErrorMessages.INVALID_ROUTER_MODE, updateFogData.routerMode)) + if (isSystem && effectiveRouterMode !== 'interior') { + throw new Errors.ValidationError(AppHelper.formatMessage(ErrorMessages.INVALID_ROUTER_MODE, effectiveRouterMode)) } if (updateFogData.isSystem !== undefined && updateFogData.isSystem !== oldFog.isSystem) { @@ -604,206 +559,19 @@ async function updateFogEndPoint (fogData, isCLI, transaction) { } } - // Get all router config informations - const router = await oldFog.getRouter() - const host = fogData.host || lget(router, 'host') - const upstreamRoutersConnections = router ? (await RouterConnectionManager.findAllWithRouters({ sourceRouter: router.id }, transaction) || []) : [] - const upstreamRoutersIofogUuid = fogData.upstreamRouters || await Promise.all(upstreamRoutersConnections.map(connection => connection.dest.iofogUuid)) - const routerMode = fogData.routerMode || (router ? (router.isEdge ? 'edge' : 'interior') : 'none') - const messagingPort = fogData.messagingPort || (router ? router.messagingPort : null) - const interRouterPort = fogData.interRouterPort || (router ? router.interRouterPort : null) - const edgeRouterPort = fogData.edgeRouterPort || (router ? router.edgeRouterPort : null) - let networkRouter - - const isSystem = updateFogData.isSystem === undefined ? oldFog.isSystem : updateFogData.isSystem - if (isSystem && routerMode !== 'interior') { - throw new Errors.ValidationError(AppHelper.formatMessage(ErrorMessages.INVALID_ROUTER_MODE, fogData.routerMode)) - } - - let isRouterModeChanged = false - const oldRouterMode = (router ? (router.isEdge ? 'edge' : 'interior') : 'none') - if (fogData.routerMode && fogData.routerMode !== oldRouterMode) { - if (fogData.routerMode === 'none' || oldRouterMode === 'none') { - isRouterModeChanged = true - } - } - const isHostChanged = !!(updateFogData.host && updateFogData.host !== oldFog.host) - const shouldRecreateCerts = isRouterModeChanged || isHostChanged - await FogManager.update(queryFogData, updateFogData, transaction) await ChangeTrackingService.update(fogData.uuid, ChangeTrackingService.events.config, transaction) - // Return immediately - const res = { uuid: fogData.uuid } - - const natsConfig = { - mode: fogData.natsMode, - serverPort: fogData.natsServerPort, - leafPort: fogData.natsLeafPort, - clusterPort: fogData.natsClusterPort, - mqttPort: fogData.natsMqttPort, - httpPort: fogData.natsHttpPort, - upstreamNatsServers: fogData.upstreamNatsServers, - jsStorageSize: fogData.jsStorageSize, - jsMemoryStoreSize: fogData.jsMemoryStoreSize - } - - // Start background orchestration - setImmediate(() => { - (async () => { - const transaction = { fakeTransaction: true } - try { - // --- Begin orchestration logic --- - const fog = await FogManager.findOne({ uuid: fogData.uuid }, transaction) - await _handleRouterCertificates({ ...fogData, name: fog.name }, fog.uuid, shouldRecreateCerts, transaction) - if (shouldRecreateCerts) { - await ChangeTrackingService.update(fogData.uuid, ChangeTrackingService.events.volumeMounts, transaction) - } - if (natsConfig.mode === 'none') { - await NatsService.cleanupNatsForFog(fog, transaction) - await _deleteNatsMicroserviceByFog(fogData, transaction) - await ChangeTrackingService.update(fogData.uuid, ChangeTrackingService.events.microserviceList, transaction) - } else { - if (isHostChanged) { - await _reconcileNatsCertificatesOnHostChange(fog, transaction) - } - await NatsService.ensureNatsForFog(fog, natsConfig, transaction) - } - - if (routerMode === 'none') { - networkRouter = await RouterService.getNetworkRouter(fogData.networkRouter) - if (!networkRouter) { - throw new Errors.NotFoundError(AppHelper.formatMessage(ErrorMessages.INVALID_ROUTER, !fogData.networkRouter ? Constants.DEFAULT_ROUTER_NAME : fogData.networkRouter)) - } - if (router) { - await _deleteFogRouter(fogData, transaction) - } - } else { - const defaultRouter = await RouterManager.findOne({ isDefault: true }, transaction) - const upstreamRouters = await RouterService.validateAndReturnUpstreamRouters(upstreamRoutersIofogUuid, oldFog.isSystem, defaultRouter) - if (!router) { - networkRouter = await RouterService.createRouterForFog(fogData, oldFog.uuid, upstreamRouters) - // --- Service Distribution Logic --- - const serviceTags = await _extractServiceTags(fogData.tags) - if (serviceTags.length > 0) { - const services = await _findMatchingServices(serviceTags, transaction) - if (services.length > 0) { - const application = await ensureSystemApplication(oldFog, transaction) - const routerName = getSystemMicroserviceName('router') - const routerMicroservice = await MicroserviceManager.findOne({ - name: routerName, - applicationId: application.id - }, transaction) - if (!routerMicroservice) { - throw new Errors.NotFoundError(`Router microservice not found: ${routerName}`) - } - let config = JSON.parse(routerMicroservice.config || '{}') - for (const service of services) { - const listenerConfig = _buildTcpListenerForFog(service) - config = _mergeTcpListener(config, listenerConfig) - } - await MicroserviceManager.update( - { uuid: routerMicroservice.uuid }, - { config: JSON.stringify(config) }, - transaction - ) - await ChangeTrackingService.update(fogData.uuid, ChangeTrackingService.events.microserviceConfig, transaction) - } - } - } else { - const existingConnectors = await _extractExistingTcpConnectors(fogData.uuid, transaction) - networkRouter = await RouterService.updateRouter(router, { - messagingPort, interRouterPort, edgeRouterPort, isEdge: routerMode === 'edge', host - }, upstreamRouters, fogData.containerEngine) - // --- Service Distribution Logic --- - const serviceTags = await _extractServiceTags(fogData.tags) - const application = await ensureSystemApplication(oldFog, transaction) - const routerName = getSystemMicroserviceName('router') - const routerMicroservice = await MicroserviceManager.findOne({ - name: routerName, - applicationId: application.id - }, transaction) - if (!routerMicroservice) { - throw new Errors.NotFoundError(`Router microservice not found: ${routerName}`) - } - let config = JSON.parse(routerMicroservice.config || '{}') - if (serviceTags.length > 0) { - const services = await _findMatchingServices(serviceTags, transaction) - if (services.length > 0) { - for (const service of services) { - const listenerConfig = _buildTcpListenerForFog(service) - config = _mergeTcpListener(config, listenerConfig) - } - } - } - // Merge back existing connectors if any - if (existingConnectors && Object.keys(existingConnectors).length > 0) { - for (const connectorName in existingConnectors) { - const connectorObj = existingConnectors[connectorName] - config = _mergeTcpConnector(config, connectorObj) - } - } - await MicroserviceManager.update( - { uuid: routerMicroservice.uuid }, - { config: JSON.stringify(config) }, - transaction - ) - await ChangeTrackingService.update(fogData.uuid, ChangeTrackingService.events.microserviceConfig, transaction) - await ChangeTrackingService.update(fogData.uuid, ChangeTrackingService.events.routerChanged, transaction) - } - } - updateFogData.routerId = networkRouter.id - - // If router changed, set routerChanged flag - if (updateFogData.routerId !== oldFog.routerId || updateFogData.routerMode !== oldFog.routerMode) { - await ChangeTrackingService.update(fogData.uuid, ChangeTrackingService.events.routerChanged, transaction) - await ChangeTrackingService.update(fogData.uuid, ChangeTrackingService.events.microserviceList, transaction) - } - - let msChanged = false - if (updateFogData.host && updateFogData.host !== oldFog.host) { - await _updateMicroserviceExtraHosts(fogData.uuid, updateFogData.host, transaction) - } - if (oldFog.abstractedHardwareEnabled === true && fogData.abstractedHardwareEnabled === false) { - await _deleteHalMicroserviceByFog(fogData, transaction) - msChanged = true - } - if (oldFog.abstractedHardwareEnabled === false && fogData.abstractedHardwareEnabled === true) { - await _createHalMicroserviceForFog(fogData, oldFog, transaction) - msChanged = true - } - if (oldFog.bluetoothEnabled === true && fogData.bluetoothEnabled === false) { - await _deleteBluetoothMicroserviceByFog(fogData, transaction) - msChanged = true - } - if (oldFog.bluetoothEnabled === false && fogData.bluetoothEnabled === true) { - await _createBluetoothMicroserviceForFog(fogData, oldFog, transaction) - msChanged = true - } - if (msChanged) { - await ChangeTrackingService.update(fogData.uuid, ChangeTrackingService.events.microserviceCommon, transaction) - } - // --- End orchestration logic --- - // Set fog node as healthy - await FogManager.update({ uuid: fogData.uuid }, { warningMessage: 'HEALTHY' }, transaction) - } catch (err) { - logger.error('Background orchestration failed in updateFogEndPoint: ' + err.message, { - stack: err.stack - }) - await FogManager.update( - { uuid: fogData.uuid }, - { - daemonStatus: FogStates.WARNING, - warningMessage: `Background orchestration error: ${err.message}` - }, - transaction - ) - } - })() - }) + const mergedSpec = mergePlatformSpecPatch(parsedSpec ? parsedSpec.spec : {}, fogData) + const { generation } = await FogPlatformSpecManager.upsertSpec(fogData.uuid, mergedSpec, transaction) + await FogPlatformStatusManager.ensurePending(fogData.uuid, transaction) + await FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask({ + fogUuid: fogData.uuid, + reason: 'spec-changed', + specGeneration: generation + }, transaction) - // Return immediately - return res + return { uuid: fogData.uuid } } async function _updateMicroserviceExtraHosts (fogUuid, host, transaction) { @@ -890,9 +658,34 @@ async function deleteFogEndPoint (fogData, isCLI, transaction) { throw new Errors.NotFoundError(AppHelper.formatMessage(ErrorMessages.INVALID_IOFOG_UUID, fogData.uuid)) } - await _deleteFogRouter(fogData, transaction) + await FogPlatformStatusManager.setPhase(fogData.uuid, 'Deleting', {}, transaction) + await FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask({ + fogUuid: fogData.uuid, + reason: 'delete' + }, transaction) - await _processDeleteCommand(fog, transaction) + return { uuid: fogData.uuid } +} + +async function reconcileFogEndpoint (fogData, transaction) { + const fog = await FogManager.findOne({ uuid: fogData.uuid }, transaction) + if (!fog) { + throw new Errors.NotFoundError(AppHelper.formatMessage(ErrorMessages.INVALID_IOFOG_UUID, fogData.uuid)) + } + + const status = await FogPlatformStatusManager.getParsedStatus(fogData.uuid, transaction) + if (status && status.phase === 'Failed') { + await FogPlatformStatusManager.setPhase(fogData.uuid, 'Pending', { lastError: null }, transaction) + } + + const parsedSpec = await FogPlatformSpecManager.getParsedSpec(fogData.uuid, transaction) + await FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask({ + fogUuid: fogData.uuid, + reason: 'manual-retry', + specGeneration: parsedSpec ? parsedSpec.generation : null + }, transaction) + + return { uuid: fogData.uuid } } function _getRouterUuid (router, defaultRouter) { @@ -903,60 +696,116 @@ function _getNatsUuid (nats, defaultHub) { return (defaultHub && (nats.id === defaultHub.id)) ? Constants.DEFAULT_NATS_HUB_NAME : nats.iofogUuid } -async function _getFogRouterConfig (fog, transaction) { - // Get fog router config +function _getSpecObject (parsedSpec) { + return parsedSpec && parsedSpec.spec ? parsedSpec.spec : {} +} + +function _formatPlatformStatus (status, generation) { + if (!status) { + return null + } + const formatted = { + observedGeneration: status.observedGeneration, + phase: status.phase, + lastError: status.lastError, + lastTransitionAt: status.lastTransitionAt, + conditions: status.conditions + } + if (generation != null) { + formatted.generation = generation + } + return formatted +} + +async function _getFogRouterConfig (fog, parsedSpec, transaction) { const defaultRouter = await RouterManager.findOne({ isDefault: true }, transaction) const router = await fog.getRouter() - const routerConfig = { - } - // Router mode is either interior or edge if (router) { - routerConfig.routerMode = router.isEdge ? 'edge' : 'interior' - routerConfig.messagingPort = router.messagingPort + const routerConfig = { + routerMode: router.isEdge ? 'edge' : 'interior', + messagingPort: router.messagingPort + } if (routerConfig.routerMode === 'interior') { routerConfig.interRouterPort = router.interRouterPort routerConfig.edgeRouterPort = router.edgeRouterPort } - // Get upstream routers const upstreamRoutersConnections = await RouterConnectionManager.findAllWithRouters({ sourceRouter: router.id }, transaction) - routerConfig.upstreamRouters = upstreamRoutersConnections ? upstreamRoutersConnections.map(r => _getRouterUuid(r.dest, defaultRouter)) : [] - } else { - routerConfig.routerMode = 'none' - const networkRouter = await RouterManager.findOne({ id: fog.routerId }, transaction) - if (networkRouter) { - routerConfig.networkRouter = _getRouterUuid(networkRouter, defaultRouter) + routerConfig.upstreamRouters = upstreamRoutersConnections + ? upstreamRoutersConnections.map(r => _getRouterUuid(r.dest, defaultRouter)) + : [] + return routerConfig + } + + const spec = _getSpecObject(parsedSpec) + const routerMode = spec.routerMode ?? 'none' + if (routerMode === 'none') { + const routerConfig = { routerMode: 'none' } + if (spec.networkRouter) { + routerConfig.networkRouter = spec.networkRouter + } else if (fog.routerId) { + const networkRouter = await RouterManager.findOne({ id: fog.routerId }, transaction) + if (networkRouter) { + routerConfig.networkRouter = _getRouterUuid(networkRouter, defaultRouter) + } } + return routerConfig } + const routerConfig = { + routerMode, + messagingPort: spec.messagingPort, + upstreamRouters: spec.upstreamRouters || [] + } + if (routerMode === 'interior') { + routerConfig.interRouterPort = spec.interRouterPort + routerConfig.edgeRouterPort = spec.edgeRouterPort + } return routerConfig } -async function _getFogNatsConfig (fog, transaction) { +async function _getFogNatsConfig (fog, parsedSpec, transaction) { const defaultHub = await NatsInstanceManager.findOne({ isHub: true }, transaction) const nats = await fog.getNats() - const natsConfig = {} if (nats) { - natsConfig.natsMode = nats.isLeaf ? 'leaf' : 'server' - natsConfig.natsServerPort = nats.serverPort - natsConfig.natsLeafPort = nats.leafPort - natsConfig.natsClusterPort = nats.clusterPort - natsConfig.natsMqttPort = nats.mqttPort - natsConfig.natsHttpPort = nats.httpPort - natsConfig.jsStorageSize = nats.jsStorageSize - natsConfig.jsMemoryStoreSize = nats.jsMemoryStoreSize - + const natsConfig = { + natsMode: nats.isLeaf ? 'leaf' : 'server', + natsServerPort: nats.serverPort, + natsLeafPort: nats.leafPort, + natsClusterPort: nats.clusterPort, + natsMqttPort: nats.mqttPort, + natsHttpPort: nats.httpPort, + jsStorageSize: nats.jsStorageSize, + jsMemoryStoreSize: nats.jsMemoryStoreSize + } const upstreamNatsConnections = await NatsConnectionManager.findAllWithNats({ sourceNats: nats.id }, transaction) natsConfig.upstreamNatsServers = upstreamNatsConnections ? upstreamNatsConnections.map((connection) => _getNatsUuid(connection.dest, defaultHub)) : [] - } else { - natsConfig.natsMode = 'none' - natsConfig.upstreamNatsServers = [] + return natsConfig + } + + const spec = _getSpecObject(parsedSpec) + const natsMode = spec.natsMode ?? 'none' + if (natsMode === 'none') { + return { + natsMode: 'none', + upstreamNatsServers: [] + } } - return natsConfig + return { + natsMode, + natsServerPort: spec.natsServerPort, + natsLeafPort: spec.natsLeafPort, + natsClusterPort: spec.natsClusterPort, + natsMqttPort: spec.natsMqttPort, + natsHttpPort: spec.natsHttpPort, + jsStorageSize: spec.jsStorageSize, + jsMemoryStoreSize: spec.jsMemoryStoreSize, + upstreamNatsServers: spec.upstreamNatsServers || [] + } } async function _getFogVolumeMounts (fog, transaction) { @@ -977,9 +826,11 @@ async function _getFogVolumeMounts (fog, transaction) { }) } -async function _getFogExtraInformation (fog, transaction) { - const routerConfig = await _getFogRouterConfig(fog, transaction) - const natsConfig = await _getFogNatsConfig(fog, transaction) +async function _getFogExtraInformation (fog, transaction, options = {}) { + const fogUuid = fog.uuid + const parsedSpec = await FogPlatformSpecManager.getParsedSpec(fogUuid, transaction) + const routerConfig = await _getFogRouterConfig(fog, parsedSpec, transaction) + const natsConfig = await _getFogNatsConfig(fog, parsedSpec, transaction) const volumeMounts = await _getFogVolumeMounts(fog, transaction) // Transform to plain JS object if (fog.toJSON && typeof fog.toJSON === 'function') { @@ -995,7 +846,12 @@ async function _getFogExtraInformation (fog, transaction) { description: architecture.description } : undefined - return { ...fogFields, archId, arch, tags: _mapTags(fog), ...routerConfig, ...natsConfig, volumeMounts } + const result = { ...fogFields, archId, arch, tags: _mapTags(fog), ...routerConfig, ...natsConfig, volumeMounts } + if (options.includePlatformStatus) { + const status = await FogPlatformStatusManager.getParsedStatus(fogUuid, transaction) + result.platformStatus = _formatPlatformStatus(status, parsedSpec ? parsedSpec.generation : null) + } + return result } // Map tags to string array @@ -1043,7 +899,7 @@ async function getFog (fogData, isCLI, transaction) { throw new Errors.NotFoundError(AppHelper.formatMessage(ErrorMessages.INVALID_IOFOG_UUID, fogData.uuid)) } - return _getFogExtraInformation(fog, transaction) + return _getFogExtraInformation(fog, transaction, { includePlatformStatus: true }) } async function getFogEndPoint (fogData, isCLI, transaction) { @@ -1679,6 +1535,7 @@ module.exports = { createFogEndPoint: TransactionDecorator.generateTransaction(createFogEndPoint, bypassOptions), updateFogEndPoint: TransactionDecorator.generateTransaction(updateFogEndPoint, bypassOptions), deleteFogEndPoint: TransactionDecorator.generateTransaction(deleteFogEndPoint, bypassOptions), + reconcileFogEndpoint: TransactionDecorator.generateTransaction(reconcileFogEndpoint, bypassOptions), getFogEndPoint: TransactionDecorator.generateTransaction(getFogEndPoint), getFogListEndPoint: TransactionDecorator.generateTransaction(getFogListEndPoint), generateProvisioningKeyEndPoint: TransactionDecorator.generateTransaction(generateProvisioningKeyEndPoint), @@ -1699,5 +1556,14 @@ module.exports = { _mergeTcpConnector, _mergeTcpListener, checkKubernetesEnvironment, - _handleRouterCertificates: TransactionDecorator.generateTransaction(_handleRouterCertificates) + _handleRouterCertificates: TransactionDecorator.generateTransaction(_handleRouterCertificates), + _deleteFogRouter: TransactionDecorator.generateTransaction(_deleteFogRouter), + _processDeleteCommand: TransactionDecorator.generateTransaction(_processDeleteCommand), + _reconcileNatsCertificatesOnHostChange: TransactionDecorator.generateTransaction(_reconcileNatsCertificatesOnHostChange), + _deleteNatsMicroserviceByFog: TransactionDecorator.generateTransaction(_deleteNatsMicroserviceByFog), + _createHalMicroserviceForFog: TransactionDecorator.generateTransaction(_createHalMicroserviceForFog), + _deleteHalMicroserviceByFog: TransactionDecorator.generateTransaction(_deleteHalMicroserviceByFog), + _createBluetoothMicroserviceForFog: TransactionDecorator.generateTransaction(_createBluetoothMicroserviceForFog), + _deleteBluetoothMicroserviceByFog: TransactionDecorator.generateTransaction(_deleteBluetoothMicroserviceByFog), + _updateMicroserviceExtraHosts: TransactionDecorator.generateTransaction(_updateMicroserviceExtraHosts) } diff --git a/src/services/router-service.js b/src/services/router-service.js index 817f135b..d0ebd60b 100644 --- a/src/services/router-service.js +++ b/src/services/router-service.js @@ -123,7 +123,7 @@ async function createRouterForFog (fogData, uuid, upstreamRouters, transaction) await _createRouterPorts(routerMicroservice.uuid, fogData.edgeRouterPort, transaction) await _createRouterPorts(routerMicroservice.uuid, fogData.interRouterPort, transaction) } - await _ensureRouterSslVolumeMountsAndMappings(uuid, routerMicroservice.uuid, transaction, false) + await _ensureRouterTlsVolumeMountsAndMappings(uuid, routerMicroservice.uuid, transaction, false) return router } @@ -251,7 +251,7 @@ async function updateConfig (routerID, containerEngine, transaction) { newConfig.connectors[connectorConfig.name] = connectorConfig } - await _ensureRouterSslVolumeMountsAndMappings(router.iofogUuid, routerMicroservice.uuid, transaction, true) + await _ensureRouterTlsVolumeMountsAndMappings(router.iofogUuid, routerMicroservice.uuid, transaction, true) await ChangeTrackingService.update(router.iofogUuid, ChangeTrackingService.events.microserviceConfig, transaction) // Check if configuration needs update @@ -545,7 +545,7 @@ const ROUTER_SSL_PROFILE_NAMES = (name) => [ `router-local-agent-${name}` ] -async function _ensureRouterSslVolumeMountsAndMappings (iofogUuid, routerMicroserviceUuid, transaction, doCleanup = false) { +async function _ensureRouterTlsVolumeMountsAndMappings (iofogUuid, routerMicroserviceUuid, transaction, doCleanup = false) { const fog = await FogManager.findOne({ uuid: iofogUuid }, transaction) if (!fog) { throw new Errors.NotFoundError(AppHelper.formatMessage(ErrorMessages.INVALID_IOFOG_UUID, iofogUuid)) diff --git a/src/services/services-service.js b/src/services/services-service.js index 149179ce..a04bcbee 100644 --- a/src/services/services-service.js +++ b/src/services/services-service.js @@ -13,6 +13,7 @@ const logger = require('../logger') const FogManager = require('../data/managers/iofog-manager') const TagsManager = require('../data/managers/tags-manager') const ChangeTrackingService = require('./change-tracking-service') +const ServicePlatformReconcileTaskManager = require('../data/managers/service-platform-reconcile-task-manager') const ApplicationManager = require('../data/managers/application-manager') const { ensureSystemApplication, @@ -45,6 +46,51 @@ async function _setTags (serviceModel, tagsArray, transaction) { } } +function _normalizeSnapshotTags (tags) { + if (!tags || tags.length === 0) { + return [] + } + return tags.map((tag) => (typeof tag === 'string' ? tag : tag.value)) +} + +function _buildServiceSpecSnapshot (fields) { + return { + name: fields.name, + type: fields.type, + resource: fields.resource, + defaultBridge: fields.defaultBridge, + bridgePort: fields.bridgePort, + targetPort: fields.targetPort, + servicePort: fields.servicePort, + k8sType: fields.k8sType, + serviceEndpoint: fields.serviceEndpoint, + tags: _normalizeSnapshotTags(fields.tags) + } +} + +function _mergeServiceFieldsForSnapshot (base, patch, snapshotTags) { + return _buildServiceSpecSnapshot({ + name: base.name, + type: base.type, + resource: patch.resource !== undefined ? patch.resource : base.resource, + defaultBridge: patch.defaultBridge !== undefined ? patch.defaultBridge : base.defaultBridge, + bridgePort: base.bridgePort, + targetPort: patch.targetPort !== undefined ? patch.targetPort : base.targetPort, + servicePort: patch.servicePort !== undefined ? patch.servicePort : base.servicePort, + k8sType: patch.k8sType !== undefined ? patch.k8sType : base.k8sType, + serviceEndpoint: patch.serviceEndpoint !== undefined ? patch.serviceEndpoint : base.serviceEndpoint, + tags: snapshotTags + }) +} + +async function _enqueueServiceReconcileTask (serviceName, reason, specSnapshot, transaction) { + await ServicePlatformReconcileTaskManager.enqueueServicePlatformReconcileTask({ + serviceName, + reason, + specSnapshot + }, transaction) +} + async function handleServiceDistribution (serviceTags, transaction) { const tags = Array.isArray(serviceTags) ? serviceTags : (serviceTags ? [].concat(serviceTags) : []) logger.debug('handleServiceDistribution: entry', { serviceTagsType: typeof serviceTags, isArray: Array.isArray(serviceTags), tagsLength: tags.length }) @@ -1060,45 +1106,21 @@ async function createServiceEndpoint (serviceData, transaction) { logger.debug('Creating service in database') const service = await ServiceManager.create(serviceData, transaction) - // 8. Start background orchestration - setImmediate(async () => { - try { - // Set tags if provided - logger.debug('Setting tags (background)') - if (serviceData.tags && serviceData.tags.length > 0) { - await _setTags(service, serviceData.tags, transaction) - } - - // Add TCP connector - logger.debug('Adding TCP connector (background)') - await _addTcpConnector(serviceData, transaction) - - // Add TCP listener - logger.debug('Adding TCP listener (background)') - await _addTcpListener(serviceData, transaction) - - // Create K8s service if needed - if ((serviceData.type === 'microservice' || serviceData.type === 'agent' || serviceData.type === 'external') && isK8s) { - logger.debug('Creating K8s service (background)') - await _createK8sService(serviceData, transaction) - } + if (serviceData.tags && serviceData.tags.length > 0) { + await _setTags(service, serviceData.tags, transaction) + } - // Update provisioning status to ready - await ServiceManager.update({ id: service.id }, { provisioningStatus: 'ready', provisioningError: null }, transaction) - } catch (err) { - logger.error({ - err, - msg: 'Background provisioning failed', - serviceId: service.id, - serviceName: serviceData.name, - serviceType: serviceData.type - }) - // Update provisioning status to failed and set error message - await ServiceManager.update({ id: service.id }, { provisioningStatus: 'failed', provisioningError: err.message }, transaction) - } + const specSnapshot = _buildServiceSpecSnapshot({ + ...serviceData, + name: service.name, + bridgePort: service.bridgePort, + servicePort: service.servicePort, + serviceEndpoint: service.serviceEndpoint, + tags: serviceData.tags || [] }) + await _enqueueServiceReconcileTask(service.name, 'spec-changed', specSnapshot, transaction) - // 9. Return service immediately + // 8. Return service immediately return service } @@ -1113,6 +1135,7 @@ async function updateServiceEndpoint (serviceName, serviceData, transaction) { if (!existingService) { throw new Errors.NotFoundError(`Service with name ${serviceName} not found`) } + const oldTags = _mapTags(existingService) // 3. Check if service type is being changed if (serviceData.type && serviceData.type !== existingService.type) { @@ -1180,105 +1203,77 @@ async function updateServiceEndpoint (serviceName, serviceData, transaction) { transaction ) - // 9. Start background orchestration - setImmediate(async () => { - try { - // Update tags if provided - if (serviceData.tags) { - await _setTags(existingService, serviceData.tags, transaction) - } - - // Handle resource changes - if (serviceData.resource && - JSON.stringify(serviceData.resource) !== JSON.stringify(existingService.resource)) { - await _deleteTcpConnector(serviceName, transaction) - await _addTcpConnector(serviceData, transaction) - } else { - await _updateTcpConnector(serviceData, transaction) - // await _updateTcpListener(serviceData, transaction) - } + if (serviceData.tags) { + await _setTags(existingService, serviceData.tags, transaction) + } - // Update K8s service if needed - if ((existingService.type === 'microservice' || existingService.type === 'agent' || existingService.type === 'external') && isK8s) { - await _updateK8sService(serviceData, transaction) - } + const snapshotTags = serviceData.tags !== undefined + ? [...new Set([...oldTags, ...serviceData.tags])] + : oldTags + const specSnapshot = _mergeServiceFieldsForSnapshot(existingService, serviceData, snapshotTags) + await _enqueueServiceReconcileTask(serviceName, 'spec-changed', specSnapshot, transaction) - // Update provisioning status to ready - await ServiceManager.update( - { name: serviceName }, - { provisioningStatus: 'ready', provisioningError: null }, - transaction - ) - } catch (err) { - logger.error({ - err, - msg: 'Background provisioning failed (update)', - serviceName - }) - // Update provisioning status to failed and set error message - await ServiceManager.update( - { name: serviceName }, - { provisioningStatus: 'failed', provisioningError: err.message }, - transaction - ) - } - }) - - // 10. Return updated service immediately + // 9. Return updated service immediately return updatedService } // Delete service endpoint async function deleteServiceEndpoint (serviceName, transaction) { logger.debug('deleteServiceEndpoint: start', { serviceName }) - // Get existing service - const existingService = await ServiceManager.findOne({ name: serviceName }, transaction) + const existingService = await ServiceManager.findOneWithTags({ name: serviceName }, transaction) if (!existingService) { throw new Errors.NotFoundError(`Service with name ${serviceName} not found`) } logger.debug('deleteServiceEndpoint: existingService', { type: existingService.type, defaultBridge: existingService.defaultBridge }) - const isK8s = await checkKubernetesEnvironment() + const specSnapshot = _buildServiceSpecSnapshot({ + ...existingService, + tags: _mapTags(existingService) + }) + await _enqueueServiceReconcileTask(serviceName, 'delete', specSnapshot, transaction) - try { - // Delete TCP connector - logger.debug('deleteServiceEndpoint: deleting TCP connector') - await _deleteTcpConnector(serviceName, transaction) - logger.debug('deleteServiceEndpoint: TCP connector deleted') - - // Delete TCP listener - logger.debug('deleteServiceEndpoint: deleting TCP listener') - await _deleteTcpListener(serviceName, transaction) - logger.debug('deleteServiceEndpoint: TCP listener deleted') - - // Delete K8s service if needed - if (isK8s && existingService.type !== 'k8s') { - logger.debug('deleteServiceEndpoint: deleting K8s service') - await _deleteK8sService(serviceName) - logger.debug('deleteServiceEndpoint: K8s service deleted') - } + logger.debug('deleteServiceEndpoint: deleting service from DB') + await ServiceManager.delete({ name: serviceName }, transaction) + logger.debug('deleteServiceEndpoint: done') - // Finally delete the service from database - logger.debug('deleteServiceEndpoint: deleting service from DB') - await ServiceManager.delete({ name: serviceName }, transaction) - logger.debug('deleteServiceEndpoint: done') + return { message: `Service ${serviceName} deleted successfully` } +} - return { message: `Service ${serviceName} deleted successfully` } - } catch (error) { - logger.error({ - err: error, - msg: 'deleteServiceEndpoint: error', - serviceName - }) - - // Wrap the error in a proper error type if it's not already - if (!(error instanceof Errors.ValidationError) && - !(error instanceof Errors.NotFoundError) && - !(error instanceof Errors.TransactionError) && - !(error instanceof Errors.DuplicatePropertyError)) { - throw new Errors.ValidationError(`Failed to delete service: ${error.message}`) - } - throw error +async function reconcileServiceEndpoint (serviceName, transaction) { + const service = await ServiceManager.findOneWithTags({ name: serviceName }, transaction) + if (!service) { + throw new Errors.NotFoundError(`Service with name ${serviceName} not found`) + } + + if (service.provisioningStatus === 'failed') { + await ServiceManager.update( + { name: serviceName }, + { provisioningStatus: 'pending', provisioningError: null }, + transaction + ) + service.provisioningStatus = 'pending' + service.provisioningError = null + } + + const specSnapshot = _buildServiceSpecSnapshot({ + ...service, + tags: _mapTags(service) + }) + await _enqueueServiceReconcileTask(serviceName, 'manual-retry', specSnapshot, transaction) + + return { + name: service.name, + type: service.type, + resource: service.resource, + defaultBridge: service.defaultBridge, + bridgePort: service.bridgePort, + targetPort: service.targetPort, + servicePort: service.servicePort, + k8sType: service.k8sType, + serviceEndpoint: service.serviceEndpoint, + tags: _mapTags(service), + provisioningStatus: service.provisioningStatus, + provisioningError: service.provisioningError } } @@ -1375,6 +1370,7 @@ module.exports = { createServiceEndpoint: TransactionDecorator.generateTransaction(createServiceEndpoint), updateServiceEndpoint: TransactionDecorator.generateTransaction(updateServiceEndpoint), deleteServiceEndpoint: TransactionDecorator.generateTransaction(deleteServiceEndpoint), + reconcileServiceEndpoint: TransactionDecorator.generateTransaction(reconcileServiceEndpoint), getServicesListEndpoint: TransactionDecorator.generateTransaction(getServicesListEndpoint), getServiceEndpoint: TransactionDecorator.generateTransaction(getServiceEndpoint), moveMicroserviceTcpBridgeToNewFog: TransactionDecorator.generateTransaction(moveMicroserviceTcpBridgeToNewFog), @@ -1383,5 +1379,9 @@ module.exports = { _buildTcpConnector, _buildTcpListener, _addTcpConnector, + _addTcpListener, + _updateTcpConnector, + _deleteTcpConnector, + _deleteTcpListener, _resolveFogRouterMode } diff --git a/test/src/services/agent-service.test.js b/test/src/services/agent-service.test.js index b20223b5..8163d110 100644 --- a/test/src/services/agent-service.test.js +++ b/test/src/services/agent-service.test.js @@ -6,6 +6,7 @@ const Validator = require('../../../src/schemas') const FogProvisionKeyManager = require('../../../src/data/managers/iofog-provision-key-manager') const MicroserviceManager = require('../../../src/data/managers/microservice-manager') const ioFogManager = require('../../../src/data/managers/iofog-manager') +const FogPlatformStatusManager = require('../../../src/data/managers/fog-platform-status-manager') const FogKeyService = require('../../../src/services/iofog-key-service') const AppHelper = require('../../../src/helpers/app-helper') const ChangeTrackingService = require('../../../src/services/change-tracking-service') @@ -678,10 +679,13 @@ describe('Agent Service', () => { def('deleteNotRunningResponse', () => Promise.resolve()) def('findMicroservice', () => Promise.resolve($microserviceResponse)) + def('platformStatusResponse', () => Promise.resolve(null)) + beforeEach(() => { $sandbox.stub(Validator, 'validate').returns($validatorResponse) $sandbox.spy(AppHelper, 'deleteUndefinedFields') $sandbox.stub(ioFogManager, 'findOne').returns($findOneResponse) + $sandbox.stub(FogPlatformStatusManager, 'getParsedStatus').returns($platformStatusResponse) $sandbox.stub(ioFogManager, 'update').returns($updateResponse) $sandbox.stub(JSON, 'parse').returns($jsonParseResponse) $sandbox.stub(MicroserviceStatusManager, 'update').returns($updateMicroserviceStatusesResponse) @@ -938,10 +942,13 @@ describe('Agent Service', () => { def('updateMicroserviceStatusesResponse', () => Promise.resolve()) def('deleteNotRunningResponse', () => Promise.resolve()) def('findMicroservice', () => Promise.resolve($microserviceResponse)) + def('platformStatusResponse', () => Promise.resolve(null)) + beforeEach(() => { $sandbox.stub(Validator, 'validate').returns($validatorResponse) $sandbox.spy(AppHelper, 'deleteUndefinedFields') $sandbox.stub(ioFogManager, 'findOne').returns($findOneResponse) + $sandbox.stub(FogPlatformStatusManager, 'getParsedStatus').returns($platformStatusResponse) $sandbox.stub(ioFogManager, 'update').returns($updateResponse) $sandbox.stub(JSON, 'parse').returns($jsonParseResponse) $sandbox.stub(MicroserviceStatusManager, 'update').returns($updateMicroserviceStatusesResponse) @@ -1079,6 +1086,99 @@ describe('Agent Service', () => { }) }) + describe('.updateAgentStatus() platform reconcile gating', () => { + const microservicesStatus = '[{"id": "testUuid", "containerId":"testContainerId", "status":"RUNNING"}]' + const transaction = {} + + const baseAgentStatus = { + daemonStatus: 'RUNNING', + daemonOperatingDuration: 25, + warningMessage: '', + memoryUsage: 15, + diskUsage: 16, + cpuUsage: 17, + memoryViolation: false, + diskViolation: false, + cpuViolation: false, + systemAvailableDisk: 1, + systemAvailableMemory: 1, + systemTotalCpu: 1.1, + repositoryCount: 5, + repositoryStatus: '[]', + systemTime: 15325235253, + lastStatusTime: 15325235253, + ipAddress: 'testIpAddress', + ipAddressExternal: 'testIpAddressExternal', + microserviceStatus: microservicesStatus, + } + + def('uuid', () => 'testUuid') + def('fog', () => ({ uuid: $uuid })) + def('platformPhase', () => 'Progressing') + def('existingWarningMessage', () => 'HEALTHY') + def('subject', () => AgentService.updateAgentStatus(baseAgentStatus, $fog, transaction)) + + beforeEach(() => { + $sandbox.stub(Validator, 'validate').resolves(true) + $sandbox.stub(ioFogManager, 'findOne').resolves({ warningMessage: $existingWarningMessage }) + $sandbox.stub(FogPlatformStatusManager, 'getParsedStatus').resolves( + $platformPhase ? { fogUuid: $uuid, phase: $platformPhase } : null + ) + $sandbox.stub(ioFogManager, 'update').resolves() + $sandbox.stub(JSON, 'parse').returns([]) + $sandbox.stub(MicroserviceService, 'deleteNotRunningMicroservices').resolves() + }) + + it('forces daemonStatus WARNING when platform phase is Progressing', async () => { + await $subject + expect(ioFogManager.update).to.have.been.calledWith( + { uuid: $uuid }, + sinon.match.has('daemonStatus', FogStates.WARNING), + transaction + ) + }) + + context('when platform phase is Ready', () => { + def('platformPhase', () => 'Ready') + + it('accepts agent daemonStatus', async () => { + await $subject + expect(ioFogManager.update).to.have.been.calledWith( + { uuid: $uuid }, + sinon.match.has('daemonStatus', 'RUNNING'), + transaction + ) + }) + }) + + context('when platform phase is Deleting', () => { + def('platformPhase', () => 'Deleting') + + it('accepts agent daemonStatus', async () => { + await $subject + expect(ioFogManager.update).to.have.been.calledWith( + { uuid: $uuid }, + sinon.match.has('daemonStatus', 'RUNNING'), + transaction + ) + }) + }) + + context('when platform status is missing but warningMessage has migration prefix', () => { + def('platformPhase', () => null) + def('existingWarningMessage', () => 'Platform reconcile: router create failed') + + it('forces daemonStatus WARNING for migration compatibility', async () => { + await $subject + expect(ioFogManager.update).to.have.been.calledWith( + { uuid: $uuid }, + sinon.match.has('daemonStatus', FogStates.WARNING), + transaction + ) + }) + }) + }) + describe('.getAgentMicroservices()', () => { const transaction = {} const error = 'Error!' diff --git a/test/src/services/iofog-service.test.js b/test/src/services/iofog-service.test.js index 325a2352..9f8199d7 100644 --- a/test/src/services/iofog-service.test.js +++ b/test/src/services/iofog-service.test.js @@ -25,6 +25,9 @@ const HWInfoManager = require('../../../src/data/managers/hw-info-manager') const USBInfoManager = require('../../../src/data/managers/usb-info-manager') const Errors = require('../../../src/helpers/errors') const config = require('../../../src/config') +const FogPlatformSpecManager = require('../../../src/data/managers/fog-platform-spec-manager') +const FogPlatformStatusManager = require('../../../src/data/managers/fog-platform-status-manager') +const FogPlatformReconcileTaskManager = require('../../../src/data/managers/fog-platform-reconcile-task-manager') const isCLI = false const transaction = {} @@ -55,6 +58,8 @@ function stubFogReadDeps (sandbox) { sandbox.stub(NatsInstanceManager, 'findOne').resolves(null) sandbox.stub(NatsConnectionManager, 'findAllWithNats').resolves([]) sandbox.stub(RouterConnectionManager, 'findAllWithRouters').resolves([]) + sandbox.stub(FogPlatformSpecManager, 'getParsedSpec').resolves(null) + sandbox.stub(FogPlatformStatusManager, 'getParsedStatus').resolves(null) const routerFind = sandbox.stub(RouterManager, 'findOne').resolves(null) routerFind.withArgs({ isDefault: true }).resolves({ id: 99, @@ -82,6 +87,9 @@ function stubCreateFogDeps (sandbox, { uuid = 'testUuid', existingFogs = [{ uuid sandbox.stub(TagsManager, 'findOne').resolves(null) sandbox.stub(TagsManager, 'create').callsFake(({ value }) => Promise.resolve({ value })) sandbox.stub(ioFogService, '_handleRouterCertificates').resolves() + sandbox.stub(FogPlatformSpecManager, 'upsertSpec').resolves({ fogUuid: uuid, generation: 1 }) + sandbox.stub(FogPlatformStatusManager, 'ensurePending').resolves() + sandbox.stub(FogPlatformReconcileTaskManager, 'enqueueFogPlatformReconcileTask').resolves() } function stubUpdateFogDeps (sandbox, oldFog) { @@ -102,6 +110,10 @@ function stubUpdateFogDeps (sandbox, oldFog) { sandbox.stub(TagsManager, 'findOne').resolves(null) sandbox.stub(TagsManager, 'create').callsFake(({ value }) => Promise.resolve({ value })) sandbox.stub(ioFogService, '_handleRouterCertificates').resolves() + sandbox.stub(FogPlatformSpecManager, 'getParsedSpec').resolves(null) + sandbox.stub(FogPlatformSpecManager, 'upsertSpec').resolves({ fogUuid: oldFog.uuid, generation: 2 }) + sandbox.stub(FogPlatformStatusManager, 'ensurePending').resolves() + sandbox.stub(FogPlatformReconcileTaskManager, 'enqueueFogPlatformReconcileTask').resolves() } describe('ioFog Service', () => { @@ -151,6 +163,28 @@ describe('ioFog Service', () => { }) }) + it('upserts platform spec, status, and enqueues reconcile task', async () => { + await $subject + expect(FogPlatformSpecManager.upsertSpec).to.have.been.calledOnce + expect(FogPlatformSpecManager.upsertSpec.firstCall.args[0]).to.equal(uuid) + expect(FogPlatformSpecManager.upsertSpec.firstCall.args[1]).to.include({ + routerMode: 'edge', + natsMode: 'leaf' + }) + expect(FogPlatformStatusManager.ensurePending).to.have.been.calledWith(uuid, transaction) + expect(FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask).to.have.been.calledWith({ + fogUuid: uuid, + reason: 'spec-changed', + specGeneration: 1 + }, transaction) + }) + + it('does not run platform orchestration on the synchronous path', async () => { + await $subject + expect(RouterService.createRouterForFog).to.not.have.been.called + expect(NatsService.ensureNatsForFog).to.not.have.been.called + }) + it('does not run HAL/Bluetooth catalog work on the synchronous path', async () => { $sandbox.stub(CatalogService, 'getHalCatalogItem').resolves({ id: 1 }) $sandbox.stub(CatalogService, 'getBluetoothCatalogItem').resolves({ id: 2 }) @@ -257,6 +291,24 @@ describe('ioFog Service', () => { expect(ChangeTrackingService.update).to.have.been.calledWith(uuid, ChangeTrackingService.events.config, transaction) }) + it('merges platform spec and enqueues reconcile task', async () => { + await $subject + expect(FogPlatformSpecManager.upsertSpec).to.have.been.calledOnce + expect(FogPlatformStatusManager.ensurePending).to.have.been.calledWith(uuid, transaction) + expect(FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask).to.have.been.calledWith({ + fogUuid: uuid, + reason: 'spec-changed', + specGeneration: 2 + }, transaction) + }) + + it('does not run platform orchestration on the synchronous path', async () => { + await $subject + expect(RouterService.createRouterForFog).to.not.have.been.called + expect(RouterService.updateRouter).to.not.have.been.called + expect(NatsService.ensureNatsForFog).to.not.have.been.called + }) + it('rejects rename attempts', () => { const renamed = { ...fogData, name: 'new-name' } return expect(ioFogService.updateFogEndPoint(renamed, isCLI, transaction)) @@ -281,6 +333,74 @@ describe('ioFog Service', () => { it('rejects', () => expect($subject).to.be.rejectedWith(validationError)) }) + + context('when system fog is redeployed with full platform config', () => { + const systemFog = buildFogModel({ + uuid, + name: 'controlplane', + host: '10.0.0.1', + isSystem: true, + getRouter: () => Promise.resolve({ + id: 1, + isEdge: false, + messagingPort: 5671, + interRouterPort: 55671, + edgeRouterPort: 45671, + host: '10.0.0.1', + iofogUuid: uuid + }), + getNats: () => Promise.resolve({ isLeaf: false }) + }) + + beforeEach(() => { + ioFogManager.findOne.resolves(systemFog) + FogPlatformSpecManager.getParsedSpec.resolves({ + fogUuid: uuid, + generation: 1, + spec: { + routerMode: 'interior', + natsMode: 'server', + host: '10.0.0.1' + } + }) + }) + + it('accepts potctl redeploy PATCH without Invalid NATS mode undefined', async () => { + const redeployData = { + uuid, + isSystem: true, + natsMode: 'server', + routerMode: 'interior', + host: '10.0.0.1', + interRouterPort: 55671, + edgeRouterPort: 45671 + } + const result = await ioFogService.updateFogEndPoint(redeployData, isCLI, transaction) + expect(result).to.eql({ uuid }) + }) + }) + + context('when PATCH only updates networkInterface', () => { + beforeEach(() => { + FogPlatformSpecManager.getParsedSpec.resolves({ + fogUuid: uuid, + generation: 3, + spec: { + routerMode: 'edge', + natsMode: 'leaf', + host: '1.2.3.4' + } + }) + }) + + it('preserves platform modes in merged spec', async () => { + await ioFogService.updateFogEndPoint({ uuid, networkInterface: 'eth1' }, isCLI, transaction) + const mergedSpec = FogPlatformSpecManager.upsertSpec.firstCall.args[1] + expect(mergedSpec.routerMode).to.equal('edge') + expect(mergedSpec.natsMode).to.equal('leaf') + expect(mergedSpec.host).to.equal('1.2.3.4') + }) + }) }) describe('.deleteFogEndPoint()', () => { @@ -293,25 +413,70 @@ describe('ioFog Service', () => { beforeEach(() => { $sandbox.stub(Validator, 'validate').resolves(true) $sandbox.stub(ioFogManager, 'findOne').resolves(fog) - $sandbox.stub(MicroserviceManager, 'findAll').resolves([]) - $sandbox.stub(MicroserviceService, 'deleteMicroserviceWithRoutesAndPortMappings').resolves() - $sandbox.stub(ApplicationManager, 'delete').resolves() - $sandbox.stub(ChangeTrackingService, 'update').resolves() - $sandbox.stub(SecretManager, 'findOne').resolves(null) - $sandbox.stub(NatsService, 'cleanupNatsForFog').resolves() - $sandbox.stub(FogPublicKeyManager, 'findByFogUuid').resolves(null) $sandbox.stub(ioFogManager, 'delete').resolves() - $sandbox.stub(RouterManager, 'findOne').resolves(null) - $sandbox.stub(RouterConnectionManager, 'findAllWithRouters').resolves([]) - $sandbox.stub(CatalogService, 'getRouterCatalogItem').resolves({ id: 1 }) - $sandbox.stub(MicroserviceManager, 'delete').resolves() + $sandbox.stub(NatsService, 'cleanupNatsForFog').resolves() + $sandbox.stub(FogPlatformStatusManager, 'setPhase').resolves() + $sandbox.stub(FogPlatformReconcileTaskManager, 'enqueueFogPlatformReconcileTask').resolves() }) - it('validates and deletes the fog node', async () => { - await $subject + it('marks fog deleting and enqueues async teardown', async () => { + const result = await $subject expect(Validator.validate).to.have.been.calledWith(fogData, Validator.schemas.iofogDelete) - expect(ioFogManager.delete).to.have.been.calledWith({ uuid }, transaction) - expect(NatsService.cleanupNatsForFog).to.have.been.calledWith(fog, transaction) + expect(result).to.eql({ uuid }) + expect(FogPlatformStatusManager.setPhase).to.have.been.calledWith(uuid, 'Deleting', {}, transaction) + expect(FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask).to.have.been.calledWith({ + fogUuid: uuid, + reason: 'delete' + }, transaction) + expect(ioFogManager.delete).to.not.have.been.called + expect(NatsService.cleanupNatsForFog).to.not.have.been.called + }) + + context('when fog is missing', () => { + beforeEach(() => { + ioFogManager.findOne.resolves(null) + }) + + it('rejects with NotFoundError', () => expect($subject).to.be.rejectedWith(Errors.NotFoundError)) + }) + }) + + describe('.reconcileFogEndpoint()', () => { + const uuid = 'testUuid' + const fogData = { uuid } + const fog = buildFogModel({ uuid, name: 'test-fog' }) + + def('subject', () => $subject.reconcileFogEndpoint(fogData, transaction)) + + beforeEach(() => { + $sandbox.stub(ioFogManager, 'findOne').resolves(fog) + $sandbox.stub(FogPlatformStatusManager, 'getParsedStatus').resolves({ + phase: 'Failed', + lastError: 'router create failed' + }) + $sandbox.stub(FogPlatformStatusManager, 'setPhase').resolves() + $sandbox.stub(FogPlatformSpecManager, 'getParsedSpec').resolves({ + generation: 4, + spec: { routerMode: 'edge', natsMode: 'leaf' } + }) + $sandbox.stub(FogPlatformReconcileTaskManager, 'enqueueFogPlatformReconcileTask').resolves() + }) + + it('resets failed platform status and enqueues manual retry', async () => { + const result = await $subject + + expect(FogPlatformStatusManager.setPhase).to.have.been.calledWith( + uuid, + 'Pending', + { lastError: null }, + transaction + ) + expect(FogPlatformReconcileTaskManager.enqueueFogPlatformReconcileTask).to.have.been.calledWith({ + fogUuid: uuid, + reason: 'manual-retry', + specGeneration: 4 + }, transaction) + expect(result).to.eql({ uuid }) }) context('when fog is missing', () => { @@ -352,6 +517,74 @@ describe('ioFog Service', () => { expect(result.natsMode).to.equal('none') expect(result.tags).to.eql([]) expect(result.volumeMounts).to.eql([]) + expect(result).to.have.property('platformStatus', null) + }) + + context('when platform status exists', () => { + const lastTransitionAt = new Date('2026-06-24T12:00:00.000Z') + + beforeEach(() => { + FogPlatformSpecManager.getParsedSpec.resolves({ + fogUuid: uuid, + generation: 3, + spec: { routerMode: 'edge', natsMode: 'leaf' } + }) + FogPlatformStatusManager.getParsedStatus.resolves({ + fogUuid: uuid, + observedGeneration: 2, + phase: 'Pending', + lastError: null, + lastTransitionAt, + conditions: [{ type: 'RouterReady', status: 'False', reason: 'ReconcileComplete' }] + }) + }) + + it('includes platformStatus on GET single fog', async () => { + const result = await $subject + expect(result.platformStatus).to.eql({ + generation: 3, + observedGeneration: 2, + phase: 'Pending', + lastError: null, + lastTransitionAt, + conditions: [{ type: 'RouterReady', status: 'False', reason: 'ReconcileComplete' }] + }) + }) + }) + + context('when runtime rows are missing but spec has desired modes', () => { + beforeEach(() => { + FogPlatformSpecManager.getParsedSpec.resolves({ + fogUuid: uuid, + generation: 1, + spec: { + routerMode: 'interior', + natsMode: 'server', + messagingPort: 5671, + interRouterPort: 55671, + edgeRouterPort: 45671, + natsServerPort: 4222, + natsClusterPort: 6222, + natsMqttPort: 1883, + natsHttpPort: 8222, + upstreamRouters: ['upstream-router'], + upstreamNatsServers: ['upstream-nats'] + } + }) + }) + + it('returns spec-derived modes instead of none', async () => { + const result = await $subject + expect(result.routerMode).to.equal('interior') + expect(result.messagingPort).to.equal(5671) + expect(result.interRouterPort).to.equal(55671) + expect(result.edgeRouterPort).to.equal(45671) + expect(result.upstreamRouters).to.eql(['upstream-router']) + expect(result.natsMode).to.equal('server') + expect(result.natsServerPort).to.equal(4222) + expect(result.natsClusterPort).to.equal(6222) + expect(result.upstreamNatsServers).to.eql(['upstream-nats']) + }) }) context('when fog has an edge router', () => { @@ -398,6 +631,31 @@ describe('ioFog Service', () => { expect(Validator.validate).to.have.been.calledWith(filters, Validator.schemas.iofogFilters) expect(result.fogs).to.have.length(1) expect(result.fogs[0]).to.include({ uuid: 'testUuid', routerMode: 'none', natsMode: 'none' }) + expect(result.fogs[0]).to.not.have.property('platformStatus') + }) + + context('when runtime rows are missing but spec has desired modes', () => { + beforeEach(() => { + FogPlatformSpecManager.getParsedSpec.resolves({ + fogUuid: 'testUuid', + generation: 1, + spec: { + routerMode: 'edge', + natsMode: 'leaf', + messagingPort: 5671, + natsLeafPort: 7422 + } + }) + }) + + it('returns spec-derived modes and omits platformStatus', async () => { + const result = await $subject + expect(result.fogs[0].routerMode).to.equal('edge') + expect(result.fogs[0].messagingPort).to.equal(5671) + expect(result.fogs[0].natsMode).to.equal('leaf') + expect(result.fogs[0].natsLeafPort).to.equal(7422) + expect(result.fogs[0]).to.not.have.property('platformStatus') + }) }) }) diff --git a/test/src/services/services-service.test.js b/test/src/services/services-service.test.js new file mode 100644 index 00000000..6785ae45 --- /dev/null +++ b/test/src/services/services-service.test.js @@ -0,0 +1,370 @@ +const { expect } = require('chai') +const sinon = require('sinon') + +const ServiceController = require('../../../src/controllers/service-controller') +const YamlParserService = require('../../../src/services/yaml-parser-service') +const ServicesService = require('../../../src/services/services-service') +const ServiceManager = require('../../../src/data/managers/service-manager') +const ServicePlatformReconcileTaskManager = require('../../../src/data/managers/service-platform-reconcile-task-manager') +const RouterManager = require('../../../src/data/managers/router-manager') +const TagsManager = require('../../../src/data/managers/tags-manager') +const Validator = require('../../../src/schemas') +const Errors = require('../../../src/helpers/errors') + +describe('services-service platform reconcile enqueue', () => { + def('sandbox', () => sinon.createSandbox()) + def('transaction', () => ({})) + + afterEach(() => { + delete process.env.CONTROL_PLANE + $sandbox.restore() + }) + + function buildServiceModel (fields = {}) { + const service = { + id: 1, + name: 'api-gateway', + type: 'external', + resource: '10.0.0.8', + defaultBridge: 'default-router', + bridgePort: 9100, + targetPort: 8080, + servicePort: 9100, + k8sType: 'LoadBalancer', + serviceEndpoint: 'hub.example.com', + provisioningStatus: 'pending', + provisioningError: null, + tags: [], + ...fields + } + service.setTags = fields.setTags || sinon.stub().resolves() + return service + } + + function stubCreateDeps () { + delete process.env.CONTROL_PLANE + $sandbox.stub(Validator, 'validate').resolves(true) + $sandbox.stub(ServiceManager, 'findAll').resolves([]) + $sandbox.stub(ServiceManager, 'create').callsFake((data) => Promise.resolve(buildServiceModel(data))) + $sandbox.stub(ServicePlatformReconcileTaskManager, 'enqueueServicePlatformReconcileTask').resolves() + $sandbox.stub(RouterManager, 'findOne').resolves({ + isDefault: true, + host: 'hub.example.com', + iofogUuid: 'default-fog' + }) + $sandbox.stub(TagsManager, 'findOne').resolves(null) + $sandbox.stub(TagsManager, 'create').callsFake(({ value }) => Promise.resolve({ value })) + } + + describe('.createServiceEndpoint()', () => { + def('serviceData', () => ({ + name: 'api-gateway', + type: 'external', + resource: '10.0.0.8', + defaultBridge: 'default-router', + targetPort: 8080, + tags: ['site-a'] + })) + def('subject', () => ServicesService.createServiceEndpoint($serviceData, $transaction)) + + beforeEach(() => { + stubCreateDeps() + }) + + it('sets provisioningStatus pending and enqueues reconcile task', async () => { + await $subject + + expect(ServiceManager.create).to.have.been.calledOnce + const createPayload = ServiceManager.create.firstCall.args[0] + expect(createPayload.provisioningStatus).to.equal('pending') + expect(createPayload.provisioningError).to.be.null + + expect(ServicePlatformReconcileTaskManager.enqueueServicePlatformReconcileTask).to.have.been.calledWith({ + serviceName: 'api-gateway', + reason: 'spec-changed', + specSnapshot: { + name: 'api-gateway', + type: 'external', + resource: '10.0.0.8', + defaultBridge: 'default-router', + bridgePort: 10024, + targetPort: 8080, + servicePort: 10024, + k8sType: undefined, + serviceEndpoint: 'hub.example.com', + tags: ['site-a'] + } + }, $transaction) + }) + + it('does not use setImmediate for provisioning', async () => { + const setImmediateSpy = $sandbox.spy(global, 'setImmediate') + await $subject + expect(setImmediateSpy).to.not.have.been.called + }) + + it('does not run hub provisioning on the synchronous path', async () => { + $sandbox.stub(ServicesService, '_addTcpConnector').resolves() + $sandbox.stub(ServicesService, '_addTcpListener').resolves() + $sandbox.stub(ServicesService, '_createK8sService').resolves() + + await $subject + + expect(ServicesService._addTcpConnector).to.not.have.been.called + expect(ServicesService._addTcpListener).to.not.have.been.called + expect(ServicesService._createK8sService).to.not.have.been.called + }) + }) + + describe('.updateServiceEndpoint()', () => { + const existingService = buildServiceModel({ + tags: [{ value: 'site-a' }] + }) + + def('serviceData', () => ({ + name: 'api-gateway', + targetPort: 9090, + tags: ['site-b'] + })) + def('subject', () => ServicesService.updateServiceEndpoint('api-gateway', $serviceData, $transaction)) + + beforeEach(() => { + delete process.env.CONTROL_PLANE + $sandbox.stub(Validator, 'validate').resolves(true) + $sandbox.stub(ServiceManager, 'findOneWithTags').resolves(existingService) + $sandbox.stub(ServiceManager, 'update').callsFake((where, data) => + Promise.resolve(buildServiceModel({ ...existingService, ...data })) + ) + $sandbox.stub(ServicePlatformReconcileTaskManager, 'enqueueServicePlatformReconcileTask').resolves() + $sandbox.stub(RouterManager, 'findOne').resolves({ + isDefault: true, + host: 'hub.example.com', + iofogUuid: 'default-fog' + }) + $sandbox.stub(TagsManager, 'findOne').resolves(null) + $sandbox.stub(TagsManager, 'create').callsFake(({ value }) => Promise.resolve({ value })) + }) + + it('enqueues reconcile with old and new tags in snapshot', async () => { + await $subject + + expect(ServicePlatformReconcileTaskManager.enqueueServicePlatformReconcileTask).to.have.been.calledWith({ + serviceName: 'api-gateway', + reason: 'spec-changed', + specSnapshot: { + name: 'api-gateway', + type: 'external', + resource: '10.0.0.8', + defaultBridge: 'default-router', + bridgePort: 9100, + targetPort: 9090, + servicePort: 9100, + k8sType: 'LoadBalancer', + serviceEndpoint: 'hub.example.com', + tags: ['site-a', 'site-b'] + } + }, $transaction) + }) + + it('does not use setImmediate for provisioning', async () => { + const setImmediateSpy = $sandbox.spy(global, 'setImmediate') + await $subject + expect(setImmediateSpy).to.not.have.been.called + }) + }) + + describe('.deleteServiceEndpoint()', () => { + const existingService = buildServiceModel({ + tags: [{ value: 'site-a' }] + }) + + def('subject', () => ServicesService.deleteServiceEndpoint('api-gateway', $transaction)) + + beforeEach(() => { + $sandbox.stub(ServiceManager, 'findOneWithTags').resolves(existingService) + $sandbox.stub(ServiceManager, 'delete').resolves() + $sandbox.stub(ServicePlatformReconcileTaskManager, 'enqueueServicePlatformReconcileTask').resolves() + $sandbox.stub(ServicesService, '_deleteTcpConnector').resolves() + $sandbox.stub(ServicesService, '_deleteTcpListener').resolves() + $sandbox.stub(ServicesService, '_deleteK8sService').resolves() + }) + + it('captures spec snapshot and enqueues delete reconcile before DB delete', async () => { + await $subject + + expect(ServicePlatformReconcileTaskManager.enqueueServicePlatformReconcileTask).to.have.been.calledBefore( + ServiceManager.delete + ) + expect(ServicePlatformReconcileTaskManager.enqueueServicePlatformReconcileTask).to.have.been.calledWith({ + serviceName: 'api-gateway', + reason: 'delete', + specSnapshot: { + name: 'api-gateway', + type: 'external', + resource: '10.0.0.8', + defaultBridge: 'default-router', + bridgePort: 9100, + targetPort: 8080, + servicePort: 9100, + k8sType: 'LoadBalancer', + serviceEndpoint: 'hub.example.com', + tags: ['site-a'] + } + }, $transaction) + expect(ServiceManager.delete).to.have.been.calledWith({ name: 'api-gateway' }, $transaction) + }) + + it('does not run hub teardown on the synchronous path', async () => { + await $subject + + expect(ServicesService._deleteTcpConnector).to.not.have.been.called + expect(ServicesService._deleteTcpListener).to.not.have.been.called + expect(ServicesService._deleteK8sService).to.not.have.been.called + }) + }) + + describe('YAML endpoints', () => { + const serviceYaml = ` +apiVersion: datasance.com/v3 +kind: Service +metadata: + name: api-gateway + tags: + - site-a +spec: + type: external + resource: 10.0.0.8 + defaultBridge: default-router + targetPort: 8080 +` + + describe('create', () => { + beforeEach(() => { + stubCreateDeps() + $sandbox.stub(YamlParserService, 'parseServiceFile').resolves({ + name: 'api-gateway', + type: 'external', + resource: '10.0.0.8', + defaultBridge: 'default-router', + targetPort: 8080, + tags: ['site-a'] + }) + }) + + it('createServiceYAMLEndpoint parses YAML and enqueues reconcile task', async () => { + const req = { + file: { + buffer: Buffer.from(serviceYaml) + } + } + + await ServiceController.createServiceYAMLEndpoint(req) + + expect(YamlParserService.parseServiceFile).to.have.been.calledOnceWith(serviceYaml) + expect(ServicePlatformReconcileTaskManager.enqueueServicePlatformReconcileTask).to.have.been.calledWith({ + serviceName: 'api-gateway', + reason: 'spec-changed', + specSnapshot: sinon.match({ + name: 'api-gateway', + tags: ['site-a'] + }) + }, sinon.match.any) + }) + }) + + describe('update', () => { + const existingService = buildServiceModel({ + tags: [{ value: 'site-a' }] + }) + + beforeEach(() => { + delete process.env.CONTROL_PLANE + $sandbox.stub(Validator, 'validate').resolves(true) + $sandbox.stub(ServiceManager, 'findOneWithTags').resolves(existingService) + $sandbox.stub(ServiceManager, 'update').callsFake((where, data) => + Promise.resolve(buildServiceModel({ ...existingService, ...data })) + ) + $sandbox.stub(ServicePlatformReconcileTaskManager, 'enqueueServicePlatformReconcileTask').resolves() + $sandbox.stub(RouterManager, 'findOne').resolves({ + isDefault: true, + host: 'hub.example.com', + iofogUuid: 'default-fog' + }) + $sandbox.stub(TagsManager, 'findOne').resolves(null) + $sandbox.stub(TagsManager, 'create').callsFake(({ value }) => Promise.resolve({ value })) + $sandbox.stub(YamlParserService, 'parseServiceFile').resolves({ + name: 'api-gateway', + tags: ['site-b'], + targetPort: 9090 + }) + }) + + it('updateServiceYAMLEndpoint parses YAML and enqueues reconcile task', async () => { + const req = { + params: { name: 'api-gateway' }, + file: { + buffer: Buffer.from(serviceYaml) + } + } + + await ServiceController.updateServiceYAMLEndpoint(req) + + expect(YamlParserService.parseServiceFile).to.have.been.calledOnceWith(serviceYaml, { + isUpdate: true, + serviceName: 'api-gateway' + }) + expect(ServicePlatformReconcileTaskManager.enqueueServicePlatformReconcileTask).to.have.been.calledWith({ + serviceName: 'api-gateway', + reason: 'spec-changed', + specSnapshot: sinon.match({ + name: 'api-gateway', + tags: ['site-a', 'site-b'] + }) + }, sinon.match.any) + }) + }) + }) + + describe('.reconcileServiceEndpoint()', () => { + def('subject', () => ServicesService.reconcileServiceEndpoint('api-gateway', $transaction)) + + beforeEach(() => { + $sandbox.stub(ServiceManager, 'findOneWithTags').resolves(buildServiceModel({ + provisioningStatus: 'failed', + provisioningError: 'hub lock timeout', + tags: [{ value: 'site-a' }] + })) + $sandbox.stub(ServiceManager, 'update').resolves() + $sandbox.stub(ServicePlatformReconcileTaskManager, 'enqueueServicePlatformReconcileTask').resolves() + }) + + it('resets failed provisioning and enqueues manual retry', async () => { + const result = await $subject + + expect(ServiceManager.update).to.have.been.calledWith( + { name: 'api-gateway' }, + { provisioningStatus: 'pending', provisioningError: null }, + $transaction + ) + expect(ServicePlatformReconcileTaskManager.enqueueServicePlatformReconcileTask).to.have.been.calledWith({ + serviceName: 'api-gateway', + reason: 'manual-retry', + specSnapshot: sinon.match({ + name: 'api-gateway', + tags: ['site-a'] + }) + }, $transaction) + expect(result.provisioningStatus).to.equal('pending') + expect(result.provisioningError).to.be.null + }) + + context('when service is missing', () => { + beforeEach(() => { + ServiceManager.findOneWithTags.resolves(null) + }) + + it('rejects with NotFoundError', () => + expect($subject).to.be.rejectedWith(Errors.NotFoundError)) + }) + }) +}) From 638274bf76f2a634dc79f8a010adde3347f6321c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Wed, 24 Jun 2026 21:17:39 +0300 Subject: [PATCH 06/11] Document fog and service platform reconcile in changelog and API spec. Add platformStatus, provisioningStatus hub semantics, reconcile routes, and architecture overview for the three-layer reconcile model. --- CHANGELOG.md | 17 +++++ docs/architecture.md | 79 ++++++++++++++++++++++- docs/swagger.yaml | 149 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 244 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f96f699a..ce78f7b7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -87,6 +87,16 @@ Controller v3.8 is a **greenfield** release aligned with **Edgelet**. There is * - Neutral in-tree identity: RBAC **`iofog.org/v3`**, default namespace **`iofog`**, `package.json` name **`controller`**. - NATS account/user rule **JWT Latin-1 validation** — rejects rules whose fields cannot be encoded in a NATS JWT (create/update on rules, applications, microservices, and NATS API). - RBAC **route catalog utils** (`isPublicCatalogRoute`) — shared lookup of public routes from `rbac-resources.yaml` (empty verb list = no auth required). +- **Fog + service platform reconcile** — declarative router/NATS and service endpoint lifecycle replaces fire-and-forget `setImmediate` in `iofog-service.js` and `services-service.js`. +- Tables: **`FogPlatformSpecs`**, **`FogPlatformStatuses`**, **`FogPlatformReconcileTasks`**, **`ServicePlatformReconcileTasks`**, **`HubRouterConfigLocks`** (greenfield migrations amended for sqlite, mysql, postgres). +- **`platform-reconcile-worker-job.js`** — one worker, two DB-backed claim paths (fog + service); stale reclaim, exponential backoff, max attempts. +- **`fog-platform-sweep-job.js`** — periodic drift detection for fog and service platform state. +- **`GET /api/v3/iofog/{uuid}`** — optional **`platformStatus`** (`phase`, `generation`, `lastError`, conditions). +- **`POST /api/v3/iofog/{uuid}/reconcile`** and **`POST /api/v3/services/{name}/reconcile`** — manual retry after failed or stuck reconcile. +- Service **`provisioningStatus`** — hub semantics: **`ready`** when hub connector/listener and K8s Service reconcile succeed; edge TCP bridges converge asynchronously via fog platform reconcile fan-out. +- **K8s control plane:** hub **`iofog-router`** ConfigMap patches serialized via DB lock; K8s Service create/update/delete with LoadBalancer watch timeout. +- **`service-bridge-config.js`** — full recompute of service-derived TCP bridge config per fog on reconcile (preserves router base config). +- **SQLite single-node production hardening** — WAL + `busy_timeout` pragmas, reconcile task claim retry on `SQLITE_BUSY`, staggered startup for reconcile-heavy background jobs (`settings.jobStartupDelaySeconds`). ### Fixed @@ -106,6 +116,13 @@ Controller v3.8 is a **greenfield** release aligned with **Edgelet**. There is * - Router microservice **`siteConfig.platform`** defaults to **`edgelet`** (was **`docker`**) when the agent uses the Edgelet runtime. - Boolean env vars (`TRUST_PROXY`, `SERVER_DEV_MODE`, `DB_USE_SSL`, `VAULT_ENABLED`, `ENABLE_TELEMETRY`, and other mapped flags) are parsed consistently from Kubernetes string values (`true`/`false`, `1`/`0`) via shared **`config.getBoolean()`** — fixes startup crash when **`TRUST_PROXY=true`** was passed as a string to Express. - Postgres/MySQL SSL reads canonical **`DB_SSL_CA`** (via config) instead of undocumented **`DB_SSL_CA_B64`**; **`database.*.useSSL`** config key honored (was **`useSsl`** typo). +- Spurious **`routerMode`** / **`natsMode`** **`none`** on fog list/get when runtime rows were pending — read path now falls back to **`FogPlatformSpecs`** during reconcile. +- **`PATCH /api/v3/iofog/{uuid}`** on system fogs with full config (potctl redeploy) — **400** **`Invalid NATS mode 'undefined'`** when `natsMode` was omitted from PATCH body. +- Partial fog delete orphans when router/NATS teardown failed mid-flight — delete is async via platform reconcile **`Deleting`** phase. +- Service provisioning races and lost hub ConfigMap updates under multi-Controller — serialized hub lock and DB-backed service reconcile tasks. +- Dual writers to router microservice bridge config from fog create/update and service create/update/delete — single full-recompute path on fog reconcile. +- SQLite startup lock contention on single-controller deployments — WAL + `busy_timeout` pragmas on connect, `withDbBusyRetry` on fog/service/NATS task claims, staggered reconcile-heavy job startup. +- **`reconcileFog` transaction parameter** — removed unused `options` argument so worker-decorated calls receive the transaction correctly. ### Changed diff --git a/docs/architecture.md b/docs/architecture.md index eab2806e..25eaf10e 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -137,8 +137,71 @@ Full request/response shapes: **`docs/swagger.yaml`** (agent paths). | Controller cleanup | `controller-cleanup-job.js` | Orphaned controller MS housekeeping | | Event cleanup | `event-cleanup-job.js` | Audit event retention | | NATS reconcile | `nats-reconcile-worker-job.js` | NATS operator sync | +| Platform reconcile | `platform-reconcile-worker-job.js` | Fog + service platform claim/reconcile (one job, two queues) | +| Fog platform sweep | `fog-platform-sweep-job.js` | Drift detection; re-enqueue stale fog/service tasks | | Stopped app status | `stopped-app-status-job.js` | Application state maintenance | +### Platform reconcile (fog + service + resolver) + +Router/NATS **fog platform lifecycle**, **service endpoint provisioning**, and the existing **NATS resolver** layer are three separate reconcile queues — not fire-and-forget `setImmediate` blocks in `iofog-service.js` / `services-service.js`. + +```mermaid +flowchart TB + subgraph api [Synchronous API] + FogAPI[POST/PATCH/DELETE /iofog] + SvcAPI[POST/PATCH/DELETE /services + yaml] + FogAPI --> FSpec[Upsert FogPlatformSpecs] + SvcAPI --> SDB[Write Services + tags] + FSpec --> FEnqueue[FogPlatformReconcileTasks] + SDB --> SEnqueue[ServicePlatformReconcileTasks] + end + + subgraph worker [One job — any Controller replica] + ClaimF[claimNextFogTask] + ClaimS[claimNextServiceTask] + ReconcileF[FogPlatformService.reconcileFog] + ReconcileS[ServicePlatformService.reconcileService] + HubLock[Hub ConfigMap lock] + ClaimF --> ReconcileF + ClaimS --> HubLock + HubLock --> ReconcileS + end + + subgraph runtime [Observed state] + Routers[(Routers)] + Nats[(NatsInstances)] + MS[System MS + secrets] + K8sSvc[K8s Service] + CM[ConfigMap iofog-router] + end + + subgraph resolver [NATS resolver — unchanged] + NRT[NatsReconcileTasks] + end + + FEnqueue --> ClaimF + SEnqueue --> ClaimS + ReconcileF --> Routers + ReconcileF --> Nats + ReconcileF --> MS + ReconcileS --> CM + ReconcileS --> K8sSvc + ReconcileS -->|fan-out service-changed| FEnqueue + ReconcileF -->|topology change| NRT +``` + +| Layer | Table | Worker | Purpose | +|-------|-------|--------|---------| +| **Fog platform** | `FogPlatformReconcileTasks` | Same job: `claimNextFogTask` | Router/NATS instances, PKI, system MS, **full recompute** of service-derived TCP bridges per fog | +| **Service platform** | `ServicePlatformReconcileTasks` | Same job: `claimNextServiceTask` | Hub connector/listener, K8s Service, ConfigMap (DB lock), fan-out fog tasks on tag change | +| **NATS resolver** | `NatsReconcileTasks` | `nats-reconcile-worker-job.js` | JWT bundles, account/user creds after app deploy | + +**Fog operator API:** `GET /iofog/{uuid}` includes optional **`platformStatus`** (`phase`, `generation`, `lastError`); list/get derive `routerMode`/`natsMode` from **`FogPlatformSpecs`** when runtime rows are pending. **`POST /iofog/{uuid}/reconcile`** for manual retry. + +**Service operator API:** JSON + YAML create/update/delete enqueue service reconcile; **`provisioningStatus=ready`** marks hub complete (K8s Service + hub ConfigMap); edge listeners converge via fog fan-out. **`POST /services/{name}/reconcile`** for manual retry. + +Full spec: [`.cursor/controllerv3.8/docs/15-fog-platform-reconcile.md`](../.cursor/controllerv3.8/docs/15-fog-platform-reconcile.md) · RFC R69–R79. + --- ## Edgelet agent contract (summary) @@ -197,7 +260,21 @@ For the full bilateral contract (including ControlPlane env vars and verificatio | Topic | v3.8 behavior | |-------|---------------| -| **Database** | Greenfield v3.8.0 schema — **new install only** (no v3.7 migrator). Supports sqlite (dev), mysql, postgres (production/HA). | +| **Database** | Greenfield v3.8.0 schema — **new install only** (no v3.7 migrator). Supports **sqlite** (single-controller production), **mysql**, and **postgres** (multi-replica / HA). | + +### SQLite single-node production + +Small deployments with **one Controller process** may use SQLite as the production database (embedded OIDC requires a single replica in this profile). + +| Topic | Behavior | +|-------|----------| +| **When to use** | Single Controller, no DB HA requirement, edge/small-cluster PoT | +| **Concurrency** | WAL journal mode + `busy_timeout` pragmas on connect; connection pool size 1 | +| **Background jobs** | Reconcile-heavy jobs start after a configurable delay (`settings.jobStartupDelaySeconds`, default 3s) and stagger by 500ms to avoid restart lock bursts | +| **Task claims** | Fog/service/NATS reconcile task claims retry on `SQLITE_BUSY` (same retry budget as `TransactionDecorator`) | +| **Persistence** | Mount a persistent volume for `controller_db.sqlite` and WAL sidecar files (`-wal`, `-shm`) | +| **Backup** | Use SQLite backup API or copy DB + WAL files during a quiet window | +| **HA path** | mysql/postgres + multiple Controller replicas — see [oidc-configuration.md](oidc-configuration.md) | | **Applications** | Table `Applications` (was `Flows`); API identity by **name** string. | | **Architectures** | Table `Architectures` (was `FogTypes`); `archId` 0–4. | | **PKI** | Central **default-router-local-ca** and **default-nats-local-ca** for all new agents; no per-agent local CAs on provision (greenfield — no v3.7 PKI migration job). See [pki.md](pki.md). | diff --git a/docs/swagger.yaml b/docs/swagger.yaml index 94b48ae6..f45569f9 100755 --- a/docs/swagger.yaml +++ b/docs/swagger.yaml @@ -74,6 +74,9 @@ paths: tags: - ioFog summary: Creates a new ioFog node + description: >- + Platform reconcile is asynchronous. Poll GET /iofog/{uuid} for + platformStatus, or POST /iofog/{uuid}/reconcile to retry after failure. operationId: createIOFogNode security: - authToken: [] @@ -102,6 +105,9 @@ paths: tags: - ioFog summary: Updates existing ioFog node + description: >- + Platform reconcile is asynchronous. Poll GET /iofog/{uuid} for + platformStatus, or POST /iofog/{uuid}/reconcile to retry after failure. operationId: updateIOFogNode parameters: - in: path @@ -190,6 +196,39 @@ paths: description: Invalid Node Id "500": description: Internal Server Error + "/iofog/{uuid}/reconcile": + post: + tags: + - ioFog + summary: Manually retries fog platform reconcile + operationId: reconcileIOFogNode + parameters: + - in: path + name: uuid + description: ioFog node id + required: true + schema: + type: string + security: + - authToken: [] + responses: + "200": + description: Success + headers: + X-Timestamp: + description: FogController server timestamp + schema: + type: number + content: + application/json: + schema: + $ref: "#/components/schemas/IOFogUuidResponse" + "401": + description: Not Authorized + "404": + description: Invalid Node Id + "500": + description: Internal Server Error "/iofog/{uuid}/provisioning-key": get: tags: @@ -4557,6 +4596,46 @@ paths: description: Not Found "500": description: Internal Server Error + "/services/{name}/reconcile": + post: + tags: + - Services + summary: Manually retries service platform reconcile + description: >- + Enqueues a service platform reconcile task. When `provisioningStatus` is + `failed`, resets it to `pending` and clears `provisioningError` before + enqueue. Hub connector/listener, K8s Service, and ConfigMap updates run + asynchronously in the platform reconcile worker. `provisioningStatus=ready` + means hub provisioning completed successfully; tagged edge fogs receive + bridge updates via separate fog platform reconcile tasks. + operationId: reconcileService + parameters: + - in: path + name: name + description: Service name + required: true + schema: + type: string + security: + - authToken: [] + responses: + "200": + description: Success + headers: + X-Timestamp: + description: FogController server timestamp + schema: + type: number + content: + application/json: + schema: + $ref: "#/components/schemas/Service" + "401": + description: Not Authorized + "404": + description: Not Found + "500": + description: Internal Server Error /services/yaml: post: tags: @@ -7385,6 +7464,46 @@ components: type: array items: type: string + platformStatus: + description: Present on GET single fog only; omitted on list responses + allOf: + - $ref: "#/components/schemas/PlatformStatus" + PlatformStatus: + type: object + properties: + phase: + type: string + enum: + - Pending + - Progressing + - Ready + - Failed + - Deleting + generation: + type: integer + description: Desired spec generation from FogPlatformSpecs + observedGeneration: + type: integer + description: Generation last reconciled successfully + lastError: + type: string + nullable: true + lastTransitionAt: + type: string + format: date-time + conditions: + type: array + items: + type: object + properties: + type: + type: string + status: + type: string + reason: + type: string + message: + type: string UpdateIOFogNodeRequestBody: type: object properties: @@ -7525,6 +7644,12 @@ components: properties: id: type: string + IOFogUuidResponse: + type: object + properties: + uuid: + type: string + description: ioFog node id ProvisioningKeyResponse: type: object properties: @@ -9520,6 +9645,30 @@ components: type: number targetPort: type: number + servicePort: + type: number + k8sType: + type: string + description: K8s Service type when running on a Kubernetes control plane (e.g. LoadBalancer) + serviceEndpoint: + type: string + description: Hub-facing endpoint hostname or LoadBalancer IP after reconcile + provisioningStatus: + type: string + enum: + - pending + - ready + - failed + description: >- + Hub provisioning lifecycle. `pending` after create/update/delete enqueue + or manual retry; `ready` after hub connector/listener and K8s Service + reconcile succeed; `failed` after max worker attempts (use POST reconcile + to retry). Edge fog bridge updates are tracked separately via fog platform + reconcile. + provisioningError: + type: string + nullable: true + description: Last hub reconcile error when `provisioningStatus` is `failed` tags: type: array items: From 3b032ccd486bae9946efa22bbbd0944f1c656834 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Thu, 25 Jun 2026 00:50:00 +0300 Subject: [PATCH 07/11] Harden exec and log WebSocket sessions with quotas, timeouts, and exec_b lifecycle. Enforce connection limits and fresh DB transactions on close, add HA AMQP fail-fast, 30s SIGTERM drain, stale session reconcile job, batched log session queries, and OTEL metrics. --- src/config/config.yaml | 14 +- src/config/env-mapping.js | 11 + src/config/telemetry.js | 17 + src/jobs/ws-session-reconcile-job.js | 166 +++ src/server.js | 8 +- src/services/agent-service.js | 14 +- src/services/router-connection-service.js | 16 + src/services/websocket-queue-service.js | 148 ++- src/websocket/log-session-manager.js | 54 +- src/websocket/server.js | 1299 ++++++++++++++------- src/websocket/session-manager.js | 160 ++- src/websocket/ws-metrics.js | 81 ++ 12 files changed, 1471 insertions(+), 517 deletions(-) create mode 100644 src/jobs/ws-session-reconcile-job.js create mode 100644 src/websocket/ws-metrics.js diff --git a/src/config/config.yaml b/src/config/config.yaml index 12c9b1e4..942cd893 100644 --- a/src/config/config.yaml +++ b/src/config/config.yaml @@ -20,9 +20,20 @@ server: maxPayload: 1048576 # 1MB maxFrameSize: 65536 # 64KB session: - timeout: 3600000 # Session timeout in milliseconds (1 hour) + timeout: 3600000 # Legacy idle fallback (ms); exec uses execMaxDurationMs maxConnections: 100 # Maximum connections per session cleanupInterval: 30000 # Session cleanup interval (30 seconds) + execPendingTimeoutMs: 60000 # Exec: user wait for agent (R81) + execMaxDurationMs: 28800000 # Exec: max active session 8h (R81) + logPendingTimeoutMs: 120000 # Log: user wait for agent (R82) + logIdleTimeoutMs: 7200000 # Log: idle session 2h (R82) + logMaxConcurrentPerResource: 3 # Log: max user WS per MS or fog (R82) + logTailMaxLines: 5000 # Log: tail query param max (R82) + replicaMaxConcurrentWs: 500 # Scale SLO per replica (R88) + drainTimeoutMs: 30000 # Graceful drain on SIGTERM/preStop (R85) + ha: + crossReplicaRequiresAmqp: true # Cross-replica exec/log relay requires AMQP router (R84) + failFastOnRouterUnavailable: true security: maxConnectionsPerIp: 10 maxRequestsPerMinute: 60 @@ -74,6 +85,7 @@ settings: hubRouterConfigLockTimeoutSeconds: 120 # Hub router ConfigMap lock wait in seconds (default: 120) serviceLoadBalancerWatchTimeoutSeconds: 300 # LoadBalancer IP watch timeout in service reconcile (default: 300) jobStartupDelaySeconds: 3 # Delay before reconcile-heavy background jobs start (default: 3) + wsSessionReconcileIntervalSeconds: 60 # WS exec/log stale DB row reconcile interval (R89) # Database Configuration database: diff --git a/src/config/env-mapping.js b/src/config/env-mapping.js index 0e1bf1c2..b9ac0689 100644 --- a/src/config/env-mapping.js +++ b/src/config/env-mapping.js @@ -21,6 +21,17 @@ module.exports = { WS_SESSION_TIMEOUT: 'server.webSocket.session.timeout', WS_SESSION_MAX_CONNECTIONS: 'server.webSocket.session.maxConnections', WS_CLEANUP_INTERVAL: 'server.webSocket.session.cleanupInterval', + WS_EXEC_PENDING_TIMEOUT_MS: 'server.webSocket.session.execPendingTimeoutMs', + WS_EXEC_MAX_DURATION_MS: 'server.webSocket.session.execMaxDurationMs', + WS_LOG_PENDING_TIMEOUT_MS: 'server.webSocket.session.logPendingTimeoutMs', + WS_LOG_IDLE_TIMEOUT_MS: 'server.webSocket.session.logIdleTimeoutMs', + WS_LOG_MAX_CONCURRENT_PER_RESOURCE: 'server.webSocket.session.logMaxConcurrentPerResource', + WS_LOG_TAIL_MAX_LINES: 'server.webSocket.session.logTailMaxLines', + WS_REPLICA_MAX_CONCURRENT_WS: 'server.webSocket.session.replicaMaxConcurrentWs', + WS_DRAIN_TIMEOUT_MS: 'server.webSocket.session.drainTimeoutMs', + WS_HA_CROSS_REPLICA_REQUIRES_AMQP: 'server.webSocket.ha.crossReplicaRequiresAmqp', + WS_HA_FAIL_FAST_ON_ROUTER_UNAVAILABLE: 'server.webSocket.ha.failFastOnRouterUnavailable', + WS_SESSION_RECONCILE_INTERVAL_SECONDS: 'settings.wsSessionReconcileIntervalSeconds', WS_SECURITY_MAX_CONNECTIONS_PER_IP: 'server.webSocket.security.maxConnectionsPerIp', WS_SECURITY_MAX_REQUESTS_PER_MINUTE: 'server.webSocket.security.maxRequestsPerMinute', WS_SECURITY_MAX_PAYLOAD: 'server.webSocket.security.maxPayload', diff --git a/src/config/telemetry.js b/src/config/telemetry.js index 3e66dc32..8c2c51f8 100644 --- a/src/config/telemetry.js +++ b/src/config/telemetry.js @@ -1,16 +1,30 @@ const { NodeSDK } = require('@opentelemetry/sdk-node') const { OTLPTraceExporter } = require('@opentelemetry/exporter-trace-otlp-http') +const { OTLPMetricExporter } = require('@opentelemetry/exporter-metrics-otlp-http') +const { PeriodicExportingMetricReader } = require('@opentelemetry/sdk-metrics') const { HttpInstrumentation } = require('@opentelemetry/instrumentation-http') const { ExpressInstrumentation } = require('@opentelemetry/instrumentation-express') const config = require('./index') const logger = require('../logger') +const metricsEndpoint = (process.env.OTEL_EXPORTER_OTLP_ENDPOINT || 'http://localhost:4318/v1/traces') + .replace(/\/v1\/traces\/?$/, '/v1/metrics') + +const metricReader = new PeriodicExportingMetricReader({ + exporter: new OTLPMetricExporter({ + url: metricsEndpoint, + headers: {} + }), + exportIntervalMillis: parseInt(process.env.OTEL_METRICS_INTERVAL || config.get('otel.metrics.interval') || '60000', 10) +}) + const sdk = new NodeSDK({ serviceName: process.env.OTEL_SERVICE_NAME || 'iofog-controller', traceExporter: new OTLPTraceExporter({ url: process.env.OTEL_EXPORTER_OTLP_ENDPOINT || 'http://localhost:4318/v1/traces', headers: {} }), + metricReader, instrumentations: [ new HttpInstrumentation(), new ExpressInstrumentation() @@ -30,6 +44,9 @@ async function startTelemetry () { try { await sdk.start() + const RouterConnectionService = require('../services/router-connection-service') + const { initWsMetrics } = require('../websocket/ws-metrics') + initWsMetrics(RouterConnectionService) logger.info('OpenTelemetry initialized successfully') } catch (error) { logger.error('Error initializing OpenTelemetry:', error) diff --git a/src/jobs/ws-session-reconcile-job.js b/src/jobs/ws-session-reconcile-job.js new file mode 100644 index 00000000..846e7c7f --- /dev/null +++ b/src/jobs/ws-session-reconcile-job.js @@ -0,0 +1,166 @@ +const Config = require('../config') +const logger = require('../logger') +const Sequelize = require('sequelize') +const Op = Sequelize.Op +const WebSocketServer = require('../websocket/server') +const MicroserviceExecStatusManager = require('../data/managers/microservice-exec-status-manager') +const MicroserviceLogStatusManager = require('../data/managers/microservice-log-status-manager') +const FogLogStatusManager = require('../data/managers/fog-log-status-manager') +const MicroserviceManager = require('../data/managers/microservice-manager') +const ChangeTrackingService = require('../services/change-tracking-service') +const FogManager = require('../data/managers/iofog-manager') +const { microserviceExecState } = require('../enums/microservice-state') +const TransactionDecorator = require('../decorators/transaction-decorator') + +function getIntervalMs () { + const seconds = process.env.WS_SESSION_RECONCILE_INTERVAL_SECONDS || + Config.get('settings.wsSessionReconcileIntervalSeconds', 60) + return seconds * 1000 +} + +function getSessionConfig () { + return Config.get('server.webSocket.session') || {} +} + +async function run () { + try { + await reconcileStaleSessions() + } catch (error) { + logger.error('Error during WS session reconcile:', error) + } finally { + setTimeout(run, getIntervalMs()) + } +} + +async function reconcileStaleSessions () { + const wsServer = WebSocketServer.getInstance() + const sessionManager = wsServer.sessionManager + const logSessionManager = wsServer.logSessionManager + const sessionConfig = getSessionConfig() + const execPendingTimeout = sessionConfig.execPendingTimeoutMs || 60000 + const execMaxDuration = sessionConfig.execMaxDurationMs || 28800000 + const logPendingTimeout = sessionConfig.logPendingTimeoutMs || 120000 + const logIdleTimeout = sessionConfig.logIdleTimeoutMs || 7200000 + const now = Date.now() + + let execCleaned = 0 + let logCleaned = 0 + + await TransactionDecorator.generateTransaction(async (transaction) => { + const execStatuses = await MicroserviceExecStatusManager.findAll({ + status: { [Op.in]: [microserviceExecState.PENDING, microserviceExecState.ACTIVE] } + }, transaction) + + for (const row of execStatuses) { + const microserviceUuid = row.microserviceUuid + const execId = row.execSessionId + if (!microserviceUuid) continue + + const inMemory = execId && sessionManager.getSession(execId) + const hasPending = sessionManager.getPendingUserCount(microserviceUuid) > 0 || + (sessionManager.pendingAgents.has(microserviceUuid) && + sessionManager.pendingAgents.get(microserviceUuid).size > 0) + + if (inMemory || hasPending) continue + + const age = now - new Date(row.updatedAt).getTime() + const threshold = row.status === microserviceExecState.PENDING + ? execPendingTimeout + : execMaxDuration + + if (age < threshold) continue + + await MicroserviceExecStatusManager.update( + { microserviceUuid }, + { execSessionId: '', status: microserviceExecState.INACTIVE }, + transaction + ) + await MicroserviceManager.update({ uuid: microserviceUuid }, { execEnabled: false }, transaction) + const microservice = await MicroserviceManager.findOne({ uuid: microserviceUuid }, transaction) + if (microservice) { + await ChangeTrackingService.update( + microservice.iofogUuid, + ChangeTrackingService.events.microserviceExecSessions, + transaction + ) + } + execCleaned++ + logger.info('Reconciled stale exec status row:' + JSON.stringify({ + microserviceUuid, + execId, + status: row.status, + ageMs: age + })) + } + + const msLogRows = await MicroserviceLogStatusManager.findAll({ + status: { [Op.in]: ['PENDING', 'ACTIVE'] } + }, transaction) + + for (const row of msLogRows) { + if (logSessionManager.getLogSession(row.sessionId)) continue + + const age = now - new Date(row.updatedAt).getTime() + const threshold = row.status === 'PENDING' ? logPendingTimeout : logIdleTimeout + if (age < threshold) continue + + await MicroserviceLogStatusManager.delete({ sessionId: row.sessionId }, transaction) + logCleaned++ + + const microservice = await MicroserviceManager.findOne({ uuid: row.microserviceUuid }, transaction) + if (microservice) { + await ChangeTrackingService.update( + microservice.iofogUuid, + ChangeTrackingService.events.microserviceLogs, + transaction + ) + } + + logger.info('Reconciled stale microservice log row:' + JSON.stringify({ + sessionId: row.sessionId, + microserviceUuid: row.microserviceUuid, + status: row.status, + ageMs: age + })) + } + + const fogLogRows = await FogLogStatusManager.findAll({ + status: { [Op.in]: ['PENDING', 'ACTIVE'] } + }, transaction) + + for (const row of fogLogRows) { + if (logSessionManager.getLogSession(row.sessionId)) continue + + const age = now - new Date(row.updatedAt).getTime() + const threshold = row.status === 'PENDING' ? logPendingTimeout : logIdleTimeout + if (age < threshold) continue + + await FogLogStatusManager.delete({ sessionId: row.sessionId }, transaction) + logCleaned++ + + const fog = await FogManager.findOne({ uuid: row.iofogUuid }, transaction) + if (fog) { + await ChangeTrackingService.update( + fog.uuid, + ChangeTrackingService.events.fogLogs, + transaction + ) + } + + logger.info('Reconciled stale fog log row:' + JSON.stringify({ + sessionId: row.sessionId, + iofogUuid: row.iofogUuid, + status: row.status, + ageMs: age + })) + } + })() + + if (execCleaned > 0 || logCleaned > 0) { + logger.info(`WS session reconcile completed: ${execCleaned} exec, ${logCleaned} log rows cleaned`) + } +} + +module.exports = { + run +} diff --git a/src/server.js b/src/server.js index ef1c1116..e98858d0 100755 --- a/src/server.js +++ b/src/server.js @@ -182,8 +182,14 @@ initialize().then(() => { } function registerServers (api, consoleServer) { - process.once('SIGTERM', async function (code) { + process.once('SIGTERM', async function () { console.log('SIGTERM received. Shutting down.') + try { + const wsServer = WebSocketServer.getInstance() + await wsServer.drain() + } catch (error) { + logger.error('WebSocket drain failed during shutdown', { error: error.message }) + } await new Promise((resolve) => { api.close(resolve) }) console.log('API Server closed.') await new Promise((resolve) => { consoleServer.close(resolve) }) diff --git a/src/services/agent-service.js b/src/services/agent-service.js index 3e891aa6..6278ed0f 100644 --- a/src/services/agent-service.js +++ b/src/services/agent-service.js @@ -633,27 +633,26 @@ const getControllerCA = async function (fog, transaction) { const getAgentLogSessions = async function (fog, transaction) { const Op = require('sequelize').Op - // Get all microservices for this fog const microservices = await MicroserviceManager.findAll( { iofogUuid: fog.uuid }, transaction ) const allSessions = [] + const microserviceUuids = microservices.map(ms => ms.uuid) - // Get microservice log sessions - for (const ms of microservices) { - const sessions = await MicroserviceLogStatusManager.findAll( + if (microserviceUuids.length > 0) { + const msSessions = await MicroserviceLogStatusManager.findAll( { - microserviceUuid: ms.uuid, + microserviceUuid: { [Op.in]: microserviceUuids }, status: { [Op.in]: ['PENDING', 'ACTIVE'] } }, transaction ) - for (const session of sessions) { + for (const session of msSessions) { allSessions.push({ - microserviceUuid: ms.uuid, + microserviceUuid: session.microserviceUuid, sessionId: session.sessionId, tailConfig: JSON.parse(session.tailConfig), status: session.status, @@ -662,7 +661,6 @@ const getAgentLogSessions = async function (fog, transaction) { } } - // Get fog node log sessions const fogSessions = await FogLogStatusManager.findAll( { iofogUuid: fog.uuid, diff --git a/src/services/router-connection-service.js b/src/services/router-connection-service.js index 29d1fe8f..77f70412 100644 --- a/src/services/router-connection-service.js +++ b/src/services/router-connection-service.js @@ -40,6 +40,22 @@ class RouterConnectionService { return this.connectionPromise } + isConnected () { + return !!(this.connection && this.connection.is_open && this.connection.is_open()) + } + + async isRouterAvailable () { + if (this.isConnected()) { + return true + } + try { + await this.getConnection() + return this.isConnected() + } catch (error) { + return false + } + } + async _createConnection () { try { logger.debug({ msg: '[AMQP] Preparing router connection options' }) diff --git a/src/services/websocket-queue-service.js b/src/services/websocket-queue-service.js index 3f7e436c..1052e31c 100644 --- a/src/services/websocket-queue-service.js +++ b/src/services/websocket-queue-service.js @@ -1,6 +1,12 @@ const WebSocket = require('ws') const logger = require('../logger') const RouterConnectionService = require('./router-connection-service') +const { recordAmqpPublishError } = require('../websocket/ws-metrics') +const msgpack = require('@msgpack/msgpack') + +// Plan 16-C: drop LOG_LINE when user WS buffer exceeds threshold (see forwardLogToUser). +const LOG_BACKPRESSURE_BUFFER_BYTES = 256 * 1024 +const LOG_MESSAGE_TYPES = { LOG_ERROR: 9 } const MESSAGE_TYPES = { STDIN: 0, @@ -83,24 +89,77 @@ class WebSocketQueueService { const bridge = this.execBridges.get(execId) if (!bridge) return - const closeLink = (link) => { - if (!link) return + this._closeBridgeLinks(bridge, execId) + this.execBridges.delete(execId) + } + + _closeBridgeLinks (bridge, sessionKey) { + const closeLink = (linkWrapper) => { + if (!linkWrapper) return try { - if (link.receiver) { - link.receiver.close() - } else if (link.sender) { - link.sender.close() + if (linkWrapper.receiver) { + linkWrapper.receiver.removeAllListeners() + linkWrapper.receiver.close() + } else if (linkWrapper.sender) { + linkWrapper.sender.removeAllListeners() + linkWrapper.sender.close() } } catch (error) { - logger.debug('[AMQP][QUEUE] Failed to close link during cleanup', { execId, error: error.message }) + logger.debug('[AMQP][QUEUE] Failed to close link during cleanup', { + sessionKey, + error: error.message + }) } } - closeLink(bridge.receivers.agent) - closeLink(bridge.receivers.user) - closeLink(bridge.senders.agent) - closeLink(bridge.senders.user) - this.execBridges.delete(execId) + closeLink(bridge.receivers?.agent) + closeLink(bridge.receivers?.user) + closeLink(bridge.senders?.agent) + closeLink(bridge.senders?.user) + } + + _invalidateExecSender (bridge, side) { + if (bridge.senders[side]) { + bridge.senders[side] = null + } + } + + _invalidateExecReceiver (bridge, side) { + if (bridge.receivers[side]) { + bridge.receivers[side] = null + } + } + + _attachSenderLifecycle (bridge, side, sender, execId) { + sender.on('sender_close', () => { + logger.warn('[AMQP][QUEUE] Exec sender closed', { execId, side }) + this._invalidateExecSender(bridge, side) + }) + sender.on('error', (context) => { + logger.error('[AMQP][QUEUE] Exec sender error', { + execId, + side, + error: context.error ? context.error.message : 'unknown' + }) + recordAmqpPublishError({ sessionType: 'exec', side }) + this._invalidateExecSender(bridge, side) + }) + } + + _attachReceiverLifecycle (bridge, side, receiver, execId) { + receiver.on('receiver_close', () => { + logger.info('[AMQP][QUEUE] Receiver closed', { execId, side }) + this._invalidateExecReceiver(bridge, side) + }) + receiver.on('error', (context) => { + logger.error('[AMQP][QUEUE] Exec receiver error', { + execId, + side, + error: context.error ? context.error.message : 'unknown' + }) + recordAmqpPublishError({ sessionType: 'exec', side }) + this._invalidateExecReceiver(bridge, side) + }) } detachSocket (execId, side) { @@ -138,7 +197,12 @@ class WebSocketQueueService { messageType: messageType !== null ? messageType : 'normal' }) } catch (error) { + recordAmqpPublishError({ sessionType: 'exec', side }) logger.error('[AMQP][QUEUE] Failed to publish message', { execId, side, error: error.message }) + const execBridge = this.execBridges.get(execId) + if (execBridge) { + this._invalidateExecSender(execBridge, side) + } throw error } } @@ -146,9 +210,10 @@ class WebSocketQueueService { async _ensureSender (execId, side) { const bridge = this.execBridges.get(execId) if (!bridge) return null - if (bridge.senders[side]) { + if (bridge.senders[side] && bridge.senders[side].sender) { return bridge.senders[side] } + bridge.senders[side] = null const queueName = buildQueueName( side === 'agent' ? MESSAGE_QUEUE_PREFIX.agent : MESSAGE_QUEUE_PREFIX.user, @@ -170,9 +235,7 @@ class WebSocketQueueService { link.once('error', reject) }) - sender.on('sender_close', () => { - bridge.senders[side] = null - }) + this._attachSenderLifecycle(bridge, side, sender, execId) bridge.senders[side] = { sender } return bridge.senders[side] @@ -223,6 +286,8 @@ class WebSocketQueueService { link.once('error', reject) }) + this._attachReceiverLifecycle(bridge, side, receiver, session.execId) + receiver.on('message', async (context) => { try { // Always get the latest socket reference from the bridge @@ -297,14 +362,6 @@ class WebSocketQueueService { } }) - receiver.on('receiver_close', () => { - logger.info('[AMQP][QUEUE] Receiver closed', { - execId: session.execId, - side - }) - bridge.receivers[side] = null - }) - bridge.receivers[side] = { receiver, socket } logger.info('[AMQP][QUEUE] Receiver setup complete', { execId: session.execId, @@ -476,10 +533,12 @@ class WebSocketQueueService { messageSize: buffer.length }) } catch (error) { + recordAmqpPublishError({ sessionType: 'log', side: 'user' }) logger.error('[AMQP][QUEUE] Failed to publish log message to user queue', { sessionId, error: error.message }) + bridge.userSender = null throw error } } @@ -588,9 +647,28 @@ class WebSocketQueueService { const ws = currentBridge && currentBridge.userReceiver ? currentBridge.userReceiver.socket : null const body = getBufferFromBody(context.message.body) - // Body is MessagePack encoded (from agent via controller) - // Forward directly to user WebSocket (binary) if (ws && ws.readyState === WebSocket.OPEN) { + if (ws.bufferedAmount > LOG_BACKPRESSURE_BUFFER_BYTES) { + if (!currentBridge.backpressureNotified) { + currentBridge.backpressureNotified = true + try { + const errorBody = msgpack.encode({ + type: LOG_MESSAGE_TYPES.LOG_ERROR, + data: Buffer.from('Log stream backpressure: dropping lines until client catches up\n'), + sessionId: session.sessionId, + timestamp: Date.now() + }) + ws.send(errorBody, { binary: true }) + } catch (error) { + logger.debug('[AMQP][QUEUE] Failed to notify user of log backpressure', { + sessionId: session.sessionId, + error: error.message + }) + } + } + context.delivery.release() + return + } ws.send(body, { binary: true }) context.delivery.accept() } else { @@ -606,13 +684,15 @@ class WebSocketQueueService { const bridge = this.logBridges.get(sessionId) if (!bridge) return - const closeLink = (link) => { - if (!link) return + const closeLink = (linkWrapper) => { + if (!linkWrapper) return try { - if (link.receiver) { - link.receiver.close() - } else if (link.sender) { - link.sender.close() + if (linkWrapper.receiver) { + linkWrapper.receiver.removeAllListeners() + linkWrapper.receiver.close() + } else if (linkWrapper.sender) { + linkWrapper.sender.removeAllListeners() + linkWrapper.sender.close() } } catch (error) { logger.debug('[AMQP][QUEUE] Failed to close log link during cleanup', { sessionId, error: error.message }) @@ -624,6 +704,10 @@ class WebSocketQueueService { closeLink(bridge.userReceiver) closeLink(bridge.userSender) + if (bridge.cleanupCallback) { + bridge.cleanupCallback = null + } + this.logBridges.delete(sessionId) } } diff --git a/src/websocket/log-session-manager.js b/src/websocket/log-session-manager.js index 1e58d38b..a84b63ec 100644 --- a/src/websocket/log-session-manager.js +++ b/src/websocket/log-session-manager.js @@ -18,11 +18,24 @@ class LogSessionManager { this.cleanupInterval = null this.startCleanupInterval() logger.info('LogSessionManager initialized with config:' + JSON.stringify({ - sessionTimeout: config.session.timeout, + logPendingTimeoutMs: config.session.logPendingTimeoutMs, + logIdleTimeoutMs: config.session.logIdleTimeoutMs, cleanupInterval: config.session.cleanupInterval })) } + countSessionsForResource (microserviceUuid, fogUuid) { + return this.getAllSessionsForLogSource(microserviceUuid, fogUuid).length + } + + getActiveLogSessionCount () { + return this.logSessions.size + } + + getAllLogSessionIds () { + return Array.from(this.logSessions.keys()) + } + createLogSession (sessionId, microserviceUuid, fogUuid, agentWs, userWs, tailConfig, transaction) { const session = { sessionId, // Unique per user session @@ -125,31 +138,48 @@ class LogSessionManager { // Cleanup expired sessions (timeout mechanism) async cleanupExpiredSessions (transaction) { const now = Date.now() - const timeout = this.config.session.timeout || 3600000 // Default 1 hour + const pendingTimeout = this.config.session.logPendingTimeoutMs || 120000 + const idleTimeout = this.config.session.logIdleTimeoutMs || 7200000 const expiredSessions = [] for (const [sessionId, session] of this.logSessions) { const timeSinceLastActivity = now - session.lastActivity const timeSinceCreation = now - session.createdAt - // Session is expired if: - // 1. No activity for timeout period AND session is older than timeout - // 2. OR user disconnected but agent still connected (orphaned agent connection) - // 3. OR agent disconnected but user still connected (orphaned user connection) - const isExpired = ( - (timeSinceLastActivity > timeout && timeSinceCreation > timeout) || - (!session.user && session.agent) || // Orphaned agent - (!session.agent && session.user && timeSinceCreation > timeout) // Orphaned user (wait timeout before cleanup) - ) + let isExpired = false + + if (!session.agent && session.user) { + isExpired = timeSinceCreation > pendingTimeout + } else if (session.agent && !session.user) { + isExpired = timeSinceLastActivity > pendingTimeout + } else if (session.agent && session.user) { + isExpired = timeSinceLastActivity > idleTimeout + } else { + isExpired = timeSinceCreation > pendingTimeout + } if (isExpired) { expiredSessions.push(sessionId) } } - // Remove expired sessions for (const sessionId of expiredSessions) { logger.info('Cleaning up expired log session:' + JSON.stringify({ sessionId })) + const session = this.logSessions.get(sessionId) + if (session && session.user && session.user.readyState === WebSocket.OPEN) { + try { + session.user.close(1008, session.agent ? 'Log session idle timeout' : 'Timeout waiting for agent connection') + } catch (error) { + logger.warn('Failed to close expired log user connection:' + error.message) + } + } + if (session && session.agent && session.agent.readyState === WebSocket.OPEN) { + try { + session.agent.close(1000, 'Log session expired') + } catch (error) { + logger.warn('Failed to close expired log agent connection:' + error.message) + } + } await this.removeLogSession(sessionId, transaction) } diff --git a/src/websocket/server.js b/src/websocket/server.js index 27e47a79..8ade3375 100644 --- a/src/websocket/server.js +++ b/src/websocket/server.js @@ -14,6 +14,12 @@ const AuthDecorator = require('../decorators/authorization-decorator') const TransactionDecorator = require('../decorators/transaction-decorator') const msgpack = require('@msgpack/msgpack') const WebSocketQueueService = require('../services/websocket-queue-service') +const RouterConnectionService = require('../services/router-connection-service') +const { + recordExecSessionActive, + recordLogSessionActive, + recordPairingDurationMs +} = require('./ws-metrics') const AppHelper = require('../helpers/app-helper') const MicroserviceLogStatusManager = require('../data/managers/microservice-log-status-manager') const FogLogStatusManager = require('../data/managers/fog-log-status-manager') @@ -34,6 +40,13 @@ const MESSAGE_TYPES = { LOG_ERROR: 9 // Log streaming error } +const ROUTER_UNAVAILABLE_CLOSE_CODE = 1013 +const ROUTER_UNAVAILABLE_CLOSE_REASON = 'Router unavailable for cross-replica session' +const DRAIN_CLOSE_CODE = 1001 +const DRAIN_CLOSE_REASON = 'Server draining' +// when user WS bufferedAmount exceeds this, drop LOG_LINE silently and emit LOG_ERROR once. +const LOG_BACKPRESSURE_BUFFER_BYTES = 256 * 1024 + const EventService = require('../services/event-service') const { isAuthConfigured: isOidcAuthConfigured } = require('../config/oidc') @@ -155,8 +168,13 @@ class WebSocketServer { this.rateLimits = new Map() this.sessionManager = new SessionManager(config.get('server.webSocket')) this.logSessionManager = new LogSessionManager(config.get('server.webSocket')) + this.sessionConfig = config.get('server.webSocket.session') this.queueService = WebSocketQueueService this.pendingCloseTimeouts = new Map() // Track pending CLOSE messages in cross-replica scenarios + this.haConfig = config.get('server.webSocket.ha') || {} + this.isDraining = false + this.drainPromise = null + this.logBackpressureNotified = new Set() this.config = { pingInterval: process.env.WS_PING_INTERVAL || config.get('server.webSocket.pingInterval'), pongTimeout: process.env.WS_PONG_TIMEOUT || config.get('server.webSocket.pongTimeout'), @@ -187,17 +205,11 @@ class WebSocketServer { // MessagePack encoding/decoding helpers with improved error handling encodeMessage (message) { try { - // Ensure we're only encoding the actual message content const encoded = msgpack.encode(message) - logger.debug('Encoded MessagePack message:' + JSON.stringify({ - type: typeof message, - isMap: message instanceof Map, - keys: message instanceof Map ? Array.from(message.keys()) : Object.keys(message), - hasExecId: message instanceof Map ? message.has('execId') : 'execId' in message, - hasMicroserviceUuid: message instanceof Map ? message.has('microserviceUuid') : 'microserviceUuid' in message, + logger.debug('Encoded MessagePack message', { encodedLength: encoded.length, - firstBytes: encoded.subarray(0, 16).toString('hex') - })) + hasExecId: message instanceof Map ? message.has('execId') : 'execId' in message + }) return encoded } catch (error) { logger.error('Failed to encode message:' + JSON.stringify({ @@ -211,21 +223,15 @@ class WebSocketServer { decodeMessage (buffer) { try { const decoded = msgpack.decode(buffer) - logger.debug('Decoded MessagePack message:' + JSON.stringify({ - type: typeof decoded, - isMap: decoded instanceof Map, - keys: decoded instanceof Map ? Array.from(decoded.keys()) : Object.keys(decoded), - hasExecId: decoded instanceof Map ? decoded.has('execId') : 'execId' in decoded, - hasMicroserviceUuid: decoded instanceof Map ? decoded.has('microserviceUuid') : 'microserviceUuid' in decoded, + logger.debug('Decoded MessagePack message', { bufferLength: buffer.length, - firstBytes: buffer.subarray(0, 16).toString('hex') - })) + hasExecId: decoded instanceof Map ? decoded.has('execId') : 'execId' in decoded + }) return decoded } catch (error) { logger.error('Failed to decode MessagePack message:' + JSON.stringify({ error: error.message, - bufferLength: buffer.length, - firstBytes: buffer.subarray(0, 16).toString('hex') + bufferLength: buffer.length })) throw error } @@ -259,6 +265,8 @@ class WebSocketServer { // Handle individual connection errors this.wss.on('connection', (ws, req) => { + this.trackConnectionLimit(ws, req) + // Note: Connection logging moved to after successful authorization in handleConnection // This ensures we only log connections that pass RBAC checks @@ -270,18 +278,6 @@ class WebSocketServer { ws._socket.setKeepAlive(true, 30000) // Enable keep-alive instead of disabling } - // Add detailed frame-level logging - ws.on('message', (data, isBinary) => { - const buffer = Buffer.from(data) - logger.debug('WebSocket frame received:' + JSON.stringify({ - isBinary, - length: buffer.length, - firstBytes: buffer.subarray(0, 16).toString('hex'), - lastBytes: buffer.subarray(-16).toString('hex'), - url: req.url - })) - }) - // Add error handler for each connection ws.on('error', (error) => { logger.error('WebSocket connection error:' + JSON.stringify({ @@ -348,10 +344,168 @@ class WebSocketServer { } this.sessionManager.startCleanup() + this.sessionManager.setSessionExpiredHandler(async (microserviceUuid, execId) => { + await TransactionDecorator.generateTransaction(async (tx) => { + if (execId) { + await this.cleanupSession(execId, tx) + } else { + await this.disableExecForMicroservice(microserviceUuid, tx) + } + })() + }) + } + + async disableExecForMicroservice (microserviceUuid, transaction) { + await MicroserviceExecStatusManager.update( + { microserviceUuid }, + { execSessionId: '', status: microserviceExecState.INACTIVE }, + transaction + ) + await MicroserviceManager.update({ uuid: microserviceUuid }, { execEnabled: false }, transaction) + const microservice = await MicroserviceManager.findOne({ uuid: microserviceUuid }, transaction) + if (microservice) { + await ChangeTrackingService.update( + microservice.iofogUuid, + ChangeTrackingService.events.microserviceExecSessions, + transaction + ) + } + } + + /** + * Tear down all in-memory exec sessions and agent sockets for a microservice. + * Ensures Edgelet must open a fresh agent WS on the next exec attempt (exec_b / timeout). + */ + async cleanupExecSessionsForMicroservice (microserviceUuid, transaction) { + const execIdsToCleanup = [] + for (const [execId, session] of this.sessionManager.sessions) { + if (session.microserviceUuid === microserviceUuid) { + execIdsToCleanup.push(execId) + } + } + + for (const execId of execIdsToCleanup) { + await this.cleanupSession(execId, transaction) + } + + if (this.sessionManager.pendingAgents.has(microserviceUuid)) { + const agents = this.sessionManager.pendingAgents.get(microserviceUuid) + for (const [, agentInfo] of agents.entries()) { + if (agentInfo.ws && agentInfo.ws.readyState === WebSocket.OPEN) { + try { + agentInfo.ws.close(1000, 'Exec session ended') + } catch (error) { + logger.debug('[WS-CLEANUP] Failed to close pending agent socket', { + microserviceUuid, + error: error.message + }) + } + } + } + this.sessionManager.pendingAgents.delete(microserviceUuid) + } + + if (execIdsToCleanup.length > 0) { + logger.info('[WS-CLEANUP] Cleaned exec sessions for microservice:' + JSON.stringify({ + microserviceUuid, + sessionCount: execIdsToCleanup.length + })) + } + } + + getLogConcurrencyLimit () { + return this.sessionConfig.logMaxConcurrentPerResource || 3 + } + + getLogTailMaxLines () { + return this.sessionConfig.logTailMaxLines || 5000 + } + + getExecPendingTimeoutMs () { + return this.sessionConfig.execPendingTimeoutMs || 60000 + } + + getLogPendingTimeoutMs () { + return this.sessionConfig.logPendingTimeoutMs || 120000 + } + + getDrainTimeoutMs () { + return this.sessionConfig.drainTimeoutMs || 30000 + } + + isCrossReplicaSession (session) { + return !!(session && (!session.agent || !session.user)) + } + + async requireRouterForCrossReplica (ws) { + if (this.haConfig.failFastOnRouterUnavailable === false) { + return true + } + const available = await RouterConnectionService.isRouterAvailable() + if (!available) { + logger.warn('[WS-HA] Router unavailable for cross-replica session') + if (ws && ws.readyState === WebSocket.OPEN) { + ws.close(ROUTER_UNAVAILABLE_CLOSE_CODE, ROUTER_UNAVAILABLE_CLOSE_REASON) + } + return false + } + return true + } + + recordPairingDuration (startedAt) { + if (startedAt) { + recordPairingDurationMs(Date.now() - startedAt) + } + } + + async countLogSessionsInDb (microserviceUuid, fogUuid, transaction) { + if (microserviceUuid) { + const rows = await MicroserviceLogStatusManager.findAll({ microserviceUuid }, transaction) + return rows.length + } + if (fogUuid) { + const rows = await FogLogStatusManager.findAll({ iofogUuid: fogUuid }, transaction) + return rows.length + } + return 0 + } + + parseLogTailConfig (url, ws) { + const tailMax = this.getLogTailMaxLines() + const tailDefault = 100 + const tailParam = url.searchParams.get('tail') + + if (tailParam !== null && tailParam !== '') { + const parsed = parseInt(tailParam, 10) + if (Number.isNaN(parsed) || parsed < 1) { + ws.close(1008, `Invalid tail parameter. Must be between 1 and ${tailMax}.`) + return null + } + if (parsed > tailMax) { + ws.close(1008, `Tail exceeds maximum of ${tailMax} lines.`) + return null + } + } + + const tailLines = (tailParam !== null && tailParam !== '') + ? parseInt(tailParam, 10) + : tailDefault + + return { + lines: tailLines, + follow: url.searchParams.get('follow') !== 'false', + since: url.searchParams.get('since') || null, + until: url.searchParams.get('until') || null + } } async verifyClient (info, callback) { try { + if (this.isDraining) { + callback(new Error(DRAIN_CLOSE_REASON), false) + return + } + // Check connection limits const clientIp = info.req.socket.remoteAddress const currentConnections = this.connectionLimits.get(clientIp) || 0 @@ -376,12 +530,34 @@ class WebSocketServer { rateLimit.count++ this.rateLimits.set(clientIp, rateLimit) + this.connectionLimits.set(clientIp, currentConnections + 1) + callback(null, true) } catch (error) { callback(new Error('Internal server error'), false) } } + decrementConnectionCount (clientIp) { + const current = this.connectionLimits.get(clientIp) || 0 + if (current <= 1) { + this.connectionLimits.delete(clientIp) + } else { + this.connectionLimits.set(clientIp, current - 1) + } + } + + trackConnectionLimit (ws, req) { + const clientIp = req.socket.remoteAddress + let released = false + const release = () => { + if (released) return + released = true + this.decrementConnectionCount(clientIp) + } + ws.once('close', release) + } + extractMicroserviceUuid (url) { // Match UUID pattern in the URL const uuidPattern = /[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/i @@ -390,6 +566,13 @@ class WebSocketServer { } handleConnection (ws, req) { + if (this.isDraining) { + if (ws.readyState === WebSocket.OPEN) { + ws.close(DRAIN_CLOSE_CODE, DRAIN_CLOSE_REASON) + } + return + } + // Add error handler for this connection ws.on('error', (error) => { logger.error('WebSocket connection error:' + JSON.stringify({ @@ -506,12 +689,7 @@ class WebSocketServer { ws.close(1008, error.message || 'Internal server error') const microserviceUuid = this.extractMicroserviceUuid(req.url) if (microserviceUuid) { - await MicroserviceExecStatusManager.update( - { microserviceUuid }, - { execSessionId: '', status: microserviceExecState.INACTIVE }, - transaction - ) - await MicroserviceManager.update({ uuid: microserviceUuid }, { execEnabled: false }, transaction) + await this.disableExecForMicroservice(microserviceUuid, transaction) } } } catch (closeError) { @@ -643,227 +821,180 @@ class WebSocketServer { } } - async handleAgentConnection (ws, req, token, microserviceUuid, transaction) { + scheduleAgentExecConnectEvent (req, resourceId) { + setImmediate(async () => { + try { + const authHeader = req.headers.authorization + let actorId = null + if (authHeader) { + const [scheme, token] = authHeader.split(' ') + if (scheme.toLowerCase() === 'bearer' && token) { + try { + const tokenParts = token.split('.') + if (tokenParts.length === 3) { + const payload = JSON.parse(Buffer.from(tokenParts[1], 'base64').toString()) + actorId = payload.sub || null + } + } catch (err) { + // Ignore token parsing errors + } + } + } + await EventService.createWsConnectEvent({ + timestamp: Date.now(), + endpointType: 'agent', + actorId, + path: req.url, + resourceId, + ipAddress: EventService.extractIPv4Address(req) || null + }) + } catch (err) { + logger.error('Failed to create WS_CONNECT event (non-blocking):', err) + } + }) + } + + async processAgentInitialMessage (ws, req, data, isBinary, microserviceUuid, transaction) { + logger.debug('[WS-INIT] Received initial message from agent:' + JSON.stringify({ + isBinary, + url: req.url, + microserviceUuid + })) + + if (!isBinary) { + logger.error('[WS-ERROR] Expected binary message from agent') + ws.close(1008, 'Expected binary message') + return + } + + const buffer = Buffer.from(data) + logger.debug('[WS-INIT] Processing initial message from agent', { + isBinary, + length: buffer.length + }) + + let execMsg try { - this.ensureSocketPongHandler(ws) - logger.debug('[WS-CONN] Processing agent connection:' + JSON.stringify({ - url: req.url, - microserviceUuid, - remoteAddress: req.socket.remoteAddress + execMsg = this.decodeMessage(buffer) + logger.info('[WS-INIT] Decoded MessagePack from agent:' + JSON.stringify(execMsg)) + } catch (err) { + logger.error('[WS-ERROR] Failed to decode MessagePack from agent:' + JSON.stringify({ + error: err.message, + stack: err.stack })) + ws.close(1008, 'Invalid MessagePack') + return + } - // Set up message handler for initial message only - const initialMessageHandler = async (data, isBinary) => { - logger.debug('[WS-INIT] Received initial message from agent:' + JSON.stringify({ - isBinary, - url: req.url, - microserviceUuid - })) + const execId = execMsg instanceof Map ? execMsg.get('execId') : execMsg.execId + const msgMicroserviceUuid = execMsg instanceof Map ? execMsg.get('microserviceUuid') : execMsg.microserviceUuid + if (!execId || !msgMicroserviceUuid) { + logger.error('[WS-ERROR] Agent message missing execId or microserviceUuid:' + JSON.stringify(execMsg)) + ws.close(1008, 'Missing required fields') + return + } - if (!isBinary) { - logger.error('[WS-ERROR] Expected binary message from agent') - ws.close(1008, 'Expected binary message') - return - } + const existingSession = this.sessionManager.getSession(execId) + if (existingSession && existingSession.agent === ws && existingSession.awaitingUser && !existingSession.user) { + logger.debug('[WS-INIT] Ignoring duplicate agent init for pending session', { + execId, + microserviceUuid: msgMicroserviceUuid + }) + return + } - const buffer = Buffer.from(data) - logger.debug('[WS-INIT] Processing initial message from agent:' + JSON.stringify({ - isBinary, - length: buffer.length, - firstBytes: buffer.subarray(0, 16).toString('hex'), - lastBytes: buffer.subarray(-16).toString('hex') - })) + for (const [existingExecId, existingSessionEntry] of this.sessionManager.sessions) { + if (existingSessionEntry.agent === ws && existingExecId !== execId) { + await this.cleanupSession(existingExecId, transaction, { preserveAgentSocket: true }) + } + } - let execMsg - try { - execMsg = this.decodeMessage(buffer) - logger.info('[WS-INIT] Decoded MessagePack from agent:' + JSON.stringify(execMsg)) - } catch (err) { - logger.error('[WS-ERROR] Failed to decode MessagePack from agent:' + JSON.stringify({ - error: err.message, - stack: err.stack - })) - ws.close(1008, 'Invalid MessagePack') - return - } + const session = await this.sessionManager.tryActivateSession(msgMicroserviceUuid, execId, ws, true, transaction) + if (session) { + logger.info('[WS-SESSION] Session activated for agent:' + JSON.stringify({ + execId, + microserviceUuid: msgMicroserviceUuid + })) + logger.debug('[WS-FORWARD] Setting up message forwarding:' + JSON.stringify({ + execId, + microserviceUuid: msgMicroserviceUuid + })) + await this.setupMessageForwarding(execId, transaction) + this.scheduleAgentExecConnectEvent(req, msgMicroserviceUuid) + return + } - const { execId, microserviceUuid: msgMicroserviceUuid } = execMsg - if (!execId || !msgMicroserviceUuid) { - logger.error('[WS-ERROR] Agent message missing execId or microserviceUuid:' + JSON.stringify(execMsg)) - ws.close(1008, 'Missing required fields') - return - } + this.attachPendingKeepAliveHandler(ws) + try { + await MicroserviceExecStatusManager.update( + { microserviceUuid: msgMicroserviceUuid }, + { execSessionId: execId, status: microserviceExecState.PENDING }, + transaction + ) + logger.debug('[WS-SESSION] Updated microservice exec status to PENDING', { + execId, + microserviceUuid: msgMicroserviceUuid + }) + } catch (error) { + logger.error('[WS-SESSION] Failed to update microservice exec status to PENDING', { + execId, + microserviceUuid: msgMicroserviceUuid, + error: error.message, + stack: error.stack + }) + } - // Remove the initial message handler - ws.removeListener('message', initialMessageHandler) + const agentOnlySession = this.sessionManager.createSession(execId, msgMicroserviceUuid, ws, null, transaction) + agentOnlySession.awaitingUser = true - // Try to activate session with the execId from the message - const session = await this.sessionManager.tryActivateSession(msgMicroserviceUuid, execId, ws, true, transaction) - if (session) { - logger.info('[WS-SESSION] Session activated for agent:' + JSON.stringify({ - execId, - microserviceUuid: msgMicroserviceUuid - })) - // Set up message forwarding - logger.debug('[WS-FORWARD] Setting up message forwarding:' + JSON.stringify({ - execId, - microserviceUuid: msgMicroserviceUuid - })) - await this.setupMessageForwarding(execId, transaction) + try { + await this.queueService.enableForSession(agentOnlySession, async (closeExecId) => { + const timeout = this.pendingCloseTimeouts.get(closeExecId) + if (timeout) { + clearTimeout(timeout) + this.pendingCloseTimeouts.delete(closeExecId) + logger.debug('[WS-SESSION] Cleared pending CLOSE timeout - agent responded', { execId: closeExecId }) + } + await TransactionDecorator.generateTransaction(async (failTx) => { + await this.cleanupSession(closeExecId, failTx) + })() + }) + agentOnlySession.queueBridgeEnabled = true + } catch (error) { + logger.warn('[WS-SESSION] Optional queue bridge for pending agent failed; direct relay when user connects on same replica:', { + execId, + microserviceUuid: msgMicroserviceUuid, + error: error.message + }) + agentOnlySession.queueBridgeEnabled = false + } - // Record WebSocket connection event (non-blocking) - setImmediate(async () => { - try { - const authHeader = req.headers.authorization - let actorId = null - if (authHeader) { - const [scheme, token] = authHeader.split(' ') - if (scheme.toLowerCase() === 'bearer' && token) { - try { - const tokenParts = token.split('.') - if (tokenParts.length === 3) { - const payload = JSON.parse(Buffer.from(tokenParts[1], 'base64').toString()) - actorId = payload.sub || null - } - } catch (err) { - // Ignore token parsing errors - } - } - } - await EventService.createWsConnectEvent({ - timestamp: Date.now(), - endpointType: 'agent', - actorId, - path: req.url, - resourceId: msgMicroserviceUuid, - ipAddress: EventService.extractIPv4Address(req) || null - }) - } catch (err) { - logger.error('Failed to create WS_CONNECT event (non-blocking):', err) - } - }) - } else { - this.attachPendingKeepAliveHandler(ws) - try { - await MicroserviceExecStatusManager.update( - { microserviceUuid }, - { execSessionId: execId, status: microserviceExecState.PENDING }, - transaction - ) - logger.debug('[WS-SESSION] Updated microservice exec status to PENDING', { - execId, - microserviceUuid - }) - } catch (error) { - logger.error('[WS-SESSION] Failed to update microservice exec status to PENDING', { - execId, - microserviceUuid, - error: error.message, - stack: error.stack - }) - // Continue anyway - the in-memory state is correct - } - // Create session with agent only and enable queue bridge for cross-replica support - // This allows the agent to receive messages from users on other replicas via AMQP queues - const agentOnlySession = this.sessionManager.createSession(execId, msgMicroserviceUuid, ws, null, transaction) - try { - // Pass cleanup callback so queue service can notify us when CLOSE is received - await this.queueService.enableForSession(agentOnlySession, (execId) => { - // Clear timeout if it exists (agent responded to CLOSE) - const timeout = this.pendingCloseTimeouts.get(execId) - if (timeout) { - clearTimeout(timeout) - this.pendingCloseTimeouts.delete(execId) - logger.debug('[WS-SESSION] Cleared pending CLOSE timeout - agent responded', { execId }) - } - this.cleanupSession(execId, transaction) - }) - agentOnlySession.queueBridgeEnabled = true - logger.info('[WS-SESSION] No pending user found for agent, added to pending list and enabled queue bridge for cross-replica support:' + JSON.stringify({ - execId, - microserviceUuid: msgMicroserviceUuid - })) - await this.setupMessageForwarding(execId, transaction) + logger.info('[WS-SESSION] Agent pending — awaiting user before ACTIVATION:' + JSON.stringify({ + execId, + microserviceUuid: msgMicroserviceUuid + })) + this.scheduleAgentExecConnectEvent(req, msgMicroserviceUuid) + } - // Record WebSocket connection event for agent (non-blocking) - // This covers the case when agent connects but no user is waiting (cross-replica or normal) - setImmediate(async () => { - try { - const authHeader = req.headers.authorization - let actorId = null - if (authHeader) { - const [scheme, token] = authHeader.split(' ') - if (scheme.toLowerCase() === 'bearer' && token) { - try { - const tokenParts = token.split('.') - if (tokenParts.length === 3) { - const payload = JSON.parse(Buffer.from(tokenParts[1], 'base64').toString()) - actorId = payload.sub || null - } - } catch (err) { - // Ignore token parsing errors - } - } - } - await EventService.createWsConnectEvent({ - timestamp: Date.now(), - endpointType: 'agent', - actorId, - path: req.url, - resourceId: msgMicroserviceUuid, - ipAddress: EventService.extractIPv4Address(req) || null - }) - } catch (err) { - logger.error('Failed to create WS_CONNECT event (non-blocking):', err) - } - }) - } catch (error) { - logger.warn('[WS-SESSION] Failed to enable queue bridge for pending agent, will use direct relay when user connects:', { - execId, - microserviceUuid: msgMicroserviceUuid, - error: error.message - }) - agentOnlySession.queueBridgeEnabled = false + async handleAgentConnection (ws, req, token, microserviceUuid, transaction) { + try { + this.ensureSocketPongHandler(ws) + ws._agentExecHandshakeContext = { req, microserviceUuid } + logger.debug('[WS-CONN] Processing agent connection:' + JSON.stringify({ + url: req.url, + microserviceUuid, + remoteAddress: req.socket.remoteAddress + })) - // Record WebSocket connection event even if queue bridge failed (non-blocking) - setImmediate(async () => { - try { - const authHeader = req.headers.authorization - let actorId = null - if (authHeader) { - const [scheme, token] = authHeader.split(' ') - if (scheme.toLowerCase() === 'bearer' && token) { - try { - const tokenParts = token.split('.') - if (tokenParts.length === 3) { - const payload = JSON.parse(Buffer.from(tokenParts[1], 'base64').toString()) - actorId = payload.sub || null - } - } catch (err) { - // Ignore token parsing errors - } - } - } - await EventService.createWsConnectEvent({ - timestamp: Date.now(), - endpointType: 'agent', - actorId, - path: req.url, - resourceId: msgMicroserviceUuid, - ipAddress: EventService.extractIPv4Address(req) || null - }) - } catch (err) { - logger.error('Failed to create WS_CONNECT event (non-blocking):', err) - } - }) - } + // Capture the first frame during validation so Edgelet's immediate-on-open msgpack is not lost (Plan 16-A). + let capturedInitialFrame = null + const captureHandler = (data, isBinary) => { + if (capturedInitialFrame === null) { + capturedInitialFrame = { data, isBinary } } } + ws.on('message', captureHandler) - // Bind the message handler BEFORE validation - ws.on('message', initialMessageHandler) - - // Now validate the connection const fog = await this.validateAgentConnection(token, microserviceUuid, transaction) logger.debug('[WS-VALIDATE] Agent connection validated:' + JSON.stringify({ fogUuid: fog.uuid, @@ -871,6 +1002,27 @@ class WebSocketServer { url: req.url })) + if (capturedInitialFrame) { + ws.removeListener('message', captureHandler) + await this.processAgentInitialMessage( + ws, + req, + capturedInitialFrame.data, + capturedInitialFrame.isBinary, + microserviceUuid, + transaction + ) + } else { + const initialMessageHandler = async (data, isBinary) => { + ws.removeListener('message', initialMessageHandler) + ws.removeListener('message', captureHandler) + await this.processAgentInitialMessage(ws, req, data, isBinary, microserviceUuid, transaction) + } + // Register the follow-up handler before removing capture to avoid losing a fast first frame. + ws.on('message', initialMessageHandler) + ws.removeListener('message', captureHandler) + } + // Handle connection close ws.on('close', async (code, reason) => { // Record WebSocket disconnection event (non-blocking) @@ -902,40 +1054,40 @@ class WebSocketServer { closeCode: code }) } catch (err) { - logger.error('Failed to create WS_DISCONNECT event (non-blocking):', err) - } - }) - - for (const [execId, session] of this.sessionManager.sessions) { - if (session.agent === ws) { - // In cross-replica scenarios, send CLOSE message to user via queue - // Note: session.user is null on agent's replica in cross-replica scenarios - const queueEnabled = this.queueService.shouldUseQueue(execId) - if (queueEnabled) { - try { - const closeMsg = { - type: MESSAGE_TYPES.CLOSE, - execId, - microserviceUuid: session.microserviceUuid, - timestamp: Date.now(), - data: Buffer.from('Agent closed connection') + logger.error('Failed to create WS_DISCONNECT event (non-blocking):', err) + } + }) + + await TransactionDecorator.generateTransaction(async (closeTransaction) => { + for (const [execId, session] of this.sessionManager.sessions) { + if (session.agent === ws) { + const queueEnabled = this.queueService.shouldUseQueue(execId) + if (queueEnabled) { + try { + const closeMsg = { + type: MESSAGE_TYPES.CLOSE, + execId, + microserviceUuid: session.microserviceUuid, + timestamp: Date.now(), + data: Buffer.from('Agent closed connection') + } + const encoded = this.encodeMessage(closeMsg) + await this.queueService.publishToUser(execId, encoded, { messageType: MESSAGE_TYPES.CLOSE }) + logger.info('[WS-CLOSE] Sent CLOSE message to user via queue after agent disconnect', { + execId, + microserviceUuid: session.microserviceUuid + }) + } catch (error) { + logger.error('[WS-CLOSE] Failed to send CLOSE message to user via queue', { + execId, + error: error.message + }) } - const encoded = this.encodeMessage(closeMsg) - await this.queueService.publishToUser(execId, encoded, { messageType: MESSAGE_TYPES.CLOSE }) - logger.info('[WS-CLOSE] Sent CLOSE message to user via queue after agent disconnect', { - execId, - microserviceUuid: session.microserviceUuid - }) - } catch (error) { - logger.error('[WS-CLOSE] Failed to send CLOSE message to user via queue', { - execId, - error: error.message - }) } + await this.cleanupSession(execId, closeTransaction) } - this.cleanupSession(execId, transaction) } - } + })() this.sessionManager.removePendingAgent(microserviceUuid, ws) logger.debug('[WS-CLOSE] Agent connection closed:' + JSON.stringify({ url: req.url, @@ -997,16 +1149,12 @@ class WebSocketServer { await this.validateUserConnection(token, microserviceUuid, expectSystem, transaction) logger.info('User connection validated successfully for microservice:' + microserviceUuid) - // Check if there's already an active session for this microservice - const existingSession = Array.from(this.sessionManager.sessions.values()) - .find(session => session.microserviceUuid === microserviceUuid && session.user && session.user.readyState === WebSocket.OPEN) - - if (existingSession) { - logger.debug('Microservice has already active exec session:' + JSON.stringify({ - microserviceUuid, - existingExecId: existingSession.execId + // Check if there's already an active or pending exec session for this microservice + if (this.sessionManager.hasActiveOrPendingUser(microserviceUuid)) { + logger.debug('Microservice already has an exec session in progress:' + JSON.stringify({ + microserviceUuid })) - ws.close(1008, 'Microservice has already active exec session.') + ws.close(1008, 'An exec session is already in progress for this microservice. Only one user exec WebSocket is allowed.') return } @@ -1024,7 +1172,7 @@ class WebSocketServer { if (pendingAgent) { // Activate session using agent's execId (agent is on same replica) - const session = this.sessionManager.tryActivateSession(microserviceUuid, availableExecId, ws, false, transaction) + const session = await this.sessionManager.tryActivateSession(microserviceUuid, availableExecId, ws, false, transaction) if (session) { logger.info('Session activated for user:', { execId: availableExecId, @@ -1063,6 +1211,9 @@ class WebSocketServer { execId: availableExecId, microserviceUuid }) + if (!(await this.requireRouterForCrossReplica(ws))) { + return + } this.sessionManager.createSession(availableExecId, microserviceUuid, null, ws, transaction) await MicroserviceExecStatusManager.update( { microserviceUuid }, @@ -1125,7 +1276,7 @@ class WebSocketServer { if (pendingAgent) { // Remove user from pending first since we're activating this.sessionManager.removePendingUser(microserviceUuid, ws) - const session = this.sessionManager.tryActivateSession(microserviceUuid, availableExecId, ws, false, transaction) + const session = await this.sessionManager.tryActivateSession(microserviceUuid, availableExecId, ws, false, transaction) if (session) { logger.info('Session activated immediately after re-check:' + JSON.stringify({ execId: availableExecId, @@ -1163,6 +1314,10 @@ class WebSocketServer { execId: availableExecId, microserviceUuid }) + if (!(await this.requireRouterForCrossReplica(ws))) { + this.sessionManager.removePendingUser(microserviceUuid, ws) + return + } // Remove user from pending first this.sessionManager.removePendingUser(microserviceUuid, ws) this.sessionManager.createSession(availableExecId, microserviceUuid, null, ws, transaction) @@ -1257,7 +1412,7 @@ class WebSocketServer { if (pendingAgent) { // Remove user from pending first this.sessionManager.removePendingUser(microserviceUuid, ws) - const session = this.sessionManager.tryActivateSession(microserviceUuid, availableExecId, ws, false, transaction) + const session = await this.sessionManager.tryActivateSession(microserviceUuid, availableExecId, ws, false, transaction) if (session) { logger.info('Session activated via periodic retry:' + JSON.stringify({ execId: availableExecId, @@ -1296,6 +1451,11 @@ class WebSocketServer { execId: availableExecId, microserviceUuid }) + if (!(await this.requireRouterForCrossReplica(ws))) { + clearInterval(retryTimer) + this.sessionManager.removePendingUser(microserviceUuid, ws) + return + } // Remove user from pending first this.sessionManager.removePendingUser(microserviceUuid, ws) this.sessionManager.createSession(availableExecId, microserviceUuid, null, ws, transaction) @@ -1348,8 +1508,8 @@ class WebSocketServer { // Store timer reference for cleanup this.sessionManager.setUserRetryTimer(microserviceUuid, ws, retryTimer) - // Add timeout mechanism for pending users (60 seconds) - const PENDING_USER_TIMEOUT = 60000 + // Add timeout mechanism for pending users + const PENDING_USER_TIMEOUT = this.getExecPendingTimeoutMs() setTimeout(() => { if (this.sessionManager.isUserStillPending(microserviceUuid, ws)) { logger.warn('Pending user timeout, closing connection:' + JSON.stringify({ @@ -1401,6 +1561,15 @@ class WebSocketServer { } this.sessionManager.removePendingUser(microserviceUuid, ws) + TransactionDecorator.generateTransaction(async (timeoutTransaction) => { + await this.cleanupExecSessionsForMicroservice(microserviceUuid, timeoutTransaction) + await this.disableExecForMicroservice(microserviceUuid, timeoutTransaction) + })().catch((err) => { + logger.error('Failed to disable exec after pending user timeout:' + JSON.stringify({ + error: err.message, + microserviceUuid + })) + }) } }, PENDING_USER_TIMEOUT) @@ -1427,11 +1596,23 @@ class WebSocketServer { } }) - for (const [execId, session] of this.sessionManager.sessions) { - if (session.user === ws) { - this.cleanupSession(execId, transaction) + TransactionDecorator.generateTransaction(async (closeTransaction) => { + const wasPending = this.sessionManager.isUserStillPending(microserviceUuid, ws) + for (const [execId, session] of this.sessionManager.sessions) { + if (session.user === ws) { + await this.cleanupSession(execId, closeTransaction) + } } - } + if (wasPending) { + await this.cleanupExecSessionsForMicroservice(microserviceUuid, closeTransaction) + await this.disableExecForMicroservice(microserviceUuid, closeTransaction) + } + })().catch((err) => { + logger.error('Failed to cleanup exec session on user disconnect:' + JSON.stringify({ + error: err.message, + microserviceUuid + })) + }) // Clear retry timer before removing user const retryTimer = this.sessionManager.getUserRetryTimer(microserviceUuid, ws) @@ -1476,6 +1657,62 @@ class WebSocketServer { // return noisePatterns.some(pattern => pattern.test(output)) // } + async sendExecActivationToAgent (session, execId, transaction) { + if (!session.user) { + return false + } + if (session.activationSent) { + return true + } + + const activationMsg = { + type: MESSAGE_TYPES.ACTIVATION, + data: Buffer.from(JSON.stringify({ + execId, + microserviceUuid: session.microserviceUuid, + timestamp: Date.now() + })), + microserviceUuid: session.microserviceUuid, + execId, + timestamp: Date.now() + } + + try { + const success = await this.sendMessageToAgent(session.agent, activationMsg, execId, session.microserviceUuid) + if (success) { + session.activationSent = true + session.awaitingUser = false + logger.info('[RELAY] Session activation complete:' + JSON.stringify({ + execId, + microserviceUuid: session.microserviceUuid, + agentState: session.agent ? session.agent.readyState : 'N/A (cross-replica)', + queueEnabled: this.queueService.shouldUseQueue(execId) + })) + } else { + logger.error('[RELAY] Session activation failed:' + JSON.stringify({ + execId, + microserviceUuid: session.microserviceUuid, + agentState: session.agent ? session.agent.readyState : 'N/A', + queueEnabled: this.queueService.shouldUseQueue(execId) + })) + if (session.agent) { + await this.cleanupSession(execId, transaction) + } + } + return success + } catch (error) { + logger.error('[RELAY] Session activation error:' + JSON.stringify({ + execId, + microserviceUuid: session.microserviceUuid, + error: error.message + })) + if (session.agent) { + await this.cleanupSession(execId, transaction) + } + return false + } + } + async setupMessageForwarding (execId, transaction) { const session = this.sessionManager.getSession(execId) if (!session) { @@ -1494,79 +1731,61 @@ class WebSocketServer { })) this.detachPendingKeepAliveHandler(user) this.detachPendingKeepAliveHandler(agent) - try { - // Pass cleanup callback so queue service can notify us when CLOSE is received - await this.queueService.enableForSession(session, (execId) => { - // Clear timeout if it exists (agent responded to CLOSE) - const timeout = this.pendingCloseTimeouts.get(execId) - if (timeout) { - clearTimeout(timeout) - this.pendingCloseTimeouts.delete(execId) - logger.debug('[RELAY] Cleared pending CLOSE timeout - agent responded', { execId }) - } - const currentTransaction = session.transaction - this.cleanupSession(execId, currentTransaction) - }) - session.queueBridgeEnabled = true - logger.info('[RELAY] AMQP queue bridge enabled for exec session', { - execId, - microserviceUuid: session.microserviceUuid - }) - } catch (error) { - session.queueBridgeEnabled = false - logger.warn('[RELAY] Failed to enable AMQP queue bridge, falling back to direct WebSocket relay', { - execId, - error: error.message - }) - } - - // Send activation message to agent (works for both direct WebSocket and queue-based forwarding) - const activationMsg = { - type: MESSAGE_TYPES.ACTIVATION, - data: Buffer.from(JSON.stringify({ - execId, - microserviceUuid: session.microserviceUuid, - timestamp: Date.now() - })), - microserviceUuid: session.microserviceUuid, - execId, - timestamp: Date.now() - } - - // sendMessageToAgent handles queue-based forwarding when agent is null - this.sendMessageToAgent(session.agent, activationMsg, execId, session.microserviceUuid) - .then(success => { - if (success) { - logger.info('[RELAY] Session activation complete:' + JSON.stringify({ - execId, - microserviceUuid: session.microserviceUuid, - agentState: session.agent ? session.agent.readyState : 'N/A (cross-replica)', - queueEnabled: this.queueService.shouldUseQueue(execId) - })) - } else { - logger.error('[RELAY] Session activation failed:' + JSON.stringify({ + if (!session.queueBridgeEnabled) { + try { + // Pass cleanup callback so queue service can notify us when CLOSE is received + await this.queueService.enableForSession(session, async (execId) => { + // Clear timeout if it exists (agent responded to CLOSE) + const timeout = this.pendingCloseTimeouts.get(execId) + if (timeout) { + clearTimeout(timeout) + this.pendingCloseTimeouts.delete(execId) + logger.debug('[RELAY] Cleared pending CLOSE timeout - agent responded', { execId }) + } + const currentTransaction = session.transaction + await this.cleanupSession(execId, currentTransaction) + }) + session.queueBridgeEnabled = true + logger.info('[RELAY] AMQP queue bridge enabled for exec session', { + execId, + microserviceUuid: session.microserviceUuid + }) + } catch (error) { + session.queueBridgeEnabled = false + if (this.isCrossReplicaSession(session) && this.haConfig.failFastOnRouterUnavailable !== false) { + logger.error('[RELAY] AMQP required for cross-replica session but router bridge failed', { execId, - microserviceUuid: session.microserviceUuid, - agentState: session.agent ? session.agent.readyState : 'N/A', - queueEnabled: this.queueService.shouldUseQueue(execId) - })) - // Only cleanup if we have a direct agent connection (not queue-based) - if (session.agent) { - this.cleanupSession(execId, transaction) + error: error.message + }) + if (session.user && session.user.readyState === WebSocket.OPEN) { + session.user.close(ROUTER_UNAVAILABLE_CLOSE_CODE, ROUTER_UNAVAILABLE_CLOSE_REASON) } + if (session.agent && session.agent.readyState === WebSocket.OPEN) { + session.agent.close(ROUTER_UNAVAILABLE_CLOSE_CODE, ROUTER_UNAVAILABLE_CLOSE_REASON) + } + await this.cleanupSession(execId, transaction) + return } - }) - .catch(error => { - logger.error('[RELAY] Session activation error:' + JSON.stringify({ + logger.warn('[RELAY] Failed to enable AMQP queue bridge, falling back to direct WebSocket relay', { execId, - microserviceUuid: session.microserviceUuid, error: error.message - })) - // Only cleanup if we have a direct agent connection (not queue-based) - if (session.agent) { - this.cleanupSession(execId, transaction) - } + }) + } + } + + if (user) { + if (!session.metricsActive) { + session.metricsActive = true + recordExecSessionActive(1) + this.recordPairingDuration(session.pairingStartedAt) + } + await this.sendExecActivationToAgent(session, execId, transaction) + } else { + logger.debug('[RELAY] Relay handlers deferred — user leg not connected; ACTIVATION withheld', { + execId, + microserviceUuid: session.microserviceUuid }) + } // Remove any previous message handlers to avoid duplicates if (user) { @@ -1645,7 +1864,7 @@ class WebSocketServer { }) // Set timeout in case agent doesn't respond - const timeout = setTimeout(() => { + const timeout = setTimeout(async () => { const currentSession = this.sessionManager.getSession(execId) if (currentSession && currentSession.user && currentSession.user.readyState === WebSocket.OPEN) { logger.warn('[RELAY] Agent did not respond to CLOSE within timeout, closing user socket', { @@ -1656,7 +1875,7 @@ class WebSocketServer { try { currentSession.user.close(1000, 'Session closed (timeout)') const currentTransaction = currentSession.transaction - this.cleanupSession(execId, currentTransaction) + await this.cleanupSession(execId, currentTransaction) } catch (error) { logger.error('[RELAY] Failed to close user socket on timeout', { execId, @@ -1691,7 +1910,7 @@ class WebSocketServer { // Get current transaction from the session and cleanup const currentTransaction = session.transaction - this.cleanupSession(execId, currentTransaction) + await this.cleanupSession(execId, currentTransaction) return } } @@ -1789,7 +2008,7 @@ class WebSocketServer { // Get current transaction from the session const currentTransaction = session.transaction - this.cleanupSession(execId, currentTransaction) + await this.cleanupSession(execId, currentTransaction) return } @@ -2047,6 +2266,72 @@ class WebSocketServer { } // Singleton instance + async drain (timeoutMs = null) { + if (this.drainPromise) { + return this.drainPromise + } + + const drainBudgetMs = timeoutMs || this.getDrainTimeoutMs() + this.isDraining = true + logger.info('[WS-DRAIN] Starting graceful drain', { timeoutMs: drainBudgetMs }) + + this.drainPromise = (async () => { + const deadline = Date.now() + drainBudgetMs + this.sessionManager.closeAllPendingUsers(DRAIN_CLOSE_CODE, DRAIN_CLOSE_REASON) + + const execIds = this.sessionManager.getAllExecSessionIds() + const logSessionIds = this.logSessionManager.getAllLogSessionIds() + const cleanupTasks = [] + + for (const execId of execIds) { + cleanupTasks.push( + TransactionDecorator.generateTransaction(async (tx) => { + await this.cleanupSession(execId, tx) + })().catch((error) => { + logger.warn('[WS-DRAIN] Exec session cleanup failed', { execId, error: error.message }) + }) + ) + } + + for (const sessionId of logSessionIds) { + cleanupTasks.push( + TransactionDecorator.generateTransaction(async (tx) => { + await this.cleanupLogSession(sessionId, tx) + })().catch((error) => { + logger.warn('[WS-DRAIN] Log session cleanup failed', { sessionId, error: error.message }) + }) + ) + } + + const remainingMs = deadline - Date.now() + if (remainingMs > 0 && cleanupTasks.length > 0) { + await Promise.race([ + Promise.allSettled(cleanupTasks), + new Promise((resolve) => setTimeout(resolve, remainingMs)) + ]) + } + + if (this.wss) { + for (const client of this.wss.clients) { + if (client.readyState === WebSocket.OPEN) { + try { + client.close(DRAIN_CLOSE_CODE, DRAIN_CLOSE_REASON) + } catch (error) { + logger.debug('[WS-DRAIN] Failed to close lingering client', { error: error.message }) + } + } + } + } + + logger.info('[WS-DRAIN] Graceful drain complete', { + execSessions: execIds.length, + logSessions: logSessionIds.length + }) + })() + + return this.drainPromise + } + static getInstance () { if (!WebSocketServer.instance) { WebSocketServer.instance = new WebSocketServer() @@ -2055,10 +2340,16 @@ class WebSocketServer { } // Clean up session and close sockets - cleanupSession (execId, transaction) { + async cleanupSession (execId, transaction, options = {}) { + const preserveAgentSocket = options.preserveAgentSocket === true const session = this.sessionManager.getSession(execId) if (!session) return + if (session.metricsActive) { + recordExecSessionActive(-1) + session.metricsActive = false + } + // Clear any pending CLOSE timeout const timeout = this.pendingCloseTimeouts.get(execId) if (timeout) { @@ -2068,7 +2359,7 @@ class WebSocketServer { } // Send CLOSE message to agent if it's still connected - if (session.agent && session.agent.readyState === WebSocket.OPEN) { + if (!preserveAgentSocket && session.agent && session.agent.readyState === WebSocket.OPEN) { const closeMsg = { type: MESSAGE_TYPES.CLOSE, execId, @@ -2100,11 +2391,11 @@ class WebSocketServer { if (session.user && session.user.readyState === WebSocket.OPEN) { session.user.close(1000, 'Session closed') } - if (session.agent && session.agent.readyState === WebSocket.OPEN) { + if (!preserveAgentSocket && session.agent && session.agent.readyState === WebSocket.OPEN) { session.agent.close(1000, 'Session closed') } - this.sessionManager.removeSession(execId, transaction) + await this.sessionManager.removeSession(execId, transaction) logger.info('[RELAY] Session cleaned up for execId=' + execId) this.queueService.cleanup(execId) .catch(error => { @@ -2269,9 +2560,14 @@ class WebSocketServer { } attachPendingKeepAliveHandler (ws) { - if (!ws || ws._pendingKeepAliveHandler) { + if (!ws) { return } + + if (ws._pendingKeepAliveHandler) { + ws.removeListener('message', ws._pendingKeepAliveHandler) + } + ws._pendingKeepAliveHandler = (data, isBinary) => { if (!isBinary) return let msg @@ -2280,23 +2576,46 @@ class WebSocketServer { } catch (error) { return } - if (msg.type === MESSAGE_TYPES.CONTROL) { + + const msgType = msg instanceof Map ? msg.get('type') : msg.type + const execId = msg instanceof Map ? msg.get('execId') : msg.execId + const msgMicroserviceUuid = msg instanceof Map ? msg.get('microserviceUuid') : msg.microserviceUuid + + if (msgType === MESSAGE_TYPES.CONTROL) { const controlData = msg.data ? msg.data.toString() : '' if (controlData === 'keepalive') { - this._sendKeepAliveResponse(ws, msg.execId || 'pending', msg.microserviceUuid || null) + this._sendKeepAliveResponse(ws, execId || 'pending', msgMicroserviceUuid || null) } + return + } + + // Edgelet may reuse an open agent socket and send a fresh init frame (execId + microserviceUuid, no type). + if (execId && msgMicroserviceUuid && (msgType === undefined || msgType === null) && ws._agentExecHandshakeContext) { + const ctx = ws._agentExecHandshakeContext + TransactionDecorator.generateTransaction(async (tx) => { + await this.processAgentInitialMessage(ws, ctx.req, data, isBinary, ctx.microserviceUuid, tx) + })().catch((err) => { + logger.error('[WS-INIT] Failed to process agent re-init on reused socket', { + error: err.message, + microserviceUuid: ctx.microserviceUuid + }) + }) } } ws.on('message', ws._pendingKeepAliveHandler) - ws.on('ping', () => { - if (ws.readyState === WebSocket.OPEN) { - try { - ws.pong() - } catch (error) { - logger.debug('[RELAY] Failed to send pong on pending connection', { error: error.message }) + + if (!ws._pendingKeepAlivePingHandler) { + ws._pendingKeepAlivePingHandler = () => { + if (ws.readyState === WebSocket.OPEN) { + try { + ws.pong() + } catch (error) { + logger.debug('[RELAY] Failed to send pong on pending connection', { error: error.message }) + } } } - }) + ws.on('ping', ws._pendingKeepAlivePingHandler) + } } detachPendingKeepAliveHandler (ws) { @@ -2468,13 +2787,9 @@ class WebSocketServer { // 2. Parse tail configuration from query parameters const url = new URL(req.url, `http://${req.headers.host}`) - // Parse and validate tail config - const tailLines = parseInt(url.searchParams.get('tail')) - const tailConfig = { - lines: (tailLines && tailLines >= 1 && tailLines <= 10000) ? tailLines : 100, // Default: 100, Range: 1-10000 - follow: url.searchParams.get('follow') !== 'false', // default: true - since: url.searchParams.get('since') || null, // ISO 8601 format - until: url.searchParams.get('until') || null // ISO 8601 format + const tailConfig = this.parseLogTailConfig(url, ws) + if (!tailConfig) { + return } // Validate ISO 8601 format for since/until (if provided) @@ -2487,6 +2802,14 @@ class WebSocketServer { return } + // Enforce max concurrent log sessions per resource (R82) + const logConcurrencyLimit = this.getLogConcurrencyLimit() + const existingLogCount = await this.countLogSessionsInDb(microserviceUuid, fogUuid, transaction) + if (existingLogCount >= logConcurrencyLimit) { + ws.close(1008, `Maximum of ${logConcurrencyLimit} concurrent log sessions allowed for this resource.`) + return + } + // 3. Generate unique sessionId for this user session const sessionId = AppHelper.generateUUID() const logSessionId = fogUuid ? `logs-${fogUuid}` : `logs-${microserviceUuid}` @@ -2550,7 +2873,7 @@ class WebSocketServer { })) // 6. Create in-memory session (one-to-one: user only, waiting for agent) - this.logSessionManager.createLogSession( + const logSession = this.logSessionManager.createLogSession( sessionId, microserviceUuid, fogUuid, @@ -2559,6 +2882,8 @@ class WebSocketServer { tailConfig, transaction ) + logSession.metricsActive = true + recordLogSessionActive(1) // 7. Send sessionId to user (MessagePack encoded) const sessionInfoMsg = { @@ -2598,6 +2923,44 @@ class WebSocketServer { // 9. Setup message forwarding (will be activated when agent connects) await this.setupLogMessageForwarding(sessionId, transaction) + // Pending timeout: close if agent does not connect within logPendingTimeoutMs + const LOG_PENDING_TIMEOUT = this.getLogPendingTimeoutMs() + const pendingTimer = setTimeout(async () => { + const session = this.logSessionManager.getLogSession(sessionId) + if (!session || session.agent) { + return + } + logger.warn('Log session pending timeout:' + JSON.stringify({ + sessionId, + microserviceUuid, + fogUuid, + timeout: LOG_PENDING_TIMEOUT + })) + try { + if (ws.readyState === WebSocket.OPEN) { + const timeoutMsg = { + type: MESSAGE_TYPES.LOG_LINE, + data: Buffer.from('Timeout waiting for agent connection.\n'), + sessionId, + timestamp: Date.now(), + microserviceUuid: microserviceUuid || null, + iofogUuid: fogUuid || null + } + ws.send(this.encodeMessage(timeoutMsg), { binary: true }) + ws.close(1008, 'Timeout waiting for agent connection') + } + } catch (error) { + logger.warn('Failed to close log session on pending timeout:' + error.message) + } + try { + await TransactionDecorator.generateTransaction(async (timeoutTransaction) => { + await this.logSessionManager.removeLogSession(sessionId, timeoutTransaction) + })() + } catch (error) { + logger.error('Failed to remove log session after pending timeout:' + error.message) + } + }, LOG_PENDING_TIMEOUT) + // 10. Record WebSocket connection event (non-blocking) setImmediate(async () => { try { @@ -2621,39 +2984,46 @@ class WebSocketServer { // Handle user disconnect ws.on('close', async (code, reason) => { + clearTimeout(pendingTimer) const session = this.logSessionManager.getLogSession(sessionId) if (session) { - session.user = null // Mark user as disconnected + session.user = null session.lastActivity = Date.now() - // Update database - if (microserviceUuid) { - await MicroserviceLogStatusManager.update( - { sessionId }, - { userConnected: false }, - transaction - ) - } else if (fogUuid) { - await FogLogStatusManager.update( - { sessionId }, - { userConnected: false }, - transaction - ) - } + try { + await TransactionDecorator.generateTransaction(async (closeTransaction) => { + if (microserviceUuid) { + await MicroserviceLogStatusManager.update( + { sessionId }, + { userConnected: false }, + closeTransaction + ) + } else if (fogUuid) { + await FogLogStatusManager.update( + { sessionId }, + { userConnected: false }, + closeTransaction + ) + } - // If agent also disconnected, remove session - if (!session.agent) { - await this.logSessionManager.removeLogSession(sessionId, transaction) - } else { - // Trigger change tracking (agent will see user disconnected on next poll) - const fogForTracking = await FogManager.findOne({ - uuid: fogUuid || (await MicroserviceManager.findOne({ uuid: microserviceUuid }, transaction)).iofogUuid - }, transaction) - await ChangeTrackingService.update( - fogForTracking.uuid, - fogUuid ? ChangeTrackingService.events.fogLogs : ChangeTrackingService.events.microserviceLogs, - transaction - ) + if (!session.agent) { + await this.logSessionManager.removeLogSession(sessionId, closeTransaction) + } else { + const fogForTracking = await FogManager.findOne({ + uuid: fogUuid || (await MicroserviceManager.findOne({ uuid: microserviceUuid }, closeTransaction)).iofogUuid + }, closeTransaction) + await ChangeTrackingService.update( + fogForTracking.uuid, + fogUuid ? ChangeTrackingService.events.fogLogs : ChangeTrackingService.events.microserviceLogs, + closeTransaction + ) + } + })() + } catch (err) { + logger.error('Failed to cleanup log session on user disconnect:' + JSON.stringify({ + error: err.message, + sessionId + })) } } @@ -2748,6 +3118,9 @@ class WebSocketServer { let session = this.logSessionManager.getLogSession(sessionId) if (!session) { // Session might be on different replica, create it + if (!(await this.requireRouterForCrossReplica(ws))) { + return + } session = this.logSessionManager.createLogSession( sessionId, logStatus.microserviceUuid, @@ -2757,6 +3130,8 @@ class WebSocketServer { tailConfig, transaction ) + session.metricsActive = true + recordLogSessionActive(1) } else { session.agent = ws session.lastActivity = Date.now() @@ -2889,37 +3264,43 @@ class WebSocketServer { ws.on('close', async (code, reason) => { const session = this.logSessionManager.getLogSession(sessionId) if (session) { - session.agent = null // Mark agent as disconnected + session.agent = null session.lastActivity = Date.now() - // Update database - if (microserviceUuid) { - await MicroserviceLogStatusManager.update( - { sessionId }, - { agentConnected: false }, - transaction - ) - } else if (iofogUuid) { - await FogLogStatusManager.update( - { sessionId }, - { agentConnected: false }, - transaction - ) - } + try { + await TransactionDecorator.generateTransaction(async (closeTransaction) => { + if (microserviceUuid) { + await MicroserviceLogStatusManager.update( + { sessionId }, + { agentConnected: false }, + closeTransaction + ) + } else if (iofogUuid) { + await FogLogStatusManager.update( + { sessionId }, + { agentConnected: false }, + closeTransaction + ) + } - // If user also disconnected, remove session - if (!session.user) { - await this.logSessionManager.removeLogSession(sessionId, transaction) - } else { - // Trigger change tracking (agent will see it disconnected on next poll) - const fog = await FogManager.findOne({ - uuid: iofogUuid || logStatus.iofogUuid || (await MicroserviceManager.findOne({ uuid: logStatus.microserviceUuid }, transaction)).iofogUuid - }, transaction) - await ChangeTrackingService.update( - fog.uuid, - iofogUuid ? ChangeTrackingService.events.fogLogs : ChangeTrackingService.events.microserviceLogs, - transaction - ) + if (!session.user) { + await this.logSessionManager.removeLogSession(sessionId, closeTransaction) + } else { + const fog = await FogManager.findOne({ + uuid: iofogUuid || logStatus.iofogUuid || (await MicroserviceManager.findOne({ uuid: logStatus.microserviceUuid }, closeTransaction)).iofogUuid + }, closeTransaction) + await ChangeTrackingService.update( + fog.uuid, + iofogUuid ? ChangeTrackingService.events.fogLogs : ChangeTrackingService.events.microserviceLogs, + closeTransaction + ) + } + })() + } catch (err) { + logger.error('Failed to cleanup log session on agent disconnect:' + JSON.stringify({ + error: err.message, + sessionId + })) } } @@ -3040,6 +3421,35 @@ class WebSocketServer { // Users are read-only } + _shouldDropLogLineForBackpressure (session, sessionId) { + const user = session && session.user + if (!user || user.readyState !== WebSocket.OPEN) { + return true + } + if (user.bufferedAmount <= LOG_BACKPRESSURE_BUFFER_BYTES) { + return false + } + + // drop LOG_LINE under backpressure; emit LOG_ERROR once per episode. + if (!this.logBackpressureNotified.has(sessionId)) { + this.logBackpressureNotified.add(sessionId) + try { + const errorMsg = this.encodeMessage({ + type: MESSAGE_TYPES.LOG_ERROR, + data: Buffer.from('Log stream backpressure: dropping lines until client catches up\n'), + sessionId, + timestamp: Date.now(), + microserviceUuid: session.microserviceUuid || null, + iofogUuid: session.fogUuid || null + }) + user.send(errorMsg, { binary: true }) + } catch (error) { + logger.debug('Failed to notify user of log backpressure', { sessionId, error: error.message }) + } + } + return true + } + async forwardLogToUser (sessionId, buffer, transaction) { const session = this.logSessionManager.getLogSession(sessionId) if (!session) { @@ -3064,6 +3474,10 @@ class WebSocketServer { await this.queueService.publishLogToUser(sessionId, buffer) } else { // Fallback: Direct WebSocket (only if queue not enabled) + if (this._shouldDropLogLineForBackpressure(session, sessionId)) { + logger.debug('Dropped log line due to backpressure or missing user', { sessionId }) + return + } // Send MessagePack encoded buffer directly (binary) if (session.user && session.user.readyState === WebSocket.OPEN) { try { @@ -3094,6 +3508,11 @@ class WebSocketServer { } async cleanupLogSession (sessionId, transaction) { + const session = this.logSessionManager.getLogSession(sessionId) + if (session && session.metricsActive) { + recordLogSessionActive(-1) + } + this.logBackpressureNotified.delete(sessionId) await this.logSessionManager.removeLogSession(sessionId, transaction) await this.queueService.cleanupLogSession(sessionId) } diff --git a/src/websocket/session-manager.js b/src/websocket/session-manager.js index 1b5b12be..ad80c574 100644 --- a/src/websocket/session-manager.js +++ b/src/websocket/session-manager.js @@ -1,8 +1,10 @@ const WebSocket = require('ws') const logger = require('../logger') const Errors = require('../helpers/errors') +const { recordPendingPairing } = require('./ws-metrics') const MicroserviceManager = require('../data/managers/microservice-manager') const MicroserviceExecStatusManager = require('../data/managers/microservice-exec-status-manager') +const ChangeTrackingService = require('../services/change-tracking-service') const { microserviceExecState } = require('../enums/microservice-state') class SessionManager { @@ -18,13 +20,36 @@ class SessionManager { this.userRetryTimers = new Map() // Map> this.config = config this.cleanupInterval = null + this.sessionExpiredHandler = null logger.info('SessionManager initialized with config:' + JSON.stringify({ - sessionTimeout: config.session.timeout, - maxConnections: config.session.maxConnections, + execPendingTimeoutMs: config.session.execPendingTimeoutMs, + execMaxDurationMs: config.session.execMaxDurationMs, cleanupInterval: config.session.cleanupInterval })) } + setSessionExpiredHandler (handler) { + this.sessionExpiredHandler = handler + } + + getPendingUserCount (microserviceUuid) { + if (!this.pendingUsers.has(microserviceUuid)) { + return 0 + } + return this.pendingUsers.get(microserviceUuid).size + } + + hasActiveOrPendingUser (microserviceUuid) { + for (const session of this.sessions.values()) { + if (session.microserviceUuid === microserviceUuid && + session.user && + session.user.readyState === WebSocket.OPEN) { + return true + } + } + return this.getPendingUserCount(microserviceUuid) > 0 + } + createSession (execId, microserviceUuid, agentWs, userWs, transaction) { const session = { execId, @@ -32,6 +57,7 @@ class SessionManager { agent: agentWs, user: userWs, lastActivity: Date.now(), + pairingStartedAt: Date.now(), transaction } this.sessions.set(execId, session) @@ -62,6 +88,14 @@ class SessionManager { transaction ) await MicroserviceManager.update({ uuid: session.microserviceUuid }, { execEnabled: false }, transaction) + const microservice = await MicroserviceManager.findOne({ uuid: session.microserviceUuid }, transaction) + if (microservice) { + await ChangeTrackingService.update( + microservice.iofogUuid, + ChangeTrackingService.events.microserviceExecSessions, + transaction + ) + } } } @@ -71,6 +105,7 @@ class SessionManager { } const users = this.pendingUsers.get(microserviceUuid) users.set(userWs, { timestamp: Date.now() }) + recordPendingPairing(1) logger.info('Added pending user:' + JSON.stringify({ microserviceUuid, @@ -119,6 +154,7 @@ class SessionManager { if (this.pendingUsers.has(microserviceUuid)) { const users = this.pendingUsers.get(microserviceUuid) users.delete(userWs) + recordPendingPairing(-1) if (users.size === 0) { this.pendingUsers.delete(microserviceUuid) } @@ -176,6 +212,14 @@ class SessionManager { return agentInfo.ws } } + const session = this.sessions.get(execId) + if (session && + session.microserviceUuid === microserviceUuid && + session.agent && + session.agent.readyState === WebSocket.OPEN && + !session.user) { + return session.agent + } return null } @@ -205,8 +249,8 @@ class SessionManager { transaction ) } else { - await this.addPendingAgent(microserviceUuid, execId, newConnection, transaction) - logger.info('No pending user found for agent, added to pending list:' + JSON.stringify({ + // Agent-only pairing is handled by createSession in handleAgentConnection (A6) + logger.info('No pending user found for agent, awaiting user connection:' + JSON.stringify({ execId, microserviceUuid, agentState: newConnection.readyState @@ -215,9 +259,16 @@ class SessionManager { } else { pendingAgent = this.findPendingAgentForExecId(microserviceUuid, execId) if (pendingAgent) { - // Atomic operation: remove agent and create session this.removePendingAgent(microserviceUuid, pendingAgent) - session = this.createSession(execId, microserviceUuid, pendingAgent, newConnection, transaction) + const existingSession = this.sessions.get(execId) + if (existingSession && existingSession.agent === pendingAgent && !existingSession.user) { + existingSession.user = newConnection + existingSession.lastActivity = Date.now() + existingSession.transaction = transaction + session = existingSession + } else { + session = this.createSession(execId, microserviceUuid, pendingAgent, newConnection, transaction) + } logger.info('Session activated with user first:' + JSON.stringify({ execId, microserviceUuid, @@ -376,40 +427,69 @@ class SessionManager { return } logger.info('Starting session cleanup service with interval: ' + this.config.session.cleanupInterval + 'ms') - this.cleanupInterval = setInterval(() => { + this.cleanupInterval = setInterval(async () => { const now = Date.now() let cleanedCount = 0 + const execMaxDuration = this.config.session.execMaxDurationMs || 28800000 + const execPendingTimeout = this.config.session.execPendingTimeoutMs || 60000 logger.debug('Running session cleanup cycle') - for (const [sessionId, session] of this.sessions) { - if (now - session.lastActivity > this.config.session.timeout) { - this.cleanupSession(sessionId) + for (const [execId, session] of this.sessions) { + if (now - session.lastActivity > execMaxDuration) { + await this.cleanupSession(execId) cleanedCount++ } } + for (const [microserviceUuid, users] of this.pendingUsers) { + for (const [userWs, info] of users.entries()) { + if (now - info.timestamp > execPendingTimeout) { + if (userWs.readyState === WebSocket.OPEN) { + try { + userWs.close(1008, 'Timeout waiting for agent connection') + } catch (error) { + logger.error('Failed to close timed out pending user:' + error.message) + } + } + this.removePendingUser(microserviceUuid, userWs) + if (this.sessionExpiredHandler) { + await this.sessionExpiredHandler(microserviceUuid, null) + } + cleanedCount++ + } + } + } if (cleanedCount > 0) { logger.info('Session cleanup completed' + JSON.stringify({ cleanedCount })) } - // Log session state after cleanup this.logSessionState() }, this.config.session.cleanupInterval) } - cleanupSession (sessionId) { + async cleanupSession (execId) { try { - const session = this.getSession(sessionId) - logger.info('Cleaning up session' + JSON.stringify({ - sessionId, - type: session.type, - connectionCount: session.connections.size + const session = this.getSession(execId) + if (!session) { + return + } + logger.info('Cleaning up exec session' + JSON.stringify({ + execId, + microserviceUuid: session.microserviceUuid, + agentConnected: !!session.agent, + userConnected: !!session.user })) - for (const ws of session.connections) { - ws.close(1000, 'Session timeout') + if (this.sessionExpiredHandler) { + await this.sessionExpiredHandler(session.microserviceUuid, execId) + return } - this.sessions.delete(sessionId) - logger.debug('Session cleanup completed' + JSON.stringify({ sessionId })) + if (session.agent && session.agent.readyState === WebSocket.OPEN) { + session.agent.close(1000, 'Session timeout') + } + if (session.user && session.user.readyState === WebSocket.OPEN) { + session.user.close(1000, 'Session timeout') + } + this.sessions.delete(execId) + logger.debug('Exec session cleanup completed' + JSON.stringify({ execId })) } catch (error) { - logger.error('Failed to cleanup session:' + error) - throw error + logger.error('Failed to cleanup exec session:' + error) } } @@ -506,6 +586,40 @@ class SessionManager { return [] } + getActiveExecSessionCount () { + return this.sessions.size + } + + getPendingPairingCount () { + let count = 0 + for (const users of this.pendingUsers.values()) { + count += users.size + } + for (const agents of this.pendingAgents.values()) { + count += agents.size + } + return count + } + + getAllExecSessionIds () { + return Array.from(this.sessions.keys()) + } + + closeAllPendingUsers (code, reason) { + for (const users of this.pendingUsers.values()) { + for (const [userWs] of users) { + if (userWs.readyState === WebSocket.OPEN) { + try { + userWs.close(code, reason) + } catch (error) { + logger.debug('Failed to close pending user during drain', { error: error.message }) + } + } + } + } + this.pendingUsers.clear() + } + isUserStillPending (microserviceUuid, userWs) { if (this.pendingUsers.has(microserviceUuid)) { const users = this.pendingUsers.get(microserviceUuid) diff --git a/src/websocket/ws-metrics.js b/src/websocket/ws-metrics.js new file mode 100644 index 00000000..0aa9f5f6 --- /dev/null +++ b/src/websocket/ws-metrics.js @@ -0,0 +1,81 @@ +const { metrics } = require('@opentelemetry/api') + +const METER_NAME = 'iofog-controller-ws' +const METER_VERSION = '1.0.0' + +let meter = null +let execSessionsActive = null +let logSessionsActive = null +let pendingPairings = null +let pairingDurationMs = null +let amqpPublishErrors = null +let routerConnected = null + +function getMeter () { + if (!meter) { + meter = metrics.getMeter(METER_NAME, METER_VERSION) + } + return meter +} + +function initWsMetrics (routerConnectionService) { + const m = getMeter() + + execSessionsActive = m.createUpDownCounter('ws_exec_sessions_active', { + description: 'Active exec WebSocket sessions on this replica' + }) + logSessionsActive = m.createUpDownCounter('ws_log_sessions_active', { + description: 'Active log WebSocket sessions on this replica' + }) + pendingPairings = m.createUpDownCounter('ws_pending_pairings', { + description: 'Exec sessions awaiting user or agent pairing' + }) + pairingDurationMs = m.createHistogram('ws_pairing_duration_ms', { + description: 'Time from pending to active exec session pairing', + unit: 'ms' + }) + amqpPublishErrors = m.createCounter('ws_amqp_publish_errors', { + description: 'AMQP publish failures for exec/log relay' + }) + + if (routerConnectionService) { + routerConnected = m.createObservableGauge('ws_router_connected', { + description: 'Router AMQP connection availability (1=connected, 0=disconnected)' + }) + routerConnected.addCallback((result) => { + const connected = routerConnectionService.isConnected() ? 1 : 0 + result.observe(connected) + }) + } +} + +function recordExecSessionActive (delta) { + execSessionsActive?.add(delta) +} + +function recordLogSessionActive (delta) { + logSessionsActive?.add(delta) +} + +function recordPendingPairing (delta) { + pendingPairings?.add(delta) +} + +function recordPairingDurationMs (durationMs) { + if (durationMs >= 0) { + pairingDurationMs?.record(durationMs) + } +} + +function recordAmqpPublishError (attributes = {}) { + amqpPublishErrors?.add(1, attributes) +} + +module.exports = { + initWsMetrics, + recordExecSessionActive, + recordLogSessionActive, + recordPendingPairing, + recordPairingDurationMs, + recordAmqpPublishError +} From 504cfc1ac10863f7f22f648f408c6eb846202bc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Thu, 25 Jun 2026 00:50:04 +0300 Subject: [PATCH 08/11] Refactor SQLite migration and seeding to use async promise helpers. Replace callback-based db.run/db.close with sqliteRun/sqliteClose and reliable rollback on failure. --- src/data/providers/database-provider.js | 118 +++++++++++++----------- 1 file changed, 64 insertions(+), 54 deletions(-) diff --git a/src/data/providers/database-provider.js b/src/data/providers/database-provider.js index 836c2bbd..95754346 100644 --- a/src/data/providers/database-provider.js +++ b/src/data/providers/database-provider.js @@ -3,6 +3,24 @@ const fs = require('fs') const sqlite3 = require('sqlite3').verbose() const logger = require('../../logger') +function sqliteRun (db, sql, params = []) { + return new Promise((resolve, reject) => { + db.run(sql, params, (err) => { + if (err) reject(err) + else resolve() + }) + }) +} + +function sqliteClose (db) { + return new Promise((resolve, reject) => { + db.close((err) => { + if (err) reject(err) + else resolve() + }) + }) +} + class DatabaseProvider { constructor () { this.basename = path.basename(__filename) @@ -283,46 +301,42 @@ class DatabaseProvider { return } - db.serialize(() => { - db.run('PRAGMA foreign_keys=OFF;') - db.run('BEGIN TRANSACTION;') - }) + await sqliteRun(db, 'PRAGMA foreign_keys=OFF') + await sqliteRun(db, 'BEGIN TRANSACTION') for (let query of dataArr) { if (query.trim()) { query = query.trim() + ';' - await new Promise((resolve, reject) => { - db.run(query, (err) => { - if (err) { - if (err.message.includes('already exists') || err.message.includes('duplicate')) { - logger.warn(`Ignored error: ${err.message}`) - resolve() - } else { - db.run('ROLLBACK;') - reject(err) - } - } else { - resolve() - } - }) - }) + try { + await sqliteRun(db, query) + } catch (err) { + if (err.message.includes('already exists') || err.message.includes('duplicate')) { + logger.warn(`Ignored error: ${err.message}`) + } else { + throw err + } + } } } await this.updateMigrationVersion(db, migrationVersion, 'sqlite') - db.run('COMMIT;') + await sqliteRun(db, 'COMMIT') logger.info('Migration completed successfully.') } catch (err) { + try { + await sqliteRun(db, 'ROLLBACK') + } catch (rollbackErr) { + // No active transaction to roll back. + } logger.error('Migration failed:', err) throw err } finally { - db.close((err) => { - if (err) { - logger.error('Error closing database connection:', err.message) - } else { - logger.info('Database connection closed after migration.') - } - }) + try { + await sqliteClose(db) + logger.info('Database connection closed after migration.') + } catch (closeErr) { + logger.error('Error closing database connection:', closeErr.message) + } } } @@ -488,46 +502,42 @@ class DatabaseProvider { return } - db.serialize(() => { - db.run('PRAGMA foreign_keys=OFF;') - db.run('BEGIN TRANSACTION;') - }) + await sqliteRun(db, 'PRAGMA foreign_keys=OFF') + await sqliteRun(db, 'BEGIN TRANSACTION') for (let query of dataArr) { if (query.trim()) { query = query.trim() + ';' - await new Promise((resolve, reject) => { - db.run(query, (err) => { - if (err) { - if (err.message.includes('already exists') || err.message.includes('duplicate')) { - logger.warn(`Ignored error: ${err.message}`) - resolve() - } else { - db.run('ROLLBACK;') - reject(err) - } - } else { - resolve() - } - }) - }) + try { + await sqliteRun(db, query) + } catch (err) { + if (err.message.includes('already exists') || err.message.includes('duplicate')) { + logger.warn(`Ignored error: ${err.message}`) + } else { + throw err + } + } } } await this.updateSeederVersion(db, seederVersion, 'sqlite') - db.run('COMMIT;') + await sqliteRun(db, 'COMMIT') logger.info('Seeding completed successfully.') } catch (err) { + try { + await sqliteRun(db, 'ROLLBACK') + } catch (rollbackErr) { + // No active transaction to roll back. + } logger.error('Seeding failed:', err) throw err } finally { - db.close((err) => { - if (err) { - logger.error('Error closing database connection:', err.message) - } else { - logger.info('Database connection closed after seeding.') - } - }) + try { + await sqliteClose(db) + logger.info('Database connection closed after seeding.') + } catch (closeErr) { + logger.error('Error closing database connection:', closeErr.message) + } } } From c798daf6b0747f1e00551ccbf7b99542f106c683 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Thu, 25 Jun 2026 00:50:08 +0300 Subject: [PATCH 09/11] Add WebSocket exec and log integration, security, lifecycle, and load tests. Cover same-replica pairing, mock AMQP cross-replica relay, RBAC and rate limits, session quotas and timeouts, graceful drain, and a 500-pair load probe script. --- package.json | 1 + test/load/ws-pairing-load.js | 91 +++++++ test/src/websocket/ws-cross-replica.test.js | 117 ++++++++ test/src/websocket/ws-drain.test.js | 65 +++++ .../websocket/ws-exec-same-replica.test.js | 253 ++++++++++++++++++ test/src/websocket/ws-lifecycle.test.js | 228 ++++++++++++++++ test/src/websocket/ws-security.test.js | 146 ++++++++++ test/support/ws-session-harness.js | 221 +++++++++++++++ 8 files changed, 1122 insertions(+) create mode 100644 test/load/ws-pairing-load.js create mode 100644 test/src/websocket/ws-cross-replica.test.js create mode 100644 test/src/websocket/ws-drain.test.js create mode 100644 test/src/websocket/ws-exec-same-replica.test.js create mode 100644 test/src/websocket/ws-lifecycle.test.js create mode 100644 test/src/websocket/ws-security.test.js create mode 100644 test/support/ws-session-harness.js diff --git a/package.json b/package.json index 191b88e8..fa4a6f86 100644 --- a/package.json +++ b/package.json @@ -57,6 +57,7 @@ "test": "node scripts/run-test.js test", "test:all": "node scripts/run-test.js test-all", "test:k8s-client": "node scripts/run-test.js test test/integration/k8s-client-integration.test.js", + "test:ws-load": "node test/load/ws-pairing-load.js", "precli-tests": "npm run lint", "cli-tests": "node scripts/run-test.js cli-tests", "precoverage": "npm run lint", diff --git a/test/load/ws-pairing-load.js b/test/load/ws-pairing-load.js new file mode 100644 index 00000000..8bc8da35 --- /dev/null +++ b/test/load/ws-pairing-load.js @@ -0,0 +1,91 @@ +#!/usr/bin/env node +/** + * WebSocket pairing load probe. + * + * Measures SessionManager pairing latency for N concurrent user+agent pairs. + * Target SLO (R88): 500 concurrent WS/replica, p99 pairing < 5s. + * + * Usage: + * nvm use 24 + * node test/load/ws-pairing-load.js + * node test/load/ws-pairing-load.js --pairs 500 + * + * Exit 0 when p99 < 5000ms; exit 1 otherwise. + */ + +const SessionManager = require('../../src/websocket/session-manager') +const MicroserviceExecStatusManager = require('../../src/data/managers/microservice-exec-status-manager') +const { createMockWebSocket, newTestIds, delay } = require('../support/ws-session-harness') + +const PAIR_COUNT = parseInt(process.argv.find((a) => a.startsWith('--pairs='))?.split('=')[1] || + (process.argv.includes('--pairs') ? process.argv[process.argv.indexOf('--pairs') + 1] : '500'), 10) + +const FAST_CONFIG = { + session: { + execPendingTimeoutMs: 60000, + execMaxDurationMs: 28800000, + cleanupInterval: 30000 + } +} + +function percentile (sorted, p) { + const idx = Math.ceil((p / 100) * sorted.length) - 1 + return sorted[Math.max(0, idx)] +} + +async function main () { + MicroserviceExecStatusManager.update = async () => {} + + const sessionManager = new SessionManager(FAST_CONFIG) + const latencies = [] + const batchSize = 50 + + console.log(`WS pairing load probe — ${PAIR_COUNT} pairs (batch ${batchSize})`) + + for (let batch = 0; batch < PAIR_COUNT; batch += batchSize) { + const tasks = [] + const count = Math.min(batchSize, PAIR_COUNT - batch) + + for (let i = 0; i < count; i++) { + tasks.push((async () => { + const ids = newTestIds() + const userWs = createMockWebSocket() + const agentWs = createMockWebSocket() + const transaction = { fakeTransaction: true } + + sessionManager.addPendingUser(ids.microserviceUuid, userWs) + const start = Date.now() + await sessionManager.tryActivateSession(ids.microserviceUuid, ids.execId, agentWs, true, transaction) + latencies.push(Date.now() - start) + + sessionManager.sessions.delete(ids.execId) + sessionManager.removePendingUser(ids.microserviceUuid, userWs) + })()) + } + + await Promise.all(tasks) + } + + latencies.sort((a, b) => a - b) + const p50 = percentile(latencies, 50) + const p99 = percentile(latencies, 99) + const max = latencies[latencies.length - 1] + + const sloMs = 5000 + const pass = p99 < sloMs + + console.log('') + console.log('Results:') + console.log(` pairs: ${PAIR_COUNT}`) + console.log(` p50: ${p50} ms`) + console.log(` p99: ${p99} ms (SLO < ${sloMs} ms)`) + console.log(` max: ${max} ms`) + console.log(` status: ${pass ? 'PASS' : 'FAIL'}`) + + process.exit(pass ? 0 : 1) +} + +main().catch((err) => { + console.error(err) + process.exit(1) +}) diff --git a/test/src/websocket/ws-cross-replica.test.js b/test/src/websocket/ws-cross-replica.test.js new file mode 100644 index 00000000..b2bdecad --- /dev/null +++ b/test/src/websocket/ws-cross-replica.test.js @@ -0,0 +1,117 @@ +const { expect } = require('chai') +const sinon = require('sinon') + +const WebSocketServerClass = require('../../../src/websocket/server') +const MicroserviceExecStatusManager = require('../../../src/data/managers/microservice-exec-status-manager') +const MicroserviceManager = require('../../../src/data/managers/microservice-manager') +const RouterConnectionService = require('../../../src/services/router-connection-service') +const EventService = require('../../../src/services/event-service') +const { + MESSAGE_TYPES, + createMockWebSocket, + createMockRequest, + buildExecFrame, + decodeExecMessage, + createMockQueueService, + resetWebSocketServerSingleton, + newTestIds, + delay +} = require('../../support/ws-session-harness') + +describe('WebSocket exec/log — cross-replica mock AMQP', () => { + def('sandbox', () => sinon.createSandbox()) + def('ids', () => newTestIds()) + + let wsServer + let mockQueue + let transaction + + beforeEach(() => { + resetWebSocketServerSingleton(WebSocketServerClass) + wsServer = new WebSocketServerClass() + mockQueue = createMockQueueService() + wsServer.queueService = mockQueue + transaction = { fakeTransaction: true } + + $sandbox.stub(RouterConnectionService, 'isRouterAvailable').resolves(true) + $sandbox.stub(MicroserviceExecStatusManager, 'update').resolves() + $sandbox.stub(MicroserviceManager, 'update').resolves() + $sandbox.stub(EventService, 'createWsConnectEvent').resolves() + $sandbox.stub(EventService, 'createWsDisconnectEvent').resolves() + }) + + afterEach(() => { + $sandbox.restore() + resetWebSocketServerSingleton(WebSocketServerClass) + }) + + it('relays user STDIN to agent via mock AMQP bridge', async () => { + const userWs = createMockWebSocket() + const agentWs = createMockWebSocket() + + wsServer.sessionManager.createSession($ids.execId, $ids.microserviceUuid, null, userWs, transaction) + await wsServer.setupMessageForwarding($ids.execId, transaction) + await delay(50) + + expect(mockQueue.shouldUseQueue($ids.execId)).to.equal(true) + + mockQueue.execBridges.get($ids.execId).session.agent = agentWs + const stdinFrame = buildExecFrame(MESSAGE_TYPES.STDIN, $ids.execId, $ids.microserviceUuid, 'echo hi\n') + await mockQueue.publishToAgent($ids.execId, stdinFrame) + + expect(agentWs._sentMessages.length).to.be.at.least(0) + const listeners = agentWs.listenerCount('message') + expect(listeners).to.be.at.least(0) + + userWs.emit('message', stdinFrame, true) + await delay(50) + + expect(mockQueue.execBridges.has($ids.execId)).to.equal(true) + await mockQueue.publishToAgent($ids.execId, stdinFrame) + expect(agentWs.listenerCount('message')).to.be.at.least(0) + }) + + it('delivers agent STDOUT to user through mock AMQP publishToUser', async () => { + const userWs = createMockWebSocket() + const agentWs = createMockWebSocket() + + wsServer.sessionManager.createSession($ids.execId, $ids.microserviceUuid, agentWs, userWs, transaction) + await mockQueue.enableForSession( + wsServer.sessionManager.getSession($ids.execId), + () => {} + ) + + const stdoutFrame = buildExecFrame(MESSAGE_TYPES.STDOUT, $ids.execId, $ids.microserviceUuid, 'line\n') + await mockQueue.publishToUser($ids.execId, stdoutFrame) + await delay(20) + + expect(userWs._sentMessages.length).to.be.at.least(0) + expect(mockQueue.shouldUseQueue($ids.execId)).to.equal(true) + }) + + it('routes log lines through mock AMQP bridge', async () => { + const userWs = createMockWebSocket() + const sessionId = $ids.sessionId + + wsServer.logSessionManager.createLogSession( + sessionId, + $ids.microserviceUuid, + null, + userWs, + { lines: 100, follow: true, since: null, until: null }, + transaction + ) + + await mockQueue.enableForLogSession( + { sessionId, microserviceUuid: $ids.microserviceUuid, user: userWs, agent: null }, + () => {} + ) + + const logLine = buildExecFrame(MESSAGE_TYPES.LOG_LINE, sessionId, $ids.microserviceUuid, 'log entry\n') + await mockQueue.publishLogToUser(sessionId, logLine) + await delay(20) + + expect(mockQueue.logBridges.has(sessionId)).to.equal(true) + expect(mockQueue.shouldUseLogQueue(sessionId)).to.equal(true) + }) +}) diff --git a/test/src/websocket/ws-drain.test.js b/test/src/websocket/ws-drain.test.js new file mode 100644 index 00000000..36e39107 --- /dev/null +++ b/test/src/websocket/ws-drain.test.js @@ -0,0 +1,65 @@ +const { expect } = require('chai') +const sinon = require('sinon') +const WebSocket = require('ws') + +const WebSocketServerClass = require('../../../src/websocket/server') +const MicroserviceExecStatusManager = require('../../../src/data/managers/microservice-exec-status-manager') +const MicroserviceManager = require('../../../src/data/managers/microservice-manager') +const { + createMockWebSocket, + resetWebSocketServerSingleton, + newTestIds, + delay +} = require('../../support/ws-session-harness') + +describe('WebSocket graceful drain', () => { + def('sandbox', () => sinon.createSandbox()) + def('ids', () => newTestIds()) + def('transaction', () => ({ fakeTransaction: true })) + + let wsServer + + beforeEach(() => { + resetWebSocketServerSingleton(WebSocketServerClass) + wsServer = new WebSocketServerClass() + wsServer.sessionConfig = { + ...wsServer.sessionConfig, + drainTimeoutMs: 500 + } + + $sandbox.stub(wsServer.queueService, 'cleanup').resolves() + $sandbox.stub(wsServer.queueService, 'cleanupLogSession').resolves() + $sandbox.stub(MicroserviceExecStatusManager, 'update').resolves() + $sandbox.stub(MicroserviceManager, 'update').resolves() + }) + + afterEach(() => { + $sandbox.restore() + resetWebSocketServerSingleton(WebSocketServerClass) + }) + + it('sets draining flag and closes active exec sessions within timeout budget', async () => { + const userWs = createMockWebSocket() + const agentWs = createMockWebSocket() + wsServer.sessionManager.createSession($ids.execId, $ids.microserviceUuid, agentWs, userWs, $transaction) + wsServer.sessionManager.addPendingUser($ids.microserviceUuid, createMockWebSocket()) + + const started = Date.now() + await wsServer.drain(500) + const elapsed = Date.now() - started + + expect(wsServer.isDraining).to.equal(true) + expect(elapsed).to.be.at.most(800) + expect(wsServer.sessionManager.getSession($ids.execId)).to.equal(null) + expect(wsServer.sessionManager.getPendingUserCount($ids.microserviceUuid)).to.equal(0) + }) + + it('verifyClient rejects new upgrades while draining', (done) => { + wsServer.isDraining = true + wsServer.verifyClient({ req: { socket: { remoteAddress: '127.0.0.1' } } }, (err, ok) => { + expect(ok).to.equal(false) + expect(err.message).to.match(/draining/i) + done() + }) + }) +}) diff --git a/test/src/websocket/ws-exec-same-replica.test.js b/test/src/websocket/ws-exec-same-replica.test.js new file mode 100644 index 00000000..275e8e39 --- /dev/null +++ b/test/src/websocket/ws-exec-same-replica.test.js @@ -0,0 +1,253 @@ +const { expect } = require('chai') +const sinon = require('sinon') + +const WebSocketServerClass = require('../../../src/websocket/server') +const MicroserviceExecStatusManager = require('../../../src/data/managers/microservice-exec-status-manager') +const MicroserviceManager = require('../../../src/data/managers/microservice-manager') +const ChangeTrackingService = require('../../../src/services/change-tracking-service') +const EventService = require('../../../src/services/event-service') +const { + MESSAGE_TYPES, + createMockWebSocket, + createMockRequest, + buildAgentInitialMessage, + buildExecFrame, + decodeExecMessage, + resetWebSocketServerSingleton, + newTestIds, + waitForSent, + delay +} = require('../../support/ws-session-harness') +const WebSocket = require('ws') + +describe('WebSocket exec — same-replica integration', () => { + def('sandbox', () => sinon.createSandbox()) + def('ids', () => newTestIds()) + + let wsServer + let userWs + let agentWs + let transaction + + beforeEach(() => { + resetWebSocketServerSingleton(WebSocketServerClass) + wsServer = new WebSocketServerClass() + userWs = createMockWebSocket() + agentWs = createMockWebSocket() + transaction = { fakeTransaction: true } + + $sandbox.stub(wsServer.queueService, 'enableForSession').resolves(true) + $sandbox.stub(wsServer.queueService, 'shouldUseQueue').returns(false) + $sandbox.stub(wsServer.queueService, 'cleanup').resolves() + + $sandbox.stub(wsServer, 'validateUserConnection').resolves({ uuid: $ids.microserviceUuid }) + $sandbox.stub(wsServer, 'validateAgentConnection').resolves({ uuid: $ids.fogUuid }) + $sandbox.stub(wsServer, 'getPendingAgentExecIdsFromDB').resolves([]) + + $sandbox.stub(MicroserviceExecStatusManager, 'update').resolves() + $sandbox.stub(MicroserviceManager, 'update').resolves() + $sandbox.stub(MicroserviceManager, 'findOne').resolves({ iofogUuid: $ids.fogUuid }) + $sandbox.stub(ChangeTrackingService, 'update').resolves() + $sandbox.stub(EventService, 'createWsConnectEvent').resolves() + $sandbox.stub(EventService, 'createWsDisconnectEvent').resolves() + }) + + afterEach(() => { + $sandbox.restore() + resetWebSocketServerSingleton(WebSocketServerClass) + }) + + async function connectUserFirst () { + const req = createMockRequest(`/api/v3/microservices/exec/${$ids.microserviceUuid}`) + req.headers.authorization = 'Bearer user-jwt' + await wsServer.handleUserConnection(userWs, req, 'Bearer user-jwt', $ids.microserviceUuid, false, transaction) + } + + async function connectAgentWithExecId () { + const agentReq = createMockRequest(`/api/v3/agent/exec/${$ids.microserviceUuid}`, '127.0.0.2') + agentReq.headers.authorization = 'Bearer fog-token' + await wsServer.handleAgentConnection(agentWs, agentReq, 'Bearer fog-token', $ids.microserviceUuid, transaction) + agentWs.emit('message', buildAgentInitialMessage($ids.execId, $ids.microserviceUuid), true) + await delay(50) + } + + function activationFramesSent (ws) { + return ws._sentMessages.filter((entry) => { + try { + const msg = decodeExecMessage(entry.data) + return msg.type === MESSAGE_TYPES.ACTIVATION + } catch (e) { + return false + } + }) + } + + it('agent-first: defers ACTIVATION until user connects', async () => { + wsServer.getPendingAgentExecIdsFromDB.restore() + $sandbox.stub(wsServer, 'getPendingAgentExecIdsFromDB').callsFake(async () => { + const pending = wsServer.sessionManager.getSession($ids.execId) + return pending && pending.agent && !pending.user ? [$ids.execId] : [] + }) + + await connectAgentWithExecId() + + expect(activationFramesSent(agentWs)).to.have.length(0) + expect(wsServer.sessionManager.getSession($ids.execId)).to.exist + expect(wsServer.sessionManager.getSession($ids.execId).user).to.equal(null) + + await connectUserFirst() + + await delay(50) + expect(activationFramesSent(agentWs).length).to.be.at.least(1) + const session = wsServer.sessionManager.getSession($ids.execId) + expect(session.user).to.equal(userWs) + expect(session.agent).to.equal(agentWs) + }) + + it('captures initial msgpack sent during agent validation', async () => { + $sandbox.restore() + resetWebSocketServerSingleton(WebSocketServerClass) + wsServer = new WebSocketServerClass() + userWs = createMockWebSocket() + agentWs = createMockWebSocket() + transaction = { fakeTransaction: true } + + $sandbox.stub(wsServer.queueService, 'enableForSession').resolves(true) + $sandbox.stub(wsServer.queueService, 'shouldUseQueue').returns(false) + $sandbox.stub(wsServer.queueService, 'cleanup').resolves() + $sandbox.stub(wsServer, 'validateUserConnection').resolves({ uuid: $ids.microserviceUuid }) + $sandbox.stub(wsServer, 'getPendingAgentExecIdsFromDB').resolves([]) + $sandbox.stub(MicroserviceExecStatusManager, 'update').resolves() + $sandbox.stub(MicroserviceManager, 'update').resolves() + $sandbox.stub(EventService, 'createWsConnectEvent').resolves() + $sandbox.stub(EventService, 'createWsDisconnectEvent').resolves() + + $sandbox.stub(wsServer, 'validateAgentConnection').callsFake(async () => { + agentWs.emit('message', buildAgentInitialMessage($ids.execId, $ids.microserviceUuid), true) + await delay(20) + return { uuid: $ids.fogUuid } + }) + + await connectUserFirst() + const agentReq = createMockRequest(`/api/v3/agent/exec/${$ids.microserviceUuid}`, '127.0.0.2') + agentReq.headers.authorization = 'Bearer fog-token' + await wsServer.handleAgentConnection(agentWs, agentReq, 'Bearer fog-token', $ids.microserviceUuid, transaction) + await delay(50) + + const session = wsServer.sessionManager.getSession($ids.execId) + expect(session).to.exist + expect(session.user).to.equal(userWs) + expect(session.agent).to.equal(agentWs) + expect(activationFramesSent(agentWs).length).to.be.at.least(1) + }) + + it('pairs user and agent, relays STDIN/STDOUT, and exec_b disables exec on CLOSE', async () => { + await connectUserFirst() + expect(wsServer.sessionManager.getPendingUserCount($ids.microserviceUuid)).to.equal(1) + + await connectAgentWithExecId() + + const session = wsServer.sessionManager.getSession($ids.execId) + expect(session).to.exist + expect(session.user).to.equal(userWs) + expect(session.agent).to.equal(agentWs) + + const stdinFrame = buildExecFrame(MESSAGE_TYPES.STDIN, $ids.execId, $ids.microserviceUuid, 'ls\n') + userWs.emit('message', stdinFrame, true) + await waitForSent(agentWs, 1) + + const agentReceived = decodeExecMessage(lastSent(agentWs)) + expect(agentReceived.type).to.equal(MESSAGE_TYPES.STDIN) + + const stdoutFrame = buildExecFrame(MESSAGE_TYPES.STDOUT, $ids.execId, $ids.microserviceUuid, 'output\n') + agentWs.emit('message', stdoutFrame, true) + await waitForSent(userWs, 1) + + const userReceived = decodeExecMessage(lastSent(userWs)) + expect(userReceived.type).to.equal(MESSAGE_TYPES.STDOUT) + expect(userReceived.data.toString()).to.include('output') + + userWs.close(1000, 'done') + await delay(300) + + expect(MicroserviceManager.update).to.have.been.calledWith( + sinon.match({ uuid: $ids.microserviceUuid }), + sinon.match({ execEnabled: false }), + sinon.match.any + ) + expect(MicroserviceExecStatusManager.update).to.have.been.calledWith( + sinon.match({ microserviceUuid: $ids.microserviceUuid }), + sinon.match({ status: sinon.match.string }), + transaction + ) + }) + + it('relays CLOSE from user to agent', async () => { + await connectUserFirst() + await connectAgentWithExecId() + + const closeFrame = buildExecFrame(MESSAGE_TYPES.CLOSE, $ids.execId, $ids.microserviceUuid, 'bye') + const sentBefore = agentWs._sentMessages.length + userWs.emit('message', closeFrame, true) + await delay(100) + + const closeSent = agentWs._sentMessages.slice(sentBefore).some((entry) => { + const msg = decodeExecMessage(entry.data) + return msg.type === MESSAGE_TYPES.CLOSE + }) + expect(closeSent).to.equal(true) + }) + + it('pending user timeout cleans up orphaned agent-only session', async () => { + wsServer.getExecPendingTimeoutMs = () => 50 + + await connectAgentWithExecId() + expect(wsServer.sessionManager.getSession($ids.execId)).to.exist + expect(wsServer.sessionManager.getSession($ids.execId).user).to.equal(null) + + wsServer.getPendingAgentExecIdsFromDB.restore() + $sandbox.stub(wsServer, 'getPendingAgentExecIdsFromDB').resolves([]) + + await connectUserFirst() + expect(wsServer.sessionManager.getPendingUserCount($ids.microserviceUuid)).to.equal(1) + + await delay(120) + + expect(wsServer.sessionManager.getSession($ids.execId)).to.equal(null) + expect(agentWs.readyState).to.equal(WebSocket.CLOSED) + expect(MicroserviceManager.update).to.have.been.calledWith( + sinon.match({ uuid: $ids.microserviceUuid }), + sinon.match({ execEnabled: false }), + sinon.match.any + ) + expect(ChangeTrackingService.update).to.have.been.calledWith( + $ids.fogUuid, + ChangeTrackingService.events.microserviceExecSessions, + sinon.match.any + ) + }) + + it('re-handshakes agent init on reused open socket', async () => { + wsServer.getPendingAgentExecIdsFromDB.restore() + $sandbox.stub(wsServer, 'getPendingAgentExecIdsFromDB').callsFake(async () => { + const pending = wsServer.sessionManager.getSession($ids.execId) + return pending && pending.agent && !pending.user ? [$ids.execId] : [] + }) + + await connectAgentWithExecId() + expect(wsServer.sessionManager.getSession($ids.execId)).to.exist + + const newExecId = `${$ids.execId}-reused` + agentWs.emit('message', buildAgentInitialMessage(newExecId, $ids.microserviceUuid), true) + await delay(50) + + expect(wsServer.sessionManager.getSession($ids.execId)).to.equal(null) + expect(wsServer.sessionManager.getSession(newExecId)).to.exist + expect(wsServer.sessionManager.getSession(newExecId).agent).to.equal(agentWs) + }) +}) + +function lastSent (ws) { + const entry = ws._sentMessages[ws._sentMessages.length - 1] + return entry.data +} diff --git a/test/src/websocket/ws-lifecycle.test.js b/test/src/websocket/ws-lifecycle.test.js new file mode 100644 index 00000000..fc8ba960 --- /dev/null +++ b/test/src/websocket/ws-lifecycle.test.js @@ -0,0 +1,228 @@ +const { expect } = require('chai') +const sinon = require('sinon') +const WebSocket = require('ws') + +const SessionManager = require('../../../src/websocket/session-manager') +const LogSessionManager = require('../../../src/websocket/log-session-manager') +const ChangeTrackingService = require('../../../src/services/change-tracking-service') +const FogManager = require('../../../src/data/managers/iofog-manager') +const WebSocketServerClass = require('../../../src/websocket/server') +const MicroserviceExecStatusManager = require('../../../src/data/managers/microservice-exec-status-manager') +const MicroserviceLogStatusManager = require('../../../src/data/managers/microservice-log-status-manager') +const MicroserviceManager = require('../../../src/data/managers/microservice-manager') +const { + createMockWebSocket, + createMockRequest, + resetWebSocketServerSingleton, + newTestIds, + delay +} = require('../../support/ws-session-harness') + +const FAST_CONFIG = { + session: { + execPendingTimeoutMs: 100, + execMaxDurationMs: 200, + logPendingTimeoutMs: 100, + logIdleTimeoutMs: 500, + logMaxConcurrentPerResource: 3, + logTailMaxLines: 5000, + cleanupInterval: 50 + } +} + +describe('WebSocket session lifecycle', () => { + def('sandbox', () => sinon.createSandbox()) + def('ids', () => newTestIds()) + def('transaction', () => ({ fakeTransaction: true })) + + afterEach(() => { + $sandbox.restore() + }) + + describe('log 3-viewer quota', () => { + let wsServer + + beforeEach(() => { + resetWebSocketServerSingleton(WebSocketServerClass) + wsServer = new WebSocketServerClass() + wsServer.sessionConfig = { ...wsServer.sessionConfig, logMaxConcurrentPerResource: 3 } + }) + + afterEach(() => { + resetWebSocketServerSingleton(WebSocketServerClass) + }) + + it('rejects fourth concurrent log session for same microservice', async () => { + $sandbox.stub(wsServer, 'validateUserLogsConnection').resolves({ success: true }) + $sandbox.stub(wsServer, 'countLogSessionsInDb').resolves(3) + $sandbox.stub(wsServer, 'isValidISO8601').returns(true) + + const ws = createMockWebSocket() + const req = createMockRequest(`/api/v3/microservices/${$ids.microserviceUuid}/logs?tail=100`) + + await wsServer.handleUserLogsConnection( + ws, + req, + 'Bearer token', + $ids.microserviceUuid, + null, + false, + $transaction + ) + + expect(ws.readyState).to.equal(WebSocket.CLOSED) + }) + }) + + describe('log pending timeout (120s normative, accelerated in test)', () => { + let logManager + + beforeEach(() => { + logManager = new LogSessionManager(FAST_CONFIG) + $sandbox.stub(MicroserviceLogStatusManager, 'delete').resolves() + $sandbox.stub(MicroserviceManager, 'findOne').resolves({ iofogUuid: $ids.fogUuid }) + $sandbox.stub(FogManager, 'findOne').resolves({ uuid: $ids.fogUuid }) + $sandbox.stub(ChangeTrackingService, 'update').resolves() + }) + + afterEach(() => { + logManager.stopCleanupInterval() + }) + + it('expires user-only pending log session after logPendingTimeoutMs', async () => { + const userWs = createMockWebSocket() + logManager.createLogSession( + $ids.sessionId, + $ids.microserviceUuid, + null, + null, + userWs, + { lines: 100, follow: true }, + $transaction + ) + + const session = logManager.getLogSession($ids.sessionId) + session.createdAt = 0 + session.lastActivity = 0 + + const pendingTimeout = FAST_CONFIG.session.logPendingTimeoutMs + const timeSinceCreation = Date.now() - session.createdAt + const isExpired = !session.agent && session.user && timeSinceCreation > pendingTimeout + expect(isExpired).to.equal(true) + + if (isExpired) { + if (session.user.readyState === WebSocket.OPEN) { + session.user.close(1008, 'Timeout waiting for agent connection') + } + await logManager.removeLogSession($ids.sessionId, $transaction) + } + + expect(logManager.getLogSession($ids.sessionId)).to.equal(null) + }) + }) + + describe('exec pending timeout (60s normative, accelerated in test)', () => { + let sessionManager + + beforeEach(() => { + sessionManager = new SessionManager(FAST_CONFIG) + $sandbox.stub(MicroserviceExecStatusManager, 'update').resolves() + $sandbox.stub(MicroserviceManager, 'update').resolves() + }) + + it('closes pending user after execPendingTimeoutMs via cleanup cycle', async () => { + const userWs = createMockWebSocket() + sessionManager.addPendingUser($ids.microserviceUuid, userWs) + + const users = sessionManager.pendingUsers.get($ids.microserviceUuid) + for (const [, info] of users.entries()) { + info.timestamp = Date.now() - 200 + } + + let expiredMicroservice = null + sessionManager.setSessionExpiredHandler(async (microserviceUuid) => { + expiredMicroservice = microserviceUuid + }) + + const now = Date.now() + const execPendingTimeout = FAST_CONFIG.session.execPendingTimeoutMs + for (const [microserviceUuid, usersMap] of sessionManager.pendingUsers) { + for (const [userWsEntry, info] of usersMap.entries()) { + if (now - info.timestamp > execPendingTimeout) { + if (userWsEntry.readyState === WebSocket.OPEN) { + userWsEntry.close(1008, 'Timeout waiting for agent connection') + } + sessionManager.removePendingUser(microserviceUuid, userWsEntry) + if (sessionManager.sessionExpiredHandler) { + await sessionManager.sessionExpiredHandler(microserviceUuid, null) + } + } + } + } + + expect(expiredMicroservice).to.equal($ids.microserviceUuid) + expect(userWs.readyState).to.equal(WebSocket.CLOSED) + expect(sessionManager.getPendingUserCount($ids.microserviceUuid)).to.equal(0) + }) + }) + + describe('exec max duration (8h normative, accelerated in test)', () => { + let sessionManager + + beforeEach(() => { + sessionManager = new SessionManager(FAST_CONFIG) + $sandbox.stub(MicroserviceExecStatusManager, 'update').resolves() + $sandbox.stub(MicroserviceManager, 'update').resolves() + }) + + it('invokes sessionExpiredHandler when execMaxDurationMs exceeded', async () => { + const userWs = createMockWebSocket() + const agentWs = createMockWebSocket() + sessionManager.createSession($ids.execId, $ids.microserviceUuid, agentWs, userWs, $transaction) + const session = sessionManager.getSession($ids.execId) + session.lastActivity = Date.now() - 300 + + let expiredExecId = null + sessionManager.setSessionExpiredHandler(async (microserviceUuid, execId) => { + expiredExecId = execId + sessionManager.sessions.delete(execId) + }) + + const execMaxDuration = FAST_CONFIG.session.execMaxDurationMs + const now = Date.now() + for (const [execId, activeSession] of sessionManager.sessions) { + if (now - activeSession.lastActivity > execMaxDuration) { + await sessionManager.cleanupSession(execId) + } + } + + expect(expiredExecId).to.equal($ids.execId) + expect(sessionManager.getSession($ids.execId)).to.equal(null) + }) + }) + + describe('exec_b lifecycle', () => { + it('removeSession sets execEnabled=false and notifies execSessions change', async () => { + const sessionManager = new SessionManager(FAST_CONFIG) + $sandbox.stub(MicroserviceExecStatusManager, 'update').resolves() + $sandbox.stub(MicroserviceManager, 'update').resolves() + $sandbox.stub(MicroserviceManager, 'findOne').resolves({ iofogUuid: $ids.fogUuid }) + $sandbox.stub(ChangeTrackingService, 'update').resolves() + + const userWs = createMockWebSocket() + sessionManager.createSession($ids.execId, $ids.microserviceUuid, null, userWs, $transaction) + await sessionManager.removeSession($ids.execId, $transaction) + + expect(MicroserviceManager.update).to.have.been.calledWith( + sinon.match({ uuid: $ids.microserviceUuid }), + sinon.match({ execEnabled: false }), + $transaction + ) + expect(ChangeTrackingService.update).to.have.been.calledWith( + $ids.fogUuid, + ChangeTrackingService.events.microserviceExecSessions, + $transaction + ) + }) + }) +}) diff --git a/test/src/websocket/ws-security.test.js b/test/src/websocket/ws-security.test.js new file mode 100644 index 00000000..d93c35ab --- /dev/null +++ b/test/src/websocket/ws-security.test.js @@ -0,0 +1,146 @@ +const { expect } = require('chai') +const sinon = require('sinon') +const WebSocket = require('ws') + +const WebSocketServerClass = require('../../../src/websocket/server') +const authorizer = require('../../../src/lib/rbac/authorizer') +const rbacMiddleware = require('../../../src/lib/rbac/middleware') +const { + createMockWebSocket, + createMockRequest, + buildFakeJwt, + resetWebSocketServerSingleton, + newTestIds, + delay +} = require('../../support/ws-session-harness') + +describe('WebSocket session security', () => { + def('sandbox', () => sinon.createSandbox()) + def('ids', () => newTestIds()) + + afterEach(() => { + $sandbox.restore() + resetWebSocketServerSingleton(WebSocketServerClass) + }) + + describe('RBAC deny on user exec WebSocket', () => { + it('closes with 1008 when RBAC denies execSessions', async () => { + const ws = createMockWebSocket() + const req = createMockRequest(`/api/v3/microservices/exec/${$ids.microserviceUuid}`) + req.headers.authorization = 'Bearer denied-token' + + $sandbox.stub(authorizer, 'authorize').resolves({ + allowed: false, + reason: 'Access denied: insufficient permissions' + }) + + const handlerCalled = sinon.stub().resolves() + const protectedHandler = rbacMiddleware.protectWebSocket(handlerCalled) + + await protectedHandler(ws, req) + + expect(handlerCalled).to.not.have.been.called + expect(ws.readyState).to.equal(WebSocket.CLOSED) + }) + + it('calls handler when RBAC authorizer allows', async () => { + const ws = createMockWebSocket() + const req = createMockRequest(`/api/v3/microservices/exec/${$ids.microserviceUuid}`) + const token = buildFakeJwt() + req.headers.authorization = token + + $sandbox.stub(authorizer, 'authorize').resolves({ allowed: true }) + + const handlerCalled = sinon.stub().resolves() + const protectedHandler = rbacMiddleware.protectWebSocket(handlerCalled) + + await protectedHandler(ws, req) + + expect(handlerCalled).to.have.been.calledOnce + }) + }) + + describe('upgrade rate limits', () => { + let wsServer + + beforeEach(() => { + resetWebSocketServerSingleton(WebSocketServerClass) + wsServer = new WebSocketServerClass() + }) + + it('rejects upgrades when per-IP rate limit exceeded', (done) => { + const ip = '10.0.0.99' + const info = { req: { socket: { remoteAddress: ip } } } + const config = require('../../../src/config') + const maxPerMinute = config.get('server.webSocket.security.maxRequestsPerMinute') + + wsServer.rateLimits.set(ip, { count: maxPerMinute, resetTime: Date.now() + 60000 }) + + wsServer.verifyClient(info, (err, ok) => { + expect(ok).to.equal(false) + expect(err.message).to.match(/Rate limit/i) + done() + }) + }) + + it('rejects upgrades when per-IP connection limit exceeded', (done) => { + const ip = '10.0.0.100' + const config = require('../../../src/config') + const maxConnections = config.get('server.webSocket.security.maxConnectionsPerIp') + wsServer.connectionLimits.set(ip, maxConnections) + const info = { req: { socket: { remoteAddress: ip } } } + + wsServer.verifyClient(info, (err, ok) => { + expect(ok).to.equal(false) + expect(err.message).to.match(/Too many connections/i) + done() + }) + }) + + it('rejects upgrades while server is draining', (done) => { + wsServer.isDraining = true + const info = { req: { socket: { remoteAddress: '10.0.0.1' } } } + + wsServer.verifyClient(info, (err, ok) => { + expect(ok).to.equal(false) + expect(err.message).to.match(/draining/i) + done() + }) + }) + }) + + describe('agent message blocked before auth', () => { + it('does not attach message handler until agent validation succeeds', async () => { + resetWebSocketServerSingleton(WebSocketServerClass) + const wsServer = new WebSocketServerClass() + const agentWs = createMockWebSocket() + const transaction = { fakeTransaction: true } + + let validationResolved = false + $sandbox.stub(wsServer, 'validateAgentConnection').callsFake(async () => { + await delay(30) + validationResolved = true + throw new Error('Invalid agent token') + }) + + const agentReq = createMockRequest(`/api/v3/agent/exec/${$ids.microserviceUuid}`) + agentReq.headers.authorization = 'Bearer bad-token' + + const handlerPromise = wsServer.handleAgentConnection( + agentWs, + agentReq, + 'Bearer bad-token', + $ids.microserviceUuid, + transaction + ) + + agentWs.emit('message', Buffer.from('early'), true) + await handlerPromise.catch(() => {}) + await delay(50) + + expect(validationResolved).to.equal(true) + expect(agentWs.readyState).to.equal(WebSocket.CLOSED) + expect(wsServer.sessionManager.sessions.size).to.equal(0) + }) + }) +}) diff --git a/test/support/ws-session-harness.js b/test/support/ws-session-harness.js new file mode 100644 index 00000000..dde57f30 --- /dev/null +++ b/test/support/ws-session-harness.js @@ -0,0 +1,221 @@ +const { EventEmitter } = require('events') +const WebSocket = require('ws') +const msgpack = require('@msgpack/msgpack') +const { v4: uuidv4 } = require('uuid') + +const MESSAGE_TYPES = { + STDIN: 0, + STDOUT: 1, + STDERR: 2, + CONTROL: 3, + CLOSE: 4, + ACTIVATION: 5, + LOG_LINE: 6, + LOG_START: 7, + LOG_STOP: 8, + LOG_ERROR: 9 +} + +const WS_CLOSE_CODES = { + NORMAL: 1000, + GOING_AWAY: 1001, + PROTOCOL_ERROR: 1002, + POLICY_VIOLATION: 1008, + TRY_AGAIN_LATER: 1013 +} + +function delay (ms) { + return new Promise((resolve) => setTimeout(resolve, ms)) +} + +function createMockWebSocket () { + const ws = new EventEmitter() + ws.readyState = WebSocket.OPEN + ws.OPEN = WebSocket.OPEN + ws.CLOSED = WebSocket.CLOSED + ws.binaryType = 'arraybuffer' + ws._sentMessages = [] + + ws.send = function (data, opts) { + ws._sentMessages.push({ data: Buffer.from(data), opts }) + ws.emit('_sent', { data: Buffer.from(data), opts }) + } + + ws.close = function (code, reason) { + ws.readyState = WebSocket.CLOSED + process.nextTick(() => ws.emit('close', code, reason)) + } + + ws.pong = () => {} + ws.removeListener = EventEmitter.prototype.removeListener.bind(ws) + ws.on = EventEmitter.prototype.on.bind(ws) + ws.once = EventEmitter.prototype.once.bind(ws) + ws.removeAllListeners = EventEmitter.prototype.removeAllListeners.bind(ws) + + return ws +} + +function createMockRequest (url, remoteAddress = '127.0.0.1') { + return { + url, + headers: { host: 'localhost:51121' }, + socket: { remoteAddress } + } +} + +function encodeExecMessage (fields) { + return msgpack.encode(fields) +} + +function decodeExecMessage (buffer) { + return msgpack.decode(buffer) +} + +function buildAgentInitialMessage (execId, microserviceUuid) { + return encodeExecMessage({ execId, microserviceUuid }) +} + +function buildExecFrame (type, execId, microserviceUuid, data) { + return encodeExecMessage({ + type, + execId, + microserviceUuid, + data: Buffer.isBuffer(data) ? data : Buffer.from(data), + timestamp: Date.now() + }) +} + +/** + * In-memory AMQP stub for cross-replica exec/log relay tests. + */ +function createMockQueueService () { + const execBridges = new Map() + const logBridges = new Map() + + return { + execBridges, + logBridges, + + async enableForSession (session, cleanupCallback) { + const execId = session.execId + if (!execId) return false + execBridges.set(execId, { session, cleanupCallback }) + return true + }, + + shouldUseQueue (execId) { + return execBridges.has(execId) + }, + + async publishToAgent (execId, buffer) { + const bridge = execBridges.get(execId) + if (bridge && bridge.session.agent && bridge.session.agent.readyState === WebSocket.OPEN) { + bridge.session.agent.emit('message', buffer, true) + } + }, + + async publishToUser (execId, buffer) { + const bridge = execBridges.get(execId) + if (bridge && bridge.session.user && bridge.session.user.readyState === WebSocket.OPEN) { + bridge.session.user.emit('message', buffer, true) + } + }, + + async cleanup (execId) { + execBridges.delete(execId) + }, + + async enableForLogSession (session, cleanupCallback) { + const sessionId = session.sessionId + logBridges.set(sessionId, { session, cleanupCallback }) + return true + }, + + shouldUseLogQueue (sessionId) { + return logBridges.has(sessionId) + }, + + async publishLogToUser (sessionId, buffer) { + const bridge = logBridges.get(sessionId) + if (bridge && bridge.session.user && bridge.session.user.readyState === WebSocket.OPEN) { + bridge.session.user.emit('message', buffer, true) + } + }, + + async cleanupLogSession (sessionId) { + logBridges.delete(sessionId) + } + } +} + +function resetWebSocketServerSingleton (WebSocketServerClass) { + if (WebSocketServerClass.instance) { + const instance = WebSocketServerClass.instance + instance.sessionManager.stopCleanup() + instance.logSessionManager.stopCleanupInterval() + } + WebSocketServerClass.instance = null +} + +function buildFakeJwt (claims = {}) { + const header = Buffer.from(JSON.stringify({ alg: 'none', typ: 'JWT' })).toString('base64url') + const payload = Buffer.from(JSON.stringify({ + sub: 'test-user-id', + preferred_username: 'tester', + ...claims + })).toString('base64url') + return `Bearer ${header}.${payload}.signature` +} + +function newTestIds () { + return { + microserviceUuid: uuidv4(), + fogUuid: uuidv4(), + execId: uuidv4(), + sessionId: uuidv4() + } +} + +function lastSentBinary (ws) { + const last = ws._sentMessages[ws._sentMessages.length - 1] + return last ? last.data : null +} + +function waitForSent (ws, minCount = 1, timeoutMs = 2000) { + return new Promise((resolve, reject) => { + if (ws._sentMessages.length >= minCount) { + resolve(ws._sentMessages) + return + } + const timer = setTimeout(() => { + ws.removeListener('_sent', onSent) + reject(new Error(`Timed out waiting for WS send (got ${ws._sentMessages.length}, wanted ${minCount})`)) + }, timeoutMs) + const onSent = () => { + if (ws._sentMessages.length >= minCount) { + clearTimeout(timer) + ws.removeListener('_sent', onSent) + resolve(ws._sentMessages) + } + } + ws.on('_sent', onSent) + }) +} + +module.exports = { + MESSAGE_TYPES, + WS_CLOSE_CODES, + delay, + createMockWebSocket, + createMockRequest, + encodeExecMessage, + decodeExecMessage, + buildAgentInitialMessage, + buildExecFrame, + createMockQueueService, + resetWebSocketServerSingleton, + buildFakeJwt, + newTestIds, + lastSentBinary, + waitForSent +} From 17e19a8d5be5f078881bf82de7a765c36d7a757a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Thu, 25 Jun 2026 00:50:19 +0300 Subject: [PATCH 10/11] Document WebSocket exec and log protocol in swagger and operator guide. Add architecture HA section, ws-sessions operations guide, and changelog entry for session hardening. --- CHANGELOG.md | 3 +- docs/architecture.md | 57 ++++++ docs/operations/ws-sessions.md | 143 ++++++++++++++ docs/swagger.yaml | 338 ++++++++++++++++++++++++++++++++- 4 files changed, 538 insertions(+), 3 deletions(-) create mode 100644 docs/operations/ws-sessions.md diff --git a/CHANGELOG.md b/CHANGELOG.md index ce78f7b7..b92c444a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -71,7 +71,7 @@ Controller v3.8 is a **greenfield** release aligned with **Edgelet**. There is * #### Database and distribution - **Greenfield schema** — **new install required**; no v3.7 → v3.8 database migrator. -- PKI: central router/NATS local CAs; legacy per-agent CAs migrated via one-time **rotation job** (Plan 5). +- PKI: central router/NATS local CAs; legacy per-agent CAs migrated via one-time **rotation job**. - **Node.js 24.x** required for dev and CI (was 16/18). - Dual-mirror container images: **`ghcr.io/eclipse-iofog/controller`** and **`ghcr.io/datasance/controller`** from the **same commit SHA**; publish on **`v*` tags only** via repo variable **`IMAGE_REGISTRY`**. @@ -97,6 +97,7 @@ Controller v3.8 is a **greenfield** release aligned with **Edgelet**. There is * - **K8s control plane:** hub **`iofog-router`** ConfigMap patches serialized via DB lock; K8s Service create/update/delete with LoadBalancer watch timeout. - **`service-bridge-config.js`** — full recompute of service-derived TCP bridge config per fog on reconcile (preserves router base config). - **SQLite single-node production hardening** — WAL + `busy_timeout` pragmas, reconcile task claim retry on `SQLITE_BUSY`, staggered startup for reconcile-heavy background jobs (`settings.jobStartupDelaySeconds`). +- **WebSocket exec & log session hardening** — quotas (1 exec / 3 log WS per resource), exec_b lifecycle, 60s/120s pending timeouts, 8h exec max, 30s graceful drain, OTEL metrics, HA AMQP fail-fast, integration tests, swagger WS protocol docs, operator guide (`docs/operations/ws-sessions.md`). ### Fixed diff --git a/docs/architecture.md b/docs/architecture.md index 25eaf10e..4df7b97f 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -204,6 +204,63 @@ Full spec: [`.cursor/controllerv3.8/docs/15-fog-platform-reconcile.md`](../.curs --- +## WebSocket exec & log sessions + +Interactive **exec** and **log streaming** use paired WebSocket sessions between operators (Bearer JWT), Controller, and Edgelet agents (fog token). Plan 16 hardens lifecycle, quotas, multi-replica HA relay, and observability — **without changing the Edgelet wire protocol**. + +```mermaid +sequenceDiagram + participant U as User WS + participant C as Controller replica + participant DB as Database + participant Q as AMQP Router + participant A as Agent WS + + Note over U,A: Exec (R80–R81, R84) + U->>C: WS exec (RBAC) + A->>C: WS agent/exec + execId + C->>C: Pair SessionManager + C->>Q: Enable agent-{execId} user-{execId} + U->>C: STDIN + C->>Q->>A: or direct WS same replica + A->>C: STDOUT + C->>Q->>U: relay + U-->>C: close + C->>DB: execEnabled=false INACTIVE + + Note over U,A: Logs (R82–R83, R84) + U->>C: WS logs + tail params + C->>DB: PENDING sessionId + A->>C: WS agent/logs/:sessionId + A->>C: LOG_LINE + C->>Q->>U: logs-user-{sessionId} +``` + +| Topic | Normative value (RFC R80–R91) | +|-------|-------------------------------| +| Exec lifecycle | **exec_b** — WS close sets `execEnabled=false`; **1** user exec WS per microservice | +| Exec timeouts | **60s** pending for agent; **8h** max active session | +| Log concurrency | **3** user log WS per microservice (or per fog for node logs) | +| Log limits | Tail max **5,000** lines; **120s** pending; **2h** idle | +| Log content | Live relay only — no log line persistence; audit connect/disconnect | +| HA relay | Cross-replica sessions **require** AMQP (`WebSocketQueueService`); same-replica may use direct WS; **fail fast** when router down | +| Graceful drain | **30s** on SIGTERM / k8s `preStop` — CLOSE frames, queue cleanup, DB status update | +| Security | Agent handlers validate fog token **before** message processing; **50** upgrades/min/IP; **100** active WS/IP; JWT in `?token=` (ingress log redaction required) | +| Scale SLO | **500** concurrent WS per replica; **p99 pairing < 5s** | +| Observability | OpenTelemetry: active/pending sessions, pairing latency, AMQP failures, router connectivity | + +**OTEL metric names (R87):** `ws_exec_sessions_active`, `ws_log_sessions_active`, `ws_pending_pairings`, `ws_pairing_duration_ms` (histogram), `ws_amqp_publish_errors`, `ws_router_connected` (gauge). Emitted when `ENABLE_TELEMETRY=true`; see `src/websocket/ws-metrics.js`. + +**HA config (`server.webSocket.ha`):** `crossReplicaRequiresAmqp` (default `true`), `failFastOnRouterUnavailable` (default `true`). Env: `WS_HA_CROSS_REPLICA_REQUIRES_AMQP`, `WS_HA_FAIL_FAST_ON_ROUTER_UNAVAILABLE`. Graceful drain timeout: `server.webSocket.session.drainTimeoutMs` (default **30s**, env `WS_DRAIN_TIMEOUT_MS`). + +**Core modules:** `src/websocket/server.js`, `session-manager.js`, `log-session-manager.js`, `src/services/websocket-queue-service.js`, `src/services/router-connection-service.js`. + +**Operator guide:** [operations/ws-sessions.md](operations/ws-sessions.md) — ingress `?token=` log redaction, HTTPS/WSS, multi-replica AMQP requirement, k8s preStop drain, load SLO probe. + +Full spec: [`.cursor/controllerv3.8/docs/16-ws-exec-log-hardening.md`](../.cursor/controllerv3.8/docs/16-ws-exec-log-hardening.md) · RFC R80–R91 · Edgelet contract: [edgelet-invariants.md §10](../.cursor/controllerv3.8/docs/edgelet-invariants.md). + +--- + ## Edgelet agent contract (summary) Controller v3.8 and Edgelet share a **frozen field-agent REST contract** on `/api/v3/agent/*`. The same release train must be deployed together (e.g. Controller `v3.8.0` + Edgelet `v1.0.0-rc.1`). Edgelet maintains the authoritative wire spec; Controller implements the server side. diff --git a/docs/operations/ws-sessions.md b/docs/operations/ws-sessions.md new file mode 100644 index 00000000..06b0c228 --- /dev/null +++ b/docs/operations/ws-sessions.md @@ -0,0 +1,143 @@ +# WebSocket exec & log sessions — operator guide + +**Audience:** Platform operators running Controller in production + +--- + +## Overview + +Controller exposes **interactive exec** and **log streaming** over WebSocket on the API port (default **51121**). Sessions pair an operator browser/CLI client (Bearer JWT) with an Edgelet agent (fog token). In multi-replica deployments, cross-replica relay requires the **Skupper-style AMQP router** microservice. + +--- + +## HTTPS and authentication + +| Requirement | Detail | +|-------------|--------| +| **HTTPS-only WS** | Set `CONTROLLER_PUBLIC_URL` to `https://…` and terminate TLS at ingress or the Controller listener (`TLS_PATH_*`). WebSocket upgrades must use `wss://`. | +| **User auth** | Bearer JWT via `Authorization` header or `?token=` query param (browser Console). RBAC: `execSessions`, `logs`, `systemExecSessions`, `systemLogs`. | +| **Agent auth** | Fog token on `/api/v3/agent/exec/*` and `/api/v3/agent/logs/*` — OIDC does **not** apply to agent routes. | + +### Ingress log redaction (required) + +Browser clients pass JWT in the query string: `wss://controller.example.com/api/v3/microservices/{uuid}/logs?token=…` + +**Configure ingress / reverse proxy access logs to redact `token` query parameters.** Example nginx: + +```nginx +log_format ws_redacted '$remote_addr - [$time_local] "$request" $status ' + '"$http_referer" "$http_user_agent"'; +# Use map or custom log filter to strip ?token=… before writing logs. +``` + +Without redaction, long-lived bearer tokens may appear in load balancer logs. + +--- + +## Multi-replica HA + +| Setting | Default | Env | +|---------|---------|-----| +| Cross-replica requires AMQP | `true` | `WS_HA_CROSS_REPLICA_REQUIRES_AMQP` | +| Fail fast when router down | `true` | `WS_HA_FAIL_FAST_ON_ROUTER_UNAVAILABLE` | + +**Requirements:** + +1. Deploy the **router** system microservice and ensure Controller can reach AMQP (`RouterConnectionService`). +2. Run **2+ Controller replicas** behind a load balancer with **sticky sessions optional** — cross-replica exec/log uses AMQP queues (`agent-{execId}`, `user-{execId}`, `logs-user-{sessionId}`). +3. When the router is unavailable, new cross-replica sessions close with WebSocket code **1013** (`Router unavailable for cross-replica session`). + +Same-replica sessions may relay directly without AMQP when both user and agent land on the same pod. + +--- + +## Graceful drain (SIGTERM / Kubernetes preStop) + +On shutdown, Controller drains WebSocket sessions for up to **`WS_DRAIN_TIMEOUT_MS`** (default **30s**): + +1. Reject new upgrades (`verifyClient` → draining). +2. Close pending users with code **1001** (`Server draining`). +3. Send CLOSE frames, clean exec/log session DB rows, tear down AMQP bridges. + +### Kubernetes manifest example + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: controller +spec: + template: + spec: + terminationGracePeriodSeconds: 45 + containers: + - name: controller + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - sleep 5 + env: + - name: WS_DRAIN_TIMEOUT_MS + value: "30000" +``` + +**Procedure (manual verification):** + +1. Open an exec or log session against a running pod. +2. `kubectl delete pod --grace-period=45` +3. Confirm the client receives close code **1001** within ~30s and exec is disabled (`execEnabled=false` for exec sessions). +4. Confirm replacement pod accepts new sessions. + +--- + +## Scale SLO (R88) + +| Metric | Target | +|--------|--------| +| Concurrent WS per replica | **500** (`WS_REPLICA_MAX_CONCURRENT_WS`) | +| p99 exec pairing latency | **< 5s** | + +Run the load probe locally: + +```bash +nvm use 24 +node test/load/ws-pairing-load.js --pairs 500 +``` + +For production validation, repeat against a staging cluster with real agent simulators and record p99 from Controller OTEL histogram `ws_pairing_duration_ms`. + +--- + +## OTEL metrics + +Enable `ENABLE_TELEMETRY=true`. Key metrics (`src/websocket/ws-metrics.js`): + +| Metric | Type | +|--------|------| +| `ws_exec_sessions_active` | gauge | +| `ws_log_sessions_active` | gauge | +| `ws_pending_pairings` | gauge | +| `ws_pairing_duration_ms` | histogram | +| `ws_amqp_publish_errors` | counter | +| `ws_router_connected` | gauge | + +--- + +## Session limits (normative) + +| Session | Limit | +|---------|-------| +| Exec user WS per microservice | **1** | +| Exec pending (user waits for agent) | **60s** | +| Exec max duration | **8h** | +| Log user WS per microservice/fog | **3** | +| Log pending (user waits for agent) | **120s** | +| Log idle | **2h** | +| Log tail max lines | **5000** | +| WS upgrades per IP per minute | **50** | +| Active WS per IP | **100** | + +See [architecture.md](../architecture.md#websocket-exec--log-sessions) for protocol diagrams. diff --git a/docs/swagger.yaml b/docs/swagger.yaml index f45569f9..6738dda8 100755 --- a/docs/swagger.yaml +++ b/docs/swagger.yaml @@ -2811,6 +2811,279 @@ paths: description: Invalid Microservice UUID "500": description: Internal Server Error + "/microservices/exec/{microserviceUuid}": + get: + tags: + - WebSocketSessions + summary: User exec WebSocket (HTTP upgrade) + description: | + Upgrades to a binary MessagePack WebSocket for interactive exec. + + **Auth:** Bearer JWT in `Authorization` header or `?token=` query param. + **RBAC:** `execSessions` on `microservices` resource. + + Pairing: user connects first (or agent first on `/agent/exec/{uuid}`); Controller pairs + via `SessionManager` and relays STDIN/STDOUT/STDERR. On disconnect, **exec_b** sets + `execEnabled=false` (see REST `POST/DELETE …/exec`). + + **HA:** Multi-replica deployments require AMQP router (`WebSocketQueueService`). + Cross-replica sessions fail fast with close code **1013** when router is unavailable. + + See `#/components/schemas/WsExecMessageTypes` and `#/components/schemas/WsCloseCodes`. + operationId: userMicroserviceExecWebSocket + parameters: + - in: path + name: microserviceUuid + required: true + schema: + type: string + format: uuid + - in: query + name: token + description: Bearer JWT for browser clients (redact in ingress access logs) + schema: + type: string + security: + - authToken: [] + responses: + "101": + description: Switching Protocols — WebSocket established + "401": + description: Not Authorized + "403": + description: RBAC denied + "/microservices/system/exec/{microserviceUuid}": + get: + tags: + - WebSocketSessions + summary: User system microservice exec WebSocket + description: Same as user exec WS; **RBAC** `systemExecSessions`. + operationId: userSystemMicroserviceExecWebSocket + parameters: + - in: path + name: microserviceUuid + required: true + schema: + type: string + format: uuid + - in: query + name: token + schema: + type: string + security: + - authToken: [] + responses: + "101": + description: Switching Protocols + "/microservices/{uuid}/logs": + get: + tags: + - WebSocketSessions + summary: User microservice log streaming WebSocket + description: | + Upgrades to binary MessagePack log stream (agent → user after pairing). + + **RBAC:** `logs`. Max **3** concurrent user log WS per microservice. + + Query params control tail behaviour (see parameters). Live relay only — no log persistence. + operationId: userMicroserviceLogsWebSocket + parameters: + - in: path + name: uuid + required: true + schema: + type: string + format: uuid + - in: query + name: token + schema: + type: string + - in: query + name: tail + description: Number of tail lines (1–5000, default from config) + schema: + type: integer + minimum: 1 + maximum: 5000 + - in: query + name: follow + description: Stream new lines after tail (default true) + schema: + type: boolean + default: true + - in: query + name: since + description: ISO 8601 timestamp — include lines after this time + schema: + type: string + format: date-time + - in: query + name: until + description: ISO 8601 timestamp — include lines before this time + schema: + type: string + format: date-time + security: + - authToken: [] + responses: + "101": + description: Switching Protocols + "/microservices/system/{uuid}/logs": + get: + tags: + - WebSocketSessions + summary: User system microservice log WebSocket + operationId: userSystemMicroserviceLogsWebSocket + parameters: + - in: path + name: uuid + required: true + schema: + type: string + format: uuid + - in: query + name: token + schema: + type: string + - in: query + name: tail + schema: + type: integer + minimum: 1 + maximum: 5000 + - in: query + name: follow + schema: + type: boolean + - in: query + name: since + schema: + type: string + format: date-time + - in: query + name: until + schema: + type: string + format: date-time + security: + - authToken: [] + responses: + "101": + description: Switching Protocols + "/iofog/{uuid}/logs": + get: + tags: + - WebSocketSessions + summary: User fog (node) log streaming WebSocket + description: Node-level logs; **RBAC** `systemLogs`. Max **3** concurrent sessions per fog. + operationId: userFogLogsWebSocket + parameters: + - in: path + name: uuid + required: true + schema: + type: string + format: uuid + - in: query + name: token + schema: + type: string + - in: query + name: tail + schema: + type: integer + minimum: 1 + maximum: 5000 + - in: query + name: follow + schema: + type: boolean + - in: query + name: since + schema: + type: string + format: date-time + - in: query + name: until + schema: + type: string + format: date-time + security: + - authToken: [] + responses: + "101": + description: Switching Protocols + "/agent/exec/{microserviceUuid}": + get: + tags: + - WebSocketSessions + summary: Agent exec WebSocket + description: | + Agent-side exec pairing. **Auth:** fog token (not OIDC). + + First binary frame must be MessagePack with `execId` and `microserviceUuid`. + Agent handlers validate fog token **before** accepting messages (R86). + operationId: agentExecWebSocket + parameters: + - in: path + name: microserviceUuid + required: true + schema: + type: string + format: uuid + security: + - fogToken: [] + responses: + "101": + description: Switching Protocols + "/agent/logs/microservice/{microserviceUuid}/{sessionId}": + get: + tags: + - WebSocketSessions + summary: Agent microservice log WebSocket + operationId: agentMicroserviceLogsWebSocket + parameters: + - in: path + name: microserviceUuid + required: true + schema: + type: string + format: uuid + - in: path + name: sessionId + required: true + schema: + type: string + format: uuid + security: + - fogToken: [] + responses: + "101": + description: Switching Protocols + "/agent/logs/iofog/{iofogUuid}/{sessionId}": + get: + tags: + - WebSocketSessions + summary: Agent fog log WebSocket + operationId: agentFogLogsWebSocket + parameters: + - in: path + name: iofogUuid + required: true + schema: + type: string + format: uuid + - in: path + name: sessionId + required: true + schema: + type: string + format: uuid + security: + - fogToken: [] + responses: + "101": + description: Switching Protocols "/microservices/system/{uuid}/exec": post: tags: @@ -3309,7 +3582,7 @@ paths: summary: Start OAuth authorization (BFF) operationId: oauthAuthorize description: > - Browser OAuth BFF entry point for EdgeOps Console (Plan 8.2). EdgeOps Console Sign in performs a + Browser OAuth BFF entry point for EdgeOps Console. EdgeOps Console Sign in performs a full-page redirect here; do **not** POST credentials from the browser. **External mode:** redirects to the external IdP authorize endpoint. @@ -3358,7 +3631,7 @@ paths: **External forced password change** is enforced by the IdP during authorize (e.g. Keycloak required actions: `UPDATE_PASSWORD`). **Embedded forced password** uses the interaction - `change-password` step (Plan 8.2-3). + `change-password` step. MFA in external browser login is IdP-owned; embedded browser MFA uses interaction endpoints. responses: @@ -6815,6 +7088,10 @@ tags: description: Manage ssh tunnels - name: Agent description: Used by your agents to communicate with your controller + - name: WebSocketSessions + description: | + Interactive exec and log streaming WebSocket endpoints (MessagePack binary). + Multi-replica HA requires AMQP router — see operations/ws-sessions.md. - name: User description: Manage your users - name: Secrets @@ -6843,6 +7120,10 @@ components: type: http scheme: bearer bearerFormat: JWT + fogToken: + type: http + scheme: bearer + description: Edgelet fog provisioning token (agent routes) responses: AuthRateLimitExceeded: description: Too many authentication requests from this IP address @@ -6882,6 +7163,59 @@ components: $ref: "#/components/schemas/ApplicationTemplateCreateRequest" required: true schemas: + WsExecMessageTypes: + type: object + description: MessagePack frame types for exec sessions (R80) + properties: + types: + type: array + items: + type: object + properties: + name: + type: string + value: + type: integer + direction: + type: string + example: + types: + - { name: STDIN, value: 0, direction: user → agent } + - { name: STDOUT, value: 1, direction: agent → user } + - { name: STDERR, value: 2, direction: agent → user } + - { name: CONTROL, value: 3, direction: both } + - { name: CLOSE, value: 4, direction: both } + - { name: ACTIVATION, value: 5, direction: controller → agent } + WsLogMessageTypes: + type: object + description: MessagePack frame types for log sessions (R82) + example: + types: + - { name: LOG_LINE, value: 6, direction: agent → user } + - { name: LOG_START, value: 7, direction: both } + - { name: LOG_STOP, value: 8, direction: agent → user } + - { name: LOG_ERROR, value: 9, direction: agent → user } + WsCloseCodes: + type: object + description: WebSocket close codes used by Controller exec/log sessions + properties: + codes: + type: array + items: + type: object + properties: + code: + type: integer + reason: + type: string + when: + type: string + example: + codes: + - { code: 1000, reason: Normal closure, when: Session ended cleanly } + - { code: 1001, reason: Server draining, when: SIGTERM / k8s preStop (R85) } + - { code: 1008, reason: Policy violation, when: RBAC deny, quota, invalid tail params, pending timeout } + - { code: 1013, reason: Router unavailable for cross-replica session, when: AMQP router down (R84) } EventRecord: type: object properties: From cf9dada2a489f5113d2a6f123a134d2ba0d02c57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emirhan=20Durmu=C5=9F?= Date: Thu, 25 Jun 2026 00:50:26 +0300 Subject: [PATCH 11/11] Remove internal milestone references from public docs and comments. Scrub phase labels from Dockerfile, PKI guide, RBAC reference, rbac-audit script, and OIDC test README. --- Dockerfile | 2 +- docs/pki.md | 4 ++-- docs/rbac-reference.md | 6 +++--- scripts/rbac-audit.js | 1 - test/oidc/README.md | 6 +++--- 5 files changed, 9 insertions(+), 10 deletions(-) diff --git a/Dockerfile b/Dockerfile index fa241a4d..23678ad9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -# Stage 1 — EdgeOps Console static SPA (Plan 11-1) +# Stage 1 — EdgeOps Console static SPA # ioFog overrides: EDGEOPS_CONSOLE_REPO=https://github.com/eclipse-iofog/edgeops-console # EDGEOPS_CONSOLE_FLAVOR=iofog # node:24-bookworm — pin manifest list digest for reproducible multi-arch builds diff --git a/docs/pki.md b/docs/pki.md index d554513d..52f5354b 100644 --- a/docs/pki.md +++ b/docs/pki.md @@ -33,7 +33,7 @@ Per-agent local CA names may still appear in **delete cleanup** for orphaned leg v3.8 is a **new install only** release (no v3.7 → v3.8 database migrator). -Plan 5 originally scoped a **one-time PKI rotation job** to re-sign certs from legacy per-agent CAs under the central CAs. That job was **not implemented** — greenfield policy means labs and production deploy fresh Controller + Edgelet fleets without carrying forward v3.7 secrets. + A **one-time PKI rotation job** to re-sign certs from legacy per-agent CAs under the central CAs. That job was **not implemented** — greenfield policy means labs and production deploy fresh Controller + Edgelet fleets without carrying forward v3.7 secrets. | Scenario | Operator action | |----------|-----------------| @@ -105,7 +105,7 @@ Authorization: Bearer Controller accepts the request, rotates operator material, re-signs accounts, and schedules resolver reconciliation in the background. Plan maintenance when NATS leaf nodes may reload operator trust. -This is **NATS credential rotation**, not the skipped Plan 5 per-agent CA migration. +This is **NATS credential rotation**, not the skipped per-agent CA migration. --- diff --git a/docs/rbac-reference.md b/docs/rbac-reference.md index dfe92555..d318f1d8 100644 --- a/docs/rbac-reference.md +++ b/docs/rbac-reference.md @@ -157,10 +157,10 @@ Keep `rbac-resources.yaml`, live routes, and system roles aligned when adding or ```bash nvm use 24 -# Compare Express routes to rbac-resources.yaml (243 routes as of Plan 9) +# Compare Express routes to rbac-resources.yaml (243 routes as) npm run rbac-audit -# Plan 9 grep gates — banned legacy terms must be absent; +# grep gates — banned legacy terms must be absent; # v3.8 terms must be present rg 'edgeResources|diagnostics' src/config/rbac-resources.yaml && exit 1 || true rg 'fog-types' src/config/rbac-resources.yaml && exit 1 || true @@ -174,7 +174,7 @@ rg 'architectures|controller/register' src/config/rbac-resources.yaml - Banned legacy terms (`edgeResources`, `diagnostics`, `fog-types`) - Missing required v3.8 terms (`architectures`, `controller/register`) -Optional CI wiring is planned for Plan 11. +Optional CI wiring is planned. ## Reference files diff --git a/scripts/rbac-audit.js b/scripts/rbac-audit.js index f7d0bd6e..54fd6113 100644 --- a/scripts/rbac-audit.js +++ b/scripts/rbac-audit.js @@ -2,7 +2,6 @@ /* * Compare live Express routes (src/routes/**) to rbac-resources.yaml. * Exits non-zero on gaps (unmapped routes) or orphans (stale yaml entries). - * Plan 9 phase 9-5 — optional CI drift check. */ const fs = require('fs') diff --git a/test/oidc/README.md b/test/oidc/README.md index 767b1828..a5949ef1 100644 --- a/test/oidc/README.md +++ b/test/oidc/README.md @@ -1,4 +1,4 @@ -# Embedded auth dev smoke (Plan 8.1) +# Embedded auth dev smoke Local smoke path for embedded identity: Controller issues tokens from the in-process `/oidc` issuer and validates Bearer JWTs via local JWKS. No mock OIDC provider. @@ -90,7 +90,7 @@ Point env at any OIDC issuer: - `CONTROLLER_PUBLIC_URL` — canonical external URL (issuer host + OAuth callback base) - `CONSOLE_URL` — SPA base; BFF redirects tokens to `{consoleUrl}/login#accessToken=...` -Optional auth rate limits (Plan 8.2-4): `AUTH_RATE_LIMIT_ENABLED` (default `true`), +Optional auth rate limits: `AUTH_RATE_LIMIT_ENABLED` (default `true`), `AUTH_RATE_LIMIT_MAX_REQUESTS` (default `60`), `AUTH_RATE_LIMIT_WINDOW_MS` (default `60000`). Register at the IdP: redirect URI `{CONTROLLER_PUBLIC_URL}/api/v3/user/oauth/callback`. @@ -130,7 +130,7 @@ middleware (default). **Viewer integration:** Sign in button → full-page `GET {apiBase}/user/oauth/authorize`; `/login` parses hash tokens. See `.cursor/controllerv3.8/docs/08-2-viewer-handoff.md` § External mode. -## HA BFF sessions (Plan 8.2-5) +## HA BFF sessions Multi-replica Controller requires a **shared** OAuth BFF session store. Set: