From 967f6d54e02886cb70ba4a01496ed1c33793f17e Mon Sep 17 00:00:00 2001 From: Riddhesh Sanghvi Date: Mon, 29 Jun 2026 21:56:27 +0530 Subject: [PATCH 1/2] fix(backup): make the per-site backup/restore lock crash-safe The per-site lock was a file-existence lock (fs->exists check + fs->dumpFile create), removed only on success and one disk-space error path. Any other exit -- archive/DB/upload failure, restore mismatch, OOM/SIGKILL, Ctrl-C -- left a stale .lock that permanently blocked every future backup AND restore of that site with 'Another backup/restore process is running' until an operator deleted it by hand. The check-then-create was also a TOCTOU race between same-site backup and restore. Convert it to an flock() lock on a handle opened with the 'e' (O_CLOEXEC) flag, acquired non-blocking in pre_backup_restore_checks() and released by an idempotent release_site_backup_lock() on the success paths and via register_shutdown_function. flock is released by the OS on process death (verified for SIGKILL), so the lock can no longer go stale; O_CLOEXEC stops backup subprocesses (rclone/mysqldump/docker exec) from inheriting the descriptor and holding it after the parent exits (verified: c+ stays held, c+e releases). The global lock fd gains the same 'e' flag. Also remove the lock-file deletion from EE_Site_Command::shut_down_function(): with flock, unlinking a held lock file lets a later process re-lock a fresh inode at the same path and break mutual exclusion. --- src/helper/Site_Backup_Restore.php | 67 ++++++++++++++++++++++++++---- src/helper/class-ee-site.php | 13 +++--- 2 files changed, 64 insertions(+), 16 deletions(-) diff --git a/src/helper/Site_Backup_Restore.php b/src/helper/Site_Backup_Restore.php index a287b639..3a5cf7a5 100644 --- a/src/helper/Site_Backup_Restore.php +++ b/src/helper/Site_Backup_Restore.php @@ -51,6 +51,9 @@ class Site_Backup_Restore { // Global backup lock handle for serializing backups private $global_backup_lock_handle = null; + // Per-site backup/restore lock handle (flock-based, auto-released on exit) + private $site_backup_lock_handle = null; + public function __construct() { $this->fs = new Filesystem(); } @@ -152,7 +155,7 @@ public function backup( $args, $assoc_args = [] ) { $this->rclone_upload( $backup_dir ); $this->fs->remove( $backup_dir ); - $this->fs->remove( EE_BACKUP_DIR . '/' . $this->site_data['site_url'] . '.lock' ); + $this->release_site_backup_lock(); // Mark backup as completed and send success callback $this->dash_backup_completed = true; @@ -300,7 +303,7 @@ public function restore( $args, $assoc_args = [] ) { EE::log( 'Reloading site.' ); EE::run_command( [ 'site', 'reload', $this->site_data['site_url'] ], [], [] ); - $this->fs->remove( EE_BACKUP_DIR . '/' . $this->site_data['site_url'] . '.lock' ); + $this->release_site_backup_lock(); EE::success( 'Site restored successfully.' ); @@ -946,16 +949,47 @@ private function pre_backup_restore_checks() { $lock_file = EE_BACKUP_DIR . '/' . $this->site_data['site_url'] . '.lock'; - if ( $this->fs->exists( $lock_file ) ) { + // Per-site lock guarding against a concurrent backup/restore of the SAME + // site. Uses flock() rather than file existence so the OS releases it + // automatically if the process dies mid-operation -- the previous + // file-existence lock was removed only on success paths, so any crash, + // OOM, or error exit left a stale `.lock` that permanently blocked all + // future backups/restores of that site. Opened with the 'e' flag + // (O_CLOEXEC) so backup subprocesses (rclone, mysqldump, docker exec) + // don't inherit the descriptor and keep the lock held after we exit. + $this->site_backup_lock_handle = fopen( $lock_file, 'c+e' ); + + if ( ! $this->site_backup_lock_handle ) { + $this->capture_error( + 'Cannot create backup lock file', + self::ERROR_TYPE_FILESYSTEM, + 5002 + ); + EE::error( 'Cannot create backup lock file.' ); + } + + // Non-blocking: fail fast if another backup/restore holds this site's lock. + if ( ! flock( $this->site_backup_lock_handle, LOCK_EX | LOCK_NB ) ) { + fclose( $this->site_backup_lock_handle ); + $this->site_backup_lock_handle = null; $this->capture_error( 'Another backup/restore process is already running for this site', self::ERROR_TYPE_LOCK, 2003 ); EE::error( 'Another backup/restore process is running. Please wait for it to complete.' ); - } else { - $this->fs->dumpFile( $lock_file, 'lock' ); } + + // Release on graceful exit (EE::error/exit, PHP fatal, Ctrl-C). On + // SIGTERM/SIGKILL/OOM the shutdown handler does not run, but the OS + // releases the flock on process death -- so the lock is freed in every case. + register_shutdown_function( [ $this, 'release_site_backup_lock' ] ); + + // Record the holder for debugging only; flock is the source of truth. + ftruncate( $this->site_backup_lock_handle, 0 ); + rewind( $this->site_backup_lock_handle ); + fwrite( $this->site_backup_lock_handle, $this->site_data['site_url'] . ' (PID: ' . getmypid() . ')' ); + fflush( $this->site_backup_lock_handle ); } private function pre_backup_check() { @@ -987,7 +1021,7 @@ private function pre_backup_check() { 3001 ); - $this->fs->remove( EE_BACKUP_DIR . '/' . $this->site_data['site_url'] . '.lock' ); + $this->release_site_backup_lock(); EE::error( $error_message ); } } @@ -1904,8 +1938,10 @@ private function acquire_global_backup_lock() { $this->fs->mkdir( EE_BACKUP_DIR ); } - // Open file handle (creates if doesn't exist) - $this->global_backup_lock_handle = fopen( $lock_file, 'c+' ); + // Open file handle (creates if doesn't exist). The 'e' flag (O_CLOEXEC) + // stops backup subprocesses (rclone, mysqldump, docker exec) from + // inheriting this descriptor and holding the lock after this process exits. + $this->global_backup_lock_handle = fopen( $lock_file, 'c+e' ); if ( ! $this->global_backup_lock_handle ) { $this->capture_error( @@ -1963,4 +1999,19 @@ public function release_global_backup_lock() { EE::debug( 'Released global backup lock' ); } } + + /** + * Release the per-site backup/restore lock. + * Safe to call multiple times (idempotent). + * + * @return void + */ + public function release_site_backup_lock() { + if ( $this->site_backup_lock_handle ) { + flock( $this->site_backup_lock_handle, LOCK_UN ); + fclose( $this->site_backup_lock_handle ); + $this->site_backup_lock_handle = null; + EE::debug( 'Released per-site backup lock' ); + } + } } diff --git a/src/helper/class-ee-site.php b/src/helper/class-ee-site.php index 08ad268e..eea6df62 100644 --- a/src/helper/class-ee-site.php +++ b/src/helper/class-ee-site.php @@ -2205,14 +2205,11 @@ protected function shut_down_function() { $logger = \EE::get_file_logger()->withName( 'site-command' ); $error = error_get_last(); - // Check if the $this->site_data is set and it is array and $this->site_data['site_url'] is set. - if ( isset( $this->site_data ) && is_array( $this->site_data ) && isset( $this->site_data['site_url'] ) ) { - // release lock if there. - $lock_file = EE_BACKUP_DIR . '/' . $this->site_data['site_url'] . '.lock'; - if ( $this->fs->exists( $lock_file ) ) { - $this->fs->remove( $lock_file ); - } - } + // The per-site backup/restore lock is now an flock() held by + // Site_Backup_Restore and released automatically on process exit. It must + // NOT be deleted here: unlinking a file that another process currently + // holds an flock on lets a later process create a fresh inode at the same + // path and acquire its own lock, silently breaking mutual exclusion. if ( isset( $error ) && $error['type'] === E_ERROR ) { \EE::warning( 'An Error occurred. Initiating clean-up.' ); From 7a2404a07138f00a27806335d334ce95eb820edd Mon Sep 17 00:00:00 2001 From: Riddhesh Sanghvi Date: Mon, 29 Jun 2026 23:24:43 +0530 Subject: [PATCH 2/2] docs(backup): fix grammar in per-site lock comment "a flock" reads correctly (flock is pronounced with a consonant sound); addresses a review comment on the lock-robustness change. --- src/helper/class-ee-site.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/helper/class-ee-site.php b/src/helper/class-ee-site.php index eea6df62..25d44fb2 100644 --- a/src/helper/class-ee-site.php +++ b/src/helper/class-ee-site.php @@ -2205,10 +2205,10 @@ protected function shut_down_function() { $logger = \EE::get_file_logger()->withName( 'site-command' ); $error = error_get_last(); - // The per-site backup/restore lock is now an flock() held by + // The per-site backup/restore lock is now a flock() held by // Site_Backup_Restore and released automatically on process exit. It must // NOT be deleted here: unlinking a file that another process currently - // holds an flock on lets a later process create a fresh inode at the same + // holds a flock on lets a later process create a fresh inode at the same // path and acquire its own lock, silently breaking mutual exclusion. if ( isset( $error ) && $error['type'] === E_ERROR ) {